[PATCH 00/15] A different approach for using blk-mq in the SCSI layer

public inbox for linux-scsi@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH 00/15] A different approach for using blk-mq in the SCSI layer
@ 2014-02-05 12:41 Christoph Hellwig
  2014-02-05 12:41 ` [PATCH 01/15] block: rework flush sequencing for blk-mq Christoph Hellwig
                   ` (16 more replies)
  0 siblings, 17 replies; 31+ messages in thread
From: Christoph Hellwig @ 2014-02-05 12:41 UTC (permalink / raw)
  To: Jens Axboe, James Bottomley, Nicholas Bellinger; +Cc: linux-scsi

As explained in my earlier proposal I think we need to move to blk-mq
for the SCSI midlayer quickly, and in a way that keeps all the existing
queueing mechanisms working.  This series shows my current progress towards
this goal.  It works under medium loads for iSCSI and virtio_scsi and I'm
fairly happy with the code structure.  As seen I've tried to push as much
work into the blk-mq core, and refactor the SCSI code so that it can be
used for the old request based path and blk-mq as much as possible.

There are still lots of limits mostly due to the lack of functionality in
blk-mq, and I will try to address these in the blk-mq core mostly.

A git tree is also available at

  git://git.infradead.org/users/hch/scsi.git#scsi-mq-wip

This work was sponsored by the ION division of Fusion IO.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* [PATCH 01/15] block: rework flush sequencing for blk-mq
  2014-02-05 12:41 [PATCH 00/15] A different approach for using blk-mq in the SCSI layer Christoph Hellwig
@ 2014-02-05 12:41 ` Christoph Hellwig
  2014-02-06  2:08   ` Muthu Kumar
  2014-02-05 12:41 ` [PATCH 02/15] blk-mq: support at_head inserations for blk_execute_rq Christoph Hellwig
                   ` (15 subsequent siblings)
  16 siblings, 1 reply; 31+ messages in thread
From: Christoph Hellwig @ 2014-02-05 12:41 UTC (permalink / raw)
  To: Jens Axboe, James Bottomley, Nicholas Bellinger; +Cc: linux-scsi

[-- Attachment #1: 0001-block-rework-flush-sequencing-for-blk-mq.patch --]
[-- Type: text/plain, Size: 10580 bytes --]

Switch to using a preallocated flush_rq for blk-mq similar to what's done
with the old request path.  This allows us to set up the request properly
with a tag from the actually allowed range and ->rq_disk as needed by
some drivers.  To make life easier we also switch to dynamic allocation
of ->flush_rq for the old path.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 block/blk-core.c       |   15 ++++++--
 block/blk-flush.c      |   97 +++++++++++++++++-------------------------------
 block/blk-mq.c         |   26 ++++++-------
 block/blk-mq.h         |    1 +
 block/blk-sysfs.c      |    2 +
 include/linux/blk-mq.h |    2 +-
 include/linux/blkdev.h |    9 +----
 7 files changed, 65 insertions(+), 87 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index c00e0bd..d3eb330 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -693,11 +693,20 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
 	if (!uninit_q)
 		return NULL;
 
+	uninit_q->flush_rq = kzalloc(sizeof(struct request), GFP_KERNEL);
+	if (!uninit_q->flush_rq)
+		goto out_cleanup_queue;
+
 	q = blk_init_allocated_queue(uninit_q, rfn, lock);
 	if (!q)
-		blk_cleanup_queue(uninit_q);
-
+		goto out_free_flush_rq;
 	return q;
+
+out_free_flush_rq:
+	kfree(uninit_q->flush_rq);
+out_cleanup_queue:
+	blk_cleanup_queue(uninit_q);
+	return NULL;
 }
 EXPORT_SYMBOL(blk_init_queue_node);
 
@@ -1127,7 +1136,7 @@ static struct request *blk_old_get_request(struct request_queue *q, int rw,
 struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
 {
 	if (q->mq_ops)
-		return blk_mq_alloc_request(q, rw, gfp_mask, false);
+		return blk_mq_alloc_request(q, rw, gfp_mask);
 	else
 		return blk_old_get_request(q, rw, gfp_mask);
 }
diff --git a/block/blk-flush.c b/block/blk-flush.c
index 9288aaf..57ef837 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -140,10 +140,16 @@ static void mq_flush_data_run(struct work_struct *work)
 	blk_mq_run_request(rq, true, false);
 }
 
-static void blk_mq_flush_data_insert(struct request *rq)
+static bool blk_flush_queue_rq(struct request *rq)
 {
-	INIT_WORK(&rq->mq_flush_data, mq_flush_data_run);
-	kblockd_schedule_work(rq->q, &rq->mq_flush_data);
+	if (rq->q->mq_ops) {
+		INIT_WORK(&rq->mq_flush_data, mq_flush_data_run);
+		kblockd_schedule_work(rq->q, &rq->mq_flush_data);
+		return false;
+	} else {
+		list_add_tail(&rq->queuelist, &rq->q->queue_head);
+		return true;
+	}
 }
 
 /**
@@ -187,12 +193,7 @@ static bool blk_flush_complete_seq(struct request *rq, unsigned int seq,
 
 	case REQ_FSEQ_DATA:
 		list_move_tail(&rq->flush.list, &q->flush_data_in_flight);
-		if (q->mq_ops)
-			blk_mq_flush_data_insert(rq);
-		else {
-			list_add(&rq->queuelist, &q->queue_head);
-			queued = true;
-		}
+		queued = blk_flush_queue_rq(rq);
 		break;
 
 	case REQ_FSEQ_DONE:
@@ -216,9 +217,6 @@ static bool blk_flush_complete_seq(struct request *rq, unsigned int seq,
 	}
 
 	kicked = blk_kick_flush(q);
-	/* blk_mq_run_flush will run queue */
-	if (q->mq_ops)
-		return queued;
 	return kicked | queued;
 }
 
@@ -230,10 +228,9 @@ static void flush_end_io(struct request *flush_rq, int error)
 	struct request *rq, *n;
 	unsigned long flags = 0;
 
-	if (q->mq_ops) {
-		blk_mq_free_request(flush_rq);
+	if (q->mq_ops)
 		spin_lock_irqsave(&q->mq_flush_lock, flags);
-	}
+
 	running = &q->flush_queue[q->flush_running_idx];
 	BUG_ON(q->flush_pending_idx == q->flush_running_idx);
 
@@ -263,49 +260,14 @@ static void flush_end_io(struct request *flush_rq, int error)
 	 * kblockd.
 	 */
 	if (queued || q->flush_queue_delayed) {
-		if (!q->mq_ops)
-			blk_run_queue_async(q);
-		else
-		/*
-		 * This can be optimized to only run queues with requests
-		 * queued if necessary.
-		 */
-			blk_mq_run_queues(q, true);
+		WARN_ON(q->mq_ops);
+		blk_run_queue_async(q);
 	}
 	q->flush_queue_delayed = 0;
 	if (q->mq_ops)
 		spin_unlock_irqrestore(&q->mq_flush_lock, flags);
 }
 
-static void mq_flush_work(struct work_struct *work)
-{
-	struct request_queue *q;
-	struct request *rq;
-
-	q = container_of(work, struct request_queue, mq_flush_work);
-
-	/* We don't need set REQ_FLUSH_SEQ, it's for consistency */
-	rq = blk_mq_alloc_request(q, WRITE_FLUSH|REQ_FLUSH_SEQ,
-		__GFP_WAIT|GFP_ATOMIC, true);
-	rq->cmd_type = REQ_TYPE_FS;
-	rq->end_io = flush_end_io;
-
-	blk_mq_run_request(rq, true, false);
-}
-
-/*
- * We can't directly use q->flush_rq, because it doesn't have tag and is not in
- * hctx->rqs[]. so we must allocate a new request, since we can't sleep here,
- * so offload the work to workqueue.
- *
- * Note: we assume a flush request finished in any hardware queue will flush
- * the whole disk cache.
- */
-static void mq_run_flush(struct request_queue *q)
-{
-	kblockd_schedule_work(q, &q->mq_flush_work);
-}
-
 /**
  * blk_kick_flush - consider issuing flush request
  * @q: request_queue being kicked
@@ -340,19 +302,31 @@ static bool blk_kick_flush(struct request_queue *q)
 	 * different from running_idx, which means flush is in flight.
 	 */
 	q->flush_pending_idx ^= 1;
+
 	if (q->mq_ops) {
-		mq_run_flush(q);
-		return true;
+		struct blk_mq_ctx *ctx = first_rq->mq_ctx;
+		struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, ctx->cpu);
+
+		blk_mq_rq_init(hctx, q->flush_rq);
+		q->flush_rq->mq_ctx = ctx;
+
+		/*
+		 * Reuse the tag value from the fist waiting request,
+		 * with blk-mq the tag is generated during request
+		 * allocation and drivers can rely on it being inside
+		 * the range they asked for.
+		 */
+		q->flush_rq->tag = first_rq->tag;
+	} else {
+		blk_rq_init(q, q->flush_rq);
 	}
 
-	blk_rq_init(q, &q->flush_rq);
-	q->flush_rq.cmd_type = REQ_TYPE_FS;
-	q->flush_rq.cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ;
-	q->flush_rq.rq_disk = first_rq->rq_disk;
-	q->flush_rq.end_io = flush_end_io;
+	q->flush_rq->cmd_type = REQ_TYPE_FS;
+	q->flush_rq->cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ;
+	q->flush_rq->rq_disk = first_rq->rq_disk;
+	q->flush_rq->end_io = flush_end_io;
 
-	list_add_tail(&q->flush_rq.queuelist, &q->queue_head);
-	return true;
+	return blk_flush_queue_rq(q->flush_rq);
 }
 
 static void flush_data_end_io(struct request *rq, int error)
@@ -558,5 +532,4 @@ EXPORT_SYMBOL(blkdev_issue_flush);
 void blk_mq_init_flush(struct request_queue *q)
 {
 	spin_lock_init(&q->mq_flush_lock);
-	INIT_WORK(&q->mq_flush_work, mq_flush_work);
 }
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 57039fc..5c3073f 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -226,15 +226,14 @@ static struct request *blk_mq_alloc_request_pinned(struct request_queue *q,
 	return rq;
 }
 
-struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
-		gfp_t gfp, bool reserved)
+struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp)
 {
 	struct request *rq;
 
 	if (blk_mq_queue_enter(q))
 		return NULL;
 
-	rq = blk_mq_alloc_request_pinned(q, rw, gfp, reserved);
+	rq = blk_mq_alloc_request_pinned(q, rw, gfp, false);
 	if (rq)
 		blk_mq_put_ctx(rq->mq_ctx);
 	return rq;
@@ -258,7 +257,7 @@ EXPORT_SYMBOL(blk_mq_alloc_reserved_request);
 /*
  * Re-init and set pdu, if we have it
  */
-static void blk_mq_rq_init(struct blk_mq_hw_ctx *hctx, struct request *rq)
+void blk_mq_rq_init(struct blk_mq_hw_ctx *hctx, struct request *rq)
 {
 	blk_rq_init(hctx->queue, rq);
 
@@ -1309,15 +1308,6 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_reg *reg,
 		reg->queue_depth = BLK_MQ_MAX_DEPTH;
 	}
 
-	/*
-	 * Set aside a tag for flush requests.  It will only be used while
-	 * another flush request is in progress but outside the driver.
-	 *
-	 * TODO: only allocate if flushes are supported
-	 */
-	reg->queue_depth++;
-	reg->reserved_tags++;
-
 	if (reg->queue_depth < (reg->reserved_tags + BLK_MQ_TAG_MIN))
 		return ERR_PTR(-EINVAL);
 
@@ -1368,9 +1358,14 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_reg *reg,
 	blk_mq_init_flush(q);
 	blk_mq_init_cpu_queues(q, reg->nr_hw_queues);
 
-	if (blk_mq_init_hw_queues(q, reg, driver_data))
+	q->flush_rq = kzalloc(round_up(sizeof(struct request) + reg->cmd_size,
+				cache_line_size()), GFP_KERNEL);
+	if (!q->flush_rq)
 		goto err_hw;
 
+	if (blk_mq_init_hw_queues(q, reg, driver_data))
+		goto err_flush_rq;
+
 	blk_mq_map_swqueue(q);
 
 	mutex_lock(&all_q_mutex);
@@ -1378,6 +1373,9 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_reg *reg,
 	mutex_unlock(&all_q_mutex);
 
 	return q;
+
+err_flush_rq:
+	kfree(q->flush_rq);
 err_hw:
 	kfree(q->mq_map);
 err_map:
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 5c39179..b771080 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -29,6 +29,7 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
 void blk_mq_init_flush(struct request_queue *q);
 void blk_mq_drain_queue(struct request_queue *q);
 void blk_mq_free_queue(struct request_queue *q);
+void blk_mq_rq_init(struct blk_mq_hw_ctx *hctx, struct request *rq);
 
 /*
  * CPU hotplug helpers
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 8095c4a..7500f87 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -549,6 +549,8 @@ static void blk_release_queue(struct kobject *kobj)
 	if (q->mq_ops)
 		blk_mq_free_queue(q);
 
+	kfree(q->flush_rq);
+
 	blk_trace_shutdown(q);
 
 	bdi_destroy(&q->backing_dev_info);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 161b231..c1684ec 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -123,7 +123,7 @@ void blk_mq_insert_request(struct request_queue *, struct request *, bool);
 void blk_mq_run_queues(struct request_queue *q, bool async);
 void blk_mq_free_request(struct request *rq);
 bool blk_mq_can_queue(struct blk_mq_hw_ctx *);
-struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp, bool reserved);
+struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp);
 struct request *blk_mq_alloc_reserved_request(struct request_queue *q, int rw, gfp_t gfp);
 struct request *blk_mq_rq_from_tag(struct request_queue *q, unsigned int tag);
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 8678c43..6f02524 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -448,13 +448,8 @@ struct request_queue {
 	unsigned long		flush_pending_since;
 	struct list_head	flush_queue[2];
 	struct list_head	flush_data_in_flight;
-	union {
-		struct request	flush_rq;
-		struct {
-			spinlock_t mq_flush_lock;
-			struct work_struct mq_flush_work;
-		};
-	};
+	struct request		*flush_rq;
+	spinlock_t		mq_flush_lock;
 
 	struct mutex		sysfs_lock;
 
-- 
1.7.10.4



^ permalink raw reply related	[flat|nested] 31+ messages in thread

* Re: [PATCH 01/15] block: rework flush sequencing for blk-mq
  2014-02-05 12:41 ` [PATCH 01/15] block: rework flush sequencing for blk-mq Christoph Hellwig
@ 2014-02-06  2:08   ` Muthu Kumar
  2014-02-06 16:18     ` Christoph Hellwig
  0 siblings, 1 reply; 31+ messages in thread
From: Muthu Kumar @ 2014-02-06  2:08 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jens Axboe, James Bottomley, Nicholas Bellinger, linux-scsi

On Wed, Feb 5, 2014 at 4:41 AM, Christoph Hellwig <hch@infradead.org> wrote:
> Switch to using a preallocated flush_rq for blk-mq similar to what's done
> with the old request path.  This allows us to set up the request properly
> with a tag from the actually allowed range and ->rq_disk as needed by
> some drivers.  To make life easier we also switch to dynamic allocation
> of ->flush_rq for the old path.
>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  block/blk-core.c       |   15 ++++++--
>  block/blk-flush.c      |   97 +++++++++++++++++-------------------------------
>  block/blk-mq.c         |   26 ++++++-------
>  block/blk-mq.h         |    1 +
>  block/blk-sysfs.c      |    2 +
>  include/linux/blk-mq.h |    2 +-
>  include/linux/blkdev.h |    9 +----
>  7 files changed, 65 insertions(+), 87 deletions(-)
>
> diff --git a/block/blk-core.c b/block/blk-core.c
> index c00e0bd..d3eb330 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -693,11 +693,20 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
>         if (!uninit_q)
>                 return NULL;
>
> +       uninit_q->flush_rq = kzalloc(sizeof(struct request), GFP_KERNEL);


Shouldn't this be kzalloc_node()?

Rest in this patch looks ok...

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 01/15] block: rework flush sequencing for blk-mq
  2014-02-06  2:08   ` Muthu Kumar
@ 2014-02-06 16:18     ` Christoph Hellwig
  0 siblings, 0 replies; 31+ messages in thread
From: Christoph Hellwig @ 2014-02-06 16:18 UTC (permalink / raw)
  To: Muthu Kumar
  Cc: Christoph Hellwig, Jens Axboe, James Bottomley,
	Nicholas Bellinger, linux-scsi

On Wed, Feb 05, 2014 at 06:08:37PM -0800, Muthu Kumar wrote:
> > diff --git a/block/blk-core.c b/block/blk-core.c
> > index c00e0bd..d3eb330 100644
> > --- a/block/blk-core.c
> > +++ b/block/blk-core.c
> > @@ -693,11 +693,20 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
> >         if (!uninit_q)
> >                 return NULL;
> >
> > +       uninit_q->flush_rq = kzalloc(sizeof(struct request), GFP_KERNEL);
> 
> 
> Shouldn't this be kzalloc_node()?

Probably.  There's also various kinds of optimization potential like
allocating one per hw_ctx or similar.  But until we have a device that
has high enough IOPS to matter and needs cache flushes I wouldn't worry
about optimizing the flush path.


^ permalink raw reply	[flat|nested] 31+ messages in thread

* [PATCH 02/15] blk-mq: support at_head inserations for blk_execute_rq
  2014-02-05 12:41 [PATCH 00/15] A different approach for using blk-mq in the SCSI layer Christoph Hellwig
  2014-02-05 12:41 ` [PATCH 01/15] block: rework flush sequencing for blk-mq Christoph Hellwig
@ 2014-02-05 12:41 ` Christoph Hellwig
  2014-02-06  2:27   ` Muthu Kumar
  2014-02-05 12:41 ` [PATCH 03/15] blk-mq: divert __blk_put_request for MQ ops Christoph Hellwig
                   ` (14 subsequent siblings)
  16 siblings, 1 reply; 31+ messages in thread
From: Christoph Hellwig @ 2014-02-05 12:41 UTC (permalink / raw)
  To: Jens Axboe, James Bottomley, Nicholas Bellinger; +Cc: linux-scsi

[-- Attachment #1: 0002-blk-mq-support-at_head-inserations-for-blk_execute_r.patch --]
[-- Type: text/plain, Size: 4141 bytes --]

This is neede for proper SG_IO operation as well as various uses of
blk_execute_rq from the SCSI midlayer.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 block/blk-exec.c       |    4 ++--
 block/blk-mq.c         |   17 ++++++++++-------
 block/blk-mq.h         |    2 ++
 include/linux/blk-mq.h |    1 -
 4 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/block/blk-exec.c b/block/blk-exec.c
index bbfc072..16abf91 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -5,10 +5,10 @@
 #include <linux/module.h>
 #include <linux/bio.h>
 #include <linux/blkdev.h>
-#include <linux/blk-mq.h>
 #include <linux/sched/sysctl.h>
 
 #include "blk.h"
+#include "blk-mq.h"
 
 /*
  * for max sense size
@@ -65,7 +65,7 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
 	 * be resued after dying flag is set
 	 */
 	if (q->mq_ops) {
-		blk_mq_insert_request(q, rq, true);
+		blk_mq_insert_request(q, rq, at_head, true);
 		return;
 	}
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 5c3073f..6838fe8 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -692,13 +692,16 @@ static void blk_mq_work_fn(struct work_struct *work)
 }
 
 static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx,
-				    struct request *rq)
+				    struct request *rq, bool at_head)
 {
 	struct blk_mq_ctx *ctx = rq->mq_ctx;
 
 	trace_block_rq_insert(hctx->queue, rq);
 
-	list_add_tail(&rq->queuelist, &ctx->rq_list);
+	if (at_head)
+		list_add(&rq->queuelist, &ctx->rq_list);
+	else
+		list_add_tail(&rq->queuelist, &ctx->rq_list);
 	blk_mq_hctx_mark_pending(hctx, ctx);
 
 	/*
@@ -708,7 +711,7 @@ static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx,
 }
 
 void blk_mq_insert_request(struct request_queue *q, struct request *rq,
-			   bool run_queue)
+			   bool at_head, bool run_queue)
 {
 	struct blk_mq_hw_ctx *hctx;
 	struct blk_mq_ctx *ctx, *current_ctx;
@@ -727,7 +730,7 @@ void blk_mq_insert_request(struct request_queue *q, struct request *rq,
 			rq->mq_ctx = ctx;
 		}
 		spin_lock(&ctx->lock);
-		__blk_mq_insert_request(hctx, rq);
+		__blk_mq_insert_request(hctx, rq, at_head);
 		spin_unlock(&ctx->lock);
 
 		blk_mq_put_ctx(current_ctx);
@@ -759,7 +762,7 @@ void blk_mq_run_request(struct request *rq, bool run_queue, bool async)
 
 	/* ctx->cpu might be offline */
 	spin_lock(&ctx->lock);
-	__blk_mq_insert_request(hctx, rq);
+	__blk_mq_insert_request(hctx, rq, false);
 	spin_unlock(&ctx->lock);
 
 	blk_mq_put_ctx(current_ctx);
@@ -797,7 +800,7 @@ static void blk_mq_insert_requests(struct request_queue *q,
 		rq = list_first_entry(list, struct request, queuelist);
 		list_del_init(&rq->queuelist);
 		rq->mq_ctx = ctx;
-		__blk_mq_insert_request(hctx, rq);
+		__blk_mq_insert_request(hctx, rq, false);
 	}
 	spin_unlock(&ctx->lock);
 
@@ -949,7 +952,7 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
 		__blk_mq_free_request(hctx, ctx, rq);
 	else {
 		blk_mq_bio_to_request(rq, bio);
-		__blk_mq_insert_request(hctx, rq);
+		__blk_mq_insert_request(hctx, rq, false);
 	}
 
 	spin_unlock(&ctx->lock);
diff --git a/block/blk-mq.h b/block/blk-mq.h
index b771080..88a2366 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -30,6 +30,8 @@ void blk_mq_init_flush(struct request_queue *q);
 void blk_mq_drain_queue(struct request_queue *q);
 void blk_mq_free_queue(struct request_queue *q);
 void blk_mq_rq_init(struct blk_mq_hw_ctx *hctx, struct request *rq);
+void blk_mq_insert_request(struct request_queue *, struct request *,
+		bool, bool);
 
 /*
  * CPU hotplug helpers
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index c1684ec..8cfdca6 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -119,7 +119,6 @@ void blk_mq_init_commands(struct request_queue *, void (*init)(void *data, struc
 
 void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule);
 
-void blk_mq_insert_request(struct request_queue *, struct request *, bool);
 void blk_mq_run_queues(struct request_queue *q, bool async);
 void blk_mq_free_request(struct request *rq);
 bool blk_mq_can_queue(struct blk_mq_hw_ctx *);
-- 
1.7.10.4



^ permalink raw reply related	[flat|nested] 31+ messages in thread

* Re: [PATCH 02/15] blk-mq: support at_head inserations for blk_execute_rq
  2014-02-05 12:41 ` [PATCH 02/15] blk-mq: support at_head inserations for blk_execute_rq Christoph Hellwig
@ 2014-02-06  2:27   ` Muthu Kumar
  2014-02-06 16:17     ` Christoph Hellwig
  0 siblings, 1 reply; 31+ messages in thread
From: Muthu Kumar @ 2014-02-06  2:27 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jens Axboe, James Bottomley, Nicholas Bellinger, linux-scsi

> diff --git a/block/blk-mq.h b/block/blk-mq.h
> index b771080..88a2366 100644
> --- a/block/blk-mq.h
> +++ b/block/blk-mq.h
> @@ -30,6 +30,8 @@ void blk_mq_init_flush(struct request_queue *q);
>  void blk_mq_drain_queue(struct request_queue *q);
>  void blk_mq_free_queue(struct request_queue *q);
>  void blk_mq_rq_init(struct blk_mq_hw_ctx *hctx, struct request *rq);
> +void blk_mq_insert_request(struct request_queue *, struct request *,
> +               bool, bool);
>
>  /*
>   * CPU hotplug helpers
> diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
> index c1684ec..8cfdca6 100644
> --- a/include/linux/blk-mq.h
> +++ b/include/linux/blk-mq.h
> @@ -119,7 +119,6 @@ void blk_mq_init_commands(struct request_queue *, void (*init)(void *data, struc
>
>  void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule);
>
> -void blk_mq_insert_request(struct request_queue *, struct request *, bool);
>  void blk_mq_run_queues(struct request_queue *q, bool async);
>  void blk_mq_free_request(struct request *rq);
>  bool blk_mq_can_queue(struct blk_mq_hw_ctx *);
> --
> 1.7.10.4
>

Currently its not used by any drivers. Are we sure we don't need it
public? If sure, please remove the EXPORT_SYMBOL() for it.

Rest looks good.

Regards,
Muthu


>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-scsi" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 02/15] blk-mq: support at_head inserations for blk_execute_rq
  2014-02-06  2:27   ` Muthu Kumar
@ 2014-02-06 16:17     ` Christoph Hellwig
  2014-02-06 17:05       ` Muthu Kumar
  0 siblings, 1 reply; 31+ messages in thread
From: Christoph Hellwig @ 2014-02-06 16:17 UTC (permalink / raw)
  To: Muthu Kumar
  Cc: Christoph Hellwig, Jens Axboe, James Bottomley,
	Nicholas Bellinger, linux-scsi

On Wed, Feb 05, 2014 at 06:27:38PM -0800, Muthu Kumar wrote:
> Currently its not used by any drivers. Are we sure we don't need it
> public? If sure, please remove the EXPORT_SYMBOL() for it.

Drivers shouldn't use it, it's a low-level interface.  As mentioned in
the intro this is not quite a coherent series yet, but I'll post the
blk-mq patches in a slightly nicer form agains Jens' tree soon.


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 02/15] blk-mq: support at_head inserations for blk_execute_rq
  2014-02-06 16:17     ` Christoph Hellwig
@ 2014-02-06 17:05       ` Muthu Kumar
  2014-02-06 17:10         ` Christoph Hellwig
  0 siblings, 1 reply; 31+ messages in thread
From: Muthu Kumar @ 2014-02-06 17:05 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jens Axboe, James Bottomley, Nicholas Bellinger, linux-scsi,
	Bart Van Assche

On Thu, Feb 6, 2014 at 8:17 AM, Christoph Hellwig <hch@infradead.org> wrote:
> On Wed, Feb 05, 2014 at 06:27:38PM -0800, Muthu Kumar wrote:
>> Currently its not used by any drivers. Are we sure we don't need it
>> public? If sure, please remove the EXPORT_SYMBOL() for it.
>
> Drivers shouldn't use it, it's a low-level interface.  As mentioned in
> the intro this is not quite a coherent series yet, but I'll post the
> blk-mq patches in a slightly nicer form agains Jens' tree soon.
>

Alright then.. I will wait for that patch. BTW, is the scsi-mq work
done on your git tree only or anyone else has their own git tree
(Bart? NAB?)?

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 02/15] blk-mq: support at_head inserations for blk_execute_rq
  2014-02-06 17:05       ` Muthu Kumar
@ 2014-02-06 17:10         ` Christoph Hellwig
  0 siblings, 0 replies; 31+ messages in thread
From: Christoph Hellwig @ 2014-02-06 17:10 UTC (permalink / raw)
  To: Muthu Kumar
  Cc: Christoph Hellwig, Jens Axboe, James Bottomley,
	Nicholas Bellinger, linux-scsi, Bart Van Assche

On Thu, Feb 06, 2014 at 09:05:10AM -0800, Muthu Kumar wrote:
> Alright then.. I will wait for that patch. BTW, is the scsi-mq work
> done on your git tree only or anyone else has their own git tree

For now I put out a git tree for those who don't like large patch
series.  Happy to take patches for it, but we'll have to see who
has the energy to keep the tree in the long run.

That being said I hope to feed patches into the upstream trees as
quickly as possible and thus only keep a small stack of patches around.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* [PATCH 03/15] blk-mq: divert __blk_put_request for MQ ops
  2014-02-05 12:41 [PATCH 00/15] A different approach for using blk-mq in the SCSI layer Christoph Hellwig
  2014-02-05 12:41 ` [PATCH 01/15] block: rework flush sequencing for blk-mq Christoph Hellwig
  2014-02-05 12:41 ` [PATCH 02/15] blk-mq: support at_head inserations for blk_execute_rq Christoph Hellwig
@ 2014-02-05 12:41 ` Christoph Hellwig
  2014-02-05 12:41 ` [PATCH 04/15] blk-mq: handle dma_drain_size Christoph Hellwig
                   ` (13 subsequent siblings)
  16 siblings, 0 replies; 31+ messages in thread
From: Christoph Hellwig @ 2014-02-05 12:41 UTC (permalink / raw)
  To: Jens Axboe, James Bottomley, Nicholas Bellinger; +Cc: linux-scsi

[-- Attachment #1: 0003-blk-mq-divert-__blk_put_request-for-MQ-ops.patch --]
[-- Type: text/plain, Size: 673 bytes --]

__blk_put_request needs to call into the blk-mq code just like
blk_put_request.  As we don't have the queue lock in this case both
end up calling the same function.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 block/blk-core.c |    5 +++++
 1 file changed, 5 insertions(+)

diff --git a/block/blk-core.c b/block/blk-core.c
index d3eb330..853f927 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1287,6 +1287,11 @@ void __blk_put_request(struct request_queue *q, struct request *req)
 	if (unlikely(!q))
 		return;
 
+	if (q->mq_ops) {
+		blk_mq_free_request(req);
+		return;
+	}
+
 	blk_pm_put_request(req);
 
 	elv_completed_request(q, req);
-- 
1.7.10.4



^ permalink raw reply related	[flat|nested] 31+ messages in thread

* [PATCH 04/15] blk-mq: handle dma_drain_size
  2014-02-05 12:41 [PATCH 00/15] A different approach for using blk-mq in the SCSI layer Christoph Hellwig
                   ` (2 preceding siblings ...)
  2014-02-05 12:41 ` [PATCH 03/15] blk-mq: divert __blk_put_request for MQ ops Christoph Hellwig
@ 2014-02-05 12:41 ` Christoph Hellwig
  2014-02-05 12:41 ` [PATCH 05/15] blk-mq: initialize sg_reserved_size Christoph Hellwig
                   ` (12 subsequent siblings)
  16 siblings, 0 replies; 31+ messages in thread
From: Christoph Hellwig @ 2014-02-05 12:41 UTC (permalink / raw)
  To: Jens Axboe, James Bottomley, Nicholas Bellinger; +Cc: linux-scsi

[-- Attachment #1: 0004-blk-mq-handle-dma_drain_size.patch --]
[-- Type: text/plain, Size: 876 bytes --]

Make blk-mq handle the dma_drain_size field the same way as the old request
path.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 block/blk-mq.c |   10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 6838fe8..0b72927 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -560,6 +560,16 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 		list_del_init(&rq->queuelist);
 		blk_mq_start_request(rq);
 
+		if (q->dma_drain_size && blk_rq_bytes(rq)) {
+			/*
+			 * make sure space for the drain appears we
+			 * know we can do this because max_hw_segments
+			 * has been adjusted to be one fewer than the
+			 * device can handle
+			 */
+			rq->nr_phys_segments++;
+		}
+
 		/*
 		 * Last request in the series. Flag it as such, this
 		 * enables drivers to know when IO should be kicked off,
-- 
1.7.10.4



^ permalink raw reply related	[flat|nested] 31+ messages in thread

* [PATCH 05/15] blk-mq: initialize sg_reserved_size
  2014-02-05 12:41 [PATCH 00/15] A different approach for using blk-mq in the SCSI layer Christoph Hellwig
                   ` (3 preceding siblings ...)
  2014-02-05 12:41 ` [PATCH 04/15] blk-mq: handle dma_drain_size Christoph Hellwig
@ 2014-02-05 12:41 ` Christoph Hellwig
  2014-02-05 12:41 ` [PATCH 06/15] scsi: reintroduce scsi_driver.init_command Christoph Hellwig
                   ` (11 subsequent siblings)
  16 siblings, 0 replies; 31+ messages in thread
From: Christoph Hellwig @ 2014-02-05 12:41 UTC (permalink / raw)
  To: Jens Axboe, James Bottomley, Nicholas Bellinger; +Cc: linux-scsi

[-- Attachment #1: 0005-blk-mq-initialize-sg_reserved_size.patch --]
[-- Type: text/plain, Size: 599 bytes --]

To behave the same way as the old request path.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 block/blk-mq.c |    2 ++
 1 file changed, 2 insertions(+)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 0b72927..d116282 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1363,6 +1363,8 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_reg *reg,
 	q->mq_ops = reg->ops;
 	q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
 
+	q->sg_reserved_size = INT_MAX;
+
 	blk_queue_make_request(q, blk_mq_make_request);
 	blk_queue_rq_timed_out(q, reg->ops->timeout);
 	if (reg->timeout)
-- 
1.7.10.4



^ permalink raw reply related	[flat|nested] 31+ messages in thread

* [PATCH 06/15] scsi: reintroduce scsi_driver.init_command
  2014-02-05 12:41 [PATCH 00/15] A different approach for using blk-mq in the SCSI layer Christoph Hellwig
                   ` (4 preceding siblings ...)
  2014-02-05 12:41 ` [PATCH 05/15] blk-mq: initialize sg_reserved_size Christoph Hellwig
@ 2014-02-05 12:41 ` Christoph Hellwig
  2014-02-05 12:41 ` [PATCH 07/15] block: remove unprep_rq_fn Christoph Hellwig
                   ` (10 subsequent siblings)
  16 siblings, 0 replies; 31+ messages in thread
From: Christoph Hellwig @ 2014-02-05 12:41 UTC (permalink / raw)
  To: Jens Axboe, James Bottomley, Nicholas Bellinger; +Cc: linux-scsi

[-- Attachment #1: 0006-scsi-reintroduce-scsi_driver.init_command.patch --]
[-- Type: text/plain, Size: 11439 bytes --]

Instead of letting the ULD play games with the prep_fn move back to
the model of a central prep_fn with a callback to the ULD.  This
already cleans up and shortens the code by itself, and will be required
to properly support blk-mq in the SCSI midlayer.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/scsi/scsi_lib.c    |   57 +++++++++++++++++++++++---------------------
 drivers/scsi/sd.c          |   46 +++++++++++++----------------------
 drivers/scsi/sr.c          |   19 +++++----------
 include/scsi/scsi_driver.h |    8 ++-----
 4 files changed, 54 insertions(+), 76 deletions(-)

diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 83c7e37..a206da5 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -482,6 +482,9 @@ static void scsi_requeue_command(struct request_queue *q, struct scsi_cmnd *cmd)
 	struct request *req = cmd->request;
 	unsigned long flags;
 
+	if (req->cmd_type == REQ_TYPE_FS)
+		scsi_cmd_to_driver(cmd)->uninit_command(cmd);
+
 	spin_lock_irqsave(q->queue_lock, flags);
 	blk_unprep_request(req);
 	req->special = NULL;
@@ -1123,15 +1126,7 @@ static struct scsi_cmnd *scsi_get_cmd_from_req(struct scsi_device *sdev,
 
 int scsi_setup_blk_pc_cmnd(struct scsi_device *sdev, struct request *req)
 {
-	struct scsi_cmnd *cmd;
-	int ret = scsi_prep_state_check(sdev, req);
-
-	if (ret != BLKPREP_OK)
-		return ret;
-
-	cmd = scsi_get_cmd_from_req(sdev, req);
-	if (unlikely(!cmd))
-		return BLKPREP_DEFER;
+	struct scsi_cmnd *cmd = req->special;
 
 	/*
 	 * BLOCK_PC requests may transfer data, in which case they must
@@ -1175,15 +1170,11 @@ EXPORT_SYMBOL(scsi_setup_blk_pc_cmnd);
  */
 int scsi_setup_fs_cmnd(struct scsi_device *sdev, struct request *req)
 {
-	struct scsi_cmnd *cmd;
-	int ret = scsi_prep_state_check(sdev, req);
-
-	if (ret != BLKPREP_OK)
-		return ret;
+	struct scsi_cmnd *cmd = req->special;
 
 	if (unlikely(sdev->scsi_dh_data && sdev->scsi_dh_data->scsi_dh
 			 && sdev->scsi_dh_data->scsi_dh->prep_fn)) {
-		ret = sdev->scsi_dh_data->scsi_dh->prep_fn(sdev, req);
+		int ret = sdev->scsi_dh_data->scsi_dh->prep_fn(sdev, req);
 		if (ret != BLKPREP_OK)
 			return ret;
 	}
@@ -1193,16 +1184,13 @@ int scsi_setup_fs_cmnd(struct scsi_device *sdev, struct request *req)
 	 */
 	BUG_ON(!req->nr_phys_segments);
 
-	cmd = scsi_get_cmd_from_req(sdev, req);
-	if (unlikely(!cmd))
-		return BLKPREP_DEFER;
-
 	memset(cmd->cmnd, 0, BLK_MAX_CDB);
 	return scsi_init_io(cmd, GFP_ATOMIC);
 }
 EXPORT_SYMBOL(scsi_setup_fs_cmnd);
 
-int scsi_prep_state_check(struct scsi_device *sdev, struct request *req)
+static int
+scsi_prep_state_check(struct scsi_device *sdev, struct request *req)
 {
 	int ret = BLKPREP_OK;
 
@@ -1254,9 +1242,9 @@ int scsi_prep_state_check(struct scsi_device *sdev, struct request *req)
 	}
 	return ret;
 }
-EXPORT_SYMBOL(scsi_prep_state_check);
 
-int scsi_prep_return(struct request_queue *q, struct request *req, int ret)
+static int
+scsi_prep_return(struct request_queue *q, struct request *req, int ret)
 {
 	struct scsi_device *sdev = q->queuedata;
 
@@ -1287,18 +1275,33 @@ int scsi_prep_return(struct request_queue *q, struct request *req, int ret)
 
 	return ret;
 }
-EXPORT_SYMBOL(scsi_prep_return);
 
-int scsi_prep_fn(struct request_queue *q, struct request *req)
+static int scsi_prep_fn(struct request_queue *q, struct request *req)
 {
 	struct scsi_device *sdev = q->queuedata;
-	int ret = BLKPREP_KILL;
+	struct scsi_cmnd *cmd;
+	int ret;
 
-	if (req->cmd_type == REQ_TYPE_BLOCK_PC)
+	ret = scsi_prep_state_check(sdev, req);
+	if (ret != BLKPREP_OK)
+		goto out;
+
+	cmd = scsi_get_cmd_from_req(sdev, req);
+	if (unlikely(!cmd)) {
+		ret = BLKPREP_DEFER;
+		goto out;
+	}
+
+	if (req->cmd_type == REQ_TYPE_FS)
+		ret = scsi_cmd_to_driver(cmd)->init_command(cmd);
+	else if (req->cmd_type == REQ_TYPE_BLOCK_PC)
 		ret = scsi_setup_blk_pc_cmnd(sdev, req);
+	else
+		ret = BLKPREP_KILL;
+
+out:
 	return scsi_prep_return(q, req, ret);
 }
-EXPORT_SYMBOL(scsi_prep_fn);
 
 /*
  * scsi_dev_queue_ready: if we can send requests to sdev, return 1 else
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 470954a..33e349b 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -109,6 +109,8 @@ static int sd_suspend_system(struct device *);
 static int sd_suspend_runtime(struct device *);
 static int sd_resume(struct device *);
 static void sd_rescan(struct device *);
+static int sd_init_command(struct scsi_cmnd *SCpnt);
+static void sd_uninit_command(struct scsi_cmnd *SCpnt);
 static int sd_done(struct scsi_cmnd *);
 static int sd_eh_action(struct scsi_cmnd *, int);
 static void sd_read_capacity(struct scsi_disk *sdkp, unsigned char *buffer);
@@ -503,6 +505,8 @@ static struct scsi_driver sd_template = {
 		.pm		= &sd_pm_ops,
 	},
 	.rescan			= sd_rescan,
+	.init_command		= sd_init_command,
+	.uninit_command		= sd_uninit_command,
 	.done			= sd_done,
 	.eh_action		= sd_eh_action,
 };
@@ -838,9 +842,9 @@ static int scsi_setup_flush_cmnd(struct scsi_device *sdp, struct request *rq)
 	return scsi_setup_blk_pc_cmnd(sdp, rq);
 }
 
-static void sd_unprep_fn(struct request_queue *q, struct request *rq)
+static void sd_uninit_command(struct scsi_cmnd *SCpnt)
 {
-	struct scsi_cmnd *SCpnt = rq->special;
+	struct request *rq = SCpnt->request;
 
 	if (rq->cmd_flags & REQ_DISCARD) {
 		free_page((unsigned long)rq->buffer);
@@ -853,18 +857,10 @@ static void sd_unprep_fn(struct request_queue *q, struct request *rq)
 	}
 }
 
-/**
- *	sd_prep_fn - build a scsi (read or write) command from
- *	information in the request structure.
- *	@SCpnt: pointer to mid-level's per scsi command structure that
- *	contains request and into which the scsi command is written
- *
- *	Returns 1 if successful and 0 if error (or cannot be done now).
- **/
-static int sd_prep_fn(struct request_queue *q, struct request *rq)
+static int sd_init_command(struct scsi_cmnd *SCpnt)
 {
-	struct scsi_cmnd *SCpnt;
-	struct scsi_device *sdp = q->queuedata;
+	struct request *rq = SCpnt->request;
+	struct scsi_device *sdp = SCpnt->device;
 	struct gendisk *disk = rq->rq_disk;
 	struct scsi_disk *sdkp;
 	sector_t block = blk_rq_pos(rq);
@@ -886,12 +882,6 @@ static int sd_prep_fn(struct request_queue *q, struct request *rq)
 	} else if (rq->cmd_flags & REQ_FLUSH) {
 		ret = scsi_setup_flush_cmnd(sdp, rq);
 		goto out;
-	} else if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
-		ret = scsi_setup_blk_pc_cmnd(sdp, rq);
-		goto out;
-	} else if (rq->cmd_type != REQ_TYPE_FS) {
-		ret = BLKPREP_KILL;
-		goto out;
 	}
 	ret = scsi_setup_fs_cmnd(sdp, rq);
 	if (ret != BLKPREP_OK)
@@ -903,11 +893,10 @@ static int sd_prep_fn(struct request_queue *q, struct request *rq)
 	 * is used for a killable error condition */
 	ret = BLKPREP_KILL;
 
-	SCSI_LOG_HLQUEUE(1, scmd_printk(KERN_INFO, SCpnt,
-					"sd_prep_fn: block=%llu, "
-					"count=%d\n",
-					(unsigned long long)block,
-					this_count));
+	SCSI_LOG_HLQUEUE(1,
+		scmd_printk(KERN_INFO, SCpnt,
+			"%s: block=%llu, count=%d\n",
+			__func__, (unsigned long long)block, this_count));
 
 	if (!sdp || !scsi_device_online(sdp) ||
 	    block + blk_rq_sectors(rq) > get_capacity(disk)) {
@@ -1127,7 +1116,7 @@ static int sd_prep_fn(struct request_queue *q, struct request *rq)
 	 */
 	ret = BLKPREP_OK;
  out:
-	return scsi_prep_return(q, rq, ret);
+ 	return ret;
 }
 
 /**
@@ -1663,6 +1652,8 @@ static int sd_done(struct scsi_cmnd *SCpnt)
 	unsigned char op = SCpnt->cmnd[0];
 	unsigned char unmap = SCpnt->cmnd[1] & 8;
 
+	sd_uninit_command(SCpnt);
+
 	if (req->cmd_flags & REQ_DISCARD || req->cmd_flags & REQ_WRITE_SAME) {
 		if (!result) {
 			good_bytes = blk_rq_bytes(req);
@@ -2872,9 +2863,6 @@ static void sd_probe_async(void *data, async_cookie_t cookie)
 
 	sd_revalidate_disk(gd);
 
-	blk_queue_prep_rq(sdp->request_queue, sd_prep_fn);
-	blk_queue_unprep_rq(sdp->request_queue, sd_unprep_fn);
-
 	gd->driverfs_dev = &sdp->sdev_gendev;
 	gd->flags = GENHD_FL_EXT_DEVT;
 	if (sdp->removable) {
@@ -3021,8 +3009,6 @@ static int sd_remove(struct device *dev)
 	scsi_autopm_get_device(sdkp->device);
 
 	async_synchronize_full_domain(&scsi_sd_probe_domain);
-	blk_queue_prep_rq(sdkp->device->request_queue, scsi_prep_fn);
-	blk_queue_unprep_rq(sdkp->device->request_queue, NULL);
 	device_del(&sdkp->dev);
 	del_gendisk(sdkp->disk);
 	sd_shutdown(dev);
diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c
index 40d8592..93cbd36 100644
--- a/drivers/scsi/sr.c
+++ b/drivers/scsi/sr.c
@@ -79,6 +79,7 @@ MODULE_ALIAS_SCSI_DEVICE(TYPE_WORM);
 static DEFINE_MUTEX(sr_mutex);
 static int sr_probe(struct device *);
 static int sr_remove(struct device *);
+static int sr_init_command(struct scsi_cmnd *SCpnt);
 static int sr_done(struct scsi_cmnd *);
 static int sr_runtime_suspend(struct device *dev);
 
@@ -94,6 +95,7 @@ static struct scsi_driver sr_template = {
 		.remove		= sr_remove,
 		.pm		= &sr_pm_ops,
 	},
+	.init_command		= sr_init_command,
 	.done			= sr_done,
 };
 
@@ -378,21 +380,14 @@ static int sr_done(struct scsi_cmnd *SCpnt)
 	return good_bytes;
 }
 
-static int sr_prep_fn(struct request_queue *q, struct request *rq)
+static int sr_init_command(struct scsi_cmnd *SCpnt)
 {
 	int block = 0, this_count, s_size;
 	struct scsi_cd *cd;
-	struct scsi_cmnd *SCpnt;
-	struct scsi_device *sdp = q->queuedata;
+	struct request *rq = SCpnt->request;
+	struct scsi_device *sdp = SCpnt->device;
 	int ret;
 
-	if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
-		ret = scsi_setup_blk_pc_cmnd(sdp, rq);
-		goto out;
-	} else if (rq->cmd_type != REQ_TYPE_FS) {
-		ret = BLKPREP_KILL;
-		goto out;
-	}
 	ret = scsi_setup_fs_cmnd(sdp, rq);
 	if (ret != BLKPREP_OK)
 		goto out;
@@ -517,7 +512,7 @@ static int sr_prep_fn(struct request_queue *q, struct request *rq)
 	 */
 	ret = BLKPREP_OK;
  out:
-	return scsi_prep_return(q, rq, ret);
+	return ret;
 }
 
 static int sr_block_open(struct block_device *bdev, fmode_t mode)
@@ -718,7 +713,6 @@ static int sr_probe(struct device *dev)
 
 	/* FIXME: need to handle a get_capabilities failure properly ?? */
 	get_capabilities(cd);
-	blk_queue_prep_rq(sdev->request_queue, sr_prep_fn);
 	sr_vendor_init(cd);
 
 	disk->driverfs_dev = &sdev->sdev_gendev;
@@ -993,7 +987,6 @@ static int sr_remove(struct device *dev)
 
 	scsi_autopm_get_device(cd->device);
 
-	blk_queue_prep_rq(cd->device->request_queue, scsi_prep_fn);
 	del_gendisk(cd->disk);
 
 	mutex_lock(&sr_ref_mutex);
diff --git a/include/scsi/scsi_driver.h b/include/scsi/scsi_driver.h
index 20fdfc2..b507729 100644
--- a/include/scsi/scsi_driver.h
+++ b/include/scsi/scsi_driver.h
@@ -6,15 +6,14 @@
 struct module;
 struct scsi_cmnd;
 struct scsi_device;
-struct request;
-struct request_queue;
-
 
 struct scsi_driver {
 	struct module		*owner;
 	struct device_driver	gendrv;
 
 	void (*rescan)(struct device *);
+	int (*init_command)(struct scsi_cmnd *);
+	void (*uninit_command)(struct scsi_cmnd *);
 	int (*done)(struct scsi_cmnd *);
 	int (*eh_action)(struct scsi_cmnd *, int);
 };
@@ -31,8 +30,5 @@ extern int scsi_register_interface(struct class_interface *);
 
 int scsi_setup_blk_pc_cmnd(struct scsi_device *sdev, struct request *req);
 int scsi_setup_fs_cmnd(struct scsi_device *sdev, struct request *req);
-int scsi_prep_state_check(struct scsi_device *sdev, struct request *req);
-int scsi_prep_return(struct request_queue *q, struct request *req, int ret);
-int scsi_prep_fn(struct request_queue *, struct request *);
 
 #endif /* _SCSI_SCSI_DRIVER_H */
-- 
1.7.10.4



^ permalink raw reply related	[flat|nested] 31+ messages in thread

* [PATCH 07/15] block: remove unprep_rq_fn
  2014-02-05 12:41 [PATCH 00/15] A different approach for using blk-mq in the SCSI layer Christoph Hellwig
                   ` (5 preceding siblings ...)
  2014-02-05 12:41 ` [PATCH 06/15] scsi: reintroduce scsi_driver.init_command Christoph Hellwig
@ 2014-02-05 12:41 ` Christoph Hellwig
  2014-02-05 12:41 ` [PATCH 08/15] scsi: cleanup scsi_end_request calling conventions Christoph Hellwig
                   ` (9 subsequent siblings)
  16 siblings, 0 replies; 31+ messages in thread
From: Christoph Hellwig @ 2014-02-05 12:41 UTC (permalink / raw)
  To: Jens Axboe, James Bottomley, Nicholas Bellinger; +Cc: linux-scsi

[-- Attachment #1: 0007-block-remove-unprep_rq_fn.patch --]
[-- Type: text/plain, Size: 3764 bytes --]

Now that scsi doesn't use it anymore there's no user left.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 block/blk-core.c       |   11 ++---------
 block/blk-settings.c   |   17 -----------------
 include/linux/blkdev.h |    3 ---
 3 files changed, 2 insertions(+), 29 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 853f927..59a4207 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -722,7 +722,6 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
 
 	q->request_fn		= rfn;
 	q->prep_rq_fn		= NULL;
-	q->unprep_rq_fn		= NULL;
 	q->queue_flags		|= QUEUE_FLAG_DEFAULT;
 
 	/* Override internal queue lock with supplied lock pointer */
@@ -2486,18 +2485,12 @@ static bool blk_update_bidi_request(struct request *rq, int error,
  * @req:	the request
  *
  * This function makes a request ready for complete resubmission (or
- * completion).  It happens only after all error handling is complete,
- * so represents the appropriate moment to deallocate any resources
- * that were allocated to the request in the prep_rq_fn.  The queue
- * lock is held when calling this.
+ * completion).  It happens only after all error handling is complete.
+ * The queue lock is held when calling this.
  */
 void blk_unprep_request(struct request *req)
 {
-	struct request_queue *q = req->q;
-
 	req->cmd_flags &= ~REQ_DONTPREP;
-	if (q->unprep_rq_fn)
-		q->unprep_rq_fn(q, req);
 }
 EXPORT_SYMBOL_GPL(blk_unprep_request);
 
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 5d21239..47a266c 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -37,23 +37,6 @@ void blk_queue_prep_rq(struct request_queue *q, prep_rq_fn *pfn)
 EXPORT_SYMBOL(blk_queue_prep_rq);
 
 /**
- * blk_queue_unprep_rq - set an unprepare_request function for queue
- * @q:		queue
- * @ufn:	unprepare_request function
- *
- * It's possible for a queue to register an unprepare_request callback
- * which is invoked before the request is finally completed. The goal
- * of the function is to deallocate any data that was allocated in the
- * prepare_request callback.
- *
- */
-void blk_queue_unprep_rq(struct request_queue *q, unprep_rq_fn *ufn)
-{
-	q->unprep_rq_fn = ufn;
-}
-EXPORT_SYMBOL(blk_queue_unprep_rq);
-
-/**
  * blk_queue_merge_bvec - set a merge_bvec function for queue
  * @q:		queue
  * @mbfn:	merge_bvec_fn
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 6f02524..67c21a4 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -223,7 +223,6 @@ struct blk_queue_ctx;
 typedef void (request_fn_proc) (struct request_queue *q);
 typedef void (make_request_fn) (struct request_queue *q, struct bio *bio);
 typedef int (prep_rq_fn) (struct request_queue *, struct request *);
-typedef void (unprep_rq_fn) (struct request_queue *, struct request *);
 
 struct bio_vec;
 struct bvec_merge_data {
@@ -312,7 +311,6 @@ struct request_queue {
 	request_fn_proc		*request_fn;
 	make_request_fn		*make_request_fn;
 	prep_rq_fn		*prep_rq_fn;
-	unprep_rq_fn		*unprep_rq_fn;
 	merge_bvec_fn		*merge_bvec_fn;
 	softirq_done_fn		*softirq_done_fn;
 	rq_timed_out_fn		*rq_timed_out_fn;
@@ -985,7 +983,6 @@ extern int blk_queue_dma_drain(struct request_queue *q,
 extern void blk_queue_lld_busy(struct request_queue *q, lld_busy_fn *fn);
 extern void blk_queue_segment_boundary(struct request_queue *, unsigned long);
 extern void blk_queue_prep_rq(struct request_queue *, prep_rq_fn *pfn);
-extern void blk_queue_unprep_rq(struct request_queue *, unprep_rq_fn *ufn);
 extern void blk_queue_merge_bvec(struct request_queue *, merge_bvec_fn *);
 extern void blk_queue_dma_alignment(struct request_queue *, int);
 extern void blk_queue_update_dma_alignment(struct request_queue *, int);
-- 
1.7.10.4



^ permalink raw reply related	[flat|nested] 31+ messages in thread

* [PATCH 08/15] scsi: cleanup scsi_end_request calling conventions
  2014-02-05 12:41 [PATCH 00/15] A different approach for using blk-mq in the SCSI layer Christoph Hellwig
                   ` (6 preceding siblings ...)
  2014-02-05 12:41 ` [PATCH 07/15] block: remove unprep_rq_fn Christoph Hellwig
@ 2014-02-05 12:41 ` Christoph Hellwig
  2014-02-05 12:41 ` [PATCH 09/15] scsi: centralize command re-queueing in scsi_dispatch_fn Christoph Hellwig
                   ` (8 subsequent siblings)
  16 siblings, 0 replies; 31+ messages in thread
From: Christoph Hellwig @ 2014-02-05 12:41 UTC (permalink / raw)
  To: Jens Axboe, James Bottomley, Nicholas Bellinger; +Cc: linux-scsi

[-- Attachment #1: 0008-scsi-cleanup-scsi_end_request-calling-conventions.patch --]
[-- Type: text/plain, Size: 2704 bytes --]

Don't bother returning a stale pointer when a bool does the work much
better.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/scsi/scsi_lib.c |   34 ++++++++++++++++------------------
 1 file changed, 16 insertions(+), 18 deletions(-)

diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index a206da5..c6fc552 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -531,17 +531,16 @@ static void __scsi_release_buffers(struct scsi_cmnd *, int);
  *
  * Lock status: Assumed that lock is not held upon entry.
  *
- * Returns:     cmd if requeue required, NULL otherwise.
+ * Returns:     false if requeue required, true otherwise.
  *
  * Notes:       This is called for block device requests in order to
  *              mark some number of sectors as complete.
  * 
  *		We are guaranteeing that the request queue will be goosed
  *		at some point during this call.
- * Notes:	If cmd was requeued, upon return it will be a stale pointer.
  */
-static struct scsi_cmnd *scsi_end_request(struct scsi_cmnd *cmd, int error,
-					  int bytes, int requeue)
+static bool scsi_end_request(struct scsi_cmnd *cmd, int error, int bytes,
+		int requeue)
 {
 	struct request_queue *q = cmd->device->request_queue;
 	struct request *req = cmd->request;
@@ -554,19 +553,18 @@ static struct scsi_cmnd *scsi_end_request(struct scsi_cmnd *cmd, int error,
 		/* kill remainder if no retrys */
 		if (error && scsi_noretry_cmd(cmd))
 			blk_end_request_all(req, error);
-		else {
-			if (requeue) {
-				/*
-				 * Bleah.  Leftovers again.  Stick the
-				 * leftovers in the front of the
-				 * queue, and goose the queue again.
-				 */
-				scsi_release_buffers(cmd);
-				scsi_requeue_command(q, cmd);
-				cmd = NULL;
-			}
-			return cmd;
+		else if (requeue) {
+			/*
+			 * Bleah.  Leftovers again.  Stick the
+			 * leftovers in the front of the
+			 * queue, and goose the queue again.
+			 */
+			scsi_release_buffers(cmd);
+			scsi_requeue_command(q, cmd);
+			return true;
 		}
+
+		return false;
 	}
 
 	/*
@@ -575,7 +573,7 @@ static struct scsi_cmnd *scsi_end_request(struct scsi_cmnd *cmd, int error,
 	 */
 	__scsi_release_buffers(cmd, 0);
 	scsi_next_command(cmd);
-	return NULL;
+	return true;
 }
 
 static inline unsigned int scsi_sgtable_index(unsigned short nents)
@@ -847,7 +845,7 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
 	 * are leftovers and there is some kind of error
 	 * (result != 0), retry the rest.
 	 */
-	if (scsi_end_request(cmd, error, good_bytes, result == 0) == NULL)
+	if (scsi_end_request(cmd, error, good_bytes, result == 0))
 		return;
 
 	error = __scsi_error_from_host_byte(cmd, result);
-- 
1.7.10.4



^ permalink raw reply related	[flat|nested] 31+ messages in thread

* [PATCH 09/15] scsi: centralize command re-queueing in scsi_dispatch_fn
  2014-02-05 12:41 [PATCH 00/15] A different approach for using blk-mq in the SCSI layer Christoph Hellwig
                   ` (7 preceding siblings ...)
  2014-02-05 12:41 ` [PATCH 08/15] scsi: cleanup scsi_end_request calling conventions Christoph Hellwig
@ 2014-02-05 12:41 ` Christoph Hellwig
  2014-02-05 12:41 ` [PATCH 10/15] scsi: split __scsi_queue_insert Christoph Hellwig
                   ` (7 subsequent siblings)
  16 siblings, 0 replies; 31+ messages in thread
From: Christoph Hellwig @ 2014-02-05 12:41 UTC (permalink / raw)
  To: Jens Axboe, James Bottomley, Nicholas Bellinger; +Cc: linux-scsi

[-- Attachment #1: 0009-scsi-centralize-command-re-queueing-in-scsi_dispatch.patch --]
[-- Type: text/plain, Size: 3126 bytes --]

Make sure we only have the logic for requeing commands in one place.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/scsi/scsi.c     |   36 +++++++++++++-----------------------
 drivers/scsi/scsi_lib.c |    6 ++++--
 2 files changed, 17 insertions(+), 25 deletions(-)

diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c
index 5cb935a..adb8bfb 100644
--- a/drivers/scsi/scsi.c
+++ b/drivers/scsi/scsi.c
@@ -620,9 +620,7 @@ int scsi_dispatch_cmd(struct scsi_cmnd *cmd)
 		 * returns an immediate error upwards, and signals
 		 * that the device is no longer present */
 		cmd->result = DID_NO_CONNECT << 16;
-		scsi_done(cmd);
-		/* return 0 (because the command has been processed) */
-		goto out;
+		goto done;
 	}
 
 	/* Check to see if the scsi lld made this device blocked. */
@@ -634,16 +632,8 @@ int scsi_dispatch_cmd(struct scsi_cmnd *cmd)
 		 * occur until the device transitions out of the
 		 * suspend state.
 		 */
-
-		scsi_queue_insert(cmd, SCSI_MLQUEUE_DEVICE_BUSY);
-
 		SCSI_LOG_MLQUEUE(3, printk("queuecommand : device blocked \n"));
-
-		/*
-		 * NOTE: rtn is still zero here because we don't need the
-		 * queue to be plugged on return (it's already stopped)
-		 */
-		goto out;
+		return SCSI_MLQUEUE_DEVICE_BUSY;
 	}
 
 	/* 
@@ -667,35 +657,35 @@ int scsi_dispatch_cmd(struct scsi_cmnd *cmd)
 			       "cdb_size=%d host->max_cmd_len=%d\n",
 			       cmd->cmd_len, cmd->device->host->max_cmd_len));
 		cmd->result = (DID_ABORT << 16);
-
-		scsi_done(cmd);
-		goto out;
+		goto done;
 	}
 
 	if (unlikely(host->shost_state == SHOST_DEL)) {
 		cmd->result = (DID_NO_CONNECT << 16);
-		scsi_done(cmd);
-	} else {
-		trace_scsi_dispatch_cmd_start(cmd);
-		cmd->scsi_done = scsi_done;
-		rtn = host->hostt->queuecommand(host, cmd);
+		goto done;
+
 	}
 
+	trace_scsi_dispatch_cmd_start(cmd);
+
+	cmd->scsi_done = scsi_done;
+	rtn = host->hostt->queuecommand(host, cmd);
 	if (rtn) {
 		trace_scsi_dispatch_cmd_error(cmd, rtn);
 		if (rtn != SCSI_MLQUEUE_DEVICE_BUSY &&
 		    rtn != SCSI_MLQUEUE_TARGET_BUSY)
 			rtn = SCSI_MLQUEUE_HOST_BUSY;
 
-		scsi_queue_insert(cmd, rtn);
-
 		SCSI_LOG_MLQUEUE(3,
 		    printk("queuecommand : request rejected\n"));
 	}
 
- out:
 	SCSI_LOG_MLQUEUE(3, printk("leaving scsi_dispatch_cmnd()\n"));
 	return rtn;
+ done:
+	SCSI_LOG_MLQUEUE(3, printk("scsi_dispatch_cmnd() failed\n"));
+	scsi_done(cmd);
+	return 0;
 }
 
 /**
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index c6fc552..5f4b70b 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1633,8 +1633,10 @@ static void scsi_request_fn(struct request_queue *q)
 		 */
 		rtn = scsi_dispatch_cmd(cmd);
 		spin_lock_irq(q->queue_lock);
-		if (rtn)
+		if (rtn) {
+			scsi_queue_insert(cmd, rtn);
 			goto out_delay;
+		}
 	}
 
 	return;
@@ -1654,7 +1656,7 @@ static void scsi_request_fn(struct request_queue *q)
 	blk_requeue_request(q, req);
 	atomic_dec(&sdev->device_busy);
 out_delay:
-	if (atomic_read(&sdev->device_busy) == 0)
+	if (atomic_read(&sdev->device_busy) == 0 && !scsi_device_blocked(sdev))
 		blk_delay_queue(q, SCSI_QUEUE_DELAY);
 }
 
-- 
1.7.10.4



^ permalink raw reply related	[flat|nested] 31+ messages in thread

* [PATCH 10/15] scsi: split __scsi_queue_insert
  2014-02-05 12:41 [PATCH 00/15] A different approach for using blk-mq in the SCSI layer Christoph Hellwig
                   ` (8 preceding siblings ...)
  2014-02-05 12:41 ` [PATCH 09/15] scsi: centralize command re-queueing in scsi_dispatch_fn Christoph Hellwig
@ 2014-02-05 12:41 ` Christoph Hellwig
  2014-02-05 12:41 ` [PATCH 11/15] scsi: factor out __scsi_init_queue Christoph Hellwig
                   ` (6 subsequent siblings)
  16 siblings, 0 replies; 31+ messages in thread
From: Christoph Hellwig @ 2014-02-05 12:41 UTC (permalink / raw)
  To: Jens Axboe, James Bottomley, Nicholas Bellinger; +Cc: linux-scsi

[-- Attachment #1: 0010-scsi-split-__scsi_queue_insert.patch --]
[-- Type: text/plain, Size: 2535 bytes --]

Factor out a helper to set the _blocked values, which we'll reuse for the
blk-mq code path.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/scsi/scsi_lib.c |   44 ++++++++++++++++++++++++++------------------
 1 file changed, 26 insertions(+), 18 deletions(-)

diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 5f4b70b..3b3c3ab 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -75,28 +75,12 @@ struct kmem_cache *scsi_sdb_cache;
  */
 #define SCSI_QUEUE_DELAY	3
 
-/**
- * __scsi_queue_insert - private queue insertion
- * @cmd: The SCSI command being requeued
- * @reason:  The reason for the requeue
- * @unbusy: Whether the queue should be unbusied
- *
- * This is a private queue insertion.  The public interface
- * scsi_queue_insert() always assumes the queue should be unbusied
- * because it's always called before the completion.  This function is
- * for a requeue after completion, which should only occur in this
- * file.
- */
-static void __scsi_queue_insert(struct scsi_cmnd *cmd, int reason, int unbusy)
+static void
+scsi_set_blocked(struct scsi_cmnd *cmd, int reason)
 {
 	struct Scsi_Host *host = cmd->device->host;
 	struct scsi_device *device = cmd->device;
 	struct scsi_target *starget = scsi_target(device);
-	struct request_queue *q = device->request_queue;
-	unsigned long flags;
-
-	SCSI_LOG_MLQUEUE(1,
-		 printk("Inserting command %p into mlqueue\n", cmd));
 
 	/*
 	 * Set the appropriate busy bit for the device/host.
@@ -125,6 +109,30 @@ static void __scsi_queue_insert(struct scsi_cmnd *cmd, int reason, int unbusy)
 			   starget->max_target_blocked);
 		break;
 	}
+}
+
+/**
+ * __scsi_queue_insert - private queue insertion
+ * @cmd: The SCSI command being requeued
+ * @reason:  The reason for the requeue
+ * @unbusy: Whether the queue should be unbusied
+ *
+ * This is a private queue insertion.  The public interface
+ * scsi_queue_insert() always assumes the queue should be unbusied
+ * because it's always called before the completion.  This function is
+ * for a requeue after completion, which should only occur in this
+ * file.
+ */
+static void __scsi_queue_insert(struct scsi_cmnd *cmd, int reason, int unbusy)
+{
+	struct scsi_device *device = cmd->device;
+	struct request_queue *q = device->request_queue;
+	unsigned long flags;
+
+	SCSI_LOG_MLQUEUE(1,
+		 printk("Inserting command %p into mlqueue\n", cmd));
+
+	scsi_set_blocked(cmd, reason);
 
 	/*
 	 * Decrement the counters, since these commands are no longer
-- 
1.7.10.4



^ permalink raw reply related	[flat|nested] 31+ messages in thread

* [PATCH 11/15] scsi: factor out __scsi_init_queue
  2014-02-05 12:41 [PATCH 00/15] A different approach for using blk-mq in the SCSI layer Christoph Hellwig
                   ` (9 preceding siblings ...)
  2014-02-05 12:41 ` [PATCH 10/15] scsi: split __scsi_queue_insert Christoph Hellwig
@ 2014-02-05 12:41 ` Christoph Hellwig
  2014-02-05 12:41 ` [PATCH 12/15] scsi: initial blk-mq support Christoph Hellwig
                   ` (5 subsequent siblings)
  16 siblings, 0 replies; 31+ messages in thread
From: Christoph Hellwig @ 2014-02-05 12:41 UTC (permalink / raw)
  To: Jens Axboe, James Bottomley, Nicholas Bellinger; +Cc: linux-scsi

[-- Attachment #1: 0011-scsi-factor-out-__scsi_init_queue.patch --]
[-- Type: text/plain, Size: 1440 bytes --]

Factor code common out so that it can be reused independently for the blk-mq
code path.

Split from a bigger patch by Nicholas Bellinger.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/scsi/scsi_lib.c |   18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 3b3c3ab..e67950c 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1690,16 +1690,10 @@ u64 scsi_calculate_bounce_limit(struct Scsi_Host *shost)
 }
 EXPORT_SYMBOL(scsi_calculate_bounce_limit);
 
-struct request_queue *__scsi_alloc_queue(struct Scsi_Host *shost,
-					 request_fn_proc *request_fn)
+static void __scsi_init_queue(struct Scsi_Host *shost, struct request_queue *q)
 {
-	struct request_queue *q;
 	struct device *dev = shost->dma_dev;
 
-	q = blk_init_queue(request_fn, NULL);
-	if (!q)
-		return NULL;
-
 	/*
 	 * this limit is imposed by hardware restrictions
 	 */
@@ -1730,7 +1724,17 @@ struct request_queue *__scsi_alloc_queue(struct Scsi_Host *shost,
 	 * blk_queue_update_dma_alignment() later.
 	 */
 	blk_queue_dma_alignment(q, 0x03);
+}
 
+struct request_queue *__scsi_alloc_queue(struct Scsi_Host *shost,
+					 request_fn_proc *request_fn)
+{
+	struct request_queue *q;
+
+	q = blk_init_queue(request_fn, NULL);
+	if (!q)
+		return NULL;
+	__scsi_init_queue(shost, q);
 	return q;
 }
 EXPORT_SYMBOL(__scsi_alloc_queue);
-- 
1.7.10.4



^ permalink raw reply related	[flat|nested] 31+ messages in thread

* [PATCH 12/15] scsi: initial blk-mq support
  2014-02-05 12:41 [PATCH 00/15] A different approach for using blk-mq in the SCSI layer Christoph Hellwig
                   ` (10 preceding siblings ...)
  2014-02-05 12:41 ` [PATCH 11/15] scsi: factor out __scsi_init_queue Christoph Hellwig
@ 2014-02-05 12:41 ` Christoph Hellwig
  2014-02-06  8:38   ` Sagi Grimberg
  2014-02-06 22:11   ` Nicholas A. Bellinger
  2014-02-05 12:41 ` [PATCH 13/15] scsi: partially stub out scsi_adjust_queue_depth when using blk-mq Christoph Hellwig
                   ` (4 subsequent siblings)
  16 siblings, 2 replies; 31+ messages in thread
From: Christoph Hellwig @ 2014-02-05 12:41 UTC (permalink / raw)
  To: Jens Axboe, James Bottomley, Nicholas Bellinger; +Cc: linux-scsi

[-- Attachment #1: 0012-scsi-initial-blk-mq-support.patch --]
[-- Type: text/plain, Size: 11992 bytes --]

Add support for using the blk-mq code to submit requests to SCSI
drivers.  There is very little blk-mq specific code, but that's
partially because important functionality like partial completions
and request requeueing is still missing in blk-mq.  I hope to keep
most of the additions for these in the blk-mq core instead of the
SCSI layer, though.

Based on the earlier scsi-mq prototype by Nicholas Bellinger, although
not a whole lot of actual code is left.

Not-quite-signed-off-yet-by: Christoph Hellwig <hch@lst.de>
---
 drivers/scsi/scsi.c      |   36 ++++++-
 drivers/scsi/scsi_lib.c  |  244 ++++++++++++++++++++++++++++++++++++++++++++--
 drivers/scsi/scsi_priv.h |    2 +
 drivers/scsi/scsi_scan.c |    5 +-
 include/scsi/scsi_host.h |    3 +
 5 files changed, 278 insertions(+), 12 deletions(-)

diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c
index adb8bfb..cf5c110 100644
--- a/drivers/scsi/scsi.c
+++ b/drivers/scsi/scsi.c
@@ -44,6 +44,7 @@
 #include <linux/string.h>
 #include <linux/slab.h>
 #include <linux/blkdev.h>
+#include <linux/blk-mq.h>
 #include <linux/delay.h>
 #include <linux/init.h>
 #include <linux/completion.h>
@@ -688,6 +689,33 @@ int scsi_dispatch_cmd(struct scsi_cmnd *cmd)
 	return 0;
 }
 
+static void scsi_softirq_done_remote(void *data)
+{
+	return scsi_softirq_done(data);
+}
+
+static void scsi_mq_done(struct request *req)
+{
+	int cpu;
+
+#if 0
+	if (!ctx->ipi_redirect)
+		return scsi_softirq_done(cmd);
+#endif
+
+	cpu = get_cpu();
+	if (cpu != req->cpu && cpu_online(req->cpu)) {
+		req->csd.func = scsi_softirq_done_remote;
+		req->csd.info = req;
+		req->csd.flags = 0;
+		__smp_call_function_single(req->cpu, &req->csd, 0);
+	} else {
+		scsi_softirq_done(req);
+	}
+
+	put_cpu();
+}
+
 /**
  * scsi_done - Invoke completion on finished SCSI command.
  * @cmd: The SCSI Command for which a low-level device driver (LLDD) gives
@@ -701,8 +729,14 @@ int scsi_dispatch_cmd(struct scsi_cmnd *cmd)
  */
 static void scsi_done(struct scsi_cmnd *cmd)
 {
+	struct request *req = cmd->request;
+
 	trace_scsi_dispatch_cmd_done(cmd);
-	blk_complete_request(cmd->request);
+
+	if (req->mq_ctx)
+		scsi_mq_done(req);
+	else
+		blk_complete_request(req);
 }
 
 /**
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index e67950c..8dd8893 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -20,6 +20,7 @@
 #include <linux/delay.h>
 #include <linux/hardirq.h>
 #include <linux/scatterlist.h>
+#include <linux/blk-mq.h>
 
 #include <scsi/scsi.h>
 #include <scsi/scsi_cmnd.h>
@@ -554,6 +555,15 @@ static bool scsi_end_request(struct scsi_cmnd *cmd, int error, int bytes,
 	struct request *req = cmd->request;
 
 	/*
+	 * XXX: need to handle partial completions and retries here.
+	 */
+	if (req->mq_ctx) {
+		blk_mq_end_io(req, error);
+		put_device(&cmd->device->sdev_gendev);
+		return true;
+	}
+
+	/*
 	 * If there are blocks left over at the end, set up the command
 	 * to queue the remainder of them.
 	 */
@@ -1014,12 +1024,15 @@ static int scsi_init_sgtable(struct request *req, struct scsi_data_buffer *sdb,
 {
 	int count;
 
-	/*
-	 * If sg table allocation fails, requeue request later.
-	 */
-	if (unlikely(scsi_alloc_sgtable(sdb, req->nr_phys_segments,
-					gfp_mask))) {
-		return BLKPREP_DEFER;
+	BUG_ON(req->nr_phys_segments > SCSI_MAX_SG_SEGMENTS);
+
+	if (!req->mq_ctx) {
+		/*
+		 * If sg table allocation fails, requeue request later.
+		 */
+		if (unlikely(scsi_alloc_sgtable(sdb, req->nr_phys_segments,
+						gfp_mask)))
+			return BLKPREP_DEFER;
 	}
 
 	req->buffer = NULL;
@@ -1075,9 +1088,11 @@ int scsi_init_io(struct scsi_cmnd *cmd, gfp_t gfp_mask)
 		BUG_ON(prot_sdb == NULL);
 		ivecs = blk_rq_count_integrity_sg(rq->q, rq->bio);
 
-		if (scsi_alloc_sgtable(prot_sdb, ivecs, gfp_mask)) {
-			error = BLKPREP_DEFER;
-			goto err_exit;
+		if (!rq->mq_ctx) {
+			if (scsi_alloc_sgtable(prot_sdb, ivecs, gfp_mask)) {
+				error = BLKPREP_DEFER;
+				goto err_exit;
+			}
 		}
 
 		count = blk_rq_map_integrity_sg(rq->q, rq->bio,
@@ -1505,7 +1520,7 @@ static void scsi_kill_request(struct request *req, struct request_queue *q)
 	blk_complete_request(req);
 }
 
-static void scsi_softirq_done(struct request *rq)
+void scsi_softirq_done(struct request *rq)
 {
 	struct scsi_cmnd *cmd = rq->special;
 	unsigned long wait_for = (cmd->allowed + 1) * rq->timeout;
@@ -1533,9 +1548,11 @@ static void scsi_softirq_done(struct request *rq)
 			scsi_finish_command(cmd);
 			break;
 		case NEEDS_RETRY:
+			WARN_ON(rq->mq_ctx);
 			scsi_queue_insert(cmd, SCSI_MLQUEUE_EH_RETRY);
 			break;
 		case ADD_TO_MLQUEUE:
+			WARN_ON(rq->mq_ctx);
 			scsi_queue_insert(cmd, SCSI_MLQUEUE_DEVICE_BUSY);
 			break;
 		default:
@@ -1668,6 +1685,120 @@ out_delay:
 		blk_delay_queue(q, SCSI_QUEUE_DELAY);
 }
 
+static int scsi_mq_prep_fn(struct request *req)
+{
+	struct scsi_cmnd *cmd = req->special;
+	int ret;
+
+	ret = scsi_prep_state_check(cmd->device, req);
+	if (ret != BLKPREP_OK)
+		goto out;
+
+	if (req->cmd_type == REQ_TYPE_FS)
+		ret = scsi_cmd_to_driver(cmd)->init_command(cmd);
+	else if (req->cmd_type == REQ_TYPE_BLOCK_PC)
+		ret = scsi_setup_blk_pc_cmnd(cmd->device, req);
+	else
+		ret = BLKPREP_KILL;
+
+out:
+	switch (ret) {
+	case BLKPREP_OK:
+		return 0;
+	case BLKPREP_DEFER:
+		return BLK_MQ_RQ_QUEUE_BUSY;
+	default:
+		req->errors = DID_NO_CONNECT << 16;
+		return BLK_MQ_RQ_QUEUE_ERROR;
+	}
+}
+
+static int scsi_mq_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq)
+{
+	struct request_queue *q = rq->q;
+	struct scsi_device *sdev = q->queuedata;
+	struct Scsi_Host *shost = sdev->host;
+	struct scsi_cmnd *cmd = rq->special;
+	unsigned char *sense_buf = cmd->sense_buffer;
+	struct scatterlist *sg;
+	int ret = BLK_MQ_RQ_QUEUE_BUSY;
+	int reason;
+
+	/*
+	 * blk-mq stores this in the mq_ctx, which can't be derferenced by
+	 * drivers.  For now use the old per-request field, but there must be
+	 * a better way.
+	 */
+	rq->cpu = raw_smp_processor_id();
+
+	if (!get_device(&sdev->sdev_gendev))
+		goto out;
+
+	if (!scsi_dev_queue_ready(q, sdev))
+		goto out_put_device;
+	if (!scsi_target_queue_ready(shost, sdev))
+		goto out_dec_device_busy;
+	if (!scsi_host_queue_ready(q, shost, sdev))
+		goto out_dec_target_busy;
+
+	memset(sense_buf, 0, SCSI_SENSE_BUFFERSIZE);
+	memset(cmd, 0, sizeof(struct scsi_cmnd));
+
+	cmd->request = rq;
+	cmd->device = sdev;
+	cmd->sense_buffer = sense_buf;
+
+	cmd->tag = rq->tag;
+	cmd->cmnd = rq->cmd;
+	cmd->prot_op = SCSI_PROT_NORMAL;
+
+	sg = (void *)cmd + sizeof(struct scsi_cmnd) + shost->hostt->cmd_size;
+
+	if (rq->nr_phys_segments) {
+		cmd->sdb.table.sgl = sg;
+		cmd->sdb.table.nents = rq->nr_phys_segments;
+		sg_init_table(cmd->sdb.table.sgl, rq->nr_phys_segments);
+	}
+
+	if (scsi_host_get_prot(shost)) {
+		cmd->prot_sdb = (void *)sg +
+			shost->sg_tablesize * sizeof(struct scatterlist);
+		memset(cmd->prot_sdb, 0, sizeof(struct scsi_data_buffer));
+
+		cmd->prot_sdb->table.sgl =
+			(struct scatterlist *)(cmd->prot_sdb + 1);
+	}
+
+	ret = scsi_mq_prep_fn(rq);
+	if (ret)
+		goto out_dec_host_busy;
+
+	scsi_init_cmd_errh(cmd);
+
+	reason = scsi_dispatch_cmd(cmd);
+	if (reason) {
+		scsi_set_blocked(cmd, reason);
+		goto out_uninit;
+	}
+
+	return BLK_MQ_RQ_QUEUE_OK;
+
+out_uninit:
+	if (rq->cmd_type == REQ_TYPE_FS)
+		scsi_cmd_to_driver(cmd)->uninit_command(cmd);
+out_dec_host_busy:
+	atomic_dec(&shost->host_busy);
+out_dec_target_busy:
+	atomic_dec(&scsi_target(sdev)->target_busy);
+out_dec_device_busy:
+	atomic_dec(&sdev->device_busy);
+	/* XXX: delay queue if device_busy == 0 */
+out_put_device:
+	put_device(&sdev->sdev_gendev);
+out:
+	return ret;
+}
+
 u64 scsi_calculate_bounce_limit(struct Scsi_Host *shost)
 {
 	struct device *host_dev;
@@ -1754,6 +1885,99 @@ struct request_queue *scsi_alloc_queue(struct scsi_device *sdev)
 	return q;
 }
 
+static struct blk_mq_ops scsi_mq_ops = {
+	.queue_rq	= scsi_mq_queue_rq,
+	.map_queue	= blk_mq_map_queue,
+	.alloc_hctx	= blk_mq_alloc_single_hw_queue,
+	.free_hctx	= blk_mq_free_single_hw_queue,
+};
+
+struct request_queue *scsi_mq_alloc_queue(struct scsi_device *sdev)
+{
+	struct Scsi_Host *shost = sdev->host;
+	struct blk_mq_hw_ctx *hctx;
+	struct request_queue *q;
+	struct request *rq;
+	struct scsi_cmnd *cmd;
+	struct blk_mq_reg reg;
+	int i, j, sgl_size;
+
+	memset(&reg, 0, sizeof(reg));
+	reg.ops = &scsi_mq_ops;
+	reg.queue_depth = shost->cmd_per_lun;
+	if (!reg.queue_depth)
+		reg.queue_depth = 1;
+
+	/* XXX: what to do about chained S/G lists? */
+	if (shost->hostt->sg_tablesize > SCSI_MAX_SG_SEGMENTS)
+		shost->sg_tablesize = SCSI_MAX_SG_SEGMENTS;
+	sgl_size = shost->sg_tablesize * sizeof(struct scatterlist);
+
+	reg.cmd_size = sizeof(struct scsi_cmnd) +
+			sgl_size +
+			shost->hostt->cmd_size;
+	if (scsi_host_get_prot(shost))
+		reg.cmd_size += sizeof(struct scsi_data_buffer) + sgl_size;
+	reg.numa_node = NUMA_NO_NODE;
+	reg.nr_hw_queues = 1;
+	reg.flags = BLK_MQ_F_SHOULD_MERGE;
+
+	q = blk_mq_init_queue(&reg, sdev);
+	if (IS_ERR(q)) {
+		printk("blk_mq_init_queue failed\n");
+		return NULL;
+	}
+
+	blk_queue_prep_rq(q, scsi_prep_fn);
+	sdev->request_queue = q;
+	q->queuedata = sdev;
+
+	__scsi_init_queue(shost, q);
+
+	/*
+	 * XXX: figure out if we can get alignment right to allocate the sense
+	 * buffer with the other chunks of memory.
+	 *
+	 * If not we'll need to find a way to have the blk-mq core call us to
+	 * allocate/free commands so that we can properly clean up the
+	 * allocation instead of leaking it.
+	 */
+	queue_for_each_hw_ctx(q, hctx, i) {
+		for (j = 0; j < hctx->queue_depth; j++) {
+			rq = hctx->rqs[j];
+			cmd = rq->special;
+
+			cmd->sense_buffer = kzalloc_node(SCSI_SENSE_BUFFERSIZE,
+					   GFP_KERNEL, reg.numa_node);
+			if (!cmd->sense_buffer)
+				goto out_free_sense_buffers;
+		}
+	}
+
+	rq = q->flush_rq;
+	cmd = blk_mq_rq_to_pdu(rq);
+
+	cmd->sense_buffer = kzalloc_node(SCSI_SENSE_BUFFERSIZE,
+					   GFP_KERNEL, reg.numa_node);
+	if (!cmd->sense_buffer)
+		goto out_free_sense_buffers;
+
+	return q;
+
+out_free_sense_buffers:
+	queue_for_each_hw_ctx(q, hctx, i) {
+		for (j = 0; j < hctx->queue_depth; j++) {
+			rq = hctx->rqs[j];
+			cmd = rq->special;
+
+			kfree(cmd->sense_buffer);
+		}
+	}
+
+	blk_cleanup_queue(q);
+	return NULL;
+}
+
 /*
  * Function:    scsi_block_requests()
  *
diff --git a/drivers/scsi/scsi_priv.h b/drivers/scsi/scsi_priv.h
index f079a59..712cec2 100644
--- a/drivers/scsi/scsi_priv.h
+++ b/drivers/scsi/scsi_priv.h
@@ -88,8 +88,10 @@ extern void scsi_next_command(struct scsi_cmnd *cmd);
 extern void scsi_io_completion(struct scsi_cmnd *, unsigned int);
 extern void scsi_run_host_queues(struct Scsi_Host *shost);
 extern struct request_queue *scsi_alloc_queue(struct scsi_device *sdev);
+extern struct request_queue *scsi_mq_alloc_queue(struct scsi_device *sdev);
 extern int scsi_init_queue(void);
 extern void scsi_exit_queue(void);
+extern void scsi_softirq_done(struct request *rq);
 struct request_queue;
 struct request;
 extern struct kmem_cache *scsi_sdb_cache;
diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c
index 307a811..c807bc2 100644
--- a/drivers/scsi/scsi_scan.c
+++ b/drivers/scsi/scsi_scan.c
@@ -277,7 +277,10 @@ static struct scsi_device *scsi_alloc_sdev(struct scsi_target *starget,
 	 */
 	sdev->borken = 1;
 
-	sdev->request_queue = scsi_alloc_queue(sdev);
+	if (shost->hostt->use_blk_mq)
+		sdev->request_queue = scsi_mq_alloc_queue(sdev);
+	else
+		sdev->request_queue = scsi_alloc_queue(sdev);
 	if (!sdev->request_queue) {
 		/* release fn is set up in scsi_sysfs_device_initialise, so
 		 * have to free and put manually here */
diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h
index c4e4875..d2661cb 100644
--- a/include/scsi/scsi_host.h
+++ b/include/scsi/scsi_host.h
@@ -531,6 +531,9 @@ struct scsi_host_template {
 	 */
 	unsigned int cmd_size;
 	struct scsi_host_cmd_pool *cmd_pool;
+
+	/* temporary flag to use blk-mq I/O path */
+	bool use_blk_mq;
 };
 
 /*
-- 
1.7.10.4



^ permalink raw reply related	[flat|nested] 31+ messages in thread

* Re: [PATCH 12/15] scsi: initial blk-mq support
  2014-02-05 12:41 ` [PATCH 12/15] scsi: initial blk-mq support Christoph Hellwig
@ 2014-02-06  8:38   ` Sagi Grimberg
  2014-02-06 16:16     ` Christoph Hellwig
  2014-02-06 22:11   ` Nicholas A. Bellinger
  1 sibling, 1 reply; 31+ messages in thread
From: Sagi Grimberg @ 2014-02-06  8:38 UTC (permalink / raw)
  To: Christoph Hellwig, Jens Axboe, James Bottomley,
	Nicholas Bellinger
  Cc: linux-scsi

On 2/5/2014 2:41 PM, Christoph Hellwig wrote:
> Add support for using the blk-mq code to submit requests to SCSI
> drivers.  There is very little blk-mq specific code, but that's
> partially because important functionality like partial completions
> and request requeueing is still missing in blk-mq.  I hope to keep
> most of the additions for these in the blk-mq core instead of the
> SCSI layer, though.
>
> Based on the earlier scsi-mq prototype by Nicholas Bellinger, although
> not a whole lot of actual code is left.
>
> Not-quite-signed-off-yet-by: Christoph Hellwig <hch@lst.de>
> ---
>   drivers/scsi/scsi.c      |   36 ++++++-
>   drivers/scsi/scsi_lib.c  |  244 ++++++++++++++++++++++++++++++++++++++++++++--
>   drivers/scsi/scsi_priv.h |    2 +
>   drivers/scsi/scsi_scan.c |    5 +-
>   include/scsi/scsi_host.h |    3 +
>   5 files changed, 278 insertions(+), 12 deletions(-)
>
> diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c
> index adb8bfb..cf5c110 100644
> --- a/drivers/scsi/scsi.c
> +++ b/drivers/scsi/scsi.c
> @@ -44,6 +44,7 @@
>   #include <linux/string.h>
>   #include <linux/slab.h>
>   #include <linux/blkdev.h>
> +#include <linux/blk-mq.h>
>   #include <linux/delay.h>
>   #include <linux/init.h>
>   #include <linux/completion.h>
> @@ -688,6 +689,33 @@ int scsi_dispatch_cmd(struct scsi_cmnd *cmd)
>   	return 0;
>   }
>   
> +static void scsi_softirq_done_remote(void *data)
> +{
> +	return scsi_softirq_done(data);
> +}
> +
> +static void scsi_mq_done(struct request *req)
> +{
> +	int cpu;
> +
> +#if 0
> +	if (!ctx->ipi_redirect)
> +		return scsi_softirq_done(cmd);
> +#endif
> +
> +	cpu = get_cpu();
> +	if (cpu != req->cpu && cpu_online(req->cpu)) {
> +		req->csd.func = scsi_softirq_done_remote;
> +		req->csd.info = req;
> +		req->csd.flags = 0;
> +		__smp_call_function_single(req->cpu, &req->csd, 0);
> +	} else {
> +		scsi_softirq_done(req);
> +	}
> +
> +	put_cpu();
> +}
> +
>   /**
>    * scsi_done - Invoke completion on finished SCSI command.
>    * @cmd: The SCSI Command for which a low-level device driver (LLDD) gives
> @@ -701,8 +729,14 @@ int scsi_dispatch_cmd(struct scsi_cmnd *cmd)
>    */
>   static void scsi_done(struct scsi_cmnd *cmd)
>   {
> +	struct request *req = cmd->request;
> +
>   	trace_scsi_dispatch_cmd_done(cmd);
> -	blk_complete_request(cmd->request);
> +
> +	if (req->mq_ctx)
> +		scsi_mq_done(req);
> +	else
> +		blk_complete_request(req);
>   }
>   
>   /**
> diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
> index e67950c..8dd8893 100644
> --- a/drivers/scsi/scsi_lib.c
> +++ b/drivers/scsi/scsi_lib.c
> @@ -20,6 +20,7 @@
>   #include <linux/delay.h>
>   #include <linux/hardirq.h>
>   #include <linux/scatterlist.h>
> +#include <linux/blk-mq.h>
>   
>   #include <scsi/scsi.h>
>   #include <scsi/scsi_cmnd.h>
> @@ -554,6 +555,15 @@ static bool scsi_end_request(struct scsi_cmnd *cmd, int error, int bytes,
>   	struct request *req = cmd->request;
>   
>   	/*
> +	 * XXX: need to handle partial completions and retries here.
> +	 */
> +	if (req->mq_ctx) {
> +		blk_mq_end_io(req, error);
> +		put_device(&cmd->device->sdev_gendev);
> +		return true;
> +	}
> +
> +	/*
>   	 * If there are blocks left over at the end, set up the command
>   	 * to queue the remainder of them.
>   	 */
> @@ -1014,12 +1024,15 @@ static int scsi_init_sgtable(struct request *req, struct scsi_data_buffer *sdb,
>   {
>   	int count;
>   
> -	/*
> -	 * If sg table allocation fails, requeue request later.
> -	 */
> -	if (unlikely(scsi_alloc_sgtable(sdb, req->nr_phys_segments,
> -					gfp_mask))) {
> -		return BLKPREP_DEFER;
> +	BUG_ON(req->nr_phys_segments > SCSI_MAX_SG_SEGMENTS);
> +
> +	if (!req->mq_ctx) {
> +		/*
> +		 * If sg table allocation fails, requeue request later.
> +		 */
> +		if (unlikely(scsi_alloc_sgtable(sdb, req->nr_phys_segments,
> +						gfp_mask)))
> +			return BLKPREP_DEFER;
>   	}
>   
>   	req->buffer = NULL;
> @@ -1075,9 +1088,11 @@ int scsi_init_io(struct scsi_cmnd *cmd, gfp_t gfp_mask)
>   		BUG_ON(prot_sdb == NULL);
>   		ivecs = blk_rq_count_integrity_sg(rq->q, rq->bio);
>   
> -		if (scsi_alloc_sgtable(prot_sdb, ivecs, gfp_mask)) {
> -			error = BLKPREP_DEFER;
> -			goto err_exit;
> +		if (!rq->mq_ctx) {
> +			if (scsi_alloc_sgtable(prot_sdb, ivecs, gfp_mask)) {
> +				error = BLKPREP_DEFER;
> +				goto err_exit;
> +			}
>   		}
>   
>   		count = blk_rq_map_integrity_sg(rq->q, rq->bio,
> @@ -1505,7 +1520,7 @@ static void scsi_kill_request(struct request *req, struct request_queue *q)
>   	blk_complete_request(req);
>   }
>   
> -static void scsi_softirq_done(struct request *rq)
> +void scsi_softirq_done(struct request *rq)
>   {
>   	struct scsi_cmnd *cmd = rq->special;
>   	unsigned long wait_for = (cmd->allowed + 1) * rq->timeout;
> @@ -1533,9 +1548,11 @@ static void scsi_softirq_done(struct request *rq)
>   			scsi_finish_command(cmd);
>   			break;
>   		case NEEDS_RETRY:
> +			WARN_ON(rq->mq_ctx);
>   			scsi_queue_insert(cmd, SCSI_MLQUEUE_EH_RETRY);
>   			break;
>   		case ADD_TO_MLQUEUE:
> +			WARN_ON(rq->mq_ctx);
>   			scsi_queue_insert(cmd, SCSI_MLQUEUE_DEVICE_BUSY);
>   			break;
>   		default:
> @@ -1668,6 +1685,120 @@ out_delay:
>   		blk_delay_queue(q, SCSI_QUEUE_DELAY);
>   }
>   
> +static int scsi_mq_prep_fn(struct request *req)
> +{
> +	struct scsi_cmnd *cmd = req->special;
> +	int ret;
> +
> +	ret = scsi_prep_state_check(cmd->device, req);
> +	if (ret != BLKPREP_OK)
> +		goto out;
> +
> +	if (req->cmd_type == REQ_TYPE_FS)
> +		ret = scsi_cmd_to_driver(cmd)->init_command(cmd);
> +	else if (req->cmd_type == REQ_TYPE_BLOCK_PC)
> +		ret = scsi_setup_blk_pc_cmnd(cmd->device, req);
> +	else
> +		ret = BLKPREP_KILL;
> +
> +out:
> +	switch (ret) {
> +	case BLKPREP_OK:
> +		return 0;
> +	case BLKPREP_DEFER:
> +		return BLK_MQ_RQ_QUEUE_BUSY;
> +	default:
> +		req->errors = DID_NO_CONNECT << 16;
> +		return BLK_MQ_RQ_QUEUE_ERROR;
> +	}
> +}
> +
> +static int scsi_mq_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq)
> +{
> +	struct request_queue *q = rq->q;
> +	struct scsi_device *sdev = q->queuedata;
> +	struct Scsi_Host *shost = sdev->host;
> +	struct scsi_cmnd *cmd = rq->special;
> +	unsigned char *sense_buf = cmd->sense_buffer;
> +	struct scatterlist *sg;
> +	int ret = BLK_MQ_RQ_QUEUE_BUSY;
> +	int reason;
> +
> +	/*
> +	 * blk-mq stores this in the mq_ctx, which can't be derferenced by
> +	 * drivers.  For now use the old per-request field, but there must be
> +	 * a better way.
> +	 */
> +	rq->cpu = raw_smp_processor_id();
> +
> +	if (!get_device(&sdev->sdev_gendev))
> +		goto out;
> +
> +	if (!scsi_dev_queue_ready(q, sdev))
> +		goto out_put_device;
> +	if (!scsi_target_queue_ready(shost, sdev))
> +		goto out_dec_device_busy;
> +	if (!scsi_host_queue_ready(q, shost, sdev))
> +		goto out_dec_target_busy;
> +
> +	memset(sense_buf, 0, SCSI_SENSE_BUFFERSIZE);
> +	memset(cmd, 0, sizeof(struct scsi_cmnd));
> +
> +	cmd->request = rq;
> +	cmd->device = sdev;
> +	cmd->sense_buffer = sense_buf;
> +
> +	cmd->tag = rq->tag;
> +	cmd->cmnd = rq->cmd;
> +	cmd->prot_op = SCSI_PROT_NORMAL;
> +
> +	sg = (void *)cmd + sizeof(struct scsi_cmnd) + shost->hostt->cmd_size;
> +
> +	if (rq->nr_phys_segments) {
> +		cmd->sdb.table.sgl = sg;
> +		cmd->sdb.table.nents = rq->nr_phys_segments;
> +		sg_init_table(cmd->sdb.table.sgl, rq->nr_phys_segments);
> +	}
> +
> +	if (scsi_host_get_prot(shost)) {
> +		cmd->prot_sdb = (void *)sg +
> +			shost->sg_tablesize * sizeof(struct scatterlist);
> +		memset(cmd->prot_sdb, 0, sizeof(struct scsi_data_buffer));
> +
> +		cmd->prot_sdb->table.sgl =
> +			(struct scatterlist *)(cmd->prot_sdb + 1);
> +	}
> +
> +	ret = scsi_mq_prep_fn(rq);
> +	if (ret)
> +		goto out_dec_host_busy;
> +
> +	scsi_init_cmd_errh(cmd);
> +
> +	reason = scsi_dispatch_cmd(cmd);
> +	if (reason) {
> +		scsi_set_blocked(cmd, reason);
> +		goto out_uninit;
> +	}
> +
> +	return BLK_MQ_RQ_QUEUE_OK;
> +
> +out_uninit:
> +	if (rq->cmd_type == REQ_TYPE_FS)
> +		scsi_cmd_to_driver(cmd)->uninit_command(cmd);
> +out_dec_host_busy:
> +	atomic_dec(&shost->host_busy);
> +out_dec_target_busy:
> +	atomic_dec(&scsi_target(sdev)->target_busy);
> +out_dec_device_busy:
> +	atomic_dec(&sdev->device_busy);
> +	/* XXX: delay queue if device_busy == 0 */
> +out_put_device:
> +	put_device(&sdev->sdev_gendev);
> +out:
> +	return ret;
> +}
> +
>   u64 scsi_calculate_bounce_limit(struct Scsi_Host *shost)
>   {
>   	struct device *host_dev;
> @@ -1754,6 +1885,99 @@ struct request_queue *scsi_alloc_queue(struct scsi_device *sdev)
>   	return q;
>   }
>   
> +static struct blk_mq_ops scsi_mq_ops = {
> +	.queue_rq	= scsi_mq_queue_rq,
> +	.map_queue	= blk_mq_map_queue,
> +	.alloc_hctx	= blk_mq_alloc_single_hw_queue,
> +	.free_hctx	= blk_mq_free_single_hw_queue,
> +};
> +
> +struct request_queue *scsi_mq_alloc_queue(struct scsi_device *sdev)
> +{
> +	struct Scsi_Host *shost = sdev->host;
> +	struct blk_mq_hw_ctx *hctx;
> +	struct request_queue *q;
> +	struct request *rq;
> +	struct scsi_cmnd *cmd;
> +	struct blk_mq_reg reg;
> +	int i, j, sgl_size;
> +
> +	memset(&reg, 0, sizeof(reg));
> +	reg.ops = &scsi_mq_ops;
> +	reg.queue_depth = shost->cmd_per_lun;
> +	if (!reg.queue_depth)
> +		reg.queue_depth = 1;
> +
> +	/* XXX: what to do about chained S/G lists? */
> +	if (shost->hostt->sg_tablesize > SCSI_MAX_SG_SEGMENTS)
> +		shost->sg_tablesize = SCSI_MAX_SG_SEGMENTS;
> +	sgl_size = shost->sg_tablesize * sizeof(struct scatterlist);
> +
> +	reg.cmd_size = sizeof(struct scsi_cmnd) +
> +			sgl_size +
> +			shost->hostt->cmd_size;
> +	if (scsi_host_get_prot(shost))
> +		reg.cmd_size += sizeof(struct scsi_data_buffer) + sgl_size;
> +	reg.numa_node = NUMA_NO_NODE;
> +	reg.nr_hw_queues = 1;

Hey Christoph,

I just started to look at mq on Nic's WIP branch. I have a pretty basic 
question.

Both you and Nic offer a single HW queue per sdev.
I'm wandering if that should be the LLD's decision (if chooses to use 
multiple queues)?

Trying to understand how LLDs will fit in a way they exploit multi-queue 
and actually
maintain multiple queues. SRP/iSER for example maintain a single queue 
per connection
(or session in iSCSI). Now with multi-queue all requests of that shost 
will eventually
boil-down to posting on a single queue which might transition the 
bottleneck to the LLDs.

I noticed virtio_scsi implementation is choosing a queue per command 
based on current
processor id without any explicit mapping (unless I missed it).

I guess my question is where do (or should) LLDs plug-in to this mq scheme?

Thanks,
Sagi.

> +	reg.flags = BLK_MQ_F_SHOULD_MERGE;
> +
> +	q = blk_mq_init_queue(&reg, sdev);
> +	if (IS_ERR(q)) {
> +		printk("blk_mq_init_queue failed\n");
> +		return NULL;
> +	}
> +
> +	blk_queue_prep_rq(q, scsi_prep_fn);
> +	sdev->request_queue = q;
> +	q->queuedata = sdev;
> +
> +	__scsi_init_queue(shost, q);
> +
> +	/*
> +	 * XXX: figure out if we can get alignment right to allocate the sense
> +	 * buffer with the other chunks of memory.
> +	 *
> +	 * If not we'll need to find a way to have the blk-mq core call us to
> +	 * allocate/free commands so that we can properly clean up the
> +	 * allocation instead of leaking it.
> +	 */
> +	queue_for_each_hw_ctx(q, hctx, i) {
> +		for (j = 0; j < hctx->queue_depth; j++) {
> +			rq = hctx->rqs[j];
> +			cmd = rq->special;
> +
> +			cmd->sense_buffer = kzalloc_node(SCSI_SENSE_BUFFERSIZE,
> +					   GFP_KERNEL, reg.numa_node);
> +			if (!cmd->sense_buffer)
> +				goto out_free_sense_buffers;
> +		}
> +	}
> +
> +	rq = q->flush_rq;
> +	cmd = blk_mq_rq_to_pdu(rq);
> +
> +	cmd->sense_buffer = kzalloc_node(SCSI_SENSE_BUFFERSIZE,
> +					   GFP_KERNEL, reg.numa_node);
> +	if (!cmd->sense_buffer)
> +		goto out_free_sense_buffers;
> +
> +	return q;
> +
> +out_free_sense_buffers:
> +	queue_for_each_hw_ctx(q, hctx, i) {
> +		for (j = 0; j < hctx->queue_depth; j++) {
> +			rq = hctx->rqs[j];
> +			cmd = rq->special;
> +
> +			kfree(cmd->sense_buffer);
> +		}
> +	}
> +
> +	blk_cleanup_queue(q);
> +	return NULL;
> +}
> +
>   /*
>    * Function:    scsi_block_requests()
>    *
> diff --git a/drivers/scsi/scsi_priv.h b/drivers/scsi/scsi_priv.h
> index f079a59..712cec2 100644
> --- a/drivers/scsi/scsi_priv.h
> +++ b/drivers/scsi/scsi_priv.h
> @@ -88,8 +88,10 @@ extern void scsi_next_command(struct scsi_cmnd *cmd);
>   extern void scsi_io_completion(struct scsi_cmnd *, unsigned int);
>   extern void scsi_run_host_queues(struct Scsi_Host *shost);
>   extern struct request_queue *scsi_alloc_queue(struct scsi_device *sdev);
> +extern struct request_queue *scsi_mq_alloc_queue(struct scsi_device *sdev);
>   extern int scsi_init_queue(void);
>   extern void scsi_exit_queue(void);
> +extern void scsi_softirq_done(struct request *rq);
>   struct request_queue;
>   struct request;
>   extern struct kmem_cache *scsi_sdb_cache;
> diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c
> index 307a811..c807bc2 100644
> --- a/drivers/scsi/scsi_scan.c
> +++ b/drivers/scsi/scsi_scan.c
> @@ -277,7 +277,10 @@ static struct scsi_device *scsi_alloc_sdev(struct scsi_target *starget,
>   	 */
>   	sdev->borken = 1;
>   
> -	sdev->request_queue = scsi_alloc_queue(sdev);
> +	if (shost->hostt->use_blk_mq)
> +		sdev->request_queue = scsi_mq_alloc_queue(sdev);
> +	else
> +		sdev->request_queue = scsi_alloc_queue(sdev);
>   	if (!sdev->request_queue) {
>   		/* release fn is set up in scsi_sysfs_device_initialise, so
>   		 * have to free and put manually here */
> diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h
> index c4e4875..d2661cb 100644
> --- a/include/scsi/scsi_host.h
> +++ b/include/scsi/scsi_host.h
> @@ -531,6 +531,9 @@ struct scsi_host_template {
>   	 */
>   	unsigned int cmd_size;
>   	struct scsi_host_cmd_pool *cmd_pool;
> +
> +	/* temporary flag to use blk-mq I/O path */
> +	bool use_blk_mq;
>   };
>   
>   /*


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 12/15] scsi: initial blk-mq support
  2014-02-06  8:38   ` Sagi Grimberg
@ 2014-02-06 16:16     ` Christoph Hellwig
  0 siblings, 0 replies; 31+ messages in thread
From: Christoph Hellwig @ 2014-02-06 16:16 UTC (permalink / raw)
  To: Sagi Grimberg; +Cc: Jens Axboe, James Bottomley, Nicholas Bellinger, linux-scsi

On Thu, Feb 06, 2014 at 10:38:17AM +0200, Sagi Grimberg wrote:
> Both you and Nic offer a single HW queue per sdev.
> I'm wandering if that should be the LLD's decision (if chooses to
> use multiple queues)?
> 
> Trying to understand how LLDs will fit in a way they exploit
> multi-queue and actually
> maintain multiple queues. SRP/iSER for example maintain a single
> queue per connection
> (or session in iSCSI). Now with multi-queue all requests of that
> shost will eventually
> boil-down to posting on a single queue which might transition the
> bottleneck to the LLDs.
> 
> I noticed virtio_scsi implementation is choosing a queue per command
> based on current
> processor id without any explicit mapping (unless I missed it).
> 
> I guess my question is where do (or should) LLDs plug-in to this mq scheme?

Just using blk-mq helps with lock contention and cacheline issues, while
being conceptually simple, that's why it's the priority.  See the
proposal I sent before the patch series for more details.

That being said if you have simple enough patches for real multiqueue
support I'd be more than happy to carry them along.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 12/15] scsi: initial blk-mq support
  2014-02-05 12:41 ` [PATCH 12/15] scsi: initial blk-mq support Christoph Hellwig
  2014-02-06  8:38   ` Sagi Grimberg
@ 2014-02-06 22:11   ` Nicholas A. Bellinger
  2014-02-07  8:45     ` Mike Christie
  2014-02-07 12:51     ` Christoph Hellwig
  1 sibling, 2 replies; 31+ messages in thread
From: Nicholas A. Bellinger @ 2014-02-06 22:11 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Jens Axboe, James Bottomley, linux-scsi

On Wed, 2014-02-05 at 04:41 -0800, Christoph Hellwig wrote:
> plain text document attachment
> (0012-scsi-initial-blk-mq-support.patch)
> Add support for using the blk-mq code to submit requests to SCSI
> drivers.  There is very little blk-mq specific code, but that's
> partially because important functionality like partial completions
> and request requeueing is still missing in blk-mq.  I hope to keep
> most of the additions for these in the blk-mq core instead of the
> SCSI layer, though.
> 
> Based on the earlier scsi-mq prototype by Nicholas Bellinger, although
> not a whole lot of actual code is left.
> 
> Not-quite-signed-off-yet-by: Christoph Hellwig <hch@lst.de>
> ---
>  drivers/scsi/scsi.c      |   36 ++++++-
>  drivers/scsi/scsi_lib.c  |  244 ++++++++++++++++++++++++++++++++++++++++++++--
>  drivers/scsi/scsi_priv.h |    2 +
>  drivers/scsi/scsi_scan.c |    5 +-
>  include/scsi/scsi_host.h |    3 +
>  5 files changed, 278 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c
> index adb8bfb..cf5c110 100644
> --- a/drivers/scsi/scsi.c
> +++ b/drivers/scsi/scsi.c
> @@ -44,6 +44,7 @@
>  #include <linux/string.h>
>  #include <linux/slab.h>
>  #include <linux/blkdev.h>
> +#include <linux/blk-mq.h>
>  #include <linux/delay.h>
>  #include <linux/init.h>
>  #include <linux/completion.h>
> @@ -688,6 +689,33 @@ int scsi_dispatch_cmd(struct scsi_cmnd *cmd)
>  	return 0;
>  }
>  
> +static void scsi_softirq_done_remote(void *data)
> +{
> +	return scsi_softirq_done(data);
> +}
> +
> +static void scsi_mq_done(struct request *req)
> +{
> +	int cpu;
> +
> +#if 0
> +	if (!ctx->ipi_redirect)
> +		return scsi_softirq_done(cmd);
> +#endif
> +
> +	cpu = get_cpu();
> +	if (cpu != req->cpu && cpu_online(req->cpu)) {
> +		req->csd.func = scsi_softirq_done_remote;
> +		req->csd.info = req;
> +		req->csd.flags = 0;
> +		__smp_call_function_single(req->cpu, &req->csd, 0);
> +	} else {
> +		scsi_softirq_done(req);
> +	}
> +
> +	put_cpu();
> +}
> +
>  /**
>   * scsi_done - Invoke completion on finished SCSI command.
>   * @cmd: The SCSI Command for which a low-level device driver (LLDD) gives
> @@ -701,8 +729,14 @@ int scsi_dispatch_cmd(struct scsi_cmnd *cmd)
>   */
>  static void scsi_done(struct scsi_cmnd *cmd)
>  {
> +	struct request *req = cmd->request;
> +
>  	trace_scsi_dispatch_cmd_done(cmd);
> -	blk_complete_request(cmd->request);
> +
> +	if (req->mq_ctx)
> +		scsi_mq_done(req);
> +	else
> +		blk_complete_request(req);
>  }
>  

Is there extra scsi_mq_done() part that does IPI here even necessary
anymore..?

I was under the assumption that blk_mq_end_io() is already taking care
of this..?

>  /**
> diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
> index e67950c..8dd8893 100644
> --- a/drivers/scsi/scsi_lib.c
> +++ b/drivers/scsi/scsi_lib.c
> @@ -20,6 +20,7 @@
>  #include <linux/delay.h>
>  #include <linux/hardirq.h>
>  #include <linux/scatterlist.h>
> +#include <linux/blk-mq.h>
>  
>  #include <scsi/scsi.h>
>  #include <scsi/scsi_cmnd.h>
> @@ -554,6 +555,15 @@ static bool scsi_end_request(struct scsi_cmnd *cmd, int error, int bytes,
>  	struct request *req = cmd->request;
>  
>  	/*
> +	 * XXX: need to handle partial completions and retries here.
> +	 */
> +	if (req->mq_ctx) {
> +		blk_mq_end_io(req, error);
> +		put_device(&cmd->device->sdev_gendev);
> +		return true;
> +	}
> +
> +	/*
>  	 * If there are blocks left over at the end, set up the command
>  	 * to queue the remainder of them.
>  	 */
> @@ -1014,12 +1024,15 @@ static int scsi_init_sgtable(struct request *req, struct scsi_data_buffer *sdb,
>  {
>  	int count;
>  
> -	/*
> -	 * If sg table allocation fails, requeue request later.
> -	 */
> -	if (unlikely(scsi_alloc_sgtable(sdb, req->nr_phys_segments,
> -					gfp_mask))) {
> -		return BLKPREP_DEFER;
> +	BUG_ON(req->nr_phys_segments > SCSI_MAX_SG_SEGMENTS);
> +
> +	if (!req->mq_ctx) {
> +		/*
> +		 * If sg table allocation fails, requeue request later.
> +		 */
> +		if (unlikely(scsi_alloc_sgtable(sdb, req->nr_phys_segments,
> +						gfp_mask)))
> +			return BLKPREP_DEFER;
>  	}
>  
>  	req->buffer = NULL;
> @@ -1075,9 +1088,11 @@ int scsi_init_io(struct scsi_cmnd *cmd, gfp_t gfp_mask)
>  		BUG_ON(prot_sdb == NULL);
>  		ivecs = blk_rq_count_integrity_sg(rq->q, rq->bio);
>  
> -		if (scsi_alloc_sgtable(prot_sdb, ivecs, gfp_mask)) {
> -			error = BLKPREP_DEFER;
> -			goto err_exit;
> +		if (!rq->mq_ctx) {
> +			if (scsi_alloc_sgtable(prot_sdb, ivecs, gfp_mask)) {
> +				error = BLKPREP_DEFER;
> +				goto err_exit;
> +			}
>  		}
>  
>  		count = blk_rq_map_integrity_sg(rq->q, rq->bio,
> @@ -1505,7 +1520,7 @@ static void scsi_kill_request(struct request *req, struct request_queue *q)
>  	blk_complete_request(req);
>  }
>  
> -static void scsi_softirq_done(struct request *rq)
> +void scsi_softirq_done(struct request *rq)
>  {
>  	struct scsi_cmnd *cmd = rq->special;
>  	unsigned long wait_for = (cmd->allowed + 1) * rq->timeout;
> @@ -1533,9 +1548,11 @@ static void scsi_softirq_done(struct request *rq)
>  			scsi_finish_command(cmd);
>  			break;
>  		case NEEDS_RETRY:
> +			WARN_ON(rq->mq_ctx);
>  			scsi_queue_insert(cmd, SCSI_MLQUEUE_EH_RETRY);
>  			break;
>  		case ADD_TO_MLQUEUE:
> +			WARN_ON(rq->mq_ctx);
>  			scsi_queue_insert(cmd, SCSI_MLQUEUE_DEVICE_BUSY);
>  			break;
>  		default:
> @@ -1668,6 +1685,120 @@ out_delay:
>  		blk_delay_queue(q, SCSI_QUEUE_DELAY);
>  }
>  
> +static int scsi_mq_prep_fn(struct request *req)
> +{
> +	struct scsi_cmnd *cmd = req->special;
> +	int ret;
> +
> +	ret = scsi_prep_state_check(cmd->device, req);
> +	if (ret != BLKPREP_OK)
> +		goto out;
> +
> +	if (req->cmd_type == REQ_TYPE_FS)
> +		ret = scsi_cmd_to_driver(cmd)->init_command(cmd);
> +	else if (req->cmd_type == REQ_TYPE_BLOCK_PC)
> +		ret = scsi_setup_blk_pc_cmnd(cmd->device, req);
> +	else
> +		ret = BLKPREP_KILL;
> +
> +out:
> +	switch (ret) {
> +	case BLKPREP_OK:
> +		return 0;
> +	case BLKPREP_DEFER:
> +		return BLK_MQ_RQ_QUEUE_BUSY;
> +	default:
> +		req->errors = DID_NO_CONNECT << 16;
> +		return BLK_MQ_RQ_QUEUE_ERROR;
> +	}
> +}
> +
> +static int scsi_mq_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq)
> +{
> +	struct request_queue *q = rq->q;
> +	struct scsi_device *sdev = q->queuedata;
> +	struct Scsi_Host *shost = sdev->host;
> +	struct scsi_cmnd *cmd = rq->special;
> +	unsigned char *sense_buf = cmd->sense_buffer;
> +	struct scatterlist *sg;
> +	int ret = BLK_MQ_RQ_QUEUE_BUSY;
> +	int reason;
> +
> +	/*
> +	 * blk-mq stores this in the mq_ctx, which can't be derferenced by
> +	 * drivers.  For now use the old per-request field, but there must be
> +	 * a better way.
> +	 */
> +	rq->cpu = raw_smp_processor_id();
> +
> +	if (!get_device(&sdev->sdev_gendev))
> +		goto out;
> +
> +	if (!scsi_dev_queue_ready(q, sdev))
> +		goto out_put_device;
> +	if (!scsi_target_queue_ready(shost, sdev))
> +		goto out_dec_device_busy;
> +	if (!scsi_host_queue_ready(q, shost, sdev))
> +		goto out_dec_target_busy;
> +
> +	memset(sense_buf, 0, SCSI_SENSE_BUFFERSIZE);
> +	memset(cmd, 0, sizeof(struct scsi_cmnd));
> +
> +	cmd->request = rq;
> +	cmd->device = sdev;
> +	cmd->sense_buffer = sense_buf;
> +
> +	cmd->tag = rq->tag;
> +	cmd->cmnd = rq->cmd;
> +	cmd->prot_op = SCSI_PROT_NORMAL;
> +
> +	sg = (void *)cmd + sizeof(struct scsi_cmnd) + shost->hostt->cmd_size;
> +
> +	if (rq->nr_phys_segments) {
> +		cmd->sdb.table.sgl = sg;
> +		cmd->sdb.table.nents = rq->nr_phys_segments;
> +		sg_init_table(cmd->sdb.table.sgl, rq->nr_phys_segments);
> +	}
> +
> +	if (scsi_host_get_prot(shost)) {
> +		cmd->prot_sdb = (void *)sg +
> +			shost->sg_tablesize * sizeof(struct scatterlist);
> +		memset(cmd->prot_sdb, 0, sizeof(struct scsi_data_buffer));
> +
> +		cmd->prot_sdb->table.sgl =
> +			(struct scatterlist *)(cmd->prot_sdb + 1);
> +	}
> +
> +	ret = scsi_mq_prep_fn(rq);
> +	if (ret)
> +		goto out_dec_host_busy;
> +
> +	scsi_init_cmd_errh(cmd);
> +
> +	reason = scsi_dispatch_cmd(cmd);
> +	if (reason) {
> +		scsi_set_blocked(cmd, reason);
> +		goto out_uninit;
> +	}
> +
> +	return BLK_MQ_RQ_QUEUE_OK;
> +
> +out_uninit:
> +	if (rq->cmd_type == REQ_TYPE_FS)
> +		scsi_cmd_to_driver(cmd)->uninit_command(cmd);
> +out_dec_host_busy:
> +	atomic_dec(&shost->host_busy);
> +out_dec_target_busy:
> +	atomic_dec(&scsi_target(sdev)->target_busy);
> +out_dec_device_busy:
> +	atomic_dec(&sdev->device_busy);
> +	/* XXX: delay queue if device_busy == 0 */
> +out_put_device:
> +	put_device(&sdev->sdev_gendev);
> +out:
> +	return ret;
> +}
> +
>  u64 scsi_calculate_bounce_limit(struct Scsi_Host *shost)
>  {
>  	struct device *host_dev;
> @@ -1754,6 +1885,99 @@ struct request_queue *scsi_alloc_queue(struct scsi_device *sdev)
>  	return q;
>  }
>  
> +static struct blk_mq_ops scsi_mq_ops = {
> +	.queue_rq	= scsi_mq_queue_rq,
> +	.map_queue	= blk_mq_map_queue,
> +	.alloc_hctx	= blk_mq_alloc_single_hw_queue,
> +	.free_hctx	= blk_mq_free_single_hw_queue,
> +};
> +
> +struct request_queue *scsi_mq_alloc_queue(struct scsi_device *sdev)
> +{
> +	struct Scsi_Host *shost = sdev->host;
> +	struct blk_mq_hw_ctx *hctx;
> +	struct request_queue *q;
> +	struct request *rq;
> +	struct scsi_cmnd *cmd;
> +	struct blk_mq_reg reg;
> +	int i, j, sgl_size;
> +
> +	memset(&reg, 0, sizeof(reg));
> +	reg.ops = &scsi_mq_ops;
> +	reg.queue_depth = shost->cmd_per_lun;
> +	if (!reg.queue_depth)
> +		reg.queue_depth = 1;
> +
> +	/* XXX: what to do about chained S/G lists? */
> +	if (shost->hostt->sg_tablesize > SCSI_MAX_SG_SEGMENTS)
> +		shost->sg_tablesize = SCSI_MAX_SG_SEGMENTS;
> +	sgl_size = shost->sg_tablesize * sizeof(struct scatterlist);
> +
> +	reg.cmd_size = sizeof(struct scsi_cmnd) +
> +			sgl_size +
> +			shost->hostt->cmd_size;
> +	if (scsi_host_get_prot(shost))
> +		reg.cmd_size += sizeof(struct scsi_data_buffer) + sgl_size;

OK, so your in-lining the allocation of data + protection SGLs from
blk-mq..

The original prototype code was doing these allocations separately below
for each pre-allocated cmd, and offering LLD's to optionally
pre-allocate their own descripts using sh->hostt->cmd_size if
necessary..

This was necessary to eliminate all fast-path allocations for
virtio-scsi, and I'd like to see something similar here as an optional
feature as well.

--nab

> +	reg.numa_node = NUMA_NO_NODE;
> +	reg.nr_hw_queues = 1;
> +	reg.flags = BLK_MQ_F_SHOULD_MERGE;
> +
> +	q = blk_mq_init_queue(&reg, sdev);
> +	if (IS_ERR(q)) {
> +		printk("blk_mq_init_queue failed\n");
> +		return NULL;
> +	}
> +
> +	blk_queue_prep_rq(q, scsi_prep_fn);
> +	sdev->request_queue = q;
> +	q->queuedata = sdev;
> +
> +	__scsi_init_queue(shost, q);
> +
> +	/*
> +	 * XXX: figure out if we can get alignment right to allocate the sense
> +	 * buffer with the other chunks of memory.
> +	 *
> +	 * If not we'll need to find a way to have the blk-mq core call us to
> +	 * allocate/free commands so that we can properly clean up the
> +	 * allocation instead of leaking it.
> +	 */
> +	queue_for_each_hw_ctx(q, hctx, i) {
> +		for (j = 0; j < hctx->queue_depth; j++) {
> +			rq = hctx->rqs[j];
> +			cmd = rq->special;
> +
> +			cmd->sense_buffer = kzalloc_node(SCSI_SENSE_BUFFERSIZE,
> +					   GFP_KERNEL, reg.numa_node);
> +			if (!cmd->sense_buffer)
> +				goto out_free_sense_buffers;
> +		}
> +	}
> +
> +	rq = q->flush_rq;
> +	cmd = blk_mq_rq_to_pdu(rq);
> +
> +	cmd->sense_buffer = kzalloc_node(SCSI_SENSE_BUFFERSIZE,
> +					   GFP_KERNEL, reg.numa_node);
> +	if (!cmd->sense_buffer)
> +		goto out_free_sense_buffers;
> +
> +	return q;
> +
> +out_free_sense_buffers:
> +	queue_for_each_hw_ctx(q, hctx, i) {
> +		for (j = 0; j < hctx->queue_depth; j++) {
> +			rq = hctx->rqs[j];
> +			cmd = rq->special;
> +
> +			kfree(cmd->sense_buffer);
> +		}
> +	}
> +
> +	blk_cleanup_queue(q);
> +	return NULL;
> +}
> +
>  /*
>   * Function:    scsi_block_requests()
>   *
> diff --git a/drivers/scsi/scsi_priv.h b/drivers/scsi/scsi_priv.h
> index f079a59..712cec2 100644
> --- a/drivers/scsi/scsi_priv.h
> +++ b/drivers/scsi/scsi_priv.h
> @@ -88,8 +88,10 @@ extern void scsi_next_command(struct scsi_cmnd *cmd);
>  extern void scsi_io_completion(struct scsi_cmnd *, unsigned int);
>  extern void scsi_run_host_queues(struct Scsi_Host *shost);
>  extern struct request_queue *scsi_alloc_queue(struct scsi_device *sdev);
> +extern struct request_queue *scsi_mq_alloc_queue(struct scsi_device *sdev);
>  extern int scsi_init_queue(void);
>  extern void scsi_exit_queue(void);
> +extern void scsi_softirq_done(struct request *rq);
>  struct request_queue;
>  struct request;
>  extern struct kmem_cache *scsi_sdb_cache;
> diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c
> index 307a811..c807bc2 100644
> --- a/drivers/scsi/scsi_scan.c
> +++ b/drivers/scsi/scsi_scan.c
> @@ -277,7 +277,10 @@ static struct scsi_device *scsi_alloc_sdev(struct scsi_target *starget,
>  	 */
>  	sdev->borken = 1;
>  
> -	sdev->request_queue = scsi_alloc_queue(sdev);
> +	if (shost->hostt->use_blk_mq)
> +		sdev->request_queue = scsi_mq_alloc_queue(sdev);
> +	else
> +		sdev->request_queue = scsi_alloc_queue(sdev);
>  	if (!sdev->request_queue) {
>  		/* release fn is set up in scsi_sysfs_device_initialise, so
>  		 * have to free and put manually here */
> diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h
> index c4e4875..d2661cb 100644
> --- a/include/scsi/scsi_host.h
> +++ b/include/scsi/scsi_host.h
> @@ -531,6 +531,9 @@ struct scsi_host_template {
>  	 */
>  	unsigned int cmd_size;
>  	struct scsi_host_cmd_pool *cmd_pool;
> +
> +	/* temporary flag to use blk-mq I/O path */
> +	bool use_blk_mq;
>  };
>  
>  /*



^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 12/15] scsi: initial blk-mq support
  2014-02-06 22:11   ` Nicholas A. Bellinger
@ 2014-02-07  8:45     ` Mike Christie
  2014-02-07 12:42       ` Christoph Hellwig
  2014-02-07 12:51     ` Christoph Hellwig
  1 sibling, 1 reply; 31+ messages in thread
From: Mike Christie @ 2014-02-07  8:45 UTC (permalink / raw)
  To: Nicholas A. Bellinger
  Cc: Christoph Hellwig, Jens Axboe, James Bottomley, linux-scsi

[-- Attachment #1: Type: text/plain, Size: 2072 bytes --]

On 02/06/2014 04:11 PM, Nicholas A. Bellinger wrote:
>> +struct request_queue *scsi_mq_alloc_queue(struct scsi_device *sdev)
>> > +{
>> > +	struct Scsi_Host *shost = sdev->host;
>> > +	struct blk_mq_hw_ctx *hctx;
>> > +	struct request_queue *q;
>> > +	struct request *rq;
>> > +	struct scsi_cmnd *cmd;
>> > +	struct blk_mq_reg reg;
>> > +	int i, j, sgl_size;
>> > +
>> > +	memset(&reg, 0, sizeof(reg));
>> > +	reg.ops = &scsi_mq_ops;
>> > +	reg.queue_depth = shost->cmd_per_lun;
>> > +	if (!reg.queue_depth)
>> > +		reg.queue_depth = 1;
>> > +
>> > +	/* XXX: what to do about chained S/G lists? */
>> > +	if (shost->hostt->sg_tablesize > SCSI_MAX_SG_SEGMENTS)
>> > +		shost->sg_tablesize = SCSI_MAX_SG_SEGMENTS;
>> > +	sgl_size = shost->sg_tablesize * sizeof(struct scatterlist);
>> > +
>> > +	reg.cmd_size = sizeof(struct scsi_cmnd) +
>> > +			sgl_size +
>> > +			shost->hostt->cmd_size;
>> > +	if (scsi_host_get_prot(shost))
>> > +		reg.cmd_size += sizeof(struct scsi_data_buffer) + sgl_size;
> OK, so your in-lining the allocation of data + protection SGLs from
> blk-mq..
> 
> The original prototype code was doing these allocations separately below
> for each pre-allocated cmd, and offering LLD's to optionally
> pre-allocate their own descripts using sh->hostt->cmd_size if
> necessary..
> 
> This was necessary to eliminate all fast-path allocations for
> virtio-scsi, and I'd like to see something similar here as an optional
> feature as well.

Yeah, it would be nice if like in Nick's patches, the driver could just
set the scsi_host_template->cmd_size then when the scsi_cmnd got to the
driver's queuecommand, the driver could just get its internal cmd struct
from the scsi_cmnd struct (for example in Nick's patch it was off the
SCp.ptr).

I started converting my iscsi mq patch from Nick's code to Christoph's
and am currently trying to figure out how to setup the
scsi_host_template->cmd_pool.

Current iscsi patch is attached if anyone cares.

However, one question I had with both approaches is how to deal with per
cmd pci/dma memory and preallocations.

[-- Attachment #2: iscsi-mq-cmd-size.patch --]
[-- Type: text/plain, Size: 28244 bytes --]

diff --git a/drivers/scsi/be2iscsi/be_main.c b/drivers/scsi/be2iscsi/be_main.c
index bc77a6f..1d0857c 100644
--- a/drivers/scsi/be2iscsi/be_main.c
+++ b/drivers/scsi/be2iscsi/be_main.c
@@ -320,10 +320,10 @@ static int beiscsi_eh_device_reset(struct scsi_cmnd *sc)
 	memset(inv_tbl, 0x0, sizeof(*inv_tbl) * BE2_CMDS_PER_CXN);
 	num_invalidate = 0;
 	for (i = 0; i < conn->session->cmds_max; i++) {
-		abrt_task = conn->session->cmds[i];
-		abrt_io_task = abrt_task->dd_data;
-		if (!abrt_task->sc || abrt_task->state == ISCSI_TASK_FREE)
+		abrt_task = conn->session->task_map[i];
+		if (!abrt_task || abrt_task->state != ISCSI_TASK_RUNNING)
 			continue;
+		abrt_io_task = abrt_task->dd_data;
 
 		if (sc->device->lun != abrt_task->sc->device->lun)
 			continue;
diff --git a/drivers/scsi/bnx2i/bnx2i_iscsi.c b/drivers/scsi/bnx2i/bnx2i_iscsi.c
index c00642f..34289b4 100644
--- a/drivers/scsi/bnx2i/bnx2i_iscsi.c
+++ b/drivers/scsi/bnx2i/bnx2i_iscsi.c
@@ -447,7 +447,7 @@ static int bnx2i_alloc_bdt(struct bnx2i_hba *hba, struct iscsi_session *session,
 
 	io->bd_tbl = dma_alloc_coherent(&hba->pcidev->dev,
 					ISCSI_MAX_BDS_PER_CMD * sizeof(*bd),
-					&io->bd_tbl_dma, GFP_KERNEL);
+					&io->bd_tbl_dma, GFP_ATOMIC);
 	if (!io->bd_tbl) {
 		iscsi_session_printk(KERN_ERR, session, "Could not "
 				     "allocate bdt.\n");
@@ -458,61 +458,6 @@ static int bnx2i_alloc_bdt(struct bnx2i_hba *hba, struct iscsi_session *session,
 }
 
 /**
- * bnx2i_destroy_cmd_pool - destroys iscsi command pool and release BD table
- * @hba:	adapter instance pointer
- * @session:	iscsi session pointer
- * @cmd:	iscsi command structure
- */
-static void bnx2i_destroy_cmd_pool(struct bnx2i_hba *hba,
-				   struct iscsi_session *session)
-{
-	int i;
-
-	for (i = 0; i < session->cmds_max; i++) {
-		struct iscsi_task *task = session->cmds[i];
-		struct bnx2i_cmd *cmd = task->dd_data;
-
-		if (cmd->io_tbl.bd_tbl)
-			dma_free_coherent(&hba->pcidev->dev,
-					  ISCSI_MAX_BDS_PER_CMD *
-					  sizeof(struct iscsi_bd),
-					  cmd->io_tbl.bd_tbl,
-					  cmd->io_tbl.bd_tbl_dma);
-	}
-
-}
-
-
-/**
- * bnx2i_setup_cmd_pool - sets up iscsi command pool for the session
- * @hba:	adapter instance pointer
- * @session:	iscsi session pointer
- */
-static int bnx2i_setup_cmd_pool(struct bnx2i_hba *hba,
-				struct iscsi_session *session)
-{
-	int i;
-
-	for (i = 0; i < session->cmds_max; i++) {
-		struct iscsi_task *task = session->cmds[i];
-		struct bnx2i_cmd *cmd = task->dd_data;
-
-		task->hdr = &cmd->hdr;
-		task->hdr_max = sizeof(struct iscsi_hdr);
-
-		if (bnx2i_alloc_bdt(hba, session, cmd))
-			goto free_bdts;
-	}
-
-	return 0;
-
-free_bdts:
-	bnx2i_destroy_cmd_pool(hba, session);
-	return -ENOMEM;
-}
-
-
-/**
  * bnx2i_setup_mp_bdt - allocate BD table resources
  * @hba:	pointer to adapter structure
  *
@@ -1157,7 +1102,14 @@ static void bnx2i_cleanup_task(struct iscsi_task *task)
 	struct iscsi_conn *conn = task->conn;
 	struct bnx2i_conn *bnx2i_conn = conn->dd_data;
 	struct bnx2i_hba *hba = bnx2i_conn->hba;
+	struct bnx2i_cmd *cmd = task->dd_data;
 
+	if (cmd->io_tbl.bd_tbl)
+		dma_free_coherent(&hba->pcidev->dev,
+				  ISCSI_MAX_BDS_PER_CMD *
+				  sizeof(struct iscsi_bd),
+				  cmd->io_tbl.bd_tbl,
+				  cmd->io_tbl.bd_tbl_dma);
 	/*
 	 * mgmt task or cmd was never sent to us to transmit.
 	 */
@@ -1178,6 +1130,29 @@ static void bnx2i_cleanup_task(struct iscsi_task *task)
 }
 
 /**
+ * bnx2i_alloc_pdu - setup task/pdu and its bdt
+ * @task:	transport layer command structure pointer
+ * @opcode:	iscsi opcode for task/pdu to setup for
+ *
+ * The bdt will be freed in bnx2i_cleanup_task.
+ */
+static int bnx2i_alloc_pdu(struct iscsi_task *task, uint8_t opcode)
+{
+	struct bnx2i_cmd *cmd = task->dd_data;
+	struct iscsi_conn *conn = task->conn;
+	struct bnx2i_conn *bnx2i_conn = conn->dd_data;
+	struct bnx2i_hba *hba = bnx2i_conn->hba;
+
+	task->hdr = &cmd->hdr;
+	task->hdr_max = sizeof(struct iscsi_hdr);
+
+	if (bnx2i_alloc_bdt(hba, conn->session, cmd))
+		return -ENOMEM;
+
+	return 0;
+}
+
+/**
  * bnx2i_mtask_xmit - transmit mtask to chip for further processing
  * @conn:	transport layer conn structure pointer
  * @task:	transport layer command structure pointer
@@ -1284,7 +1259,6 @@ bnx2i_session_create(struct iscsi_endpoint *ep,
 		     uint32_t initial_cmdsn)
 {
 	struct Scsi_Host *shost;
-	struct iscsi_cls_session *cls_session;
 	struct bnx2i_hba *hba;
 	struct bnx2i_endpoint *bnx2i_ep;
 
@@ -1308,40 +1282,11 @@ bnx2i_session_create(struct iscsi_endpoint *ep,
 	else if (cmds_max < BNX2I_SQ_WQES_MIN)
 		cmds_max = BNX2I_SQ_WQES_MIN;
 
-	cls_session = iscsi_session_setup(&bnx2i_iscsi_transport, shost,
-					  cmds_max, 0, sizeof(struct bnx2i_cmd),
-					  initial_cmdsn, ISCSI_MAX_TARGET);
-	if (!cls_session)
-		return NULL;
-
-	if (bnx2i_setup_cmd_pool(hba, cls_session->dd_data))
-		goto session_teardown;
-	return cls_session;
-
-session_teardown:
-	iscsi_session_teardown(cls_session);
-	return NULL;
-}
-
-
-/**
- * bnx2i_session_destroy - destroys iscsi session
- * @cls_session:	pointer to iscsi cls session
- *
- * Destroys previously created iSCSI session instance and releases
- *	all resources held by it
- */
-static void bnx2i_session_destroy(struct iscsi_cls_session *cls_session)
-{
-	struct iscsi_session *session = cls_session->dd_data;
-	struct Scsi_Host *shost = iscsi_session_to_shost(cls_session);
-	struct bnx2i_hba *hba = iscsi_host_priv(shost);
-
-	bnx2i_destroy_cmd_pool(hba, session);
-	iscsi_session_teardown(cls_session);
+	return iscsi_session_setup(&bnx2i_iscsi_transport, shost,
+				   cmds_max, 0, sizeof(struct bnx2i_cmd),
+				   initial_cmdsn, ISCSI_MAX_TARGET);
 }
 
-
 /**
  * bnx2i_conn_create - create iscsi connection instance
  * @cls_session:	pointer to iscsi cls session
@@ -2273,7 +2218,7 @@ struct iscsi_transport bnx2i_iscsi_transport = {
 				  CAP_DATA_PATH_OFFLOAD |
 				  CAP_TEXT_NEGO,
 	.create_session		= bnx2i_session_create,
-	.destroy_session	= bnx2i_session_destroy,
+	.destroy_session	= iscsi_session_teardown,
 	.create_conn		= bnx2i_conn_create,
 	.bind_conn		= bnx2i_conn_bind,
 	.destroy_conn		= bnx2i_conn_destroy,
@@ -2284,6 +2229,7 @@ struct iscsi_transport bnx2i_iscsi_transport = {
 	.get_host_param		= bnx2i_host_get_param,
 	.start_conn		= bnx2i_conn_start,
 	.stop_conn		= iscsi_conn_stop,
+	.alloc_pdu		= bnx2i_alloc_pdu,
 	.send_pdu		= iscsi_conn_send_pdu,
 	.xmit_task		= bnx2i_task_xmit,
 	.get_stats		= bnx2i_conn_get_stats,
diff --git a/drivers/scsi/cxgbi/libcxgbi.c b/drivers/scsi/cxgbi/libcxgbi.c
index b44c1cf..9b64ddc 100644
--- a/drivers/scsi/cxgbi/libcxgbi.c
+++ b/drivers/scsi/cxgbi/libcxgbi.c
@@ -2306,7 +2306,6 @@ struct iscsi_cls_session *cxgbi_create_session(struct iscsi_endpoint *ep,
 	struct cxgbi_hba *chba;
 	struct Scsi_Host *shost;
 	struct iscsi_cls_session *cls_session;
-	struct iscsi_session *session;
 
 	if (!ep) {
 		pr_err("missing endpoint.\n");
@@ -2327,17 +2326,9 @@ struct iscsi_cls_session *cxgbi_create_session(struct iscsi_endpoint *ep,
 	if (!cls_session)
 		return NULL;
 
-	session = cls_session->dd_data;
-	if (iscsi_tcp_r2tpool_alloc(session))
-		goto remove_session;
-
 	log_debug(1 << CXGBI_DBG_ISCSI,
 		"ep 0x%p, cls sess 0x%p.\n", ep, cls_session);
 	return cls_session;
-
-remove_session:
-	iscsi_session_teardown(cls_session);
-	return NULL;
 }
 EXPORT_SYMBOL_GPL(cxgbi_create_session);
 
@@ -2346,7 +2337,6 @@ void cxgbi_destroy_session(struct iscsi_cls_session *cls_session)
 	log_debug(1 << CXGBI_DBG_ISCSI,
 		"cls sess 0x%p.\n", cls_session);
 
-	iscsi_tcp_r2tpool_free(cls_session->dd_data);
 	iscsi_session_teardown(cls_session);
 }
 EXPORT_SYMBOL_GPL(cxgbi_destroy_session);
diff --git a/drivers/scsi/iscsi_tcp.c b/drivers/scsi/iscsi_tcp.c
index ad5244d..efb8d75 100644
--- a/drivers/scsi/iscsi_tcp.c
+++ b/drivers/scsi/iscsi_tcp.c
@@ -850,12 +850,8 @@ iscsi_sw_tcp_session_create(struct iscsi_endpoint *ep, uint16_t cmds_max,
 	tcp_sw_host->session = session;
 
 	shost->can_queue = session->scsi_cmds_max;
-	if (iscsi_tcp_r2tpool_alloc(session))
-		goto remove_session;
 	return cls_session;
 
-remove_session:
-	iscsi_session_teardown(cls_session);
 remove_host:
 	iscsi_host_remove(shost);
 free_host:
@@ -867,7 +863,6 @@ static void iscsi_sw_tcp_session_destroy(struct iscsi_cls_session *cls_session)
 {
 	struct Scsi_Host *shost = iscsi_session_to_shost(cls_session);
 
-	iscsi_tcp_r2tpool_free(cls_session->dd_data);
 	iscsi_session_teardown(cls_session);
 
 	iscsi_host_remove(shost);
@@ -961,6 +956,9 @@ static struct scsi_host_template iscsi_sw_tcp_sht = {
 	.proc_name		= "iscsi_tcp",
 	.this_id		= -1,
 	.use_blk_mq		= true,
+	.cmd_size		= sizeof(struct iscsi_task) +
+				  sizeof(struct iscsi_tcp_task) +
+				  sizeof(struct iscsi_sw_tcp_hdrbuf),
 };
 
 static struct iscsi_transport iscsi_sw_tcp_transport = {
diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c
index 63032d3..24b2c3f 100644
--- a/drivers/scsi/libiscsi.c
+++ b/drivers/scsi/libiscsi.c
@@ -426,6 +426,12 @@ static int iscsi_prep_scsi_cmd_pdu(struct iscsi_task *task)
 			r2t->data_offset = task->imm_count;
 			r2t->ttt = cpu_to_be32(ISCSI_RESERVED_TAG);
 			r2t->exp_statsn = cpu_to_be32(conn->exp_statsn);
+			/*
+			 * make sure if xmit thread is handling multiple tasks
+			 * it sees all these updated
+			 */
+			smp_wmb();
+			r2t->sent = 0;
 		}
 
 		if (!task->unsol_r2t.data_length)
@@ -496,9 +502,12 @@ static void iscsi_free_task(struct iscsi_task *task)
 	if (conn->login_task == task)
 		return;
 
-	kfifo_in(&session->cmdpool.queue, (void*)&task, sizeof(void*));
+	session->task_map[task->itt] = NULL;
+	percpu_ida_free(&session->itts, task->itt);
 
-	if (sc) {
+	if (!sc) {
+		kfifo_in(&session->mgmt_pool.queue, (void*)&task, sizeof(void*));
+	} else {
 		/* SCSI eh reuses commands to verify us */
 		sc->SCp.ptr = NULL;
 		/*
@@ -595,7 +604,7 @@ EXPORT_SYMBOL_GPL(iscsi_complete_scsi_task);
 
 
 /*
- * session back_lock must be held and if not called for a task that is
+ * session frwd_lock must be held and if not called for a task that is
  * still pending or from the xmit thread, then xmit thread must
  * be suspended.
  */
@@ -696,6 +705,7 @@ __iscsi_conn_send_pdu(struct iscsi_conn *conn, struct iscsi_hdr *hdr,
 	uint8_t opcode = hdr->opcode & ISCSI_OPCODE_MASK;
 	struct iscsi_task *task;
 	itt_t itt;
+	int tag;
 
 	if (session->state == ISCSI_STATE_TERMINATE)
 		return NULL;
@@ -721,10 +731,20 @@ __iscsi_conn_send_pdu(struct iscsi_conn *conn, struct iscsi_hdr *hdr,
 		BUG_ON(conn->c_stage == ISCSI_CONN_INITIAL_STAGE);
 		BUG_ON(conn->c_stage == ISCSI_CONN_STOPPED);
 
-		if (!kfifo_out(&session->cmdpool.queue,
+		if (!kfifo_out(&session->mgmt_pool.queue,
 				 (void*)&task, sizeof(void*)))
 			return NULL;
+	
+		tag = percpu_ida_alloc(&session->itts, GFP_ATOMIC);
+		if (tag < 0) {
+			kfifo_in(&session->mgmt_pool.queue, (void*)&task,
+				 sizeof(void*));
+			return NULL;
+		}
+		task->itt = tag;
+		session->task_map[tag] = task;
 	}
+
 	/*
 	 * released in complete pdu for task we expect a response for, and
 	 * released by the lld when it has transmitted the task for
@@ -735,6 +755,7 @@ __iscsi_conn_send_pdu(struct iscsi_conn *conn, struct iscsi_hdr *hdr,
 	task->sc = NULL;
 	INIT_LIST_HEAD(&task->running);
 	task->state = ISCSI_TASK_PENDING;
+	task->dd_data = &task[1];
 
 	if (data_size) {
 		memcpy(task->data, data, data_size);
@@ -1095,7 +1116,7 @@ struct iscsi_task *iscsi_itt_to_task(struct iscsi_conn *conn, itt_t itt)
 	if (i >= session->cmds_max)
 		return NULL;
 
-	return session->cmds[i];
+	return session->task_map[i];
 }
 EXPORT_SYMBOL_GPL(iscsi_itt_to_task);
 
@@ -1563,19 +1584,23 @@ static void iscsi_xmitworker(struct work_struct *work)
 	} while (rc >= 0 || rc == -EAGAIN);
 }
 
-static inline struct iscsi_task *iscsi_alloc_task(struct iscsi_conn *conn,
-						  struct scsi_cmnd *sc)
+static inline struct iscsi_task *iscsi_init_task(struct iscsi_conn *conn,
+						 struct scsi_cmnd *sc)
 {
-	struct iscsi_task *task;
-
-	if (!kfifo_out(&conn->session->cmdpool.queue,
-			 (void *) &task, sizeof(void *)))
-		return NULL;
+	struct iscsi_session *session = conn->session;
+	struct iscsi_task *task = scsi_get_drv_cmd(sc); /*TODO - how to go from cmd to driver struct in hch's patches */
+	int tag;
 
-	sc->SCp.phase = conn->session->age;
+	sc->SCp.phase = session->age;
 	sc->SCp.ptr = (char *) task;
 
+	tag = percpu_ida_alloc(&session->itts, GFP_ATOMIC);
+	if (tag < 0)
+		return NULL;
+
+	task->dd_data = &task[1];
 	atomic_set(&task->refcount, 1);
+	task->itt = tag;
 	task->state = ISCSI_TASK_PENDING;
 	task->conn = conn;
 	task->sc = sc;
@@ -1583,6 +1608,7 @@ static inline struct iscsi_task *iscsi_alloc_task(struct iscsi_conn *conn,
 	task->last_timeout = jiffies;
 	task->last_xfer = jiffies;
 	INIT_LIST_HEAD(&task->running);
+	session->task_map[tag] = task;
 	return task;
 }
 
@@ -1673,7 +1699,7 @@ int iscsi_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *sc)
 		goto reject;
 	}
 
-	task = iscsi_alloc_task(conn, sc);
+	task = iscsi_init_task(conn, sc);
 	if (!task) {
 		reason = FAILURE_OOM;
 		goto reject;
@@ -1824,8 +1850,8 @@ static int iscsi_exec_task_mgmt_fn(struct iscsi_conn *conn,
 }
 
 /*
- * Fail commands. session lock held and recv side suspended and xmit
- * thread flushed
+ * Fail commands. session frwd_lock must be held and recv side suspended and
+ * xmit thread flushed
  */
 static void fail_scsi_tasks(struct iscsi_conn *conn, unsigned lun,
 			    int error)
@@ -1834,8 +1860,8 @@ static void fail_scsi_tasks(struct iscsi_conn *conn, unsigned lun,
 	int i;
 
 	for (i = 0; i < conn->session->cmds_max; i++) {
-		task = conn->session->cmds[i];
-		if (!task->sc || task->state == ISCSI_TASK_FREE)
+		task = conn->session->task_map[i];
+		if (!task || !task->sc || task->state == ISCSI_TASK_FREE)
 			continue;
 
 		if (lun != -1 && lun != task->sc->device->lun)
@@ -1978,9 +2004,10 @@ static enum blk_eh_timer_return iscsi_eh_cmd_timed_out(struct scsi_cmnd *sc)
 	}
 
 	for (i = 0; i < conn->session->cmds_max; i++) {
-		running_task = conn->session->cmds[i];
-		if (!running_task->sc || running_task == task ||
-		     running_task->state != ISCSI_TASK_RUNNING)
+		running_task = conn->session->task_map[i];
+		if (!running_task || !running_task->sc ||
+		    running_task == task ||
+		    running_task->state != ISCSI_TASK_RUNNING)
 			continue;
 
 		/*
@@ -2697,7 +2724,7 @@ iscsi_session_setup(struct iscsi_transport *iscsit, struct Scsi_Host *shost,
 	struct iscsi_host *ihost = shost_priv(shost);
 	struct iscsi_session *session;
 	struct iscsi_cls_session *cls_session;
-	int cmd_i, scsi_cmds, total_cmds = cmds_max;
+	int scsi_cmds, total_cmds = cmds_max;
 	unsigned long flags;
 
 	spin_lock_irqsave(&ihost->lock, flags);
@@ -2766,22 +2793,23 @@ iscsi_session_setup(struct iscsi_transport *iscsit, struct Scsi_Host *shost,
 	spin_lock_init(&session->frwd_lock);
 	spin_lock_init(&session->back_lock);
 
-	/* initialize SCSI PDU commands pool */
-	if (iscsi_pool_init(&session->cmdpool, session->cmds_max,
-			    (void***)&session->cmds,
-			    cmd_task_size + sizeof(struct iscsi_task)))
-		goto cmdpool_alloc_fail;
+	/*
+	 * TODO: make block layer handle this like was done for
+	 * the non-mq host wide tagging.
+	 */
+	if (percpu_ida_init(&session->itts, total_cmds))
+		goto itts_init;
 
-	/* pre-format cmds pool with ITT */
-	for (cmd_i = 0; cmd_i < session->cmds_max; cmd_i++) {
-		struct iscsi_task *task = session->cmds[cmd_i];
+	/* initialize LOGIN/NOP/TMF PDU pool */
+	if (iscsi_pool_init(&session->mgmt_pool, ISCSI_MGMT_CMDS_MAX,
+			    (void***)&session->mgmt_cmds,
+			    cmd_task_size + sizeof(struct iscsi_task)))
+		goto mgmtpool_alloc_fail;
 
-		if (cmd_task_size)
-			task->dd_data = &task[1];
-		task->itt = cmd_i;
-		task->state = ISCSI_TASK_FREE;
-		INIT_LIST_HEAD(&task->running);
-	}
+	session->task_map = kzalloc(sizeof(struct iscsi_task *) *
+				    session->cmds_max, GFP_KERNEL);
+	if (!session->task_map)
+		goto task_map_alloc_fail;
 
 	if (!try_module_get(iscsit->owner))
 		goto module_get_fail;
@@ -2794,8 +2822,12 @@ iscsi_session_setup(struct iscsi_transport *iscsit, struct Scsi_Host *shost,
 cls_session_fail:
 	module_put(iscsit->owner);
 module_get_fail:
-	iscsi_pool_free(&session->cmdpool);
-cmdpool_alloc_fail:
+	kfree(session->task_map);
+task_map_alloc_fail:
+	iscsi_pool_free(&session->mgmt_pool);
+mgmtpool_alloc_fail:
+	percpu_ida_destroy(&session->itts);
+itts_init:
 	iscsi_free_session(cls_session);
 dec_session_count:
 	iscsi_host_dec_session_cnt(shost);
@@ -2816,7 +2848,7 @@ void iscsi_session_teardown(struct iscsi_cls_session *cls_session)
 	struct module *owner = cls_session->transport->owner;
 	struct Scsi_Host *shost = session->host;
 
-	iscsi_pool_free(&session->cmdpool);
+	iscsi_pool_free(&session->mgmt_pool);
 
 	kfree(session->password);
 	kfree(session->password_in);
@@ -2831,6 +2863,9 @@ void iscsi_session_teardown(struct iscsi_cls_session *cls_session)
 	kfree(session->ifacename);
 	kfree(session->portal_type);
 	kfree(session->discovery_parent_type);
+	kfree(session->task_map);
+
+	percpu_ida_destroy(&session->itts);
 
 	iscsi_destroy_session(cls_session);
 	iscsi_host_dec_session_cnt(shost);
@@ -2852,6 +2887,7 @@ iscsi_conn_setup(struct iscsi_cls_session *cls_session, int dd_size,
 	struct iscsi_conn *conn;
 	struct iscsi_cls_conn *cls_conn;
 	char *data;
+	int tag;
 
 	cls_conn = iscsi_create_conn(cls_session, sizeof(*conn) + dd_size,
 				     conn_idx);
@@ -2879,7 +2915,7 @@ iscsi_conn_setup(struct iscsi_cls_session *cls_session, int dd_size,
 
 	/* allocate login_task used for the login/text sequences */
 	spin_lock_bh(&session->frwd_lock);
-	if (!kfifo_out(&session->cmdpool.queue,
+	if (!kfifo_out(&session->mgmt_pool.queue,
                          (void*)&conn->login_task,
 			 sizeof(void*))) {
 		spin_unlock_bh(&session->frwd_lock);
@@ -2893,13 +2929,22 @@ iscsi_conn_setup(struct iscsi_cls_session *cls_session, int dd_size,
 		goto login_task_data_alloc_fail;
 	conn->login_task->data = conn->data = data;
 
+	tag = percpu_ida_alloc(&session->itts, GFP_KERNEL);
+	if (tag < 0)
+		goto login_itt_fail;
+
+	conn->login_task->itt = tag;
+printk(KERN_ERR "iscsi login task !!!!!!!!!!!!!!!!!! %d\n", conn->login_task->itt);
+	session->task_map[tag] = conn->login_task;
 	init_timer(&conn->tmf_timer);
 	init_waitqueue_head(&conn->ehwait);
 
 	return cls_conn;
 
+login_itt_fail:
+	kfree(data);
 login_task_data_alloc_fail:
-	kfifo_in(&session->cmdpool.queue, (void*)&conn->login_task,
+	kfifo_in(&session->mgmt_pool.queue, (void*)&conn->login_task,
 		    sizeof(void*));
 login_task_alloc_fail:
 	iscsi_destroy_conn(cls_conn);
@@ -2965,7 +3010,8 @@ void iscsi_conn_teardown(struct iscsi_cls_conn *cls_conn)
 	kfree(conn->local_ipaddr);
 	/* regular RX path uses back_lock */
 	spin_lock_bh(&session->back_lock);
-	kfifo_in(&session->cmdpool.queue, (void*)&conn->login_task,
+	percpu_ida_free(&session->itts, conn->login_task->itt);
+	kfifo_in(&session->mgmt_pool.queue, (void*)&conn->login_task,
 		    sizeof(void*));
 	spin_unlock_bh(&session->back_lock);
 	if (session->leadconn == conn)
@@ -3051,8 +3097,8 @@ fail_mgmt_tasks(struct iscsi_session *session, struct iscsi_conn *conn)
 	int i, state;
 
 	for (i = 0; i < conn->session->cmds_max; i++) {
-		task = conn->session->cmds[i];
-		if (task->sc)
+		task = conn->session->task_map[i];
+		if (!task || task->sc)
 			continue;
 
 		if (task->state == ISCSI_TASK_FREE)
diff --git a/drivers/scsi/libiscsi_tcp.c b/drivers/scsi/libiscsi_tcp.c
index e4bec58..3c95032 100644
--- a/drivers/scsi/libiscsi_tcp.c
+++ b/drivers/scsi/libiscsi_tcp.c
@@ -451,27 +451,13 @@ iscsi_tcp_data_recv_prep(struct iscsi_tcp_conn *tcp_conn)
 void iscsi_tcp_cleanup_task(struct iscsi_task *task)
 {
 	struct iscsi_tcp_task *tcp_task = task->dd_data;
-	struct iscsi_r2t_info *r2t;
 
 	/* nothing to do for mgmt */
 	if (!task->sc)
 		return;
 
-	spin_lock_bh(&tcp_task->queue2pool);
-	/* flush task's r2t queues */
-	while (kfifo_out(&tcp_task->r2tqueue, (void*)&r2t, sizeof(void*))) {
-		kfifo_in(&tcp_task->r2tpool.queue, (void*)&r2t,
-			    sizeof(void*));
-		ISCSI_DBG_TCP(task->conn, "pending r2t dropped\n");
-	}
-
-	r2t = tcp_task->r2t;
-	if (r2t != NULL) {
-		kfifo_in(&tcp_task->r2tpool.queue, (void*)&r2t,
-			    sizeof(void*));
-		tcp_task->r2t = NULL;
-	}
-	spin_unlock_bh(&tcp_task->queue2pool);
+	tcp_task->r2t.data_length = 0;
+	tcp_task->r2t.sent = 0;
 }
 EXPORT_SYMBOL_GPL(iscsi_tcp_cleanup_task);
 
@@ -529,11 +515,10 @@ static int iscsi_tcp_r2t_rsp(struct iscsi_conn *conn, struct iscsi_task *task)
 	struct iscsi_tcp_task *tcp_task = task->dd_data;
 	struct iscsi_tcp_conn *tcp_conn = conn->dd_data;
 	struct iscsi_r2t_rsp *rhdr = (struct iscsi_r2t_rsp *)tcp_conn->in.hdr;
-	struct iscsi_r2t_info *r2t;
+	struct iscsi_r2t_info *r2t = &tcp_task->r2t;
 	int r2tsn = be32_to_cpu(rhdr->r2tsn);
 	u32 data_length;
 	u32 data_offset;
-	int rc;
 
 	if (tcp_conn->in.datalen) {
 		iscsi_conn_printk(KERN_ERR, conn,
@@ -579,28 +564,21 @@ static int iscsi_tcp_r2t_rsp(struct iscsi_conn *conn, struct iscsi_task *task)
 		return ISCSI_ERR_DATALEN;
 	}
 
-	spin_lock(&tcp_task->pool2queue);
-	rc = kfifo_out(&tcp_task->r2tpool.queue, (void*)&r2t, sizeof(void*));
-	if (!rc) {
-		iscsi_conn_printk(KERN_ERR, conn, "Could not allocate R2T. "
-				  "Target has sent more R2Ts than it "
-				  "negotiated for or driver has leaked.\n");
-		spin_unlock(&tcp_task->pool2queue);
-		return ISCSI_ERR_PROTO;
-	}
-
 	r2t->exp_statsn = rhdr->statsn;
 	r2t->data_length = data_length;
 	r2t->data_offset = data_offset;
-
 	r2t->ttt = rhdr->ttt; /* no flip */
 	r2t->datasn = 0;
+	/*
+	 * TODO: I think this is needed to make sure if the xmit thread
+	 * is handling multiple tasks at a time then we want to make sure
+	 * it sees these fields updated
+	 */
+	smp_wmb();
 	r2t->sent = 0;
 
 	tcp_task->exp_datasn = r2tsn + 1;
-	kfifo_in(&tcp_task->r2tqueue, (void*)&r2t, sizeof(void*));
 	conn->r2t_pdus_cnt++;
-	spin_unlock(&tcp_task->pool2queue);
 
 	iscsi_requeue_task(task);
 	return 0;
@@ -971,7 +949,6 @@ int iscsi_tcp_task_init(struct iscsi_task *task)
 		return conn->session->tt->init_pdu(task, 0, task->data_count);
 	}
 
-	BUG_ON(kfifo_len(&tcp_task->r2tqueue));
 	tcp_task->exp_datasn = 0;
 
 	/* Prepare PDU, optionally w/ immediate data */
@@ -989,37 +966,17 @@ EXPORT_SYMBOL_GPL(iscsi_tcp_task_init);
 static struct iscsi_r2t_info *iscsi_tcp_get_curr_r2t(struct iscsi_task *task)
 {
 	struct iscsi_tcp_task *tcp_task = task->dd_data;
-	struct iscsi_r2t_info *r2t = NULL;
-
-	if (iscsi_task_has_unsol_data(task))
-		r2t = &task->unsol_r2t;
-	else {
-		spin_lock_bh(&tcp_task->queue2pool);
-		if (tcp_task->r2t) {
-			r2t = tcp_task->r2t;
-			/* Continue with this R2T? */
-			if (r2t->data_length <= r2t->sent) {
-				ISCSI_DBG_TCP(task->conn,
-					      "  done with r2t %p\n", r2t);
-				kfifo_in(&tcp_task->r2tpool.queue,
-					    (void *)&tcp_task->r2t,
-					    sizeof(void *));
-				tcp_task->r2t = r2t = NULL;
-			}
-		}
-
-		if (r2t == NULL) {
-			if (kfifo_out(&tcp_task->r2tqueue,
-			    (void *)&tcp_task->r2t, sizeof(void *)) !=
-			    sizeof(void *))
-				r2t = NULL;
-			else
-				r2t = tcp_task->r2t;
-		}
-		spin_unlock_bh(&tcp_task->queue2pool);
-	}
 
-	return r2t;
+	if (task->unsol_r2t.sent == 0 && iscsi_task_has_unsol_data(task)) {
+		/* Make sure we see the queue sides update of these */
+		smp_rmb();
+		return &task->unsol_r2t;
+	} else if (tcp_task->r2t.data_length > tcp_task->r2t.sent) {
+		/* Make sure we see the recv sides update of these */
+		smp_rmb();
+		return &tcp_task->r2t;
+	} else
+		return NULL;
 }
 
 /**
@@ -1115,84 +1072,17 @@ void iscsi_tcp_conn_teardown(struct iscsi_cls_conn *cls_conn)
 }
 EXPORT_SYMBOL_GPL(iscsi_tcp_conn_teardown);
 
-int iscsi_tcp_r2tpool_alloc(struct iscsi_session *session)
-{
-	int i;
-	int cmd_i;
-
-	/*
-	 * initialize per-task: R2T pool and xmit queue
-	 */
-	for (cmd_i = 0; cmd_i < session->cmds_max; cmd_i++) {
-	        struct iscsi_task *task = session->cmds[cmd_i];
-		struct iscsi_tcp_task *tcp_task = task->dd_data;
-
-		/*
-		 * pre-allocated x2 as much r2ts to handle race when
-		 * target acks DataOut faster than we data_xmit() queues
-		 * could replenish r2tqueue.
-		 */
-
-		/* R2T pool */
-		if (iscsi_pool_init(&tcp_task->r2tpool,
-				    session->max_r2t * 2, NULL,
-				    sizeof(struct iscsi_r2t_info))) {
-			goto r2t_alloc_fail;
-		}
-
-		/* R2T xmit queue */
-		if (kfifo_alloc(&tcp_task->r2tqueue,
-		      session->max_r2t * 4 * sizeof(void*), GFP_KERNEL)) {
-			iscsi_pool_free(&tcp_task->r2tpool);
-			goto r2t_alloc_fail;
-		}
-		spin_lock_init(&tcp_task->pool2queue);
-		spin_lock_init(&tcp_task->queue2pool);
-	}
-
-	return 0;
-
-r2t_alloc_fail:
-	for (i = 0; i < cmd_i; i++) {
-		struct iscsi_task *task = session->cmds[i];
-		struct iscsi_tcp_task *tcp_task = task->dd_data;
-
-		kfifo_free(&tcp_task->r2tqueue);
-		iscsi_pool_free(&tcp_task->r2tpool);
-	}
-	return -ENOMEM;
-}
-EXPORT_SYMBOL_GPL(iscsi_tcp_r2tpool_alloc);
-
-void iscsi_tcp_r2tpool_free(struct iscsi_session *session)
-{
-	int i;
-
-	for (i = 0; i < session->cmds_max; i++) {
-		struct iscsi_task *task = session->cmds[i];
-		struct iscsi_tcp_task *tcp_task = task->dd_data;
-
-		kfifo_free(&tcp_task->r2tqueue);
-		iscsi_pool_free(&tcp_task->r2tpool);
-	}
-}
-EXPORT_SYMBOL_GPL(iscsi_tcp_r2tpool_free);
-
 int iscsi_tcp_set_max_r2t(struct iscsi_conn *conn, char *buf)
 {
 	struct iscsi_session *session = conn->session;
 	unsigned short r2ts = 0;
 
 	sscanf(buf, "%hu", &r2ts);
-	if (session->max_r2t == r2ts)
-		return 0;
-
-	if (!r2ts || !is_power_of_2(r2ts))
+	if (r2ts != 1)
 		return -EINVAL;
 
 	session->max_r2t = r2ts;
-	iscsi_tcp_r2tpool_free(session);
-	return iscsi_tcp_r2tpool_alloc(session);
+	return 0;
 }
 EXPORT_SYMBOL_GPL(iscsi_tcp_set_max_r2t);
 
diff --git a/include/scsi/libiscsi.h b/include/scsi/libiscsi.h
index 7221a24..1ef3aa0 100644
--- a/include/scsi/libiscsi.h
+++ b/include/scsi/libiscsi.h
@@ -29,6 +29,7 @@
 #include <linux/timer.h>
 #include <linux/workqueue.h>
 #include <linux/kfifo.h>
+#include <linux/percpu_ida.h>
 #include <scsi/iscsi_proto.h>
 #include <scsi/iscsi_if.h>
 #include <scsi/scsi_transport_iscsi.h>
@@ -335,18 +336,18 @@ struct iscsi_session {
 	spinlock_t		frwd_lock;	/* protects session state, *
 						 * cmdsn, queued_cmdsn     *
 						 * session resources:      *
-						 * - cmdpool kfifo_out ,   *
-						 * - mgmtpool,		   */
+						 * - mgmt_pool kfifo,	   */
 	spinlock_t		back_lock;	/* protects cmdsn_exp      *
-						 * cmdsn_max,              *
-						 * cmdpool kfifo_in        */
+						 * cmdsn_max,              */
 	int			state;		/* session state           */
 	int			age;		/* counts session re-opens */
 
-	int			scsi_cmds_max; 	/* max scsi commands */
+	int			scsi_cmds_max;	/* max scsi commands */
+	struct percpu_ida	itts;		/* itt ida */
+	struct iscsi_task	**task_map;	/* itt to task map */
 	int			cmds_max;	/* size of cmds array */
-	struct iscsi_task	**cmds;		/* Original Cmds arr */
-	struct iscsi_pool	cmdpool;	/* PDU's pool */
+	struct iscsi_task	**mgmt_cmds;	/* Mgmt cmds ar */
+	struct iscsi_pool	mgmt_pool;	/* Mgmt PDU pool */
 	void			*dd_data;	/* LLD private data */
 };
 
diff --git a/include/scsi/libiscsi_tcp.h b/include/scsi/libiscsi_tcp.h
index 2a7aa75..c494660 100644
--- a/include/scsi/libiscsi_tcp.h
+++ b/include/scsi/libiscsi_tcp.h
@@ -79,12 +79,8 @@ struct iscsi_tcp_conn {
 struct iscsi_tcp_task {
 	uint32_t		exp_datasn;	/* expected target's R2TSN/DataSN */
 	int			data_offset;
-	struct iscsi_r2t_info	*r2t;		/* in progress solict R2T */
-	struct iscsi_pool	r2tpool;
-	struct kfifo		r2tqueue;
+	struct iscsi_r2t_info	r2t;
 	void			*dd_data;
-	spinlock_t		pool2queue;
-	spinlock_t		queue2pool;
 };
 
 enum {
@@ -128,8 +124,6 @@ iscsi_tcp_conn_setup(struct iscsi_cls_session *cls_session, int dd_data_size,
 extern void iscsi_tcp_conn_teardown(struct iscsi_cls_conn *cls_conn);
 
 /* misc helpers */
-extern int iscsi_tcp_r2tpool_alloc(struct iscsi_session *session);
-extern void iscsi_tcp_r2tpool_free(struct iscsi_session *session);
 extern int iscsi_tcp_set_max_r2t(struct iscsi_conn *conn, char *buf);
 extern void iscsi_tcp_conn_get_stats(struct iscsi_cls_conn *cls_conn,
 				     struct iscsi_stats *stats);

^ permalink raw reply related	[flat|nested] 31+ messages in thread

* Re: [PATCH 12/15] scsi: initial blk-mq support
  2014-02-07  8:45     ` Mike Christie
@ 2014-02-07 12:42       ` Christoph Hellwig
  0 siblings, 0 replies; 31+ messages in thread
From: Christoph Hellwig @ 2014-02-07 12:42 UTC (permalink / raw)
  To: Mike Christie
  Cc: Nicholas A. Bellinger, Christoph Hellwig, Jens Axboe,
	James Bottomley, linux-scsi

On Fri, Feb 07, 2014 at 02:45:18AM -0600, Mike Christie wrote:
> Yeah, it would be nice if like in Nick's patches, the driver could just
> set the scsi_host_template->cmd_size then when the scsi_cmnd got to the
> driver's queuecommand, the driver could just get its internal cmd struct
> from the scsi_cmnd struct (for example in Nick's patch it was off the
> SCp.ptr).

You can do this with my series.  The cmd_size support is actually
added in the first series and still works here.

Instead of needing SCp.ptr you just grab the memory right behind
the command.  Right now virtio_scsi opencodes this, but I should
probably add a helper for the next iteration.


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 12/15] scsi: initial blk-mq support
  2014-02-06 22:11   ` Nicholas A. Bellinger
  2014-02-07  8:45     ` Mike Christie
@ 2014-02-07 12:51     ` Christoph Hellwig
  1 sibling, 0 replies; 31+ messages in thread
From: Christoph Hellwig @ 2014-02-07 12:51 UTC (permalink / raw)
  To: Nicholas A. Bellinger
  Cc: Christoph Hellwig, Jens Axboe, James Bottomley, linux-scsi

> Is there extra scsi_mq_done() part that does IPI here even necessary
> anymore..?
> 
> I was under the assumption that blk_mq_end_io() is already taking care
> of this..?

blk_mq_end_io does it, but given that the SCSI-specific I/O completion
path is non-trivial I'd rather run it on the indicated CPU as well,
similar to how the old blk-softirq code works.  I'll send out a patch
series that should make this area a lot cleaner and avoid having to
implement the IPIs in each driver that wants or needs it soon.

> OK, so your in-lining the allocation of data + protection SGLs from
> blk-mq..
> 
> The original prototype code was doing these allocations separately below
> for each pre-allocated cmd, and offering LLD's to optionally
> pre-allocate their own descripts using sh->hostt->cmd_size if
> necessary..
> 
> This was necessary to eliminate all fast-path allocations for
> virtio-scsi, and I'd like to see something similar here as an optional
> feature as well.

It's present.  The cmd_size host_template parameter is added in the
previous series, and we still take it into account.  

^ permalink raw reply	[flat|nested] 31+ messages in thread

* [PATCH 13/15] scsi: partially stub out scsi_adjust_queue_depth when using blk-mq
  2014-02-05 12:41 [PATCH 00/15] A different approach for using blk-mq in the SCSI layer Christoph Hellwig
                   ` (11 preceding siblings ...)
  2014-02-05 12:41 ` [PATCH 12/15] scsi: initial blk-mq support Christoph Hellwig
@ 2014-02-05 12:41 ` Christoph Hellwig
  2014-02-05 12:41 ` [PATCH 14/15] iscsi_tcp: use blk_mq Christoph Hellwig
                   ` (3 subsequent siblings)
  16 siblings, 0 replies; 31+ messages in thread
From: Christoph Hellwig @ 2014-02-05 12:41 UTC (permalink / raw)
  To: Jens Axboe, James Bottomley, Nicholas Bellinger; +Cc: linux-scsi

[-- Attachment #1: 0013-scsi-partially-stub-out-scsi_adjust_queue_depth-when.patch --]
[-- Type: text/plain, Size: 796 bytes --]

This will have to be funnelled to blk-mq directly, but skip it for now.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/scsi/scsi.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c
index cf5c110..014b642 100644
--- a/drivers/scsi/scsi.c
+++ b/drivers/scsi/scsi.c
@@ -837,7 +837,7 @@ void scsi_adjust_queue_depth(struct scsi_device *sdev, int tagged, int tags)
 	 * is more IO than the LLD's can_queue (so there are not enuogh
 	 * tags) request_fn's host queue ready check will handle it.
 	 */
-	if (!sdev->host->bqt) {
+	if (!sdev->host->hostt->use_blk_mq && !sdev->host->bqt) {
 		if (blk_queue_tagged(sdev->request_queue) &&
 		    blk_queue_resize_tags(sdev->request_queue, tags) != 0)
 			goto out;
-- 
1.7.10.4



^ permalink raw reply related	[flat|nested] 31+ messages in thread

* [PATCH 14/15] iscsi_tcp: use blk_mq
  2014-02-05 12:41 [PATCH 00/15] A different approach for using blk-mq in the SCSI layer Christoph Hellwig
                   ` (12 preceding siblings ...)
  2014-02-05 12:41 ` [PATCH 13/15] scsi: partially stub out scsi_adjust_queue_depth when using blk-mq Christoph Hellwig
@ 2014-02-05 12:41 ` Christoph Hellwig
  2014-02-05 12:41 ` [PATCH 15/15] virtio_scsi: " Christoph Hellwig
                   ` (2 subsequent siblings)
  16 siblings, 0 replies; 31+ messages in thread
From: Christoph Hellwig @ 2014-02-05 12:41 UTC (permalink / raw)
  To: Jens Axboe, James Bottomley, Nicholas Bellinger; +Cc: linux-scsi

[-- Attachment #1: 0014-iscsi_tcp-use-blk_mq.patch --]
[-- Type: text/plain, Size: 537 bytes --]

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/scsi/iscsi_tcp.c |    1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/scsi/iscsi_tcp.c b/drivers/scsi/iscsi_tcp.c
index add6d15..44aae3d 100644
--- a/drivers/scsi/iscsi_tcp.c
+++ b/drivers/scsi/iscsi_tcp.c
@@ -957,6 +957,7 @@ static struct scsi_host_template iscsi_sw_tcp_sht = {
 	.target_alloc		= iscsi_target_alloc,
 	.proc_name		= "iscsi_tcp",
 	.this_id		= -1,
+	.use_blk_mq		= true,
 };
 
 static struct iscsi_transport iscsi_sw_tcp_transport = {
-- 
1.7.10.4



^ permalink raw reply related	[flat|nested] 31+ messages in thread

* [PATCH 15/15] virtio_scsi: use blk_mq
  2014-02-05 12:41 [PATCH 00/15] A different approach for using blk-mq in the SCSI layer Christoph Hellwig
                   ` (13 preceding siblings ...)
  2014-02-05 12:41 ` [PATCH 14/15] iscsi_tcp: use blk_mq Christoph Hellwig
@ 2014-02-05 12:41 ` Christoph Hellwig
  2014-02-10 11:35 ` [PATCH 00/15] A different approach for using blk-mq in the SCSI layer Christoph Hellwig
  2014-02-24 14:46 ` Christoph Hellwig
  16 siblings, 0 replies; 31+ messages in thread
From: Christoph Hellwig @ 2014-02-05 12:41 UTC (permalink / raw)
  To: Jens Axboe, James Bottomley, Nicholas Bellinger; +Cc: linux-scsi

[-- Attachment #1: 0015-virtio_scsi-use-blk_mq.patch --]
[-- Type: text/plain, Size: 890 bytes --]

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/scsi/virtio_scsi.c |    2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c
index d9a6074..59b030c 100644
--- a/drivers/scsi/virtio_scsi.c
+++ b/drivers/scsi/virtio_scsi.c
@@ -680,6 +680,7 @@ static struct scsi_host_template virtscsi_host_template_single = {
 	.use_clustering = ENABLE_CLUSTERING,
 	.target_alloc = virtscsi_target_alloc,
 	.target_destroy = virtscsi_target_destroy,
+	.use_blk_mq = true,
 };
 
 static struct scsi_host_template virtscsi_host_template_multi = {
@@ -697,6 +698,7 @@ static struct scsi_host_template virtscsi_host_template_multi = {
 	.use_clustering = ENABLE_CLUSTERING,
 	.target_alloc = virtscsi_target_alloc,
 	.target_destroy = virtscsi_target_destroy,
+	.use_blk_mq = true,
 };
 
 #define virtscsi_config_get(vdev, fld) \
-- 
1.7.10.4



^ permalink raw reply related	[flat|nested] 31+ messages in thread

* Re: [PATCH 00/15] A different approach for using blk-mq in the SCSI layer
  2014-02-05 12:41 [PATCH 00/15] A different approach for using blk-mq in the SCSI layer Christoph Hellwig
                   ` (14 preceding siblings ...)
  2014-02-05 12:41 ` [PATCH 15/15] virtio_scsi: " Christoph Hellwig
@ 2014-02-10 11:35 ` Christoph Hellwig
  2014-02-10 19:38   ` Nicholas A. Bellinger
  2014-02-24 14:46 ` Christoph Hellwig
  16 siblings, 1 reply; 31+ messages in thread
From: Christoph Hellwig @ 2014-02-10 11:35 UTC (permalink / raw)
  To: Jens Axboe, James Bottomley, Nicholas Bellinger; +Cc: linux-scsi

On Wed, Feb 05, 2014 at 04:41:18AM -0800, Christoph Hellwig wrote:
> A git tree is also available at
> 
>   git://git.infradead.org/users/hch/scsi.git#scsi-mq-wip

I've pushed various fixes to the branch, most importantly the ata_piix
driver including CDROM support has now been tested with blk-mq, and
timeouts and basic EH now work properly.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 00/15] A different approach for using blk-mq in the SCSI layer
  2014-02-10 11:35 ` [PATCH 00/15] A different approach for using blk-mq in the SCSI layer Christoph Hellwig
@ 2014-02-10 19:38   ` Nicholas A. Bellinger
  0 siblings, 0 replies; 31+ messages in thread
From: Nicholas A. Bellinger @ 2014-02-10 19:38 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Jens Axboe, James Bottomley, linux-scsi

On Mon, 2014-02-10 at 03:35 -0800, Christoph Hellwig wrote:
> On Wed, Feb 05, 2014 at 04:41:18AM -0800, Christoph Hellwig wrote:
> > A git tree is also available at
> > 
> >   git://git.infradead.org/users/hch/scsi.git#scsi-mq-wip
> 
> I've pushed various fixes to the branch, most importantly the ata_piix
> driver including CDROM support has now been tested with blk-mq, and
> timeouts and basic EH now work properly.

Nice work hch!

--nab


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 00/15] A different approach for using blk-mq in the SCSI layer
  2014-02-05 12:41 [PATCH 00/15] A different approach for using blk-mq in the SCSI layer Christoph Hellwig
                   ` (15 preceding siblings ...)
  2014-02-10 11:35 ` [PATCH 00/15] A different approach for using blk-mq in the SCSI layer Christoph Hellwig
@ 2014-02-24 14:46 ` Christoph Hellwig
  16 siblings, 0 replies; 31+ messages in thread
From: Christoph Hellwig @ 2014-02-24 14:46 UTC (permalink / raw)
  To: Jens Axboe, James Bottomley, Nicholas Bellinger; +Cc: linux-scsi

On Wed, Feb 05, 2014 at 04:41:18AM -0800, Christoph Hellwig wrote:
> There are still lots of limits mostly due to the lack of functionality in
> blk-mq, and I will try to address these in the blk-mq core mostly.
> 
> A git tree is also available at
> 
>   git://git.infradead.org/users/hch/scsi.git#scsi-mq-wip

I've pushed out a new tree that has been rebased ontop of Linux 3.14-rc3,
contains various fixes and supports partial completions as well as all
forms of requeueing.

This tree can be found at:

 git://git.infradead.org/users/hch/scsi.git#scsi-mq-wip.2


^ permalink raw reply	[flat|nested] 31+ messages in thread

end of thread, other threads:[~2014-02-24 14:46 UTC | newest]

Thread overview: 31+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2014-02-05 12:41 [PATCH 00/15] A different approach for using blk-mq in the SCSI layer Christoph Hellwig
2014-02-05 12:41 ` [PATCH 01/15] block: rework flush sequencing for blk-mq Christoph Hellwig
2014-02-06  2:08   ` Muthu Kumar
2014-02-06 16:18     ` Christoph Hellwig
2014-02-05 12:41 ` [PATCH 02/15] blk-mq: support at_head inserations for blk_execute_rq Christoph Hellwig
2014-02-06  2:27   ` Muthu Kumar
2014-02-06 16:17     ` Christoph Hellwig
2014-02-06 17:05       ` Muthu Kumar
2014-02-06 17:10         ` Christoph Hellwig
2014-02-05 12:41 ` [PATCH 03/15] blk-mq: divert __blk_put_request for MQ ops Christoph Hellwig
2014-02-05 12:41 ` [PATCH 04/15] blk-mq: handle dma_drain_size Christoph Hellwig
2014-02-05 12:41 ` [PATCH 05/15] blk-mq: initialize sg_reserved_size Christoph Hellwig
2014-02-05 12:41 ` [PATCH 06/15] scsi: reintroduce scsi_driver.init_command Christoph Hellwig
2014-02-05 12:41 ` [PATCH 07/15] block: remove unprep_rq_fn Christoph Hellwig
2014-02-05 12:41 ` [PATCH 08/15] scsi: cleanup scsi_end_request calling conventions Christoph Hellwig
2014-02-05 12:41 ` [PATCH 09/15] scsi: centralize command re-queueing in scsi_dispatch_fn Christoph Hellwig
2014-02-05 12:41 ` [PATCH 10/15] scsi: split __scsi_queue_insert Christoph Hellwig
2014-02-05 12:41 ` [PATCH 11/15] scsi: factor out __scsi_init_queue Christoph Hellwig
2014-02-05 12:41 ` [PATCH 12/15] scsi: initial blk-mq support Christoph Hellwig
2014-02-06  8:38   ` Sagi Grimberg
2014-02-06 16:16     ` Christoph Hellwig
2014-02-06 22:11   ` Nicholas A. Bellinger
2014-02-07  8:45     ` Mike Christie
2014-02-07 12:42       ` Christoph Hellwig
2014-02-07 12:51     ` Christoph Hellwig
2014-02-05 12:41 ` [PATCH 13/15] scsi: partially stub out scsi_adjust_queue_depth when using blk-mq Christoph Hellwig
2014-02-05 12:41 ` [PATCH 14/15] iscsi_tcp: use blk_mq Christoph Hellwig
2014-02-05 12:41 ` [PATCH 15/15] virtio_scsi: " Christoph Hellwig
2014-02-10 11:35 ` [PATCH 00/15] A different approach for using blk-mq in the SCSI layer Christoph Hellwig
2014-02-10 19:38   ` Nicholas A. Bellinger
2014-02-24 14:46 ` Christoph Hellwig

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox