Re: [PATCH 12/15] scsi: initial blk-mq support

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Sagi Grimberg <sagig@dev.mellanox.co.il>
To: Christoph Hellwig <hch@infradead.org>,
	Jens Axboe <axboe@kernel.dk>,
	James Bottomley <JBottomley@Parallels.com>,
	Nicholas Bellinger <nab@linux-iscsi.org>
Cc: linux-scsi@vger.kernel.org
Subject: Re: [PATCH 12/15] scsi: initial blk-mq support
Date: Thu, 06 Feb 2014 10:38:17 +0200	[thread overview]
Message-ID: <52F349F9.7090509@dev.mellanox.co.il> (raw)
In-Reply-To: <20140205124154.337539740@bombadil.infradead.org>

On 2/5/2014 2:41 PM, Christoph Hellwig wrote:
> Add support for using the blk-mq code to submit requests to SCSI
> drivers.  There is very little blk-mq specific code, but that's
> partially because important functionality like partial completions
> and request requeueing is still missing in blk-mq.  I hope to keep
> most of the additions for these in the blk-mq core instead of the
> SCSI layer, though.
>
> Based on the earlier scsi-mq prototype by Nicholas Bellinger, although
> not a whole lot of actual code is left.
>
> Not-quite-signed-off-yet-by: Christoph Hellwig <hch@lst.de>
> ---
>   drivers/scsi/scsi.c      |   36 ++++++-
>   drivers/scsi/scsi_lib.c  |  244 ++++++++++++++++++++++++++++++++++++++++++++--
>   drivers/scsi/scsi_priv.h |    2 +
>   drivers/scsi/scsi_scan.c |    5 +-
>   include/scsi/scsi_host.h |    3 +
>   5 files changed, 278 insertions(+), 12 deletions(-)
>
> diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c
> index adb8bfb..cf5c110 100644
> --- a/drivers/scsi/scsi.c
> +++ b/drivers/scsi/scsi.c
> @@ -44,6 +44,7 @@
>   #include <linux/string.h>
>   #include <linux/slab.h>
>   #include <linux/blkdev.h>
> +#include <linux/blk-mq.h>
>   #include <linux/delay.h>
>   #include <linux/init.h>
>   #include <linux/completion.h>
> @@ -688,6 +689,33 @@ int scsi_dispatch_cmd(struct scsi_cmnd *cmd)
>   	return 0;
>   }
>   
> +static void scsi_softirq_done_remote(void *data)
> +{
> +	return scsi_softirq_done(data);
> +}
> +
> +static void scsi_mq_done(struct request *req)
> +{
> +	int cpu;
> +
> +#if 0
> +	if (!ctx->ipi_redirect)
> +		return scsi_softirq_done(cmd);
> +#endif
> +
> +	cpu = get_cpu();
> +	if (cpu != req->cpu && cpu_online(req->cpu)) {
> +		req->csd.func = scsi_softirq_done_remote;
> +		req->csd.info = req;
> +		req->csd.flags = 0;
> +		__smp_call_function_single(req->cpu, &req->csd, 0);
> +	} else {
> +		scsi_softirq_done(req);
> +	}
> +
> +	put_cpu();
> +}
> +
>   /**
>    * scsi_done - Invoke completion on finished SCSI command.
>    * @cmd: The SCSI Command for which a low-level device driver (LLDD) gives
> @@ -701,8 +729,14 @@ int scsi_dispatch_cmd(struct scsi_cmnd *cmd)
>    */
>   static void scsi_done(struct scsi_cmnd *cmd)
>   {
> +	struct request *req = cmd->request;
> +
>   	trace_scsi_dispatch_cmd_done(cmd);
> -	blk_complete_request(cmd->request);
> +
> +	if (req->mq_ctx)
> +		scsi_mq_done(req);
> +	else
> +		blk_complete_request(req);
>   }
>   
>   /**
> diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
> index e67950c..8dd8893 100644
> --- a/drivers/scsi/scsi_lib.c
> +++ b/drivers/scsi/scsi_lib.c
> @@ -20,6 +20,7 @@
>   #include <linux/delay.h>
>   #include <linux/hardirq.h>
>   #include <linux/scatterlist.h>
> +#include <linux/blk-mq.h>
>   
>   #include <scsi/scsi.h>
>   #include <scsi/scsi_cmnd.h>
> @@ -554,6 +555,15 @@ static bool scsi_end_request(struct scsi_cmnd *cmd, int error, int bytes,
>   	struct request *req = cmd->request;
>   
>   	/*
> +	 * XXX: need to handle partial completions and retries here.
> +	 */
> +	if (req->mq_ctx) {
> +		blk_mq_end_io(req, error);
> +		put_device(&cmd->device->sdev_gendev);
> +		return true;
> +	}
> +
> +	/*
>   	 * If there are blocks left over at the end, set up the command
>   	 * to queue the remainder of them.
>   	 */
> @@ -1014,12 +1024,15 @@ static int scsi_init_sgtable(struct request *req, struct scsi_data_buffer *sdb,
>   {
>   	int count;
>   
> -	/*
> -	 * If sg table allocation fails, requeue request later.
> -	 */
> -	if (unlikely(scsi_alloc_sgtable(sdb, req->nr_phys_segments,
> -					gfp_mask))) {
> -		return BLKPREP_DEFER;
> +	BUG_ON(req->nr_phys_segments > SCSI_MAX_SG_SEGMENTS);
> +
> +	if (!req->mq_ctx) {
> +		/*
> +		 * If sg table allocation fails, requeue request later.
> +		 */
> +		if (unlikely(scsi_alloc_sgtable(sdb, req->nr_phys_segments,
> +						gfp_mask)))
> +			return BLKPREP_DEFER;
>   	}
>   
>   	req->buffer = NULL;
> @@ -1075,9 +1088,11 @@ int scsi_init_io(struct scsi_cmnd *cmd, gfp_t gfp_mask)
>   		BUG_ON(prot_sdb == NULL);
>   		ivecs = blk_rq_count_integrity_sg(rq->q, rq->bio);
>   
> -		if (scsi_alloc_sgtable(prot_sdb, ivecs, gfp_mask)) {
> -			error = BLKPREP_DEFER;
> -			goto err_exit;
> +		if (!rq->mq_ctx) {
> +			if (scsi_alloc_sgtable(prot_sdb, ivecs, gfp_mask)) {
> +				error = BLKPREP_DEFER;
> +				goto err_exit;
> +			}
>   		}
>   
>   		count = blk_rq_map_integrity_sg(rq->q, rq->bio,
> @@ -1505,7 +1520,7 @@ static void scsi_kill_request(struct request *req, struct request_queue *q)
>   	blk_complete_request(req);
>   }
>   
> -static void scsi_softirq_done(struct request *rq)
> +void scsi_softirq_done(struct request *rq)
>   {
>   	struct scsi_cmnd *cmd = rq->special;
>   	unsigned long wait_for = (cmd->allowed + 1) * rq->timeout;
> @@ -1533,9 +1548,11 @@ static void scsi_softirq_done(struct request *rq)
>   			scsi_finish_command(cmd);
>   			break;
>   		case NEEDS_RETRY:
> +			WARN_ON(rq->mq_ctx);
>   			scsi_queue_insert(cmd, SCSI_MLQUEUE_EH_RETRY);
>   			break;
>   		case ADD_TO_MLQUEUE:
> +			WARN_ON(rq->mq_ctx);
>   			scsi_queue_insert(cmd, SCSI_MLQUEUE_DEVICE_BUSY);
>   			break;
>   		default:
> @@ -1668,6 +1685,120 @@ out_delay:
>   		blk_delay_queue(q, SCSI_QUEUE_DELAY);
>   }
>   
> +static int scsi_mq_prep_fn(struct request *req)
> +{
> +	struct scsi_cmnd *cmd = req->special;
> +	int ret;
> +
> +	ret = scsi_prep_state_check(cmd->device, req);
> +	if (ret != BLKPREP_OK)
> +		goto out;
> +
> +	if (req->cmd_type == REQ_TYPE_FS)
> +		ret = scsi_cmd_to_driver(cmd)->init_command(cmd);
> +	else if (req->cmd_type == REQ_TYPE_BLOCK_PC)
> +		ret = scsi_setup_blk_pc_cmnd(cmd->device, req);
> +	else
> +		ret = BLKPREP_KILL;
> +
> +out:
> +	switch (ret) {
> +	case BLKPREP_OK:
> +		return 0;
> +	case BLKPREP_DEFER:
> +		return BLK_MQ_RQ_QUEUE_BUSY;
> +	default:
> +		req->errors = DID_NO_CONNECT << 16;
> +		return BLK_MQ_RQ_QUEUE_ERROR;
> +	}
> +}
> +
> +static int scsi_mq_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq)
> +{
> +	struct request_queue *q = rq->q;
> +	struct scsi_device *sdev = q->queuedata;
> +	struct Scsi_Host *shost = sdev->host;
> +	struct scsi_cmnd *cmd = rq->special;
> +	unsigned char *sense_buf = cmd->sense_buffer;
> +	struct scatterlist *sg;
> +	int ret = BLK_MQ_RQ_QUEUE_BUSY;
> +	int reason;
> +
> +	/*
> +	 * blk-mq stores this in the mq_ctx, which can't be derferenced by
> +	 * drivers.  For now use the old per-request field, but there must be
> +	 * a better way.
> +	 */
> +	rq->cpu = raw_smp_processor_id();
> +
> +	if (!get_device(&sdev->sdev_gendev))
> +		goto out;
> +
> +	if (!scsi_dev_queue_ready(q, sdev))
> +		goto out_put_device;
> +	if (!scsi_target_queue_ready(shost, sdev))
> +		goto out_dec_device_busy;
> +	if (!scsi_host_queue_ready(q, shost, sdev))
> +		goto out_dec_target_busy;
> +
> +	memset(sense_buf, 0, SCSI_SENSE_BUFFERSIZE);
> +	memset(cmd, 0, sizeof(struct scsi_cmnd));
> +
> +	cmd->request = rq;
> +	cmd->device = sdev;
> +	cmd->sense_buffer = sense_buf;
> +
> +	cmd->tag = rq->tag;
> +	cmd->cmnd = rq->cmd;
> +	cmd->prot_op = SCSI_PROT_NORMAL;
> +
> +	sg = (void *)cmd + sizeof(struct scsi_cmnd) + shost->hostt->cmd_size;
> +
> +	if (rq->nr_phys_segments) {
> +		cmd->sdb.table.sgl = sg;
> +		cmd->sdb.table.nents = rq->nr_phys_segments;
> +		sg_init_table(cmd->sdb.table.sgl, rq->nr_phys_segments);
> +	}
> +
> +	if (scsi_host_get_prot(shost)) {
> +		cmd->prot_sdb = (void *)sg +
> +			shost->sg_tablesize * sizeof(struct scatterlist);
> +		memset(cmd->prot_sdb, 0, sizeof(struct scsi_data_buffer));
> +
> +		cmd->prot_sdb->table.sgl =
> +			(struct scatterlist *)(cmd->prot_sdb + 1);
> +	}
> +
> +	ret = scsi_mq_prep_fn(rq);
> +	if (ret)
> +		goto out_dec_host_busy;
> +
> +	scsi_init_cmd_errh(cmd);
> +
> +	reason = scsi_dispatch_cmd(cmd);
> +	if (reason) {
> +		scsi_set_blocked(cmd, reason);
> +		goto out_uninit;
> +	}
> +
> +	return BLK_MQ_RQ_QUEUE_OK;
> +
> +out_uninit:
> +	if (rq->cmd_type == REQ_TYPE_FS)
> +		scsi_cmd_to_driver(cmd)->uninit_command(cmd);
> +out_dec_host_busy:
> +	atomic_dec(&shost->host_busy);
> +out_dec_target_busy:
> +	atomic_dec(&scsi_target(sdev)->target_busy);
> +out_dec_device_busy:
> +	atomic_dec(&sdev->device_busy);
> +	/* XXX: delay queue if device_busy == 0 */
> +out_put_device:
> +	put_device(&sdev->sdev_gendev);
> +out:
> +	return ret;
> +}
> +
>   u64 scsi_calculate_bounce_limit(struct Scsi_Host *shost)
>   {
>   	struct device *host_dev;
> @@ -1754,6 +1885,99 @@ struct request_queue *scsi_alloc_queue(struct scsi_device *sdev)
>   	return q;
>   }
>   
> +static struct blk_mq_ops scsi_mq_ops = {
> +	.queue_rq	= scsi_mq_queue_rq,
> +	.map_queue	= blk_mq_map_queue,
> +	.alloc_hctx	= blk_mq_alloc_single_hw_queue,
> +	.free_hctx	= blk_mq_free_single_hw_queue,
> +};
> +
> +struct request_queue *scsi_mq_alloc_queue(struct scsi_device *sdev)
> +{
> +	struct Scsi_Host *shost = sdev->host;
> +	struct blk_mq_hw_ctx *hctx;
> +	struct request_queue *q;
> +	struct request *rq;
> +	struct scsi_cmnd *cmd;
> +	struct blk_mq_reg reg;
> +	int i, j, sgl_size;
> +
> +	memset(&reg, 0, sizeof(reg));
> +	reg.ops = &scsi_mq_ops;
> +	reg.queue_depth = shost->cmd_per_lun;
> +	if (!reg.queue_depth)
> +		reg.queue_depth = 1;
> +
> +	/* XXX: what to do about chained S/G lists? */
> +	if (shost->hostt->sg_tablesize > SCSI_MAX_SG_SEGMENTS)
> +		shost->sg_tablesize = SCSI_MAX_SG_SEGMENTS;
> +	sgl_size = shost->sg_tablesize * sizeof(struct scatterlist);
> +
> +	reg.cmd_size = sizeof(struct scsi_cmnd) +
> +			sgl_size +
> +			shost->hostt->cmd_size;
> +	if (scsi_host_get_prot(shost))
> +		reg.cmd_size += sizeof(struct scsi_data_buffer) + sgl_size;
> +	reg.numa_node = NUMA_NO_NODE;
> +	reg.nr_hw_queues = 1;

Hey Christoph,

I just started to look at mq on Nic's WIP branch. I have a pretty basic 
question.

Both you and Nic offer a single HW queue per sdev.
I'm wandering if that should be the LLD's decision (if chooses to use 
multiple queues)?

Trying to understand how LLDs will fit in a way they exploit multi-queue 
and actually
maintain multiple queues. SRP/iSER for example maintain a single queue 
per connection
(or session in iSCSI). Now with multi-queue all requests of that shost 
will eventually
boil-down to posting on a single queue which might transition the 
bottleneck to the LLDs.

I noticed virtio_scsi implementation is choosing a queue per command 
based on current
processor id without any explicit mapping (unless I missed it).

I guess my question is where do (or should) LLDs plug-in to this mq scheme?

Thanks,
Sagi.

> +	reg.flags = BLK_MQ_F_SHOULD_MERGE;
> +
> +	q = blk_mq_init_queue(&reg, sdev);
> +	if (IS_ERR(q)) {
> +		printk("blk_mq_init_queue failed\n");
> +		return NULL;
> +	}
> +
> +	blk_queue_prep_rq(q, scsi_prep_fn);
> +	sdev->request_queue = q;
> +	q->queuedata = sdev;
> +
> +	__scsi_init_queue(shost, q);
> +
> +	/*
> +	 * XXX: figure out if we can get alignment right to allocate the sense
> +	 * buffer with the other chunks of memory.
> +	 *
> +	 * If not we'll need to find a way to have the blk-mq core call us to
> +	 * allocate/free commands so that we can properly clean up the
> +	 * allocation instead of leaking it.
> +	 */
> +	queue_for_each_hw_ctx(q, hctx, i) {
> +		for (j = 0; j < hctx->queue_depth; j++) {
> +			rq = hctx->rqs[j];
> +			cmd = rq->special;
> +
> +			cmd->sense_buffer = kzalloc_node(SCSI_SENSE_BUFFERSIZE,
> +					   GFP_KERNEL, reg.numa_node);
> +			if (!cmd->sense_buffer)
> +				goto out_free_sense_buffers;
> +		}
> +	}
> +
> +	rq = q->flush_rq;
> +	cmd = blk_mq_rq_to_pdu(rq);
> +
> +	cmd->sense_buffer = kzalloc_node(SCSI_SENSE_BUFFERSIZE,
> +					   GFP_KERNEL, reg.numa_node);
> +	if (!cmd->sense_buffer)
> +		goto out_free_sense_buffers;
> +
> +	return q;
> +
> +out_free_sense_buffers:
> +	queue_for_each_hw_ctx(q, hctx, i) {
> +		for (j = 0; j < hctx->queue_depth; j++) {
> +			rq = hctx->rqs[j];
> +			cmd = rq->special;
> +
> +			kfree(cmd->sense_buffer);
> +		}
> +	}
> +
> +	blk_cleanup_queue(q);
> +	return NULL;
> +}
> +
>   /*
>    * Function:    scsi_block_requests()
>    *
> diff --git a/drivers/scsi/scsi_priv.h b/drivers/scsi/scsi_priv.h
> index f079a59..712cec2 100644
> --- a/drivers/scsi/scsi_priv.h
> +++ b/drivers/scsi/scsi_priv.h
> @@ -88,8 +88,10 @@ extern void scsi_next_command(struct scsi_cmnd *cmd);
>   extern void scsi_io_completion(struct scsi_cmnd *, unsigned int);
>   extern void scsi_run_host_queues(struct Scsi_Host *shost);
>   extern struct request_queue *scsi_alloc_queue(struct scsi_device *sdev);
> +extern struct request_queue *scsi_mq_alloc_queue(struct scsi_device *sdev);
>   extern int scsi_init_queue(void);
>   extern void scsi_exit_queue(void);
> +extern void scsi_softirq_done(struct request *rq);
>   struct request_queue;
>   struct request;
>   extern struct kmem_cache *scsi_sdb_cache;
> diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c
> index 307a811..c807bc2 100644
> --- a/drivers/scsi/scsi_scan.c
> +++ b/drivers/scsi/scsi_scan.c
> @@ -277,7 +277,10 @@ static struct scsi_device *scsi_alloc_sdev(struct scsi_target *starget,
>   	 */
>   	sdev->borken = 1;
>   
> -	sdev->request_queue = scsi_alloc_queue(sdev);
> +	if (shost->hostt->use_blk_mq)
> +		sdev->request_queue = scsi_mq_alloc_queue(sdev);
> +	else
> +		sdev->request_queue = scsi_alloc_queue(sdev);
>   	if (!sdev->request_queue) {
>   		/* release fn is set up in scsi_sysfs_device_initialise, so
>   		 * have to free and put manually here */
> diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h
> index c4e4875..d2661cb 100644
> --- a/include/scsi/scsi_host.h
> +++ b/include/scsi/scsi_host.h
> @@ -531,6 +531,9 @@ struct scsi_host_template {
>   	 */
>   	unsigned int cmd_size;
>   	struct scsi_host_cmd_pool *cmd_pool;
> +
> +	/* temporary flag to use blk-mq I/O path */
> +	bool use_blk_mq;
>   };
>   
>   /*

next prev parent reply	other threads:[~2014-02-06  8:38 UTC|newest]

Thread overview: 31+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2014-02-05 12:41 [PATCH 00/15] A different approach for using blk-mq in the SCSI layer Christoph Hellwig
2014-02-05 12:41 ` [PATCH 01/15] block: rework flush sequencing for blk-mq Christoph Hellwig
2014-02-06  2:08   ` Muthu Kumar
2014-02-06 16:18     ` Christoph Hellwig
2014-02-05 12:41 ` [PATCH 02/15] blk-mq: support at_head inserations for blk_execute_rq Christoph Hellwig
2014-02-06  2:27   ` Muthu Kumar
2014-02-06 16:17     ` Christoph Hellwig
2014-02-06 17:05       ` Muthu Kumar
2014-02-06 17:10         ` Christoph Hellwig
2014-02-05 12:41 ` [PATCH 03/15] blk-mq: divert __blk_put_request for MQ ops Christoph Hellwig
2014-02-05 12:41 ` [PATCH 04/15] blk-mq: handle dma_drain_size Christoph Hellwig
2014-02-05 12:41 ` [PATCH 05/15] blk-mq: initialize sg_reserved_size Christoph Hellwig
2014-02-05 12:41 ` [PATCH 06/15] scsi: reintroduce scsi_driver.init_command Christoph Hellwig
2014-02-05 12:41 ` [PATCH 07/15] block: remove unprep_rq_fn Christoph Hellwig
2014-02-05 12:41 ` [PATCH 08/15] scsi: cleanup scsi_end_request calling conventions Christoph Hellwig
2014-02-05 12:41 ` [PATCH 09/15] scsi: centralize command re-queueing in scsi_dispatch_fn Christoph Hellwig
2014-02-05 12:41 ` [PATCH 10/15] scsi: split __scsi_queue_insert Christoph Hellwig
2014-02-05 12:41 ` [PATCH 11/15] scsi: factor out __scsi_init_queue Christoph Hellwig
2014-02-05 12:41 ` [PATCH 12/15] scsi: initial blk-mq support Christoph Hellwig
2014-02-06  8:38   ` Sagi Grimberg [this message]
2014-02-06 16:16     ` Christoph Hellwig
2014-02-06 22:11   ` Nicholas A. Bellinger
2014-02-07  8:45     ` Mike Christie
2014-02-07 12:42       ` Christoph Hellwig
2014-02-07 12:51     ` Christoph Hellwig
2014-02-05 12:41 ` [PATCH 13/15] scsi: partially stub out scsi_adjust_queue_depth when using blk-mq Christoph Hellwig
2014-02-05 12:41 ` [PATCH 14/15] iscsi_tcp: use blk_mq Christoph Hellwig
2014-02-05 12:41 ` [PATCH 15/15] virtio_scsi: " Christoph Hellwig
2014-02-10 11:35 ` [PATCH 00/15] A different approach for using blk-mq in the SCSI layer Christoph Hellwig
2014-02-10 19:38   ` Nicholas A. Bellinger
2014-02-24 14:46 ` Christoph Hellwig

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=52F349F9.7090509@dev.mellanox.co.il \
    --to=sagig@dev.mellanox.co.il \
    --cc=JBottomley@Parallels.com \
    --cc=axboe@kernel.dk \
    --cc=hch@infradead.org \
    --cc=linux-scsi@vger.kernel.org \
    --cc=nab@linux-iscsi.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.