[PATCH v10] NVMe: Convert to blk-mq

All of lore.kernel.org
 help / color / mirror / Atom feed

From: hch@infradead.org (Christoph Hellwig)
Subject: [PATCH v10] NVMe: Convert to blk-mq
Date: Mon, 14 Jul 2014 05:41:31 -0700	[thread overview]
Message-ID: <20140714124131.GA311@infradead.org> (raw)
In-Reply-To: <1404226382-7179-2-git-send-email-m@bjorling.me>

> +static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
> +			  unsigned int hctx_idx)
> +	struct nvme_queue *nvmeq = dev->queues[
> +					(hctx_idx % dev->queue_count) + 1];
> +
> +	/* nvmeq queues are shared between namespaces. We assume here that
> +	 * blk-mq map the tags so they match up with the nvme queue tags */
> +	if (!nvmeq->hctx)
> +		nvmeq->hctx = hctx;
> +	else
> +		WARN_ON(nvmeq->hctx->tags != hctx->tags);


This wrong to me, as you're overwriting the value of nvmeq->hctx for each
new requeust_queue.  But nothing but ->tagsis ever used from nvmeq->hctx,
so you shold rather set up nvmeq->tags in nvme_dev_add.

> +static int nvme_init_request(void *data, struct request *req,
> +				unsigned int hctx_idx, unsigned int rq_idx,
> +				unsigned int numa_node)
> +{
> +	struct nvme_dev *dev = data;
> +	struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
> +	struct nvme_queue *nvmeq = dev->queues[hctx_idx + 1];
> +
> +	WARN_ON(!nvmeq);
> +	cmd->nvmeq = nvmeq;

Shouldn't this fail instead of the warn_on?

> +static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req)
>  {
> +	struct nvme_ns *ns = hctx->queue->queuedata;
> +	struct nvme_queue *nvmeq = hctx->driver_data;
> +	struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
>  	struct nvme_iod *iod;
> +	enum dma_data_direction dma_dir;
> +	int psegs = req->nr_phys_segments;
> +	int result = BLK_MQ_RQ_QUEUE_BUSY;
> +	/*
> +	 * Requeued IO has already been prepped
> +	 */
> +	iod = req->special;
> +	if (iod)
> +		goto submit_iod;
>  
> +	iod = nvme_alloc_iod(psegs, blk_rq_bytes(req), GFP_ATOMIC);
>  	if (!iod)
> +		return result;

So there's still a memory allocation for each request here.  Any reason
this can't be preallocated at least for reasonable sized I/O?

No need for GFP_ATOMIC here either, and you probably need a mempool to
guarantee forward progress.

> +	if (req->cmd_flags & REQ_DISCARD) {
>  		void *range;
>  		/*
>  		 * We reuse the small pool to allocate the 16-byte range here
> @@ -752,33 +602,53 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
>  		range = dma_pool_alloc(nvmeq->dev->prp_small_pool,
>  						GFP_ATOMIC,
>  						&iod->first_dma);
> +		if (!range)
> +			goto finish_cmd;
>  		iod_list(iod)[0] = (__le64 *)range;
>  		iod->npages = 0;
>  	} else if (psegs) {
> +		dma_dir = rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE;
> +
> +		sg_init_table(iod->sg, psegs);
> +		iod->nents = blk_rq_map_sg(req->q, req, iod->sg);
> +		if (!iod->nents) {
> +			result = BLK_MQ_RQ_QUEUE_ERROR;
> +			goto finish_cmd;
>  		}
> +
> +		if (!dma_map_sg(nvmeq->q_dmadev, iod->sg, iod->nents, dma_dir))
> +			goto finish_cmd;
> +
> +		if (blk_rq_bytes(req) != nvme_setup_prps(nvmeq->dev, iod,
> +						blk_rq_bytes(req), GFP_ATOMIC))
> +			goto finish_cmd;
> +	}

Would be nice to factor these two into helpers, that could also avoid
the submid_iod goto..

> +
> +	if (req->cmd_flags & REQ_DISCARD) {
> +		nvme_submit_discard(nvmeq, ns, req, iod);
> +		goto queued;
> +	}
> +
> +	if (req->cmd_flags & REQ_FLUSH) {
> +		nvme_submit_flush(nvmeq, ns, req->tag);
> +		goto queued;
>  	}
> -	return 0;
>  
> +	nvme_submit_iod(nvmeq, iod, ns);
> + queued:

A simple

	if (req->cmd_flags & REQ_DISCARD)
		nvme_submit_discard(nvmeq, ns, req, iod);
	else if if (req->cmd_flags & REQ_FLUSH)
		nvme_submit_flush(nvmeq, ns, req->tag);
	else
		nvme_submit_iod(nvmeq, iod, ns);

seems preferable here.

> +static void nvme_cancel_queue_ios(void *data, unsigned long *tag_map)
>  {
> +	struct nvme_queue *nvmeq = data;
> +	struct blk_mq_hw_ctx *hctx = nvmeq->hctx;
> +	unsigned int tag = 0;
>  
> +	tag = 0;
> +	do {
> +		struct request *req;
>  		void *ctx;
>  		nvme_completion_fn fn;
> +		struct nvme_cmd_info *cmd;
>  		static struct nvme_completion cqe = {
>  			.status = cpu_to_le16(NVME_SC_ABORT_REQ << 1),
>  		};
> +		int qdepth = nvmeq == nvmeq->dev->queues[0] ?
> +					nvmeq->dev->admin_tagset.queue_depth :
> +					nvmeq->dev->tagset.queue_depth;
>  
> +		/* zero'd bits are free tags */
> +		tag = find_next_zero_bit(tag_map, qdepth, tag);
> +		if (tag >= qdepth)
> +			break;
> +
> +		req = blk_mq_tag_to_rq(hctx->tags, tag++);
> +		cmd = blk_mq_rq_to_pdu(req);

Seems like blk-mq would make your life easier by exporting an iterator
that goes over each in-use request instead of the current
blk_mq_tag_busy_iter prototype.  blk_mq_timeout_check would also be able
to make use of that, so maybe that would be a good preparatory patch?

> +static enum blk_eh_timer_return nvme_timeout(struct request *req)
>  {
> +	struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
> +	struct nvme_queue *nvmeq = cmd->nvmeq;
>  
> +	dev_warn(nvmeq->q_dmadev, "Timeout I/O %d QID %d\n", req->tag,
> +							nvmeq->qid);
> +	if (nvmeq->dev->initialized)
> +		nvme_abort_req(req);
>  
> +	return BLK_EH_RESET_TIMER;
> +}

Aborting a request but then resetting the timer looks wrong to me.

If that's indeed the intended behavior please add a comment explaining
it.

> +
> +static int nvme_alloc_admin_tags(struct nvme_dev *dev)
> +{
> +	if (!dev->admin_q) {

When would it be non-NULL?  Seems like the resume case might be the
case, but I suspect the code could be restructured to avoid even calling
nvme_alloc_admin_tags for that case.

> +static void nvme_free_admin_tags(struct nvme_dev *dev)
> +{
> +	if (dev->admin_q)
> +		blk_mq_free_tag_set(&dev->admin_tagset);
> +}

When would this be called with the admin queue not initialized?

> +static void nvme_dev_remove_admin(struct nvme_dev *dev)
> +{
> +	if (dev->admin_q && !blk_queue_dying(dev->admin_q))
> +		blk_cleanup_queue(dev->admin_q);
> +}

Same here.

WARNING: multiple messages have this Message-ID (diff)

From: Christoph Hellwig <hch@infradead.org>
To: Matias Bj??rling <m@bjorling.me>
Cc: willy@linux.intel.com, keith.busch@intel.com,
	sbradshaw@micron.com, axboe@fb.com, tom.leiming@gmail.com,
	hch@infradead.org, rlnelson@google.com,
	linux-kernel@vger.kernel.org, linux-nvme@lists.infradead.org
Subject: Re: [PATCH v10] NVMe: Convert to blk-mq
Date: Mon, 14 Jul 2014 05:41:31 -0700	[thread overview]
Message-ID: <20140714124131.GA311@infradead.org> (raw)
In-Reply-To: <1404226382-7179-2-git-send-email-m@bjorling.me>

> +static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
> +			  unsigned int hctx_idx)
> +	struct nvme_queue *nvmeq = dev->queues[
> +					(hctx_idx % dev->queue_count) + 1];
> +
> +	/* nvmeq queues are shared between namespaces. We assume here that
> +	 * blk-mq map the tags so they match up with the nvme queue tags */
> +	if (!nvmeq->hctx)
> +		nvmeq->hctx = hctx;
> +	else
> +		WARN_ON(nvmeq->hctx->tags != hctx->tags);


This wrong to me, as you're overwriting the value of nvmeq->hctx for each
new requeust_queue.  But nothing but ->tagsis ever used from nvmeq->hctx,
so you shold rather set up nvmeq->tags in nvme_dev_add.

> +static int nvme_init_request(void *data, struct request *req,
> +				unsigned int hctx_idx, unsigned int rq_idx,
> +				unsigned int numa_node)
> +{
> +	struct nvme_dev *dev = data;
> +	struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
> +	struct nvme_queue *nvmeq = dev->queues[hctx_idx + 1];
> +
> +	WARN_ON(!nvmeq);
> +	cmd->nvmeq = nvmeq;

Shouldn't this fail instead of the warn_on?

> +static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req)
>  {
> +	struct nvme_ns *ns = hctx->queue->queuedata;
> +	struct nvme_queue *nvmeq = hctx->driver_data;
> +	struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
>  	struct nvme_iod *iod;
> +	enum dma_data_direction dma_dir;
> +	int psegs = req->nr_phys_segments;
> +	int result = BLK_MQ_RQ_QUEUE_BUSY;
> +	/*
> +	 * Requeued IO has already been prepped
> +	 */
> +	iod = req->special;
> +	if (iod)
> +		goto submit_iod;
>  
> +	iod = nvme_alloc_iod(psegs, blk_rq_bytes(req), GFP_ATOMIC);
>  	if (!iod)
> +		return result;

So there's still a memory allocation for each request here.  Any reason
this can't be preallocated at least for reasonable sized I/O?

No need for GFP_ATOMIC here either, and you probably need a mempool to
guarantee forward progress.

> +	if (req->cmd_flags & REQ_DISCARD) {
>  		void *range;
>  		/*
>  		 * We reuse the small pool to allocate the 16-byte range here
> @@ -752,33 +602,53 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
>  		range = dma_pool_alloc(nvmeq->dev->prp_small_pool,
>  						GFP_ATOMIC,
>  						&iod->first_dma);
> +		if (!range)
> +			goto finish_cmd;
>  		iod_list(iod)[0] = (__le64 *)range;
>  		iod->npages = 0;
>  	} else if (psegs) {
> +		dma_dir = rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE;
> +
> +		sg_init_table(iod->sg, psegs);
> +		iod->nents = blk_rq_map_sg(req->q, req, iod->sg);
> +		if (!iod->nents) {
> +			result = BLK_MQ_RQ_QUEUE_ERROR;
> +			goto finish_cmd;
>  		}
> +
> +		if (!dma_map_sg(nvmeq->q_dmadev, iod->sg, iod->nents, dma_dir))
> +			goto finish_cmd;
> +
> +		if (blk_rq_bytes(req) != nvme_setup_prps(nvmeq->dev, iod,
> +						blk_rq_bytes(req), GFP_ATOMIC))
> +			goto finish_cmd;
> +	}

Would be nice to factor these two into helpers, that could also avoid
the submid_iod goto..

> +
> +	if (req->cmd_flags & REQ_DISCARD) {
> +		nvme_submit_discard(nvmeq, ns, req, iod);
> +		goto queued;
> +	}
> +
> +	if (req->cmd_flags & REQ_FLUSH) {
> +		nvme_submit_flush(nvmeq, ns, req->tag);
> +		goto queued;
>  	}
> -	return 0;
>  
> +	nvme_submit_iod(nvmeq, iod, ns);
> + queued:

A simple

	if (req->cmd_flags & REQ_DISCARD)
		nvme_submit_discard(nvmeq, ns, req, iod);
	else if if (req->cmd_flags & REQ_FLUSH)
		nvme_submit_flush(nvmeq, ns, req->tag);
	else
		nvme_submit_iod(nvmeq, iod, ns);

seems preferable here.

> +static void nvme_cancel_queue_ios(void *data, unsigned long *tag_map)
>  {
> +	struct nvme_queue *nvmeq = data;
> +	struct blk_mq_hw_ctx *hctx = nvmeq->hctx;
> +	unsigned int tag = 0;
>  
> +	tag = 0;
> +	do {
> +		struct request *req;
>  		void *ctx;
>  		nvme_completion_fn fn;
> +		struct nvme_cmd_info *cmd;
>  		static struct nvme_completion cqe = {
>  			.status = cpu_to_le16(NVME_SC_ABORT_REQ << 1),
>  		};
> +		int qdepth = nvmeq == nvmeq->dev->queues[0] ?
> +					nvmeq->dev->admin_tagset.queue_depth :
> +					nvmeq->dev->tagset.queue_depth;
>  
> +		/* zero'd bits are free tags */
> +		tag = find_next_zero_bit(tag_map, qdepth, tag);
> +		if (tag >= qdepth)
> +			break;
> +
> +		req = blk_mq_tag_to_rq(hctx->tags, tag++);
> +		cmd = blk_mq_rq_to_pdu(req);

Seems like blk-mq would make your life easier by exporting an iterator
that goes over each in-use request instead of the current
blk_mq_tag_busy_iter prototype.  blk_mq_timeout_check would also be able
to make use of that, so maybe that would be a good preparatory patch?

> +static enum blk_eh_timer_return nvme_timeout(struct request *req)
>  {
> +	struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
> +	struct nvme_queue *nvmeq = cmd->nvmeq;
>  
> +	dev_warn(nvmeq->q_dmadev, "Timeout I/O %d QID %d\n", req->tag,
> +							nvmeq->qid);
> +	if (nvmeq->dev->initialized)
> +		nvme_abort_req(req);
>  
> +	return BLK_EH_RESET_TIMER;
> +}

Aborting a request but then resetting the timer looks wrong to me.

If that's indeed the intended behavior please add a comment explaining
it.

> +
> +static int nvme_alloc_admin_tags(struct nvme_dev *dev)
> +{
> +	if (!dev->admin_q) {

When would it be non-NULL?  Seems like the resume case might be the
case, but I suspect the code could be restructured to avoid even calling
nvme_alloc_admin_tags for that case.

> +static void nvme_free_admin_tags(struct nvme_dev *dev)
> +{
> +	if (dev->admin_q)
> +		blk_mq_free_tag_set(&dev->admin_tagset);
> +}

When would this be called with the admin queue not initialized?

> +static void nvme_dev_remove_admin(struct nvme_dev *dev)
> +{
> +	if (dev->admin_q && !blk_queue_dying(dev->admin_q))
> +		blk_cleanup_queue(dev->admin_q);
> +}

Same here.

next prev parent reply	other threads:[~2014-07-14 12:41 UTC|newest]

Thread overview: 10+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2014-07-01 14:53 [PATCH v10] Convert NVMe driver to blk-mq Matias Bjørling
2014-07-01 14:53 ` Matias Bjørling
2014-07-01 14:53 ` [PATCH v10] NVMe: Convert " Matias Bjørling
2014-07-01 14:53   ` Matias Bjørling
2014-07-14 12:41   ` Christoph Hellwig [this message]
2014-07-14 12:41     ` Christoph Hellwig
2014-07-23 18:58     ` Matias Bjorling
2014-07-23 18:58       ` Matias Bjorling
2014-07-23 19:07       ` Jens Axboe
2014-07-23 19:07         ` Jens Axboe

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20140714124131.GA311@infradead.org \
    --to=hch@infradead.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.