From: Christoph Hellwig <hch@infradead.org>
To: Matias Bj??rling <m@bjorling.me>
Cc: willy@linux.intel.com, keith.busch@intel.com,
sbradshaw@micron.com, axboe@fb.com, tom.leiming@gmail.com,
hch@infradead.org, rlnelson@google.com,
linux-kernel@vger.kernel.org, linux-nvme@lists.infradead.org
Subject: Re: [PATCH v10] NVMe: Convert to blk-mq
Date: Mon, 14 Jul 2014 05:41:31 -0700 [thread overview]
Message-ID: <20140714124131.GA311@infradead.org> (raw)
In-Reply-To: <1404226382-7179-2-git-send-email-m@bjorling.me>
> +static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
> + unsigned int hctx_idx)
> + struct nvme_queue *nvmeq = dev->queues[
> + (hctx_idx % dev->queue_count) + 1];
> +
> + /* nvmeq queues are shared between namespaces. We assume here that
> + * blk-mq map the tags so they match up with the nvme queue tags */
> + if (!nvmeq->hctx)
> + nvmeq->hctx = hctx;
> + else
> + WARN_ON(nvmeq->hctx->tags != hctx->tags);
This wrong to me, as you're overwriting the value of nvmeq->hctx for each
new requeust_queue. But nothing but ->tagsis ever used from nvmeq->hctx,
so you shold rather set up nvmeq->tags in nvme_dev_add.
> +static int nvme_init_request(void *data, struct request *req,
> + unsigned int hctx_idx, unsigned int rq_idx,
> + unsigned int numa_node)
> +{
> + struct nvme_dev *dev = data;
> + struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
> + struct nvme_queue *nvmeq = dev->queues[hctx_idx + 1];
> +
> + WARN_ON(!nvmeq);
> + cmd->nvmeq = nvmeq;
Shouldn't this fail instead of the warn_on?
> +static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req)
> {
> + struct nvme_ns *ns = hctx->queue->queuedata;
> + struct nvme_queue *nvmeq = hctx->driver_data;
> + struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
> struct nvme_iod *iod;
> + enum dma_data_direction dma_dir;
> + int psegs = req->nr_phys_segments;
> + int result = BLK_MQ_RQ_QUEUE_BUSY;
> + /*
> + * Requeued IO has already been prepped
> + */
> + iod = req->special;
> + if (iod)
> + goto submit_iod;
>
> + iod = nvme_alloc_iod(psegs, blk_rq_bytes(req), GFP_ATOMIC);
> if (!iod)
> + return result;
So there's still a memory allocation for each request here. Any reason
this can't be preallocated at least for reasonable sized I/O?
No need for GFP_ATOMIC here either, and you probably need a mempool to
guarantee forward progress.
> + if (req->cmd_flags & REQ_DISCARD) {
> void *range;
> /*
> * We reuse the small pool to allocate the 16-byte range here
> @@ -752,33 +602,53 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
> range = dma_pool_alloc(nvmeq->dev->prp_small_pool,
> GFP_ATOMIC,
> &iod->first_dma);
> + if (!range)
> + goto finish_cmd;
> iod_list(iod)[0] = (__le64 *)range;
> iod->npages = 0;
> } else if (psegs) {
> + dma_dir = rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE;
> +
> + sg_init_table(iod->sg, psegs);
> + iod->nents = blk_rq_map_sg(req->q, req, iod->sg);
> + if (!iod->nents) {
> + result = BLK_MQ_RQ_QUEUE_ERROR;
> + goto finish_cmd;
> }
> +
> + if (!dma_map_sg(nvmeq->q_dmadev, iod->sg, iod->nents, dma_dir))
> + goto finish_cmd;
> +
> + if (blk_rq_bytes(req) != nvme_setup_prps(nvmeq->dev, iod,
> + blk_rq_bytes(req), GFP_ATOMIC))
> + goto finish_cmd;
> + }
Would be nice to factor these two into helpers, that could also avoid
the submid_iod goto..
> +
> + if (req->cmd_flags & REQ_DISCARD) {
> + nvme_submit_discard(nvmeq, ns, req, iod);
> + goto queued;
> + }
> +
> + if (req->cmd_flags & REQ_FLUSH) {
> + nvme_submit_flush(nvmeq, ns, req->tag);
> + goto queued;
> }
> - return 0;
>
> + nvme_submit_iod(nvmeq, iod, ns);
> + queued:
A simple
if (req->cmd_flags & REQ_DISCARD)
nvme_submit_discard(nvmeq, ns, req, iod);
else if if (req->cmd_flags & REQ_FLUSH)
nvme_submit_flush(nvmeq, ns, req->tag);
else
nvme_submit_iod(nvmeq, iod, ns);
seems preferable here.
> +static void nvme_cancel_queue_ios(void *data, unsigned long *tag_map)
> {
> + struct nvme_queue *nvmeq = data;
> + struct blk_mq_hw_ctx *hctx = nvmeq->hctx;
> + unsigned int tag = 0;
>
> + tag = 0;
> + do {
> + struct request *req;
> void *ctx;
> nvme_completion_fn fn;
> + struct nvme_cmd_info *cmd;
> static struct nvme_completion cqe = {
> .status = cpu_to_le16(NVME_SC_ABORT_REQ << 1),
> };
> + int qdepth = nvmeq == nvmeq->dev->queues[0] ?
> + nvmeq->dev->admin_tagset.queue_depth :
> + nvmeq->dev->tagset.queue_depth;
>
> + /* zero'd bits are free tags */
> + tag = find_next_zero_bit(tag_map, qdepth, tag);
> + if (tag >= qdepth)
> + break;
> +
> + req = blk_mq_tag_to_rq(hctx->tags, tag++);
> + cmd = blk_mq_rq_to_pdu(req);
Seems like blk-mq would make your life easier by exporting an iterator
that goes over each in-use request instead of the current
blk_mq_tag_busy_iter prototype. blk_mq_timeout_check would also be able
to make use of that, so maybe that would be a good preparatory patch?
> +static enum blk_eh_timer_return nvme_timeout(struct request *req)
> {
> + struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
> + struct nvme_queue *nvmeq = cmd->nvmeq;
>
> + dev_warn(nvmeq->q_dmadev, "Timeout I/O %d QID %d\n", req->tag,
> + nvmeq->qid);
> + if (nvmeq->dev->initialized)
> + nvme_abort_req(req);
>
> + return BLK_EH_RESET_TIMER;
> +}
Aborting a request but then resetting the timer looks wrong to me.
If that's indeed the intended behavior please add a comment explaining
it.
> +
> +static int nvme_alloc_admin_tags(struct nvme_dev *dev)
> +{
> + if (!dev->admin_q) {
When would it be non-NULL? Seems like the resume case might be the
case, but I suspect the code could be restructured to avoid even calling
nvme_alloc_admin_tags for that case.
> +static void nvme_free_admin_tags(struct nvme_dev *dev)
> +{
> + if (dev->admin_q)
> + blk_mq_free_tag_set(&dev->admin_tagset);
> +}
When would this be called with the admin queue not initialized?
> +static void nvme_dev_remove_admin(struct nvme_dev *dev)
> +{
> + if (dev->admin_q && !blk_queue_dying(dev->admin_q))
> + blk_cleanup_queue(dev->admin_q);
> +}
Same here.
next prev parent reply other threads:[~2014-07-14 12:41 UTC|newest]
Thread overview: 5+ messages / expand[flat|nested] mbox.gz Atom feed top
2014-07-01 14:53 [PATCH v10] Convert NVMe driver to blk-mq Matias Bjørling
2014-07-01 14:53 ` [PATCH v10] NVMe: Convert " Matias Bjørling
2014-07-14 12:41 ` Christoph Hellwig [this message]
2014-07-23 18:58 ` Matias Bjorling
2014-07-23 19:07 ` Jens Axboe
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20140714124131.GA311@infradead.org \
--to=hch@infradead.org \
--cc=axboe@fb.com \
--cc=keith.busch@intel.com \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-nvme@lists.infradead.org \
--cc=m@bjorling.me \
--cc=rlnelson@google.com \
--cc=sbradshaw@micron.com \
--cc=tom.leiming@gmail.com \
--cc=willy@linux.intel.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox