* [PATCH RFC] nvme: rewrite discard support
@ 2016-03-16 23:15 Ming Lin
2016-03-17 15:33 ` Keith Busch
0 siblings, 1 reply; 7+ messages in thread
From: Ming Lin @ 2016-03-16 23:15 UTC (permalink / raw)
From: Ming Lin <ming.l@ssi.samsung.com>
This rewrites nvme_setup_discard() similar as sd_setup_discard_cmnd().
And moves it to common code so fabrics driver can also use it.
Signed-off-by: Ming Lin <ming.l at ssi.samsung.com>
---
drivers/nvme/host/nvme.h | 26 ++++++++++++++++++++
drivers/nvme/host/pci.c | 64 +++++++++++++++++++++---------------------------
2 files changed, 54 insertions(+), 36 deletions(-)
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 75982b9..c757746 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -231,6 +231,32 @@ static inline void nvme_setup_rw(struct nvme_ns *ns, struct request *req,
cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt);
}
+static inline int nvme_setup_discard(struct nvme_ns *ns,
+ struct request *req, struct nvme_command *cmnd)
+{
+ struct nvme_dsm_range *range;
+ struct page *page;
+
+ page = alloc_page(GFP_ATOMIC | __GFP_ZERO);
+ if (!page)
+ return BLKPREP_DEFER;
+
+ range = page_address(page);
+ range->cattr = cpu_to_le32(0);
+ range->nlb = cpu_to_le32(blk_rq_bytes(req) >> ns->lba_shift);
+ range->slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
+
+ memset(cmnd, 0, sizeof(*cmnd));
+ cmnd->dsm.opcode = nvme_cmd_dsm;
+ cmnd->dsm.nsid = cpu_to_le32(ns->ns_id);
+ cmnd->dsm.nr = 0;
+ cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD);
+
+ req->completion_data = page;
+ blk_add_request_payload(req, page, sizeof(struct nvme_dsm_range));
+
+ return 0;
+}
static inline int nvme_error_status(u16 status)
{
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 4301584..eb593e6 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -382,6 +382,9 @@ static void nvme_free_iod(struct nvme_dev *dev, struct request *req)
__le64 **list = iod_list(req);
dma_addr_t prp_dma = iod->first_dma;
+ if (req->cmd_flags & REQ_DISCARD)
+ __free_page(req->completion_data);
+
if (iod->npages == 0)
dma_pool_free(dev->prp_small_pool, list[0], prp_dma);
for (i = 0; i < iod->npages; i++) {
@@ -610,37 +613,6 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
}
/*
- * We reuse the small pool to allocate the 16-byte range here as it is not
- * worth having a special pool for these or additional cases to handle freeing
- * the iod.
- */
-static int nvme_setup_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns,
- struct request *req, struct nvme_command *cmnd)
-{
- struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
- struct nvme_dsm_range *range;
-
- range = dma_pool_alloc(nvmeq->dev->prp_small_pool, GFP_ATOMIC,
- &iod->first_dma);
- if (!range)
- return BLK_MQ_RQ_QUEUE_BUSY;
- iod_list(req)[0] = (__le64 *)range;
- iod->npages = 0;
-
- range->cattr = cpu_to_le32(0);
- range->nlb = cpu_to_le32(blk_rq_bytes(req) >> ns->lba_shift);
- range->slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
-
- memset(cmnd, 0, sizeof(*cmnd));
- cmnd->dsm.opcode = nvme_cmd_dsm;
- cmnd->dsm.nsid = cpu_to_le32(ns->ns_id);
- cmnd->dsm.prp1 = cpu_to_le64(iod->first_dma);
- cmnd->dsm.nr = 0;
- cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD);
- return BLK_MQ_RQ_QUEUE_OK;
-}
-
-/*
* NOTE: ns is NULL when called on the admin queue.
*/
static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
@@ -671,7 +643,27 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
return ret;
if (req->cmd_flags & REQ_DISCARD) {
- ret = nvme_setup_discard(nvmeq, ns, req, &cmnd);
+ unsigned int nr_bytes = blk_rq_bytes(req);
+
+ ret = nvme_setup_discard(ns, req, &cmnd);
+ if (ret)
+ goto out;
+ ret = nvme_map_data(dev, req, &cmnd);
+ if (ret) {
+ __free_page(req->completion_data);
+ goto out;
+ }
+
+ /*
+ * Initially __data_len is set to the amount of data that needs
+ * to be transferred to the namespace. This amount depends on
+ * whether DISCARD is being used. After the scatterlist has been
+ * mapped by nvme_map_data() we set __data_len to the size of
+ * the area to be discarded on disk. This allows us to report
+ * completion on the full amount of blocks described by the
+ * request.
+ */
+ req->__data_len = nr_bytes;
} else {
if (req->cmd_type == REQ_TYPE_DRV_PRIV)
memcpy(&cmnd, req->cmd, sizeof(cmnd));
@@ -680,13 +672,13 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
else
nvme_setup_rw(ns, req, &cmnd);
- if (req->nr_phys_segments)
+ if (req->nr_phys_segments) {
ret = nvme_map_data(dev, req, &cmnd);
+ if (ret)
+ goto out;
+ }
}
- if (ret)
- goto out;
-
cmnd.common.command_id = req->tag;
blk_mq_start_request(req);
--
1.9.1
^ permalink raw reply related [flat|nested] 7+ messages in thread
* [PATCH RFC] nvme: rewrite discard support
2016-03-16 23:15 [PATCH RFC] nvme: rewrite discard support Ming Lin
@ 2016-03-17 15:33 ` Keith Busch
2016-03-17 17:27 ` Ming Lin
2016-03-21 14:47 ` Christoph Hellwig
0 siblings, 2 replies; 7+ messages in thread
From: Keith Busch @ 2016-03-17 15:33 UTC (permalink / raw)
On Wed, Mar 16, 2016@04:15:32PM -0700, Ming Lin wrote:
> From: Ming Lin <ming.l at ssi.samsung.com>
>
> This rewrites nvme_setup_discard() similar as sd_setup_discard_cmnd().
> And moves it to common code so fabrics driver can also use it.
Initial thoughts on this, I see the appeal to having a generic NVMe
discard setup, but I liked the existing implementation not needing a
special case for discard check on every IO in the completion handler.
Allocating a page for just 16 bytes of data seems a bit wasteful even
if it is short-lived. When it was allocating 256b from the DMA pool,
the smaller amount of additional data seemed worth it for not creating
a conditional completion.
^ permalink raw reply [flat|nested] 7+ messages in thread
* [PATCH RFC] nvme: rewrite discard support
2016-03-17 15:33 ` Keith Busch
@ 2016-03-17 17:27 ` Ming Lin
2016-03-21 14:47 ` Christoph Hellwig
1 sibling, 0 replies; 7+ messages in thread
From: Ming Lin @ 2016-03-17 17:27 UTC (permalink / raw)
On Thu, 2016-03-17@15:33 +0000, Keith Busch wrote:
> On Wed, Mar 16, 2016@04:15:32PM -0700, Ming Lin wrote:
> > From: Ming Lin <ming.l at ssi.samsung.com>
> >
> > This rewrites nvme_setup_discard() similar as sd_setup_discard_cmnd().
> > And moves it to common code so fabrics driver can also use it.
>
> Initial thoughts on this, I see the appeal to having a generic NVMe
> discard setup, but I liked the existing implementation not needing a
> special case for discard check on every IO in the completion handler.
>
> Allocating a page for just 16 bytes of data seems a bit wasteful even
> if it is short-lived. When it was allocating 256b from the DMA pool,
> the smaller amount of additional data seemed worth it for not creating
> a conditional completion.
How about moving the prp_small_pool to the core(and rename it to
nvme_small_pool)?
Then DISCARD can also use this pool. So we don't need a special case
check in the completion handler.
^ permalink raw reply [flat|nested] 7+ messages in thread
* [PATCH RFC] nvme: rewrite discard support
2016-03-17 15:33 ` Keith Busch
2016-03-17 17:27 ` Ming Lin
@ 2016-03-21 14:47 ` Christoph Hellwig
2016-03-21 15:51 ` Keith Busch
1 sibling, 1 reply; 7+ messages in thread
From: Christoph Hellwig @ 2016-03-21 14:47 UTC (permalink / raw)
On Thu, Mar 17, 2016@03:33:35PM +0000, Keith Busch wrote:
> Initial thoughts on this, I see the appeal to having a generic NVMe
> discard setup, but I liked the existing implementation not needing a
> special case for discard check on every IO in the completion handler.
This looks fine now, although if we'd ever support SGLs for PCIe
it wouldn't work anymore either way.
> Allocating a page for just 16 bytes of data seems a bit wasteful even
> if it is short-lived. When it was allocating 256b from the DMA pool,
> the smaller amount of additional data seemed worth it for not creating
> a conditional completion.
I think we should just do a 16 byte (or rather cache line size) kmalloc,
as there is no need for a mempool for discards. Would a version of
that be fine with you? Because discard is currently the only thing
preventing us from having a split between transport and commanset specific
setup for NVMe. And as things like Fabrics and LighNVM show I think
a clean split there would be rather benefitical in the long run.
^ permalink raw reply [flat|nested] 7+ messages in thread
* [PATCH RFC] nvme: rewrite discard support
2016-03-21 14:47 ` Christoph Hellwig
@ 2016-03-21 15:51 ` Keith Busch
2016-03-21 15:57 ` Christoph Hellwig
0 siblings, 1 reply; 7+ messages in thread
From: Keith Busch @ 2016-03-21 15:51 UTC (permalink / raw)
On Mon, Mar 21, 2016@03:47:59PM +0100, Christoph Hellwig wrote:
> I think we should just do a 16 byte (or rather cache line size) kmalloc,
> as there is no need for a mempool for discards. Would a version of
> that be fine with you? Because discard is currently the only thing
> preventing us from having a split between transport and commanset specific
> setup for NVMe. And as things like Fabrics and LighNVM show I think
> a clean split there would be rather benefitical in the long run.
We only used the mempool for the driver to leverage normal IO clean up
so that discard isn't special.
It looks like SCSI's special handling lead to what the code comments
describe as a "horrible hack", so that's not convincing this is a good
example to follow.
^ permalink raw reply [flat|nested] 7+ messages in thread
* [PATCH RFC] nvme: rewrite discard support
2016-03-21 15:51 ` Keith Busch
@ 2016-03-21 15:57 ` Christoph Hellwig
2016-03-21 16:29 ` Keith Busch
0 siblings, 1 reply; 7+ messages in thread
From: Christoph Hellwig @ 2016-03-21 15:57 UTC (permalink / raw)
On Mon, Mar 21, 2016@03:51:39PM +0000, Keith Busch wrote:
> It looks like SCSI's special handling lead to what the code comments
> describe as a "horrible hack", so that's not convincing this is a good
> example to follow.
Nah, the "horrible hack" is adding a payload to the request. We went
over and over this and everyone but Kent agrees it's the least horrible
option to handle discards in generic code. NVMe side steps this a bit
by not trying to handle this in common code, but that only works as
long as it's a single tightly integrated driver, not for a layer
architecture like SCSI, or what NVMe is increasingly moving to.
^ permalink raw reply [flat|nested] 7+ messages in thread
* [PATCH RFC] nvme: rewrite discard support
2016-03-21 15:57 ` Christoph Hellwig
@ 2016-03-21 16:29 ` Keith Busch
0 siblings, 0 replies; 7+ messages in thread
From: Keith Busch @ 2016-03-21 16:29 UTC (permalink / raw)
On Mon, Mar 21, 2016@04:57:36PM +0100, Christoph Hellwig wrote:
> On Mon, Mar 21, 2016@03:51:39PM +0000, Keith Busch wrote:
> > It looks like SCSI's special handling lead to what the code comments
> > describe as a "horrible hack", so that's not convincing this is a good
> > example to follow.
>
> Nah, the "horrible hack" is adding a payload to the request. We went
> over and over this and everyone but Kent agrees it's the least horrible
> option to handle discards in generic code. NVMe side steps this a bit
> by not trying to handle this in common code, but that only works as
> long as it's a single tightly integrated driver, not for a layer
> architecture like SCSI, or what NVMe is increasingly moving to.
Fair enough. Every discard payload is specific to the protocol, so
understood on the need. General consensus prevails.
Let's see the next version. It'd be nice if we can get away with
allocating only the necessary amount for the payload.
Final comment on the original patch: it shouldn't return BLKPREP_DEFER
when out of memory as it's not applicable for blk-mq drivers.
^ permalink raw reply [flat|nested] 7+ messages in thread
end of thread, other threads:[~2016-03-21 16:29 UTC | newest]
Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2016-03-16 23:15 [PATCH RFC] nvme: rewrite discard support Ming Lin
2016-03-17 15:33 ` Keith Busch
2016-03-17 17:27 ` Ming Lin
2016-03-21 14:47 ` Christoph Hellwig
2016-03-21 15:51 ` Keith Busch
2016-03-21 15:57 ` Christoph Hellwig
2016-03-21 16:29 ` Keith Busch
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.