From: Klaus Jensen <its@irrelevant.dk>
To: qemu-devel@nongnu.org
Cc: Kevin Wolf <kwolf@redhat.com>,
qemu-block@nongnu.org, Klaus Jensen <k.jensen@samsung.com>,
Max Reitz <mreitz@redhat.com>, Keith Busch <kbusch@kernel.org>,
Klaus Jensen <its@irrelevant.dk>
Subject: [PATCH v4 2/2] hw/block/nvme: add the dataset management command
Date: Thu, 22 Oct 2020 09:33:13 +0200 [thread overview]
Message-ID: <20201022073313.143794-3-its@irrelevant.dk> (raw)
In-Reply-To: <20201022073313.143794-1-its@irrelevant.dk>
From: Klaus Jensen <k.jensen@samsung.com>
Add support for the Dataset Management command and the Deallocate
attribute. Deallocation results in discards being sent to the underlying
block device. Whether of not the blocks are actually deallocated is
affected by the same factors as Write Zeroes (see previous commit).
format | discard | dsm (512b) dsm (4kb) dsm (64kb)
------------------------------------------------------
qcow2 ignore n n n
qcow2 unmap n n y
raw ignore n n n
raw unmap n y y
Again, a raw format and 4kb LBAs are preferable.
In order to set the Namespace Preferred Deallocate Granularity and
Alignment fields (NPDG and NPDA), choose a sane minimum discard
granularity of 4kb. If we are using a passthru device supporting discard
at a 512b granularity, user should set the discard_granularity property
explicitly. NPDG and NPDA will also account for the cluster_size of the
block driver if required (i.e. for QCOW2).
See NVM Express 1.3d, Section 6.7 ("Dataset Management command").
Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
---
hw/block/nvme.h | 2 +
include/block/nvme.h | 7 ++-
hw/block/nvme-ns.c | 36 ++++++++++++++--
hw/block/nvme.c | 100 ++++++++++++++++++++++++++++++++++++++++++-
4 files changed, 139 insertions(+), 6 deletions(-)
diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index e080a2318a50..574333caa3f9 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -28,6 +28,7 @@ typedef struct NvmeRequest {
struct NvmeNamespace *ns;
BlockAIOCB *aiocb;
uint16_t status;
+ void *opaque;
NvmeCqe cqe;
NvmeCmd cmd;
BlockAcctCookie acct;
@@ -60,6 +61,7 @@ static inline const char *nvme_io_opc_str(uint8_t opc)
case NVME_CMD_WRITE: return "NVME_NVM_CMD_WRITE";
case NVME_CMD_READ: return "NVME_NVM_CMD_READ";
case NVME_CMD_WRITE_ZEROES: return "NVME_NVM_CMD_WRITE_ZEROES";
+ case NVME_CMD_DSM: return "NVME_NVM_CMD_DSM";
default: return "NVME_NVM_CMD_UNKNOWN";
}
}
diff --git a/include/block/nvme.h b/include/block/nvme.h
index 966c3bb304bd..e95ff6ca9b37 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -990,7 +990,12 @@ typedef struct QEMU_PACKED NvmeIdNs {
uint16_t nabspf;
uint16_t noiob;
uint8_t nvmcap[16];
- uint8_t rsvd64[40];
+ uint16_t npwg;
+ uint16_t npwa;
+ uint16_t npdg;
+ uint16_t npda;
+ uint16_t nows;
+ uint8_t rsvd74[30];
uint8_t nguid[16];
uint64_t eui64;
NvmeLBAF lbaf[16];
diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index f1cc734c60f5..840651db7256 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -28,10 +28,14 @@
#include "nvme.h"
#include "nvme-ns.h"
-static void nvme_ns_init(NvmeNamespace *ns)
+#define MIN_DISCARD_GRANULARITY (4 * KiB)
+
+static int nvme_ns_init(NvmeNamespace *ns, Error **errp)
{
+ BlockDriverInfo bdi;
NvmeIdNs *id_ns = &ns->id_ns;
int lba_index = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas);
+ int npdg, ret;
ns->id_ns.dlfeat = 0x9;
@@ -43,8 +47,25 @@ static void nvme_ns_init(NvmeNamespace *ns)
id_ns->ncap = id_ns->nsze;
id_ns->nuse = id_ns->ncap;
- /* support DULBE */
- id_ns->nsfeat |= 0x4;
+ /* support DULBE and I/O optimization fields */
+ id_ns->nsfeat |= (0x4 | 0x10);
+
+ npdg = ns->blkconf.discard_granularity / ns->blkconf.logical_block_size;
+
+ ret = bdrv_get_info(blk_bs(ns->blkconf.blk), &bdi);
+ if (ret < 0) {
+ error_setg_errno(errp, -ret, "could not get block driver info");
+ return ret;
+ }
+
+ if (bdi.cluster_size &&
+ bdi.cluster_size > ns->blkconf.discard_granularity) {
+ npdg = bdi.cluster_size / ns->blkconf.logical_block_size;
+ }
+
+ id_ns->npda = id_ns->npdg = npdg - 1;
+
+ return 0;
}
static int nvme_ns_init_blk(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
@@ -59,6 +80,11 @@ static int nvme_ns_init_blk(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
return -1;
}
+ if (ns->blkconf.discard_granularity == -1) {
+ ns->blkconf.discard_granularity =
+ MAX(ns->blkconf.logical_block_size, MIN_DISCARD_GRANULARITY);
+ }
+
ns->size = blk_getlength(ns->blkconf.blk);
if (ns->size < 0) {
error_setg_errno(errp, -ns->size, "could not get blockdev size");
@@ -92,7 +118,9 @@ int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
return -1;
}
- nvme_ns_init(ns);
+ if (nvme_ns_init(ns, errp)) {
+ return -1;
+ }
if (nvme_register_namespace(n, ns, errp)) {
return -1;
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 68fa5aa0b1c0..8b3f4d24968b 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -959,6 +959,102 @@ static void nvme_rw_cb(void *opaque, int ret)
nvme_enqueue_req_completion(nvme_cq(req), req);
}
+static void nvme_aio_discard_cb(void *opaque, int ret)
+{
+ NvmeRequest *req = opaque;
+ int *discards = req->opaque;
+
+ trace_pci_nvme_aio_discard_cb(nvme_cid(req));
+
+ if (ret) {
+ req->status = NVME_INTERNAL_DEV_ERROR;
+ trace_pci_nvme_err_aio(nvme_cid(req), strerror(ret),
+ req->status);
+ }
+
+ if (discards && --(*discards) > 0) {
+ return;
+ }
+
+ g_free(req->opaque);
+ req->opaque = NULL;
+
+ nvme_enqueue_req_completion(nvme_cq(req), req);
+}
+
+static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req)
+{
+ NvmeNamespace *ns = req->ns;
+ NvmeDsmCmd *dsm = (NvmeDsmCmd *) &req->cmd;
+ NvmeDsmRange *range = NULL;
+ int *discards = NULL;
+
+ uint32_t attr = le32_to_cpu(dsm->attributes);
+ uint32_t nr = (le32_to_cpu(dsm->nr) & 0xff) + 1;
+
+ uint16_t status = NVME_SUCCESS;
+
+ trace_pci_nvme_dsm(nvme_cid(req), nvme_nsid(ns), nr, attr);
+
+ if (attr & NVME_DSMGMT_AD) {
+ int64_t offset;
+ size_t len;
+
+ range = g_new(NvmeDsmRange, nr);
+
+ status = nvme_dma(n, (uint8_t *)range, nr * sizeof(NvmeDsmRange),
+ DMA_DIRECTION_TO_DEVICE, req);
+ if (status) {
+ goto out;
+ }
+
+ discards = g_new(int, 1);
+ *discards = 1;
+ req->opaque = discards;
+
+ for (int i = 0; i < nr; i++) {
+ uint64_t slba = le64_to_cpu(range[i].slba);
+ uint32_t nlb = le32_to_cpu(range[i].nlb);
+
+ if (nvme_check_bounds(n, ns, slba, nlb)) {
+ trace_pci_nvme_err_invalid_lba_range(slba, nlb,
+ ns->id_ns.nsze);
+ continue;
+ }
+
+ trace_pci_nvme_dsm_deallocate(nvme_cid(req), nvme_nsid(ns), slba,
+ nlb);
+
+ offset = nvme_l2b(ns, slba);
+ len = nvme_l2b(ns, nlb);
+
+ while (len) {
+ size_t bytes = MIN(BDRV_REQUEST_MAX_BYTES, len);
+
+ (*discards)++;
+
+ blk_aio_pdiscard(ns->blkconf.blk, offset, bytes,
+ nvme_aio_discard_cb, req);
+
+ offset += bytes;
+ len -= bytes;
+ }
+ }
+
+ if (--(*discards)) {
+ status = NVME_NO_COMPLETE;
+ } else {
+ g_free(discards);
+ req->opaque = NULL;
+ }
+ }
+
+out:
+ g_free(range);
+
+ return status;
+}
+
static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req)
{
block_acct_start(blk_get_stats(req->ns->blkconf.blk), &req->acct, 0,
@@ -1088,6 +1184,8 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
case NVME_CMD_WRITE:
case NVME_CMD_READ:
return nvme_rw(n, req);
+ case NVME_CMD_DSM:
+ return nvme_dsm(n, req);
default:
trace_pci_nvme_err_invalid_opc(req->cmd.opcode);
return NVME_INVALID_OPCODE | NVME_DNR;
@@ -2811,7 +2909,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
id->cqes = (0x4 << 4) | 0x4;
id->nn = cpu_to_le32(n->num_namespaces);
id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROES | NVME_ONCS_TIMESTAMP |
- NVME_ONCS_FEATURES);
+ NVME_ONCS_FEATURES | NVME_ONCS_DSM);
id->vwc = 0x1;
id->sgls = cpu_to_le32(NVME_CTRL_SGLS_SUPPORT_NO_ALIGN |
--
2.28.0
next prev parent reply other threads:[~2020-10-22 7:51 UTC|newest]
Thread overview: 7+ messages / expand[flat|nested] mbox.gz Atom feed top
2020-10-22 7:33 [PATCH v4 0/2] hw/block/nvme: dulbe and dsm support Klaus Jensen
2020-10-22 7:33 ` [PATCH v4 1/2] hw/block/nvme: add dulbe support Klaus Jensen
2020-10-22 7:33 ` Klaus Jensen [this message]
2020-10-22 15:01 ` [PATCH v4 2/2] hw/block/nvme: add the dataset management command Keith Busch
2020-10-22 17:43 ` Klaus Jensen
2020-10-22 17:50 ` Keith Busch
2020-10-22 18:43 ` Klaus Jensen
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20201022073313.143794-3-its@irrelevant.dk \
--to=its@irrelevant.dk \
--cc=k.jensen@samsung.com \
--cc=kbusch@kernel.org \
--cc=kwolf@redhat.com \
--cc=mreitz@redhat.com \
--cc=qemu-block@nongnu.org \
--cc=qemu-devel@nongnu.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).