From: Klaus Jensen <its@irrelevant.dk>
To: qemu-devel@nongnu.org, Peter Maydell <peter.maydell@linaro.org>
Cc: Fam Zheng <fam@euphon.net>, Kevin Wolf <kwolf@redhat.com>,
qemu-block@nongnu.org, Klaus Jensen <k.jensen@samsung.com>,
Max Reitz <mreitz@redhat.com>, Klaus Jensen <its@irrelevant.dk>,
Stefan Hajnoczi <stefanha@redhat.com>,
Keith Busch <kbusch@kernel.org>
Subject: [PULL 03/56] hw/block/nvme: add dulbe support
Date: Tue, 9 Feb 2021 08:30:08 +0100 [thread overview]
Message-ID: <20210209073101.548811-4-its@irrelevant.dk> (raw)
In-Reply-To: <20210209073101.548811-1-its@irrelevant.dk>
From: Klaus Jensen <k.jensen@samsung.com>
Add support for reporting the Deallocated or Unwritten Logical Block
Error (DULBE).
Rely on the block status flags reported by the block layer and consider
any block with the BDRV_BLOCK_ZERO flag to be deallocated.
Multiple factors affect when a Write Zeroes command result in
deallocation of blocks.
* the underlying file system block size
* the blockdev format
* the 'discard' and 'logical_block_size' parameters
format | discard | wz (512B) wz (4KiB) wz (64KiB)
-----------------------------------------------------
qcow2 ignore n n y
qcow2 unmap n n y
raw ignore n y y
raw unmap n y y
So, this works best with an image in raw format and 4KiB LBAs, since
holes can then be punched on a per-block basis (this assumes a file
system with a 4kb block size, YMMV). A qcow2 image, uses a cluster size
of 64KiB by default and blocks will only be marked deallocated if a full
cluster is zeroed or discarded. However, this *is* consistent with the
spec since Write Zeroes "should" deallocate the block if the Deallocate
attribute is set and "may" deallocate if the Deallocate attribute is not
set. Thus, we always try to deallocate (the BDRV_REQ_MAY_UNMAP flag is
always set).
Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
Reviewed-by: Keith Busch <kbusch@kernel.org>
---
hw/block/nvme-ns.h | 4 ++
include/block/nvme.h | 5 +++
hw/block/nvme-ns.c | 8 ++--
hw/block/nvme.c | 91 ++++++++++++++++++++++++++++++++++++++++++-
hw/block/trace-events | 4 ++
5 files changed, 107 insertions(+), 5 deletions(-)
diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
index 83734f4606e1..44bf6271b744 100644
--- a/hw/block/nvme-ns.h
+++ b/hw/block/nvme-ns.h
@@ -31,6 +31,10 @@ typedef struct NvmeNamespace {
NvmeIdNs id_ns;
NvmeNamespaceParams params;
+
+ struct {
+ uint32_t err_rec;
+ } features;
} NvmeNamespace;
static inline uint32_t nvme_nsid(NvmeNamespace *ns)
diff --git a/include/block/nvme.h b/include/block/nvme.h
index 3e02d9ca9843..b663d11e60c1 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -693,6 +693,7 @@ enum NvmeStatusCodes {
NVME_E2E_REF_ERROR = 0x0284,
NVME_CMP_FAILURE = 0x0285,
NVME_ACCESS_DENIED = 0x0286,
+ NVME_DULB = 0x0287,
NVME_MORE = 0x2000,
NVME_DNR = 0x4000,
NVME_NO_COMPLETE = 0xffff,
@@ -909,6 +910,9 @@ enum NvmeIdCtrlLpa {
#define NVME_AEC_NS_ATTR(aec) ((aec >> 8) & 0x1)
#define NVME_AEC_FW_ACTIVATION(aec) ((aec >> 9) & 0x1)
+#define NVME_ERR_REC_TLER(err_rec) (err_rec & 0xffff)
+#define NVME_ERR_REC_DULBE(err_rec) (err_rec & 0x10000)
+
enum NvmeFeatureIds {
NVME_ARBITRATION = 0x1,
NVME_POWER_MANAGEMENT = 0x2,
@@ -1029,6 +1033,7 @@ enum NvmeNsIdentifierType {
#define NVME_ID_NS_NSFEAT_THIN(nsfeat) ((nsfeat & 0x1))
+#define NVME_ID_NS_NSFEAT_DULBE(nsfeat) ((nsfeat >> 2) & 0x1)
#define NVME_ID_NS_FLBAS_EXTENDED(flbas) ((flbas >> 4) & 0x1)
#define NVME_ID_NS_FLBAS_INDEX(flbas) ((flbas & 0xf))
#define NVME_ID_NS_MC_SEPARATE(mc) ((mc >> 1) & 0x1)
diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index 2670787d2630..53ded460348e 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -33,9 +33,7 @@ static void nvme_ns_init(NvmeNamespace *ns)
NvmeIdNs *id_ns = &ns->id_ns;
int lba_index = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas);
- if (blk_get_flags(ns->blkconf.blk) & BDRV_O_UNMAP) {
- ns->id_ns.dlfeat = 0x9;
- }
+ ns->id_ns.dlfeat = 0x9;
id_ns->lbaf[lba_index].ds = 31 - clz32(ns->blkconf.logical_block_size);
@@ -44,6 +42,9 @@ static void nvme_ns_init(NvmeNamespace *ns)
/* no thin provisioning */
id_ns->ncap = id_ns->nsze;
id_ns->nuse = id_ns->ncap;
+
+ /* support DULBE */
+ id_ns->nsfeat |= 0x4;
}
static int nvme_ns_init_blk(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
@@ -93,6 +94,7 @@ int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
}
nvme_ns_init(ns);
+
if (nvme_register_namespace(n, ns, errp)) {
return -1;
}
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index a245ff8ceb2c..6e6bdb338ad7 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -105,6 +105,7 @@ static const bool nvme_feature_support[NVME_FID_MAX] = {
static const uint32_t nvme_feature_cap[NVME_FID_MAX] = {
[NVME_TEMPERATURE_THRESHOLD] = NVME_FEAT_CAP_CHANGE,
+ [NVME_ERROR_RECOVERY] = NVME_FEAT_CAP_CHANGE | NVME_FEAT_CAP_NS,
[NVME_VOLATILE_WRITE_CACHE] = NVME_FEAT_CAP_CHANGE,
[NVME_NUMBER_OF_QUEUES] = NVME_FEAT_CAP_CHANGE,
[NVME_ASYNCHRONOUS_EVENT_CONF] = NVME_FEAT_CAP_CHANGE,
@@ -878,6 +879,49 @@ static inline uint16_t nvme_check_bounds(NvmeNamespace *ns, uint64_t slba,
return NVME_SUCCESS;
}
+static uint16_t nvme_check_dulbe(NvmeNamespace *ns, uint64_t slba,
+ uint32_t nlb)
+{
+ BlockDriverState *bs = blk_bs(ns->blkconf.blk);
+
+ int64_t pnum = 0, bytes = nvme_l2b(ns, nlb);
+ int64_t offset = nvme_l2b(ns, slba);
+ bool zeroed;
+ int ret;
+
+ Error *local_err = NULL;
+
+ /*
+ * `pnum` holds the number of bytes after offset that shares the same
+ * allocation status as the byte at offset. If `pnum` is different from
+ * `bytes`, we should check the allocation status of the next range and
+ * continue this until all bytes have been checked.
+ */
+ do {
+ bytes -= pnum;
+
+ ret = bdrv_block_status(bs, offset, bytes, &pnum, NULL, NULL);
+ if (ret < 0) {
+ error_setg_errno(&local_err, -ret, "unable to get block status");
+ error_report_err(local_err);
+
+ return NVME_INTERNAL_DEV_ERROR;
+ }
+
+ zeroed = !!(ret & BDRV_BLOCK_ZERO);
+
+ trace_pci_nvme_block_status(offset, bytes, pnum, ret, zeroed);
+
+ if (zeroed) {
+ return NVME_DULB;
+ }
+
+ offset += pnum;
+ } while (pnum != bytes);
+
+ return NVME_SUCCESS;
+}
+
static void nvme_aio_err(NvmeRequest *req, int ret)
{
uint16_t status = NVME_SUCCESS;
@@ -996,6 +1040,15 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeRequest *req)
goto invalid;
}
+ if (acct == BLOCK_ACCT_READ) {
+ if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
+ status = nvme_check_dulbe(ns, slba, nlb);
+ if (status) {
+ goto invalid;
+ }
+ }
+ }
+
status = nvme_map_dptr(n, data_size, req);
if (status) {
goto invalid;
@@ -1641,6 +1694,7 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest *req)
uint8_t fid = NVME_GETSETFEAT_FID(dw10);
NvmeGetFeatureSelect sel = NVME_GETFEAT_SELECT(dw10);
uint16_t iv;
+ NvmeNamespace *ns;
static const uint32_t nvme_feature_default[NVME_FID_MAX] = {
[NVME_ARBITRATION] = NVME_ARB_AB_NOLIMIT,
@@ -1703,6 +1757,18 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest *req)
}
return NVME_INVALID_FIELD | NVME_DNR;
+ case NVME_ERROR_RECOVERY:
+ if (!nvme_nsid_valid(n, nsid)) {
+ return NVME_INVALID_NSID | NVME_DNR;
+ }
+
+ ns = nvme_ns(n, nsid);
+ if (unlikely(!ns)) {
+ return NVME_INVALID_FIELD | NVME_DNR;
+ }
+
+ result = ns->features.err_rec;
+ goto out;
case NVME_VOLATILE_WRITE_CACHE:
result = n->features.vwc;
trace_pci_nvme_getfeat_vwcache(result ? "enabled" : "disabled");
@@ -1775,7 +1841,7 @@ static uint16_t nvme_set_feature_timestamp(NvmeCtrl *n, NvmeRequest *req)
static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req)
{
- NvmeNamespace *ns;
+ NvmeNamespace *ns = NULL;
NvmeCmd *cmd = &req->cmd;
uint32_t dw10 = le32_to_cpu(cmd->cdw10);
@@ -1783,6 +1849,7 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req)
uint32_t nsid = le32_to_cpu(cmd->nsid);
uint8_t fid = NVME_GETSETFEAT_FID(dw10);
uint8_t save = NVME_SETFEAT_SAVE(dw10);
+ int i;
trace_pci_nvme_setfeat(nvme_cid(req), nsid, fid, save, dw11);
@@ -1842,11 +1909,31 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req)
NVME_LOG_SMART_INFO);
}
+ break;
+ case NVME_ERROR_RECOVERY:
+ if (nsid == NVME_NSID_BROADCAST) {
+ for (i = 1; i <= n->num_namespaces; i++) {
+ ns = nvme_ns(n, i);
+
+ if (!ns) {
+ continue;
+ }
+
+ if (NVME_ID_NS_NSFEAT_DULBE(ns->id_ns.nsfeat)) {
+ ns->features.err_rec = dw11;
+ }
+ }
+
+ break;
+ }
+
+ assert(ns);
+ ns->features.err_rec = dw11;
break;
case NVME_VOLATILE_WRITE_CACHE:
n->features.vwc = dw11 & 0x1;
- for (int i = 1; i <= n->num_namespaces; i++) {
+ for (i = 1; i <= n->num_namespaces; i++) {
ns = nvme_ns(n, i);
if (!ns) {
continue;
diff --git a/hw/block/trace-events b/hw/block/trace-events
index c1537e3ac0b0..1ffe0b3f76b5 100644
--- a/hw/block/trace-events
+++ b/hw/block/trace-events
@@ -43,6 +43,10 @@ pci_nvme_admin_cmd(uint16_t cid, uint16_t sqid, uint8_t opcode, const char *opna
pci_nvme_rw(uint16_t cid, const char *verb, uint32_t nsid, uint32_t nlb, uint64_t count, uint64_t lba) "cid %"PRIu16" opname '%s' nsid %"PRIu32" nlb %"PRIu32" count %"PRIu64" lba 0x%"PRIx64""
pci_nvme_rw_cb(uint16_t cid, const char *blkname) "cid %"PRIu16" blk '%s'"
pci_nvme_write_zeroes(uint16_t cid, uint32_t nsid, uint64_t slba, uint32_t nlb) "cid %"PRIu16" nsid %"PRIu32" slba %"PRIu64" nlb %"PRIu32""
+pci_nvme_block_status(int64_t offset, int64_t bytes, int64_t pnum, int ret, bool zeroed) "offset %"PRId64" bytes %"PRId64" pnum %"PRId64" ret 0x%x zeroed %d"
+pci_nvme_dsm(uint16_t cid, uint32_t nsid, uint32_t nr, uint32_t attr) "cid %"PRIu16" nsid %"PRIu32" nr %"PRIu32" attr 0x%"PRIx32""
+pci_nvme_dsm_deallocate(uint16_t cid, uint32_t nsid, uint64_t slba, uint32_t nlb) "cid %"PRIu16" nsid %"PRIu32" slba %"PRIu64" nlb %"PRIu32""
+pci_nvme_aio_discard_cb(uint16_t cid) "cid %"PRIu16""
pci_nvme_create_sq(uint64_t addr, uint16_t sqid, uint16_t cqid, uint16_t qsize, uint16_t qflags) "create submission queue, addr=0x%"PRIx64", sqid=%"PRIu16", cqid=%"PRIu16", qsize=%"PRIu16", qflags=%"PRIu16""
pci_nvme_create_cq(uint64_t addr, uint16_t cqid, uint16_t vector, uint16_t size, uint16_t qflags, int ien) "create completion queue, addr=0x%"PRIx64", cqid=%"PRIu16", vector=%"PRIu16", qsize=%"PRIu16", qflags=%"PRIu16", ien=%d"
pci_nvme_del_sq(uint16_t qid) "deleting submission queue sqid=%"PRIu16""
--
2.30.0
next prev parent reply other threads:[~2021-02-09 7:43 UTC|newest]
Thread overview: 62+ messages / expand[flat|nested] mbox.gz Atom feed top
2021-02-09 7:30 [PULL 00/56] emulated nvme patches Klaus Jensen
2021-02-09 7:30 ` [PULL 01/56] hw/block/nvme: remove superfluous NvmeCtrl parameter Klaus Jensen
2021-02-09 7:30 ` [PULL 02/56] hw/block/nvme: pull aio error handling Klaus Jensen
2021-02-09 7:30 ` Klaus Jensen [this message]
2021-02-09 7:30 ` [PULL 04/56] nvme: add namespace I/O optimization fields to shared header Klaus Jensen
2021-02-09 7:30 ` [PULL 05/56] hw/block/nvme: add the dataset management command Klaus Jensen
2021-02-09 7:30 ` [PULL 06/56] hw/block/nvme: add compare command Klaus Jensen
2021-02-09 7:30 ` [PULL 07/56] hw/block/nvme: fix bad clearing of CAP Klaus Jensen
2021-02-09 7:30 ` [PULL 08/56] hw/block/nvme: Process controller reset and shutdown differently Klaus Jensen
2021-02-09 7:30 ` [PULL 09/56] hw/block/nvme: Generate namespace UUIDs Klaus Jensen
2021-02-09 7:30 ` [PULL 10/56] hw/block/nvme: Separate read and write handlers Klaus Jensen
2021-02-09 7:30 ` [PULL 11/56] hw/block/nvme: Combine nvme_write_zeroes() and nvme_write() Klaus Jensen
2021-02-09 7:30 ` [PULL 12/56] hw/block/nvme: Add Commands Supported and Effects log Klaus Jensen
2021-02-09 7:30 ` [PULL 13/56] hw/block/nvme: Add support for Namespace Types Klaus Jensen
2021-02-09 7:30 ` [PULL 14/56] hw/block/nvme: Support allocated CNS command variants Klaus Jensen
2021-02-09 7:30 ` [PULL 15/56] nvme: Make ZNS-related definitions Klaus Jensen
2021-02-09 7:30 ` [PULL 16/56] hw/block/nvme: Support Zoned Namespace Command Set Klaus Jensen
2021-02-09 7:30 ` [PULL 17/56] hw/block/nvme: Introduce max active and open zone limits Klaus Jensen
2021-02-09 7:30 ` [PULL 18/56] hw/block/nvme: Support Zone Descriptor Extensions Klaus Jensen
2021-02-09 7:30 ` [PULL 19/56] hw/block/nvme: Document zoned parameters in usage text Klaus Jensen
2021-02-09 7:30 ` [PULL 20/56] hw/block/nvme: fix for non-msix machines Klaus Jensen
2021-02-09 7:30 ` [PULL 21/56] hw/block/nvme: conditionally enable DULBE for zoned namespaces Klaus Jensen
2021-02-09 7:30 ` [PULL 22/56] hw/block/nvme: fix shutdown/reset logic Klaus Jensen
2021-02-09 7:30 ` [PULL 23/56] hw/block/nvme: merge implicitly/explicitly opened processing masks Klaus Jensen
2021-02-09 7:30 ` [PULL 24/56] hw/block/nvme: enum style fix Klaus Jensen
2021-02-09 7:30 ` [PULL 25/56] hw/block/nvme: zero out zones on reset Klaus Jensen
2021-02-09 7:30 ` [PULL 26/56] hw/block/nvme: add missing string representations for commands Klaus Jensen
2021-02-09 7:30 ` [PULL 27/56] hw/block/nvme: remove unnecessary check for append Klaus Jensen
2021-02-09 7:30 ` [PULL 28/56] hw/block/nvme: Correct error status for unaligned ZA Klaus Jensen
2021-02-09 7:30 ` [PULL 29/56] hw/block/nvme: remove unused argument in nvme_ns_init_zoned Klaus Jensen
2021-02-09 7:30 ` [PULL 30/56] hw/block/nvme: open code for volatile write cache Klaus Jensen
2021-02-09 7:30 ` [PULL 31/56] hw/block/nvme: remove unused argument in nvme_ns_init_blk Klaus Jensen
2021-02-09 7:30 ` [PULL 32/56] hw/block/nvme: split setup and register for namespace Klaus Jensen
2021-02-11 9:53 ` Alexander Graf
2021-02-11 10:41 ` Klaus Jensen
2021-02-11 11:40 ` Philippe Mathieu-Daudé
2021-02-11 11:46 ` Klaus Jensen
2021-02-09 7:30 ` [PULL 33/56] hw/block/nvme: remove unused argument in nvme_ns_setup Klaus Jensen
2021-02-09 7:30 ` [PULL 34/56] hw/block/nvme: fix zone write finalize Klaus Jensen
2021-02-09 7:30 ` [PULL 35/56] nvme: introduce bit 5 for critical warning Klaus Jensen
2021-02-09 7:30 ` [PULL 36/56] hw/block/nvme: add smart_critical_warning property Klaus Jensen
2021-02-09 7:30 ` [PULL 37/56] hw/block/nvme: trigger async event during injecting smart warning Klaus Jensen
2021-02-09 7:30 ` [PULL 38/56] hw/block/nvme: add size to mmio read/write trace events Klaus Jensen
2021-02-09 7:30 ` [PULL 39/56] hw/block/nvme: fix 64 bit register hi/lo split writes Klaus Jensen
2021-02-09 7:30 ` [PULL 40/56] hw/block/nvme: indicate CMB support through controller capabilities register Klaus Jensen
2021-02-09 7:30 ` [PULL 41/56] hw/block/nvme: move msix table and pba to BAR 0 Klaus Jensen
2021-02-09 7:30 ` [PULL 42/56] hw/block/nvme: allow cmb and pmr to coexist Klaus Jensen
2021-02-09 7:30 ` [PULL 43/56] hw/block/nvme: rename PMR/CMB shift/mask fields Klaus Jensen
2021-02-09 7:30 ` [PULL 44/56] hw/block/nvme: remove redundant zeroing of PMR registers Klaus Jensen
2021-02-09 7:30 ` [PULL 45/56] hw/block/nvme: disable PMR at boot up Klaus Jensen
2021-02-09 7:30 ` [PULL 46/56] hw/block/nvme: add PMR RDS/WDS support Klaus Jensen
2021-02-09 7:30 ` [PULL 47/56] hw/block/nvme: move cmb logic to v1.4 Klaus Jensen
2021-02-09 7:30 ` [PULL 48/56] hw/block/nvme: bump " Klaus Jensen
2021-02-09 7:30 ` [PULL 49/56] hw/block/nvme: lift cmb restrictions Klaus Jensen
2021-02-09 7:30 ` [PULL 50/56] hw/block/nvme: error if drive less than a zone size Klaus Jensen
2021-02-09 7:30 ` [PULL 51/56] hw/block/nvme: fix set feature for error recovery Klaus Jensen
2021-02-09 7:30 ` [PULL 52/56] hw/block/nvme: fix set feature save field check Klaus Jensen
2021-02-09 7:30 ` [PULL 53/56] hw/block/nvme: align with existing style Klaus Jensen
2021-02-09 7:30 ` [PULL 54/56] hw/block/nvme: fix wrong parameter name 'cross_read' Klaus Jensen
2021-02-09 7:31 ` [PULL 55/56] hw/block/nvme: fix zone boundary check for append Klaus Jensen
2021-02-09 7:31 ` [PULL 56/56] hw/block/nvme: refactor the logic for zone write checks Klaus Jensen
2021-02-09 14:52 ` [PULL 00/56] emulated nvme patches Peter Maydell
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20210209073101.548811-4-its@irrelevant.dk \
--to=its@irrelevant.dk \
--cc=fam@euphon.net \
--cc=k.jensen@samsung.com \
--cc=kbusch@kernel.org \
--cc=kwolf@redhat.com \
--cc=mreitz@redhat.com \
--cc=peter.maydell@linaro.org \
--cc=qemu-block@nongnu.org \
--cc=qemu-devel@nongnu.org \
--cc=stefanha@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).