- * [PULL 01/56] hw/block/nvme: remove superfluous NvmeCtrl parameter
  2021-02-09  7:30 [PULL 00/56] emulated nvme patches Klaus Jensen
@ 2021-02-09  7:30 ` Klaus Jensen
  2021-02-09  7:30 ` [PULL 02/56] hw/block/nvme: pull aio error handling Klaus Jensen
                   ` (55 subsequent siblings)
  56 siblings, 0 replies; 62+ messages in thread
From: Klaus Jensen @ 2021-02-09  7:30 UTC (permalink / raw)
  To: qemu-devel, Peter Maydell
  Cc: Fam Zheng, Kevin Wolf, qemu-block, Klaus Jensen, Max Reitz,
	Klaus Jensen, Minwoo Im, Stefan Hajnoczi, Keith Busch
From: Klaus Jensen <k.jensen@samsung.com>
nvme_check_bounds has no use of the NvmeCtrl parameter; remove it.
Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
Reviewed-by: Minwoo Im <minwoo.im.dev@gmail.com>
---
 hw/block/nvme.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 27d2c72716eb..b0816d19eff4 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -866,8 +866,8 @@ static inline uint16_t nvme_check_mdts(NvmeCtrl *n, size_t len)
     return NVME_SUCCESS;
 }
 
-static inline uint16_t nvme_check_bounds(NvmeCtrl *n, NvmeNamespace *ns,
-                                         uint64_t slba, uint32_t nlb)
+static inline uint16_t nvme_check_bounds(NvmeNamespace *ns, uint64_t slba,
+                                         uint32_t nlb)
 {
     uint64_t nsze = le64_to_cpu(ns->id_ns.nsze);
 
@@ -943,7 +943,7 @@ static uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req)
 
     trace_pci_nvme_write_zeroes(nvme_cid(req), nvme_nsid(ns), slba, nlb);
 
-    status = nvme_check_bounds(n, ns, slba, nlb);
+    status = nvme_check_bounds(ns, slba, nlb);
     if (status) {
         trace_pci_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
         return status;
@@ -979,7 +979,7 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeRequest *req)
         goto invalid;
     }
 
-    status = nvme_check_bounds(n, ns, slba, nlb);
+    status = nvme_check_bounds(ns, slba, nlb);
     if (status) {
         trace_pci_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
         goto invalid;
-- 
2.30.0
^ permalink raw reply related	[flat|nested] 62+ messages in thread
- * [PULL 02/56] hw/block/nvme: pull aio error handling
  2021-02-09  7:30 [PULL 00/56] emulated nvme patches Klaus Jensen
  2021-02-09  7:30 ` [PULL 01/56] hw/block/nvme: remove superfluous NvmeCtrl parameter Klaus Jensen
@ 2021-02-09  7:30 ` Klaus Jensen
  2021-02-09  7:30 ` [PULL 03/56] hw/block/nvme: add dulbe support Klaus Jensen
                   ` (54 subsequent siblings)
  56 siblings, 0 replies; 62+ messages in thread
From: Klaus Jensen @ 2021-02-09  7:30 UTC (permalink / raw)
  To: qemu-devel, Peter Maydell
  Cc: Fam Zheng, Kevin Wolf, qemu-block, Klaus Jensen, Max Reitz,
	Klaus Jensen, Stefan Hajnoczi, Keith Busch
From: Klaus Jensen <k.jensen@samsung.com>
Add a new function, nvme_aio_err, to handle errors resulting from AIOs
and use this from the callbacks.
Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
---
 hw/block/nvme.c | 61 +++++++++++++++++++++++++++++--------------------
 1 file changed, 36 insertions(+), 25 deletions(-)
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index b0816d19eff4..a245ff8ceb2c 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -878,6 +878,41 @@ static inline uint16_t nvme_check_bounds(NvmeNamespace *ns, uint64_t slba,
     return NVME_SUCCESS;
 }
 
+static void nvme_aio_err(NvmeRequest *req, int ret)
+{
+    uint16_t status = NVME_SUCCESS;
+    Error *local_err = NULL;
+
+    switch (req->cmd.opcode) {
+    case NVME_CMD_READ:
+        status = NVME_UNRECOVERED_READ;
+        break;
+    case NVME_CMD_FLUSH:
+    case NVME_CMD_WRITE:
+    case NVME_CMD_WRITE_ZEROES:
+        status = NVME_WRITE_FAULT;
+        break;
+    default:
+        status = NVME_INTERNAL_DEV_ERROR;
+        break;
+    }
+
+    trace_pci_nvme_err_aio(nvme_cid(req), strerror(ret), status);
+
+    error_setg_errno(&local_err, -ret, "aio failed");
+    error_report_err(local_err);
+
+    /*
+     * Set the command status code to the first encountered error but allow a
+     * subsequent Internal Device Error to trump it.
+     */
+    if (req->status && status != NVME_INTERNAL_DEV_ERROR) {
+        return;
+    }
+
+    req->status = status;
+}
+
 static void nvme_rw_cb(void *opaque, int ret)
 {
     NvmeRequest *req = opaque;
@@ -887,37 +922,13 @@ static void nvme_rw_cb(void *opaque, int ret)
     BlockAcctCookie *acct = &req->acct;
     BlockAcctStats *stats = blk_get_stats(blk);
 
-    Error *local_err = NULL;
-
     trace_pci_nvme_rw_cb(nvme_cid(req), blk_name(blk));
 
     if (!ret) {
         block_acct_done(stats, acct);
     } else {
-        uint16_t status;
-
         block_acct_failed(stats, acct);
-
-        switch (req->cmd.opcode) {
-        case NVME_CMD_READ:
-            status = NVME_UNRECOVERED_READ;
-            break;
-        case NVME_CMD_FLUSH:
-        case NVME_CMD_WRITE:
-        case NVME_CMD_WRITE_ZEROES:
-            status = NVME_WRITE_FAULT;
-            break;
-        default:
-            status = NVME_INTERNAL_DEV_ERROR;
-            break;
-        }
-
-        trace_pci_nvme_err_aio(nvme_cid(req), strerror(ret), status);
-
-        error_setg_errno(&local_err, -ret, "aio failed");
-        error_report_err(local_err);
-
-        req->status = status;
+        nvme_aio_err(req, ret);
     }
 
     nvme_enqueue_req_completion(nvme_cq(req), req);
-- 
2.30.0
^ permalink raw reply related	[flat|nested] 62+ messages in thread
- * [PULL 03/56] hw/block/nvme: add dulbe support
  2021-02-09  7:30 [PULL 00/56] emulated nvme patches Klaus Jensen
  2021-02-09  7:30 ` [PULL 01/56] hw/block/nvme: remove superfluous NvmeCtrl parameter Klaus Jensen
  2021-02-09  7:30 ` [PULL 02/56] hw/block/nvme: pull aio error handling Klaus Jensen
@ 2021-02-09  7:30 ` Klaus Jensen
  2021-02-09  7:30 ` [PULL 04/56] nvme: add namespace I/O optimization fields to shared header Klaus Jensen
                   ` (53 subsequent siblings)
  56 siblings, 0 replies; 62+ messages in thread
From: Klaus Jensen @ 2021-02-09  7:30 UTC (permalink / raw)
  To: qemu-devel, Peter Maydell
  Cc: Fam Zheng, Kevin Wolf, qemu-block, Klaus Jensen, Max Reitz,
	Klaus Jensen, Stefan Hajnoczi, Keith Busch
From: Klaus Jensen <k.jensen@samsung.com>
Add support for reporting the Deallocated or Unwritten Logical Block
Error (DULBE).
Rely on the block status flags reported by the block layer and consider
any block with the BDRV_BLOCK_ZERO flag to be deallocated.
Multiple factors affect when a Write Zeroes command result in
deallocation of blocks.
  * the underlying file system block size
  * the blockdev format
  * the 'discard' and 'logical_block_size' parameters
     format | discard | wz (512B)  wz (4KiB)  wz (64KiB)
    -----------------------------------------------------
      qcow2    ignore   n          n          y
      qcow2    unmap    n          n          y
      raw      ignore   n          y          y
      raw      unmap    n          y          y
So, this works best with an image in raw format and 4KiB LBAs, since
holes can then be punched on a per-block basis (this assumes a file
system with a 4kb block size, YMMV). A qcow2 image, uses a cluster size
of 64KiB by default and blocks will only be marked deallocated if a full
cluster is zeroed or discarded. However, this *is* consistent with the
spec since Write Zeroes "should" deallocate the block if the Deallocate
attribute is set and "may" deallocate if the Deallocate attribute is not
set. Thus, we always try to deallocate (the BDRV_REQ_MAY_UNMAP flag is
always set).
Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
Reviewed-by: Keith Busch <kbusch@kernel.org>
---
 hw/block/nvme-ns.h    |  4 ++
 include/block/nvme.h  |  5 +++
 hw/block/nvme-ns.c    |  8 ++--
 hw/block/nvme.c       | 91 ++++++++++++++++++++++++++++++++++++++++++-
 hw/block/trace-events |  4 ++
 5 files changed, 107 insertions(+), 5 deletions(-)
diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
index 83734f4606e1..44bf6271b744 100644
--- a/hw/block/nvme-ns.h
+++ b/hw/block/nvme-ns.h
@@ -31,6 +31,10 @@ typedef struct NvmeNamespace {
     NvmeIdNs     id_ns;
 
     NvmeNamespaceParams params;
+
+    struct {
+        uint32_t err_rec;
+    } features;
 } NvmeNamespace;
 
 static inline uint32_t nvme_nsid(NvmeNamespace *ns)
diff --git a/include/block/nvme.h b/include/block/nvme.h
index 3e02d9ca9843..b663d11e60c1 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -693,6 +693,7 @@ enum NvmeStatusCodes {
     NVME_E2E_REF_ERROR          = 0x0284,
     NVME_CMP_FAILURE            = 0x0285,
     NVME_ACCESS_DENIED          = 0x0286,
+    NVME_DULB                   = 0x0287,
     NVME_MORE                   = 0x2000,
     NVME_DNR                    = 0x4000,
     NVME_NO_COMPLETE            = 0xffff,
@@ -909,6 +910,9 @@ enum NvmeIdCtrlLpa {
 #define NVME_AEC_NS_ATTR(aec)       ((aec >> 8) & 0x1)
 #define NVME_AEC_FW_ACTIVATION(aec) ((aec >> 9) & 0x1)
 
+#define NVME_ERR_REC_TLER(err_rec)  (err_rec & 0xffff)
+#define NVME_ERR_REC_DULBE(err_rec) (err_rec & 0x10000)
+
 enum NvmeFeatureIds {
     NVME_ARBITRATION                = 0x1,
     NVME_POWER_MANAGEMENT           = 0x2,
@@ -1029,6 +1033,7 @@ enum NvmeNsIdentifierType {
 
 
 #define NVME_ID_NS_NSFEAT_THIN(nsfeat)      ((nsfeat & 0x1))
+#define NVME_ID_NS_NSFEAT_DULBE(nsfeat)     ((nsfeat >> 2) & 0x1)
 #define NVME_ID_NS_FLBAS_EXTENDED(flbas)    ((flbas >> 4) & 0x1)
 #define NVME_ID_NS_FLBAS_INDEX(flbas)       ((flbas & 0xf))
 #define NVME_ID_NS_MC_SEPARATE(mc)          ((mc >> 1) & 0x1)
diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index 2670787d2630..53ded460348e 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -33,9 +33,7 @@ static void nvme_ns_init(NvmeNamespace *ns)
     NvmeIdNs *id_ns = &ns->id_ns;
     int lba_index = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas);
 
-    if (blk_get_flags(ns->blkconf.blk) & BDRV_O_UNMAP) {
-        ns->id_ns.dlfeat = 0x9;
-    }
+    ns->id_ns.dlfeat = 0x9;
 
     id_ns->lbaf[lba_index].ds = 31 - clz32(ns->blkconf.logical_block_size);
 
@@ -44,6 +42,9 @@ static void nvme_ns_init(NvmeNamespace *ns)
     /* no thin provisioning */
     id_ns->ncap = id_ns->nsze;
     id_ns->nuse = id_ns->ncap;
+
+    /* support DULBE */
+    id_ns->nsfeat |= 0x4;
 }
 
 static int nvme_ns_init_blk(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
@@ -93,6 +94,7 @@ int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
     }
 
     nvme_ns_init(ns);
+
     if (nvme_register_namespace(n, ns, errp)) {
         return -1;
     }
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index a245ff8ceb2c..6e6bdb338ad7 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -105,6 +105,7 @@ static const bool nvme_feature_support[NVME_FID_MAX] = {
 
 static const uint32_t nvme_feature_cap[NVME_FID_MAX] = {
     [NVME_TEMPERATURE_THRESHOLD]    = NVME_FEAT_CAP_CHANGE,
+    [NVME_ERROR_RECOVERY]           = NVME_FEAT_CAP_CHANGE | NVME_FEAT_CAP_NS,
     [NVME_VOLATILE_WRITE_CACHE]     = NVME_FEAT_CAP_CHANGE,
     [NVME_NUMBER_OF_QUEUES]         = NVME_FEAT_CAP_CHANGE,
     [NVME_ASYNCHRONOUS_EVENT_CONF]  = NVME_FEAT_CAP_CHANGE,
@@ -878,6 +879,49 @@ static inline uint16_t nvme_check_bounds(NvmeNamespace *ns, uint64_t slba,
     return NVME_SUCCESS;
 }
 
+static uint16_t nvme_check_dulbe(NvmeNamespace *ns, uint64_t slba,
+                                 uint32_t nlb)
+{
+    BlockDriverState *bs = blk_bs(ns->blkconf.blk);
+
+    int64_t pnum = 0, bytes = nvme_l2b(ns, nlb);
+    int64_t offset = nvme_l2b(ns, slba);
+    bool zeroed;
+    int ret;
+
+    Error *local_err = NULL;
+
+    /*
+     * `pnum` holds the number of bytes after offset that shares the same
+     * allocation status as the byte at offset. If `pnum` is different from
+     * `bytes`, we should check the allocation status of the next range and
+     * continue this until all bytes have been checked.
+     */
+    do {
+        bytes -= pnum;
+
+        ret = bdrv_block_status(bs, offset, bytes, &pnum, NULL, NULL);
+        if (ret < 0) {
+            error_setg_errno(&local_err, -ret, "unable to get block status");
+            error_report_err(local_err);
+
+            return NVME_INTERNAL_DEV_ERROR;
+        }
+
+        zeroed = !!(ret & BDRV_BLOCK_ZERO);
+
+        trace_pci_nvme_block_status(offset, bytes, pnum, ret, zeroed);
+
+        if (zeroed) {
+            return NVME_DULB;
+        }
+
+        offset += pnum;
+    } while (pnum != bytes);
+
+    return NVME_SUCCESS;
+}
+
 static void nvme_aio_err(NvmeRequest *req, int ret)
 {
     uint16_t status = NVME_SUCCESS;
@@ -996,6 +1040,15 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeRequest *req)
         goto invalid;
     }
 
+    if (acct == BLOCK_ACCT_READ) {
+        if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
+            status = nvme_check_dulbe(ns, slba, nlb);
+            if (status) {
+                goto invalid;
+            }
+        }
+    }
+
     status = nvme_map_dptr(n, data_size, req);
     if (status) {
         goto invalid;
@@ -1641,6 +1694,7 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest *req)
     uint8_t fid = NVME_GETSETFEAT_FID(dw10);
     NvmeGetFeatureSelect sel = NVME_GETFEAT_SELECT(dw10);
     uint16_t iv;
+    NvmeNamespace *ns;
 
     static const uint32_t nvme_feature_default[NVME_FID_MAX] = {
         [NVME_ARBITRATION] = NVME_ARB_AB_NOLIMIT,
@@ -1703,6 +1757,18 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest *req)
         }
 
         return NVME_INVALID_FIELD | NVME_DNR;
+    case NVME_ERROR_RECOVERY:
+        if (!nvme_nsid_valid(n, nsid)) {
+            return NVME_INVALID_NSID | NVME_DNR;
+        }
+
+        ns = nvme_ns(n, nsid);
+        if (unlikely(!ns)) {
+            return NVME_INVALID_FIELD | NVME_DNR;
+        }
+
+        result = ns->features.err_rec;
+        goto out;
     case NVME_VOLATILE_WRITE_CACHE:
         result = n->features.vwc;
         trace_pci_nvme_getfeat_vwcache(result ? "enabled" : "disabled");
@@ -1775,7 +1841,7 @@ static uint16_t nvme_set_feature_timestamp(NvmeCtrl *n, NvmeRequest *req)
 
 static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req)
 {
-    NvmeNamespace *ns;
+    NvmeNamespace *ns = NULL;
 
     NvmeCmd *cmd = &req->cmd;
     uint32_t dw10 = le32_to_cpu(cmd->cdw10);
@@ -1783,6 +1849,7 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req)
     uint32_t nsid = le32_to_cpu(cmd->nsid);
     uint8_t fid = NVME_GETSETFEAT_FID(dw10);
     uint8_t save = NVME_SETFEAT_SAVE(dw10);
+    int i;
 
     trace_pci_nvme_setfeat(nvme_cid(req), nsid, fid, save, dw11);
 
@@ -1842,11 +1909,31 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req)
                                NVME_LOG_SMART_INFO);
         }
 
+        break;
+    case NVME_ERROR_RECOVERY:
+        if (nsid == NVME_NSID_BROADCAST) {
+            for (i = 1; i <= n->num_namespaces; i++) {
+                ns = nvme_ns(n, i);
+
+                if (!ns) {
+                    continue;
+                }
+
+                if (NVME_ID_NS_NSFEAT_DULBE(ns->id_ns.nsfeat)) {
+                    ns->features.err_rec = dw11;
+                }
+            }
+
+            break;
+        }
+
+        assert(ns);
+        ns->features.err_rec = dw11;
         break;
     case NVME_VOLATILE_WRITE_CACHE:
         n->features.vwc = dw11 & 0x1;
 
-        for (int i = 1; i <= n->num_namespaces; i++) {
+        for (i = 1; i <= n->num_namespaces; i++) {
             ns = nvme_ns(n, i);
             if (!ns) {
                 continue;
diff --git a/hw/block/trace-events b/hw/block/trace-events
index c1537e3ac0b0..1ffe0b3f76b5 100644
--- a/hw/block/trace-events
+++ b/hw/block/trace-events
@@ -43,6 +43,10 @@ pci_nvme_admin_cmd(uint16_t cid, uint16_t sqid, uint8_t opcode, const char *opna
 pci_nvme_rw(uint16_t cid, const char *verb, uint32_t nsid, uint32_t nlb, uint64_t count, uint64_t lba) "cid %"PRIu16" opname '%s' nsid %"PRIu32" nlb %"PRIu32" count %"PRIu64" lba 0x%"PRIx64""
 pci_nvme_rw_cb(uint16_t cid, const char *blkname) "cid %"PRIu16" blk '%s'"
 pci_nvme_write_zeroes(uint16_t cid, uint32_t nsid, uint64_t slba, uint32_t nlb) "cid %"PRIu16" nsid %"PRIu32" slba %"PRIu64" nlb %"PRIu32""
+pci_nvme_block_status(int64_t offset, int64_t bytes, int64_t pnum, int ret, bool zeroed) "offset %"PRId64" bytes %"PRId64" pnum %"PRId64" ret 0x%x zeroed %d"
+pci_nvme_dsm(uint16_t cid, uint32_t nsid, uint32_t nr, uint32_t attr) "cid %"PRIu16" nsid %"PRIu32" nr %"PRIu32" attr 0x%"PRIx32""
+pci_nvme_dsm_deallocate(uint16_t cid, uint32_t nsid, uint64_t slba, uint32_t nlb) "cid %"PRIu16" nsid %"PRIu32" slba %"PRIu64" nlb %"PRIu32""
+pci_nvme_aio_discard_cb(uint16_t cid) "cid %"PRIu16""
 pci_nvme_create_sq(uint64_t addr, uint16_t sqid, uint16_t cqid, uint16_t qsize, uint16_t qflags) "create submission queue, addr=0x%"PRIx64", sqid=%"PRIu16", cqid=%"PRIu16", qsize=%"PRIu16", qflags=%"PRIu16""
 pci_nvme_create_cq(uint64_t addr, uint16_t cqid, uint16_t vector, uint16_t size, uint16_t qflags, int ien) "create completion queue, addr=0x%"PRIx64", cqid=%"PRIu16", vector=%"PRIu16", qsize=%"PRIu16", qflags=%"PRIu16", ien=%d"
 pci_nvme_del_sq(uint16_t qid) "deleting submission queue sqid=%"PRIu16""
-- 
2.30.0
^ permalink raw reply related	[flat|nested] 62+ messages in thread
- * [PULL 04/56] nvme: add namespace I/O optimization fields to shared header
  2021-02-09  7:30 [PULL 00/56] emulated nvme patches Klaus Jensen
                   ` (2 preceding siblings ...)
  2021-02-09  7:30 ` [PULL 03/56] hw/block/nvme: add dulbe support Klaus Jensen
@ 2021-02-09  7:30 ` Klaus Jensen
  2021-02-09  7:30 ` [PULL 05/56] hw/block/nvme: add the dataset management command Klaus Jensen
                   ` (52 subsequent siblings)
  56 siblings, 0 replies; 62+ messages in thread
From: Klaus Jensen @ 2021-02-09  7:30 UTC (permalink / raw)
  To: qemu-devel, Peter Maydell
  Cc: Fam Zheng, Kevin Wolf, qemu-block, Klaus Jensen, Max Reitz,
	Klaus Jensen, Minwoo Im, Stefan Hajnoczi, Keith Busch
From: Klaus Jensen <k.jensen@samsung.com>
This adds the NPWG, NPWA, NPDG, NPDA and NOWS family of fields to the
shared nvme.h header for use by later patches.
Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
Cc: Stefan Hajnoczi <stefanha@redhat.com>
Cc: Fam Zheng <fam@euphon.net>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Minwoo Im <minwoo.im.dev@gmail.com>
---
 include/block/nvme.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/include/block/nvme.h b/include/block/nvme.h
index b663d11e60c1..11ac1c2b7dfb 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -996,7 +996,12 @@ typedef struct QEMU_PACKED NvmeIdNs {
     uint16_t    nabspf;
     uint16_t    noiob;
     uint8_t     nvmcap[16];
-    uint8_t     rsvd64[40];
+    uint16_t    npwg;
+    uint16_t    npwa;
+    uint16_t    npdg;
+    uint16_t    npda;
+    uint16_t    nows;
+    uint8_t     rsvd74[30];
     uint8_t     nguid[16];
     uint64_t    eui64;
     NvmeLBAF    lbaf[16];
-- 
2.30.0
^ permalink raw reply related	[flat|nested] 62+ messages in thread
- * [PULL 05/56] hw/block/nvme: add the dataset management command
  2021-02-09  7:30 [PULL 00/56] emulated nvme patches Klaus Jensen
                   ` (3 preceding siblings ...)
  2021-02-09  7:30 ` [PULL 04/56] nvme: add namespace I/O optimization fields to shared header Klaus Jensen
@ 2021-02-09  7:30 ` Klaus Jensen
  2021-02-09  7:30 ` [PULL 06/56] hw/block/nvme: add compare command Klaus Jensen
                   ` (51 subsequent siblings)
  56 siblings, 0 replies; 62+ messages in thread
From: Klaus Jensen @ 2021-02-09  7:30 UTC (permalink / raw)
  To: qemu-devel, Peter Maydell
  Cc: Fam Zheng, Kevin Wolf, qemu-block, Klaus Jensen, Max Reitz,
	Klaus Jensen, Stefan Hajnoczi, Keith Busch
From: Klaus Jensen <k.jensen@samsung.com>
Add support for the Dataset Management command and the Deallocate
attribute. Deallocation results in discards being sent to the underlying
block device. Whether of not the blocks are actually deallocated is
affected by the same factors as Write Zeroes (see previous commit).
     format | discard | dsm (512B)  dsm (4KiB)  dsm (64KiB)
    --------------------------------------------------------
      qcow2    ignore   n           n           n
      qcow2    unmap    n           n           y
      raw      ignore   n           n           n
      raw      unmap    n           y           y
Again, a raw format and 4KiB LBAs are preferable.
In order to set the Namespace Preferred Deallocate Granularity and
Alignment fields (NPDG and NPDA), choose a sane minimum discard
granularity of 4KiB. If we are using a passthru device supporting
discard at a 512B granularity, user should set the discard_granularity
property explicitly. NPDG and NPDA will also account for the
cluster_size of the block driver if required (i.e. for QCOW2).
See NVM Express 1.3d, Section 6.7 ("Dataset Management command").
Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
Reviewed-by: Keith Busch <kbusch@kernel.org>
---
 hw/block/nvme.h    |  2 +
 hw/block/nvme-ns.c | 30 ++++++++++++--
 hw/block/nvme.c    | 98 +++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 125 insertions(+), 5 deletions(-)
diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index e080a2318a50..574333caa3f9 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -28,6 +28,7 @@ typedef struct NvmeRequest {
     struct NvmeNamespace    *ns;
     BlockAIOCB              *aiocb;
     uint16_t                status;
+    void                    *opaque;
     NvmeCqe                 cqe;
     NvmeCmd                 cmd;
     BlockAcctCookie         acct;
@@ -60,6 +61,7 @@ static inline const char *nvme_io_opc_str(uint8_t opc)
     case NVME_CMD_WRITE:            return "NVME_NVM_CMD_WRITE";
     case NVME_CMD_READ:             return "NVME_NVM_CMD_READ";
     case NVME_CMD_WRITE_ZEROES:     return "NVME_NVM_CMD_WRITE_ZEROES";
+    case NVME_CMD_DSM:              return "NVME_NVM_CMD_DSM";
     default:                        return "NVME_NVM_CMD_UNKNOWN";
     }
 }
diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index 53ded460348e..37f95951a6b8 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -28,10 +28,14 @@
 #include "nvme.h"
 #include "nvme-ns.h"
 
-static void nvme_ns_init(NvmeNamespace *ns)
+#define MIN_DISCARD_GRANULARITY (4 * KiB)
+
+static int nvme_ns_init(NvmeNamespace *ns, Error **errp)
 {
+    BlockDriverInfo bdi;
     NvmeIdNs *id_ns = &ns->id_ns;
     int lba_index = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas);
+    int npdg;
 
     ns->id_ns.dlfeat = 0x9;
 
@@ -43,8 +47,19 @@ static void nvme_ns_init(NvmeNamespace *ns)
     id_ns->ncap = id_ns->nsze;
     id_ns->nuse = id_ns->ncap;
 
-    /* support DULBE */
-    id_ns->nsfeat |= 0x4;
+    /* support DULBE and I/O optimization fields */
+    id_ns->nsfeat |= (0x4 | 0x10);
+
+    npdg = ns->blkconf.discard_granularity / ns->blkconf.logical_block_size;
+
+    if (bdrv_get_info(blk_bs(ns->blkconf.blk), &bdi) >= 0 &&
+        bdi.cluster_size > ns->blkconf.discard_granularity) {
+        npdg = bdi.cluster_size / ns->blkconf.logical_block_size;
+    }
+
+    id_ns->npda = id_ns->npdg = npdg - 1;
+
+    return 0;
 }
 
 static int nvme_ns_init_blk(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
@@ -60,6 +75,11 @@ static int nvme_ns_init_blk(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
         return -1;
     }
 
+    if (ns->blkconf.discard_granularity == -1) {
+        ns->blkconf.discard_granularity =
+            MAX(ns->blkconf.logical_block_size, MIN_DISCARD_GRANULARITY);
+    }
+
     ns->size = blk_getlength(ns->blkconf.blk);
     if (ns->size < 0) {
         error_setg_errno(errp, -ns->size, "could not get blockdev size");
@@ -93,7 +113,9 @@ int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
         return -1;
     }
 
-    nvme_ns_init(ns);
+    if (nvme_ns_init(ns, errp)) {
+        return -1;
+    }
 
     if (nvme_register_namespace(n, ns, errp)) {
         return -1;
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 6e6bdb338ad7..f019d43788ac 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -242,6 +242,7 @@ static void nvme_irq_deassert(NvmeCtrl *n, NvmeCQueue *cq)
 static void nvme_req_clear(NvmeRequest *req)
 {
     req->ns = NULL;
+    req->opaque = NULL;
     memset(&req->cqe, 0x0, sizeof(req->cqe));
     req->status = NVME_SUCCESS;
 }
@@ -978,6 +979,99 @@ static void nvme_rw_cb(void *opaque, int ret)
     nvme_enqueue_req_completion(nvme_cq(req), req);
 }
 
+static void nvme_aio_discard_cb(void *opaque, int ret)
+{
+    NvmeRequest *req = opaque;
+    uintptr_t *discards = (uintptr_t *)&req->opaque;
+
+    trace_pci_nvme_aio_discard_cb(nvme_cid(req));
+
+    if (ret) {
+        nvme_aio_err(req, ret);
+    }
+
+    (*discards)--;
+
+    if (*discards) {
+        return;
+    }
+
+    nvme_enqueue_req_completion(nvme_cq(req), req);
+}
+
+static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req)
+{
+    NvmeNamespace *ns = req->ns;
+    NvmeDsmCmd *dsm = (NvmeDsmCmd *) &req->cmd;
+
+    uint32_t attr = le32_to_cpu(dsm->attributes);
+    uint32_t nr = (le32_to_cpu(dsm->nr) & 0xff) + 1;
+
+    uint16_t status = NVME_SUCCESS;
+
+    trace_pci_nvme_dsm(nvme_cid(req), nvme_nsid(ns), nr, attr);
+
+    if (attr & NVME_DSMGMT_AD) {
+        int64_t offset;
+        size_t len;
+        NvmeDsmRange range[nr];
+        uintptr_t *discards = (uintptr_t *)&req->opaque;
+
+        status = nvme_dma(n, (uint8_t *)range, sizeof(range),
+                          DMA_DIRECTION_TO_DEVICE, req);
+        if (status) {
+            return status;
+        }
+
+        /*
+         * AIO callbacks may be called immediately, so initialize discards to 1
+         * to make sure the the callback does not complete the request before
+         * all discards have been issued.
+         */
+        *discards = 1;
+
+        for (int i = 0; i < nr; i++) {
+            uint64_t slba = le64_to_cpu(range[i].slba);
+            uint32_t nlb = le32_to_cpu(range[i].nlb);
+
+            if (nvme_check_bounds(ns, slba, nlb)) {
+                trace_pci_nvme_err_invalid_lba_range(slba, nlb,
+                                                     ns->id_ns.nsze);
+                continue;
+            }
+
+            trace_pci_nvme_dsm_deallocate(nvme_cid(req), nvme_nsid(ns), slba,
+                                          nlb);
+
+            offset = nvme_l2b(ns, slba);
+            len = nvme_l2b(ns, nlb);
+
+            while (len) {
+                size_t bytes = MIN(BDRV_REQUEST_MAX_BYTES, len);
+
+                (*discards)++;
+
+                blk_aio_pdiscard(ns->blkconf.blk, offset, bytes,
+                                 nvme_aio_discard_cb, req);
+
+                offset += bytes;
+                len -= bytes;
+            }
+        }
+
+        /* account for the 1-initialization */
+        (*discards)--;
+
+        if (*discards) {
+            status = NVME_NO_COMPLETE;
+        } else {
+            status = req->status;
+        }
+    }
+
+    return status;
+}
+
 static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req)
 {
     block_acct_start(blk_get_stats(req->ns->blkconf.blk), &req->acct, 0,
@@ -1107,6 +1201,8 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
     case NVME_CMD_WRITE:
     case NVME_CMD_READ:
         return nvme_rw(n, req);
+    case NVME_CMD_DSM:
+        return nvme_dsm(n, req);
     default:
         trace_pci_nvme_err_invalid_opc(req->cmd.opcode);
         return NVME_INVALID_OPCODE | NVME_DNR;
@@ -2829,7 +2925,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
     id->cqes = (0x4 << 4) | 0x4;
     id->nn = cpu_to_le32(n->num_namespaces);
     id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROES | NVME_ONCS_TIMESTAMP |
-                           NVME_ONCS_FEATURES);
+                           NVME_ONCS_FEATURES | NVME_ONCS_DSM);
 
     id->vwc = 0x1;
     id->sgls = cpu_to_le32(NVME_CTRL_SGLS_SUPPORT_NO_ALIGN |
-- 
2.30.0
^ permalink raw reply related	[flat|nested] 62+ messages in thread
- * [PULL 06/56] hw/block/nvme: add compare command
  2021-02-09  7:30 [PULL 00/56] emulated nvme patches Klaus Jensen
                   ` (4 preceding siblings ...)
  2021-02-09  7:30 ` [PULL 05/56] hw/block/nvme: add the dataset management command Klaus Jensen
@ 2021-02-09  7:30 ` Klaus Jensen
  2021-02-09  7:30 ` [PULL 07/56] hw/block/nvme: fix bad clearing of CAP Klaus Jensen
                   ` (50 subsequent siblings)
  56 siblings, 0 replies; 62+ messages in thread
From: Klaus Jensen @ 2021-02-09  7:30 UTC (permalink / raw)
  To: qemu-devel, Peter Maydell
  Cc: Fam Zheng, Kevin Wolf, qemu-block, Klaus Jensen,
	Gollu Appalanaidu, Max Reitz, Klaus Jensen, Minwoo Im,
	Stefan Hajnoczi, Keith Busch
From: Gollu Appalanaidu <anaidu.gollu@samsung.com>
Add the Compare command.
This implementation uses a bounce buffer to read in the data from
storage and then compare with the host supplied buffer.
Signed-off-by: Gollu Appalanaidu <anaidu.gollu@samsung.com>
[k.jensen: rebased]
Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
Reviewed-by: Minwoo Im <minwoo.im.dev@gmail.com>
Reviewed-by: Keith Busch <kbusch@kernel.org>
---
 hw/block/nvme.c       | 101 +++++++++++++++++++++++++++++++++++++++++-
 hw/block/trace-events |   2 +
 2 files changed, 102 insertions(+), 1 deletion(-)
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index f019d43788ac..b9313fdc4762 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -999,6 +999,51 @@ static void nvme_aio_discard_cb(void *opaque, int ret)
     nvme_enqueue_req_completion(nvme_cq(req), req);
 }
 
+struct nvme_compare_ctx {
+    QEMUIOVector iov;
+    uint8_t *bounce;
+    size_t len;
+};
+
+static void nvme_compare_cb(void *opaque, int ret)
+{
+    NvmeRequest *req = opaque;
+    NvmeNamespace *ns = req->ns;
+    struct nvme_compare_ctx *ctx = req->opaque;
+    g_autofree uint8_t *buf = NULL;
+    uint16_t status;
+
+    trace_pci_nvme_compare_cb(nvme_cid(req));
+
+    if (!ret) {
+        block_acct_done(blk_get_stats(ns->blkconf.blk), &req->acct);
+    } else {
+        block_acct_failed(blk_get_stats(ns->blkconf.blk), &req->acct);
+        nvme_aio_err(req, ret);
+        goto out;
+    }
+
+    buf = g_malloc(ctx->len);
+
+    status = nvme_dma(nvme_ctrl(req), buf, ctx->len, DMA_DIRECTION_TO_DEVICE,
+                      req);
+    if (status) {
+        req->status = status;
+        goto out;
+    }
+
+    if (memcmp(buf, ctx->bounce, ctx->len)) {
+        req->status = NVME_CMP_FAILURE;
+    }
+
+out:
+    qemu_iovec_destroy(&ctx->iov);
+    g_free(ctx->bounce);
+    g_free(ctx);
+
+    nvme_enqueue_req_completion(nvme_cq(req), req);
+}
+
 static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req)
 {
     NvmeNamespace *ns = req->ns;
@@ -1072,6 +1117,57 @@ static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req)
     return status;
 }
 
+static uint16_t nvme_compare(NvmeCtrl *n, NvmeRequest *req)
+{
+    NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
+    NvmeNamespace *ns = req->ns;
+    BlockBackend *blk = ns->blkconf.blk;
+    uint64_t slba = le64_to_cpu(rw->slba);
+    uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
+    size_t len = nvme_l2b(ns, nlb);
+    int64_t offset = nvme_l2b(ns, slba);
+    uint8_t *bounce = NULL;
+    struct nvme_compare_ctx *ctx = NULL;
+    uint16_t status;
+
+    trace_pci_nvme_compare(nvme_cid(req), nvme_nsid(ns), slba, nlb);
+
+    status = nvme_check_mdts(n, len);
+    if (status) {
+        trace_pci_nvme_err_mdts(nvme_cid(req), len);
+        return status;
+    }
+
+    status = nvme_check_bounds(ns, slba, nlb);
+    if (status) {
+        trace_pci_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
+        return status;
+    }
+
+    if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
+        status = nvme_check_dulbe(ns, slba, nlb);
+        if (status) {
+            return status;
+        }
+    }
+
+    bounce = g_malloc(len);
+
+    ctx = g_new(struct nvme_compare_ctx, 1);
+    ctx->bounce = bounce;
+    ctx->len = len;
+
+    req->opaque = ctx;
+
+    qemu_iovec_init(&ctx->iov, 1);
+    qemu_iovec_add(&ctx->iov, bounce, len);
+
+    block_acct_start(blk_get_stats(blk), &req->acct, len, BLOCK_ACCT_READ);
+    blk_aio_preadv(blk, offset, &ctx->iov, 0, nvme_compare_cb, req);
+
+    return NVME_NO_COMPLETE;
+}
+
 static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req)
 {
     block_acct_start(blk_get_stats(req->ns->blkconf.blk), &req->acct, 0,
@@ -1201,6 +1297,8 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
     case NVME_CMD_WRITE:
     case NVME_CMD_READ:
         return nvme_rw(n, req);
+    case NVME_CMD_COMPARE:
+        return nvme_compare(n, req);
     case NVME_CMD_DSM:
         return nvme_dsm(n, req);
     default:
@@ -2925,7 +3023,8 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
     id->cqes = (0x4 << 4) | 0x4;
     id->nn = cpu_to_le32(n->num_namespaces);
     id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROES | NVME_ONCS_TIMESTAMP |
-                           NVME_ONCS_FEATURES | NVME_ONCS_DSM);
+                           NVME_ONCS_FEATURES | NVME_ONCS_DSM |
+                           NVME_ONCS_COMPARE);
 
     id->vwc = 0x1;
     id->sgls = cpu_to_le32(NVME_CTRL_SGLS_SUPPORT_NO_ALIGN |
diff --git a/hw/block/trace-events b/hw/block/trace-events
index 1ffe0b3f76b5..68a4c8ed35e0 100644
--- a/hw/block/trace-events
+++ b/hw/block/trace-events
@@ -46,6 +46,8 @@ pci_nvme_write_zeroes(uint16_t cid, uint32_t nsid, uint64_t slba, uint32_t nlb)
 pci_nvme_block_status(int64_t offset, int64_t bytes, int64_t pnum, int ret, bool zeroed) "offset %"PRId64" bytes %"PRId64" pnum %"PRId64" ret 0x%x zeroed %d"
 pci_nvme_dsm(uint16_t cid, uint32_t nsid, uint32_t nr, uint32_t attr) "cid %"PRIu16" nsid %"PRIu32" nr %"PRIu32" attr 0x%"PRIx32""
 pci_nvme_dsm_deallocate(uint16_t cid, uint32_t nsid, uint64_t slba, uint32_t nlb) "cid %"PRIu16" nsid %"PRIu32" slba %"PRIu64" nlb %"PRIu32""
+pci_nvme_compare(uint16_t cid, uint32_t nsid, uint64_t slba, uint32_t nlb) "cid %"PRIu16" nsid %"PRIu32" slba 0x%"PRIx64" nlb %"PRIu32""
+pci_nvme_compare_cb(uint16_t cid) "cid %"PRIu16""
 pci_nvme_aio_discard_cb(uint16_t cid) "cid %"PRIu16""
 pci_nvme_create_sq(uint64_t addr, uint16_t sqid, uint16_t cqid, uint16_t qsize, uint16_t qflags) "create submission queue, addr=0x%"PRIx64", sqid=%"PRIu16", cqid=%"PRIu16", qsize=%"PRIu16", qflags=%"PRIu16""
 pci_nvme_create_cq(uint64_t addr, uint16_t cqid, uint16_t vector, uint16_t size, uint16_t qflags, int ien) "create completion queue, addr=0x%"PRIx64", cqid=%"PRIu16", vector=%"PRIu16", qsize=%"PRIu16", qflags=%"PRIu16", ien=%d"
-- 
2.30.0
^ permalink raw reply related	[flat|nested] 62+ messages in thread
- * [PULL 07/56] hw/block/nvme: fix bad clearing of CAP
  2021-02-09  7:30 [PULL 00/56] emulated nvme patches Klaus Jensen
                   ` (5 preceding siblings ...)
  2021-02-09  7:30 ` [PULL 06/56] hw/block/nvme: add compare command Klaus Jensen
@ 2021-02-09  7:30 ` Klaus Jensen
  2021-02-09  7:30 ` [PULL 08/56] hw/block/nvme: Process controller reset and shutdown differently Klaus Jensen
                   ` (49 subsequent siblings)
  56 siblings, 0 replies; 62+ messages in thread
From: Klaus Jensen @ 2021-02-09  7:30 UTC (permalink / raw)
  To: qemu-devel, Peter Maydell
  Cc: Fam Zheng, Kevin Wolf, qemu-block, Klaus Jensen, Max Reitz,
	Klaus Jensen, Minwoo Im, Stefan Hajnoczi, Keith Busch
From: Klaus Jensen <k.jensen@samsung.com>
Commit 37712e00b1f0 ("hw/block/nvme: factor out pmr setup") changed the
control flow such that the CAP register is erronously cleared after
nvme_init_pmr() has configured it. Since the entire NvmeCtrl structure
is zero-filled initially, there is no need for the explicit clearing, so
just remove it.
Fixes: 37712e00b1f0 ("hw/block/nvme: factor out pmr setup")
Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
Reviewed-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Minwoo Im <minwoo.im.dev@gmail.com>
---
 hw/block/nvme.c | 1 -
 1 file changed, 1 deletion(-)
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index b9313fdc4762..de52487aaf06 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -3038,7 +3038,6 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
     id->psd[0].enlat = cpu_to_le32(0x10);
     id->psd[0].exlat = cpu_to_le32(0x4);
 
-    n->bar.cap = 0;
     NVME_CAP_SET_MQES(n->bar.cap, 0x7ff);
     NVME_CAP_SET_CQR(n->bar.cap, 1);
     NVME_CAP_SET_TO(n->bar.cap, 0xf);
-- 
2.30.0
^ permalink raw reply related	[flat|nested] 62+ messages in thread
- * [PULL 08/56] hw/block/nvme: Process controller reset and shutdown differently
  2021-02-09  7:30 [PULL 00/56] emulated nvme patches Klaus Jensen
                   ` (6 preceding siblings ...)
  2021-02-09  7:30 ` [PULL 07/56] hw/block/nvme: fix bad clearing of CAP Klaus Jensen
@ 2021-02-09  7:30 ` Klaus Jensen
  2021-02-09  7:30 ` [PULL 09/56] hw/block/nvme: Generate namespace UUIDs Klaus Jensen
                   ` (48 subsequent siblings)
  56 siblings, 0 replies; 62+ messages in thread
From: Klaus Jensen @ 2021-02-09  7:30 UTC (permalink / raw)
  To: qemu-devel, Peter Maydell
  Cc: Fam Zheng, Kevin Wolf, qemu-block, Dmitry Fomichev, Klaus Jensen,
	Max Reitz, Klaus Jensen, Stefan Hajnoczi, Keith Busch
From: Dmitry Fomichev <dmitry.fomichev@wdc.com>
Controller reset ans subsystem shutdown are handled very much the same
in the current code, but some of the steps should be different in these
two cases.
Introduce two new functions, nvme_reset_ctrl() and nvme_shutdown_ctrl(),
to separate some portions of the code from nvme_clear_ctrl(). The steps
that are made different between reset and shutdown are that BAR.CC is not
reset to zero upon the shutdown and namespace data is flushed to
backing storage as a part of shutdown handling, but not upon reset.
Suggested-by: Klaus Jensen <k.jensen@samsung.com>
Signed-off-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
Reviewed-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
---
 hw/block/nvme-ns.h |  2 +-
 hw/block/nvme-ns.c |  2 +-
 hw/block/nvme.c    | 24 ++++++++++++++++++------
 3 files changed, 20 insertions(+), 8 deletions(-)
diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
index 44bf6271b744..ed3d7e65d597 100644
--- a/hw/block/nvme-ns.h
+++ b/hw/block/nvme-ns.h
@@ -73,6 +73,6 @@ typedef struct NvmeCtrl NvmeCtrl;
 
 int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp);
 void nvme_ns_drain(NvmeNamespace *ns);
-void nvme_ns_flush(NvmeNamespace *ns);
+void nvme_ns_shutdown(NvmeNamespace *ns);
 
 #endif /* NVME_NS_H */
diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index 37f95951a6b8..a0de53e71878 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -129,7 +129,7 @@ void nvme_ns_drain(NvmeNamespace *ns)
     blk_drain(ns->blkconf.blk);
 }
 
-void nvme_ns_flush(NvmeNamespace *ns)
+void nvme_ns_shutdown(NvmeNamespace *ns)
 {
     blk_flush(ns->blkconf.blk);
 }
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index de52487aaf06..f54c5c6ea44d 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -2295,6 +2295,20 @@ static void nvme_clear_ctrl(NvmeCtrl *n)
     n->aer_queued = 0;
     n->outstanding_aers = 0;
     n->qs_created = false;
+}
+
+static void nvme_ctrl_reset(NvmeCtrl *n)
+{
+    nvme_clear_ctrl(n);
+    n->bar.cc = 0;
+}
+
+static void nvme_ctrl_shutdown(NvmeCtrl *n)
+{
+    NvmeNamespace *ns;
+    int i;
+
+    nvme_clear_ctrl(n);
 
     for (i = 1; i <= n->num_namespaces; i++) {
         ns = nvme_ns(n, i);
@@ -2302,10 +2316,8 @@ static void nvme_clear_ctrl(NvmeCtrl *n)
             continue;
         }
 
-        nvme_ns_flush(ns);
+        nvme_ns_shutdown(ns);
     }
-
-    n->bar.cc = 0;
 }
 
 static int nvme_start_ctrl(NvmeCtrl *n)
@@ -2472,12 +2484,12 @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
             }
         } else if (!NVME_CC_EN(data) && NVME_CC_EN(n->bar.cc)) {
             trace_pci_nvme_mmio_stopped();
-            nvme_clear_ctrl(n);
+            nvme_ctrl_reset(n);
             n->bar.csts &= ~NVME_CSTS_READY;
         }
         if (NVME_CC_SHN(data) && !(NVME_CC_SHN(n->bar.cc))) {
             trace_pci_nvme_mmio_shutdown_set();
-            nvme_clear_ctrl(n);
+            nvme_ctrl_shutdown(n);
             n->bar.cc = data;
             n->bar.csts |= NVME_CSTS_SHST_COMPLETE;
         } else if (!NVME_CC_SHN(data) && NVME_CC_SHN(n->bar.cc)) {
@@ -3088,7 +3100,7 @@ static void nvme_exit(PCIDevice *pci_dev)
 {
     NvmeCtrl *n = NVME(pci_dev);
 
-    nvme_clear_ctrl(n);
+    nvme_ctrl_shutdown(n);
     g_free(n->cq);
     g_free(n->sq);
     g_free(n->aer_reqs);
-- 
2.30.0
^ permalink raw reply related	[flat|nested] 62+ messages in thread
- * [PULL 09/56] hw/block/nvme: Generate namespace UUIDs
  2021-02-09  7:30 [PULL 00/56] emulated nvme patches Klaus Jensen
                   ` (7 preceding siblings ...)
  2021-02-09  7:30 ` [PULL 08/56] hw/block/nvme: Process controller reset and shutdown differently Klaus Jensen
@ 2021-02-09  7:30 ` Klaus Jensen
  2021-02-09  7:30 ` [PULL 10/56] hw/block/nvme: Separate read and write handlers Klaus Jensen
                   ` (47 subsequent siblings)
  56 siblings, 0 replies; 62+ messages in thread
From: Klaus Jensen @ 2021-02-09  7:30 UTC (permalink / raw)
  To: qemu-devel, Peter Maydell
  Cc: Fam Zheng, Kevin Wolf, qemu-block, Niklas Cassel, Dmitry Fomichev,
	Klaus Jensen, Max Reitz, Klaus Jensen, Stefan Hajnoczi,
	Keith Busch
From: Dmitry Fomichev <dmitry.fomichev@wdc.com>
In NVMe 1.4, a namespace must report an ID descriptor of UUID type
if it doesn't support EUI64 or NGUID. Add a new namespace property,
"uuid", that provides the user the option to either specify the UUID
explicitly or have a UUID generated automatically every time a
namespace is initialized.
Suggested-by: Klaus Jensen <k.jensen@samsung.com>
Signed-off-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
Reviewed-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Niklas Cassel <Niklas.Cassel@wdc.com>
Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
---
 hw/block/nvme-ns.h | 1 +
 hw/block/nvme-ns.c | 1 +
 hw/block/nvme.c    | 9 +++++----
 3 files changed, 7 insertions(+), 4 deletions(-)
diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
index ed3d7e65d597..aeca810fc7a8 100644
--- a/hw/block/nvme-ns.h
+++ b/hw/block/nvme-ns.h
@@ -21,6 +21,7 @@
 
 typedef struct NvmeNamespaceParams {
     uint32_t nsid;
+    QemuUUID uuid;
 } NvmeNamespaceParams;
 
 typedef struct NvmeNamespace {
diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index a0de53e71878..f6d752b71467 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -151,6 +151,7 @@ static void nvme_ns_realize(DeviceState *dev, Error **errp)
 static Property nvme_ns_props[] = {
     DEFINE_BLOCK_PROPERTIES(NvmeNamespace, blkconf),
     DEFINE_PROP_UINT32("nsid", NvmeNamespace, params.nsid, 0),
+    DEFINE_PROP_UUID("uuid", NvmeNamespace, params.uuid),
     DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index f54c5c6ea44d..7b243a56efdf 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1765,6 +1765,7 @@ static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeRequest *req)
 
 static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req)
 {
+    NvmeNamespace *ns;
     NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
     uint32_t nsid = le32_to_cpu(c->nsid);
     uint8_t list[NVME_IDENTIFY_DATA_SIZE];
@@ -1784,7 +1785,8 @@ static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req)
         return NVME_INVALID_NSID | NVME_DNR;
     }
 
-    if (unlikely(!nvme_ns(n, nsid))) {
+    ns = nvme_ns(n, nsid);
+    if (unlikely(!ns)) {
         return NVME_INVALID_FIELD | NVME_DNR;
     }
 
@@ -1793,12 +1795,11 @@ static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req)
     /*
      * Because the NGUID and EUI64 fields are 0 in the Identify Namespace data
      * structure, a Namespace UUID (nidt = 0x3) must be reported in the
-     * Namespace Identification Descriptor. Add a very basic Namespace UUID
-     * here.
+     * Namespace Identification Descriptor. Add the namespace UUID here.
      */
     ns_descrs->uuid.hdr.nidt = NVME_NIDT_UUID;
     ns_descrs->uuid.hdr.nidl = NVME_NIDT_UUID_LEN;
-    stl_be_p(&ns_descrs->uuid.v, nsid);
+    memcpy(&ns_descrs->uuid.v, ns->params.uuid.data, NVME_NIDT_UUID_LEN);
 
     return nvme_dma(n, list, NVME_IDENTIFY_DATA_SIZE,
                     DMA_DIRECTION_FROM_DEVICE, req);
-- 
2.30.0
^ permalink raw reply related	[flat|nested] 62+ messages in thread
- * [PULL 10/56] hw/block/nvme: Separate read and write handlers
  2021-02-09  7:30 [PULL 00/56] emulated nvme patches Klaus Jensen
                   ` (8 preceding siblings ...)
  2021-02-09  7:30 ` [PULL 09/56] hw/block/nvme: Generate namespace UUIDs Klaus Jensen
@ 2021-02-09  7:30 ` Klaus Jensen
  2021-02-09  7:30 ` [PULL 11/56] hw/block/nvme: Combine nvme_write_zeroes() and nvme_write() Klaus Jensen
                   ` (46 subsequent siblings)
  56 siblings, 0 replies; 62+ messages in thread
From: Klaus Jensen @ 2021-02-09  7:30 UTC (permalink / raw)
  To: qemu-devel, Peter Maydell
  Cc: Fam Zheng, Kevin Wolf, qemu-block, Niklas Cassel, Dmitry Fomichev,
	Klaus Jensen, Max Reitz, Klaus Jensen, Stefan Hajnoczi,
	Keith Busch
From: Dmitry Fomichev <dmitry.fomichev@wdc.com>
The majority of code in nvme_rw() is becoming read- or write-specific.
Move these parts to two separate handlers, nvme_read() and nvme_write()
to make the code more readable and to remove multiple is_write checks
that has been present in the i/o path.
This is a refactoring patch, no change in functionality.
Signed-off-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
Reviewed-by: Niklas Cassel <Niklas.Cassel@wdc.com>
Reviewed-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
---
 hw/block/nvme.c       | 107 ++++++++++++++++++++++++++++--------------
 hw/block/trace-events |   3 +-
 2 files changed, 74 insertions(+), 36 deletions(-)
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 7b243a56efdf..905fd1ba93f5 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1176,6 +1176,61 @@ static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req)
     return NVME_NO_COMPLETE;
 }
 
+static uint16_t nvme_read(NvmeCtrl *n, NvmeRequest *req)
+{
+    NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
+    NvmeNamespace *ns = req->ns;
+    uint64_t slba = le64_to_cpu(rw->slba);
+    uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
+    uint64_t data_size = nvme_l2b(ns, nlb);
+    uint64_t data_offset;
+    BlockBackend *blk = ns->blkconf.blk;
+    uint16_t status;
+
+    trace_pci_nvme_read(nvme_cid(req), nvme_nsid(ns), nlb, data_size, slba);
+
+    status = nvme_check_mdts(n, data_size);
+    if (status) {
+        trace_pci_nvme_err_mdts(nvme_cid(req), data_size);
+        goto invalid;
+    }
+
+    status = nvme_check_bounds(ns, slba, nlb);
+    if (status) {
+        trace_pci_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
+        goto invalid;
+    }
+
+    status = nvme_map_dptr(n, data_size, req);
+    if (status) {
+        goto invalid;
+    }
+
+    if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
+        status = nvme_check_dulbe(ns, slba, nlb);
+        if (status) {
+            goto invalid;
+        }
+    }
+
+    data_offset = nvme_l2b(ns, slba);
+
+    block_acct_start(blk_get_stats(blk), &req->acct, data_size,
+                     BLOCK_ACCT_READ);
+    if (req->qsg.sg) {
+        req->aiocb = dma_blk_read(blk, &req->qsg, data_offset,
+                                  BDRV_SECTOR_SIZE, nvme_rw_cb, req);
+    } else {
+        req->aiocb = blk_aio_preadv(blk, data_offset, &req->iov, 0,
+                                    nvme_rw_cb, req);
+    }
+    return NVME_NO_COMPLETE;
+
+invalid:
+    block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_READ);
+    return status | NVME_DNR;
+}
+
 static uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req)
 {
     NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
@@ -1201,22 +1256,19 @@ static uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req)
     return NVME_NO_COMPLETE;
 }
 
-static uint16_t nvme_rw(NvmeCtrl *n, NvmeRequest *req)
+static uint16_t nvme_write(NvmeCtrl *n, NvmeRequest *req)
 {
     NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
     NvmeNamespace *ns = req->ns;
-    uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
     uint64_t slba = le64_to_cpu(rw->slba);
-
+    uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
     uint64_t data_size = nvme_l2b(ns, nlb);
-    uint64_t data_offset = nvme_l2b(ns, slba);
-    enum BlockAcctType acct = req->cmd.opcode == NVME_CMD_WRITE ?
-        BLOCK_ACCT_WRITE : BLOCK_ACCT_READ;
+    uint64_t data_offset;
     BlockBackend *blk = ns->blkconf.blk;
     uint16_t status;
 
-    trace_pci_nvme_rw(nvme_cid(req), nvme_io_opc_str(rw->opcode),
-                      nvme_nsid(ns), nlb, data_size, slba);
+    trace_pci_nvme_write(nvme_cid(req), nvme_io_opc_str(rw->opcode),
+                         nvme_nsid(ns), nlb, data_size, slba);
 
     status = nvme_check_mdts(n, data_size);
     if (status) {
@@ -1230,43 +1282,27 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeRequest *req)
         goto invalid;
     }
 
-    if (acct == BLOCK_ACCT_READ) {
-        if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
-            status = nvme_check_dulbe(ns, slba, nlb);
-            if (status) {
-                goto invalid;
-            }
-        }
-    }
-
     status = nvme_map_dptr(n, data_size, req);
     if (status) {
         goto invalid;
     }
 
-    block_acct_start(blk_get_stats(blk), &req->acct, data_size, acct);
+    data_offset = nvme_l2b(ns, slba);
+
+    block_acct_start(blk_get_stats(blk), &req->acct, data_size,
+                     BLOCK_ACCT_WRITE);
     if (req->qsg.sg) {
-        if (acct == BLOCK_ACCT_WRITE) {
-            req->aiocb = dma_blk_write(blk, &req->qsg, data_offset,
-                                       BDRV_SECTOR_SIZE, nvme_rw_cb, req);
-        } else {
-            req->aiocb = dma_blk_read(blk, &req->qsg, data_offset,
-                                      BDRV_SECTOR_SIZE, nvme_rw_cb, req);
-        }
+        req->aiocb = dma_blk_write(blk, &req->qsg, data_offset,
+                                   BDRV_SECTOR_SIZE, nvme_rw_cb, req);
     } else {
-        if (acct == BLOCK_ACCT_WRITE) {
-            req->aiocb = blk_aio_pwritev(blk, data_offset, &req->iov, 0,
-                                         nvme_rw_cb, req);
-        } else {
-            req->aiocb = blk_aio_preadv(blk, data_offset, &req->iov, 0,
-                                        nvme_rw_cb, req);
-        }
+        req->aiocb = blk_aio_pwritev(blk, data_offset, &req->iov, 0,
+                                     nvme_rw_cb, req);
     }
     return NVME_NO_COMPLETE;
 
 invalid:
-    block_acct_invalid(blk_get_stats(ns->blkconf.blk), acct);
-    return status;
+    block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_WRITE);
+    return status | NVME_DNR;
 }
 
 static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
@@ -1295,8 +1331,9 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
     case NVME_CMD_WRITE_ZEROES:
         return nvme_write_zeroes(n, req);
     case NVME_CMD_WRITE:
+        return nvme_write(n, req);
     case NVME_CMD_READ:
-        return nvme_rw(n, req);
+        return nvme_read(n, req);
     case NVME_CMD_COMPARE:
         return nvme_compare(n, req);
     case NVME_CMD_DSM:
diff --git a/hw/block/trace-events b/hw/block/trace-events
index 68a4c8ed35e0..ec1b43220eff 100644
--- a/hw/block/trace-events
+++ b/hw/block/trace-events
@@ -40,7 +40,8 @@ pci_nvme_map_prp(uint64_t trans_len, uint32_t len, uint64_t prp1, uint64_t prp2,
 pci_nvme_map_sgl(uint16_t cid, uint8_t typ, uint64_t len) "cid %"PRIu16" type 0x%"PRIx8" len %"PRIu64""
 pci_nvme_io_cmd(uint16_t cid, uint32_t nsid, uint16_t sqid, uint8_t opcode, const char *opname) "cid %"PRIu16" nsid %"PRIu32" sqid %"PRIu16" opc 0x%"PRIx8" opname '%s'"
 pci_nvme_admin_cmd(uint16_t cid, uint16_t sqid, uint8_t opcode, const char *opname) "cid %"PRIu16" sqid %"PRIu16" opc 0x%"PRIx8" opname '%s'"
-pci_nvme_rw(uint16_t cid, const char *verb, uint32_t nsid, uint32_t nlb, uint64_t count, uint64_t lba) "cid %"PRIu16" opname '%s' nsid %"PRIu32" nlb %"PRIu32" count %"PRIu64" lba 0x%"PRIx64""
+pci_nvme_read(uint16_t cid, uint32_t nsid, uint32_t nlb, uint64_t count, uint64_t lba) "cid %"PRIu16" nsid %"PRIu32" nlb %"PRIu32" count %"PRIu64" lba 0x%"PRIx64""
+pci_nvme_write(uint16_t cid, const char *verb, uint32_t nsid, uint32_t nlb, uint64_t count, uint64_t lba) "cid %"PRIu16" opname '%s' nsid %"PRIu32" nlb %"PRIu32" count %"PRIu64" lba 0x%"PRIx64""
 pci_nvme_rw_cb(uint16_t cid, const char *blkname) "cid %"PRIu16" blk '%s'"
 pci_nvme_write_zeroes(uint16_t cid, uint32_t nsid, uint64_t slba, uint32_t nlb) "cid %"PRIu16" nsid %"PRIu32" slba %"PRIu64" nlb %"PRIu32""
 pci_nvme_block_status(int64_t offset, int64_t bytes, int64_t pnum, int ret, bool zeroed) "offset %"PRId64" bytes %"PRId64" pnum %"PRId64" ret 0x%x zeroed %d"
-- 
2.30.0
^ permalink raw reply related	[flat|nested] 62+ messages in thread
- * [PULL 11/56] hw/block/nvme: Combine nvme_write_zeroes() and nvme_write()
  2021-02-09  7:30 [PULL 00/56] emulated nvme patches Klaus Jensen
                   ` (9 preceding siblings ...)
  2021-02-09  7:30 ` [PULL 10/56] hw/block/nvme: Separate read and write handlers Klaus Jensen
@ 2021-02-09  7:30 ` Klaus Jensen
  2021-02-09  7:30 ` [PULL 12/56] hw/block/nvme: Add Commands Supported and Effects log Klaus Jensen
                   ` (45 subsequent siblings)
  56 siblings, 0 replies; 62+ messages in thread
From: Klaus Jensen @ 2021-02-09  7:30 UTC (permalink / raw)
  To: qemu-devel, Peter Maydell
  Cc: Fam Zheng, Kevin Wolf, qemu-block, Niklas Cassel, Dmitry Fomichev,
	Klaus Jensen, Max Reitz, Klaus Jensen, Stefan Hajnoczi,
	Keith Busch
From: Dmitry Fomichev <dmitry.fomichev@wdc.com>
Move write processing to nvme_do_write() that now handles both WRITE
and WRITE ZEROES. Both nvme_write() and nvme_write_zeroes() become
inline helper functions.
Signed-off-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
Reviewed-by: Niklas Cassel <Niklas.Cassel@wdc.com>
Reviewed-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
---
 hw/block/nvme.c       | 78 ++++++++++++++++++++-----------------------
 hw/block/trace-events |  1 -
 2 files changed, 36 insertions(+), 43 deletions(-)
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 905fd1ba93f5..4d1ca8c466c5 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1231,32 +1231,7 @@ invalid:
     return status | NVME_DNR;
 }
 
-static uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req)
-{
-    NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
-    NvmeNamespace *ns = req->ns;
-    uint64_t slba = le64_to_cpu(rw->slba);
-    uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
-    uint64_t offset = nvme_l2b(ns, slba);
-    uint32_t count = nvme_l2b(ns, nlb);
-    uint16_t status;
-
-    trace_pci_nvme_write_zeroes(nvme_cid(req), nvme_nsid(ns), slba, nlb);
-
-    status = nvme_check_bounds(ns, slba, nlb);
-    if (status) {
-        trace_pci_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
-        return status;
-    }
-
-    block_acct_start(blk_get_stats(req->ns->blkconf.blk), &req->acct, 0,
-                     BLOCK_ACCT_WRITE);
-    req->aiocb = blk_aio_pwrite_zeroes(req->ns->blkconf.blk, offset, count,
-                                       BDRV_REQ_MAY_UNMAP, nvme_rw_cb, req);
-    return NVME_NO_COMPLETE;
-}
-
-static uint16_t nvme_write(NvmeCtrl *n, NvmeRequest *req)
+static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool wrz)
 {
     NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
     NvmeNamespace *ns = req->ns;
@@ -1270,10 +1245,12 @@ static uint16_t nvme_write(NvmeCtrl *n, NvmeRequest *req)
     trace_pci_nvme_write(nvme_cid(req), nvme_io_opc_str(rw->opcode),
                          nvme_nsid(ns), nlb, data_size, slba);
 
-    status = nvme_check_mdts(n, data_size);
-    if (status) {
-        trace_pci_nvme_err_mdts(nvme_cid(req), data_size);
-        goto invalid;
+    if (!wrz) {
+        status = nvme_check_mdts(n, data_size);
+        if (status) {
+            trace_pci_nvme_err_mdts(nvme_cid(req), data_size);
+            goto invalid;
+        }
     }
 
     status = nvme_check_bounds(ns, slba, nlb);
@@ -1282,21 +1259,28 @@ static uint16_t nvme_write(NvmeCtrl *n, NvmeRequest *req)
         goto invalid;
     }
 
-    status = nvme_map_dptr(n, data_size, req);
-    if (status) {
-        goto invalid;
-    }
-
     data_offset = nvme_l2b(ns, slba);
 
-    block_acct_start(blk_get_stats(blk), &req->acct, data_size,
-                     BLOCK_ACCT_WRITE);
-    if (req->qsg.sg) {
-        req->aiocb = dma_blk_write(blk, &req->qsg, data_offset,
-                                   BDRV_SECTOR_SIZE, nvme_rw_cb, req);
+    if (!wrz) {
+        status = nvme_map_dptr(n, data_size, req);
+        if (status) {
+            goto invalid;
+        }
+
+        block_acct_start(blk_get_stats(blk), &req->acct, data_size,
+                         BLOCK_ACCT_WRITE);
+        if (req->qsg.sg) {
+            req->aiocb = dma_blk_write(blk, &req->qsg, data_offset,
+                                       BDRV_SECTOR_SIZE, nvme_rw_cb, req);
+        } else {
+            req->aiocb = blk_aio_pwritev(blk, data_offset, &req->iov, 0,
+                                         nvme_rw_cb, req);
+        }
     } else {
-        req->aiocb = blk_aio_pwritev(blk, data_offset, &req->iov, 0,
-                                     nvme_rw_cb, req);
+        block_acct_start(blk_get_stats(blk), &req->acct, 0, BLOCK_ACCT_WRITE);
+        req->aiocb = blk_aio_pwrite_zeroes(blk, data_offset, data_size,
+                                           BDRV_REQ_MAY_UNMAP, nvme_rw_cb,
+                                           req);
     }
     return NVME_NO_COMPLETE;
 
@@ -1305,6 +1289,16 @@ invalid:
     return status | NVME_DNR;
 }
 
+static inline uint16_t nvme_write(NvmeCtrl *n, NvmeRequest *req)
+{
+    return nvme_do_write(n, req, false);
+}
+
+static inline uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req)
+{
+    return nvme_do_write(n, req, true);
+}
+
 static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
 {
     uint32_t nsid = le32_to_cpu(req->cmd.nsid);
diff --git a/hw/block/trace-events b/hw/block/trace-events
index ec1b43220eff..60262b03c901 100644
--- a/hw/block/trace-events
+++ b/hw/block/trace-events
@@ -43,7 +43,6 @@ pci_nvme_admin_cmd(uint16_t cid, uint16_t sqid, uint8_t opcode, const char *opna
 pci_nvme_read(uint16_t cid, uint32_t nsid, uint32_t nlb, uint64_t count, uint64_t lba) "cid %"PRIu16" nsid %"PRIu32" nlb %"PRIu32" count %"PRIu64" lba 0x%"PRIx64""
 pci_nvme_write(uint16_t cid, const char *verb, uint32_t nsid, uint32_t nlb, uint64_t count, uint64_t lba) "cid %"PRIu16" opname '%s' nsid %"PRIu32" nlb %"PRIu32" count %"PRIu64" lba 0x%"PRIx64""
 pci_nvme_rw_cb(uint16_t cid, const char *blkname) "cid %"PRIu16" blk '%s'"
-pci_nvme_write_zeroes(uint16_t cid, uint32_t nsid, uint64_t slba, uint32_t nlb) "cid %"PRIu16" nsid %"PRIu32" slba %"PRIu64" nlb %"PRIu32""
 pci_nvme_block_status(int64_t offset, int64_t bytes, int64_t pnum, int ret, bool zeroed) "offset %"PRId64" bytes %"PRId64" pnum %"PRId64" ret 0x%x zeroed %d"
 pci_nvme_dsm(uint16_t cid, uint32_t nsid, uint32_t nr, uint32_t attr) "cid %"PRIu16" nsid %"PRIu32" nr %"PRIu32" attr 0x%"PRIx32""
 pci_nvme_dsm_deallocate(uint16_t cid, uint32_t nsid, uint64_t slba, uint32_t nlb) "cid %"PRIu16" nsid %"PRIu32" slba %"PRIu64" nlb %"PRIu32""
-- 
2.30.0
^ permalink raw reply related	[flat|nested] 62+ messages in thread
- * [PULL 12/56] hw/block/nvme: Add Commands Supported and Effects log
  2021-02-09  7:30 [PULL 00/56] emulated nvme patches Klaus Jensen
                   ` (10 preceding siblings ...)
  2021-02-09  7:30 ` [PULL 11/56] hw/block/nvme: Combine nvme_write_zeroes() and nvme_write() Klaus Jensen
@ 2021-02-09  7:30 ` Klaus Jensen
  2021-02-09  7:30 ` [PULL 13/56] hw/block/nvme: Add support for Namespace Types Klaus Jensen
                   ` (44 subsequent siblings)
  56 siblings, 0 replies; 62+ messages in thread
From: Klaus Jensen @ 2021-02-09  7:30 UTC (permalink / raw)
  To: qemu-devel, Peter Maydell
  Cc: Fam Zheng, Kevin Wolf, qemu-block, Niklas Cassel, Dmitry Fomichev,
	Klaus Jensen, Max Reitz, Klaus Jensen, Stefan Hajnoczi,
	Keith Busch
From: Dmitry Fomichev <dmitry.fomichev@wdc.com>
This log page becomes necessary to implement to allow checking for
Zone Append command support in Zoned Namespace Command Set.
This commit adds the code to report this log page for NVM Command
Set only. The parts that are specific to zoned operation will be
added later in the series.
All incoming admin and i/o commands are now only processed if their
corresponding support bits are set in this log. This provides an
easy way to control what commands to support and what not to
depending on set CC.CSS.
Signed-off-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
Reviewed-by: Niklas Cassel <Niklas.Cassel@wdc.com>
Reviewed-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
---
 hw/block/nvme-ns.h    |   1 +
 include/block/nvme.h  |  19 ++++++++
 hw/block/nvme.c       | 102 ++++++++++++++++++++++++++++++++++++++----
 hw/block/trace-events |   1 +
 4 files changed, 114 insertions(+), 9 deletions(-)
diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
index aeca810fc7a8..bdeaf1c0de84 100644
--- a/hw/block/nvme-ns.h
+++ b/hw/block/nvme-ns.h
@@ -30,6 +30,7 @@ typedef struct NvmeNamespace {
     int32_t      bootindex;
     int64_t      size;
     NvmeIdNs     id_ns;
+    const uint32_t *iocs;
 
     NvmeNamespaceParams params;
 
diff --git a/include/block/nvme.h b/include/block/nvme.h
index 11ac1c2b7dfb..397f7ca3b5cb 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -752,10 +752,27 @@ enum NvmeSmartWarn {
     NVME_SMART_FAILED_VOLATILE_MEDIA  = 1 << 4,
 };
 
+typedef struct NvmeEffectsLog {
+    uint32_t    acs[256];
+    uint32_t    iocs[256];
+    uint8_t     resv[2048];
+} NvmeEffectsLog;
+
+enum {
+    NVME_CMD_EFF_CSUPP      = 1 << 0,
+    NVME_CMD_EFF_LBCC       = 1 << 1,
+    NVME_CMD_EFF_NCC        = 1 << 2,
+    NVME_CMD_EFF_NIC        = 1 << 3,
+    NVME_CMD_EFF_CCC        = 1 << 4,
+    NVME_CMD_EFF_CSE_MASK   = 3 << 16,
+    NVME_CMD_EFF_UUID_SEL   = 1 << 19,
+};
+
 enum NvmeLogIdentifier {
     NVME_LOG_ERROR_INFO     = 0x01,
     NVME_LOG_SMART_INFO     = 0x02,
     NVME_LOG_FW_SLOT_INFO   = 0x03,
+    NVME_LOG_CMD_EFFECTS    = 0x05,
 };
 
 typedef struct QEMU_PACKED NvmePSD {
@@ -868,6 +885,7 @@ enum NvmeIdCtrlFrmw {
 
 enum NvmeIdCtrlLpa {
     NVME_LPA_NS_SMART = 1 << 0,
+    NVME_LPA_CSE      = 1 << 1,
     NVME_LPA_EXTENDED = 1 << 2,
 };
 
@@ -1076,6 +1094,7 @@ static inline void _nvme_check_size(void)
     QEMU_BUILD_BUG_ON(sizeof(NvmeErrorLog) != 64);
     QEMU_BUILD_BUG_ON(sizeof(NvmeFwSlotInfoLog) != 512);
     QEMU_BUILD_BUG_ON(sizeof(NvmeSmartLog) != 512);
+    QEMU_BUILD_BUG_ON(sizeof(NvmeEffectsLog) != 4096);
     QEMU_BUILD_BUG_ON(sizeof(NvmeIdCtrl) != 4096);
     QEMU_BUILD_BUG_ON(sizeof(NvmeIdNs) != 4096);
     QEMU_BUILD_BUG_ON(sizeof(NvmeSglDescriptor) != 16);
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 4d1ca8c466c5..05e799623c41 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -112,6 +112,30 @@ static const uint32_t nvme_feature_cap[NVME_FID_MAX] = {
     [NVME_TIMESTAMP]                = NVME_FEAT_CAP_CHANGE,
 };
 
+static const uint32_t nvme_cse_acs[256] = {
+    [NVME_ADM_CMD_DELETE_SQ]        = NVME_CMD_EFF_CSUPP,
+    [NVME_ADM_CMD_CREATE_SQ]        = NVME_CMD_EFF_CSUPP,
+    [NVME_ADM_CMD_GET_LOG_PAGE]     = NVME_CMD_EFF_CSUPP,
+    [NVME_ADM_CMD_DELETE_CQ]        = NVME_CMD_EFF_CSUPP,
+    [NVME_ADM_CMD_CREATE_CQ]        = NVME_CMD_EFF_CSUPP,
+    [NVME_ADM_CMD_IDENTIFY]         = NVME_CMD_EFF_CSUPP,
+    [NVME_ADM_CMD_ABORT]            = NVME_CMD_EFF_CSUPP,
+    [NVME_ADM_CMD_SET_FEATURES]     = NVME_CMD_EFF_CSUPP,
+    [NVME_ADM_CMD_GET_FEATURES]     = NVME_CMD_EFF_CSUPP,
+    [NVME_ADM_CMD_ASYNC_EV_REQ]     = NVME_CMD_EFF_CSUPP,
+};
+
+static const uint32_t nvme_cse_iocs_none[256];
+
+static const uint32_t nvme_cse_iocs_nvm[256] = {
+    [NVME_CMD_FLUSH]                = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
+    [NVME_CMD_WRITE_ZEROES]         = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
+    [NVME_CMD_WRITE]                = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
+    [NVME_CMD_READ]                 = NVME_CMD_EFF_CSUPP,
+    [NVME_CMD_DSM]                  = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
+    [NVME_CMD_COMPARE]              = NVME_CMD_EFF_CSUPP,
+};
+
 static void nvme_process_sq(void *opaque);
 
 static uint16_t nvme_cid(NvmeRequest *req)
@@ -1306,10 +1330,6 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
     trace_pci_nvme_io_cmd(nvme_cid(req), nsid, nvme_sqid(req),
                           req->cmd.opcode, nvme_io_opc_str(req->cmd.opcode));
 
-    if (NVME_CC_CSS(n->bar.cc) == NVME_CC_CSS_ADMIN_ONLY) {
-        return NVME_INVALID_OPCODE | NVME_DNR;
-    }
-
     if (!nvme_nsid_valid(n, nsid)) {
         return NVME_INVALID_NSID | NVME_DNR;
     }
@@ -1319,6 +1339,11 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
         return NVME_INVALID_FIELD | NVME_DNR;
     }
 
+    if (!(req->ns->iocs[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) {
+        trace_pci_nvme_err_invalid_opc(req->cmd.opcode);
+        return NVME_INVALID_OPCODE | NVME_DNR;
+    }
+
     switch (req->cmd.opcode) {
     case NVME_CMD_FLUSH:
         return nvme_flush(n, req);
@@ -1333,9 +1358,10 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
     case NVME_CMD_DSM:
         return nvme_dsm(n, req);
     default:
-        trace_pci_nvme_err_invalid_opc(req->cmd.opcode);
-        return NVME_INVALID_OPCODE | NVME_DNR;
+        assert(false);
     }
+
+    return NVME_INVALID_OPCODE | NVME_DNR;
 }
 
 static void nvme_free_sq(NvmeSQueue *sq, NvmeCtrl *n)
@@ -1570,6 +1596,37 @@ static uint16_t nvme_error_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
                     DMA_DIRECTION_FROM_DEVICE, req);
 }
 
+static uint16_t nvme_cmd_effects(NvmeCtrl *n, uint32_t buf_len,
+                                 uint64_t off, NvmeRequest *req)
+{
+    NvmeEffectsLog log = {};
+    const uint32_t *src_iocs = NULL;
+    uint32_t trans_len;
+
+    if (off >= sizeof(log)) {
+        trace_pci_nvme_err_invalid_log_page_offset(off, sizeof(log));
+        return NVME_INVALID_FIELD | NVME_DNR;
+    }
+
+    switch (NVME_CC_CSS(n->bar.cc)) {
+    case NVME_CC_CSS_NVM:
+        src_iocs = nvme_cse_iocs_nvm;
+    case NVME_CC_CSS_ADMIN_ONLY:
+        break;
+    }
+
+    memcpy(log.acs, nvme_cse_acs, sizeof(nvme_cse_acs));
+
+    if (src_iocs) {
+        memcpy(log.iocs, src_iocs, sizeof(log.iocs));
+    }
+
+    trans_len = MIN(sizeof(log) - off, buf_len);
+
+    return nvme_dma(n, ((uint8_t *)&log) + off, trans_len,
+                    DMA_DIRECTION_FROM_DEVICE, req);
+}
+
 static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req)
 {
     NvmeCmd *cmd = &req->cmd;
@@ -1613,6 +1670,8 @@ static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req)
         return nvme_smart_info(n, rae, len, off, req);
     case NVME_LOG_FW_SLOT_INFO:
         return nvme_fw_log_info(n, len, off, req);
+    case NVME_LOG_CMD_EFFECTS:
+        return nvme_cmd_effects(n, len, off, req);
     default:
         trace_pci_nvme_err_invalid_log_page(nvme_cid(req), lid);
         return NVME_INVALID_FIELD | NVME_DNR;
@@ -2229,6 +2288,11 @@ static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req)
     trace_pci_nvme_admin_cmd(nvme_cid(req), nvme_sqid(req), req->cmd.opcode,
                              nvme_adm_opc_str(req->cmd.opcode));
 
+    if (!(nvme_cse_acs[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) {
+        trace_pci_nvme_err_invalid_admin_opc(req->cmd.opcode);
+        return NVME_INVALID_OPCODE | NVME_DNR;
+    }
+
     switch (req->cmd.opcode) {
     case NVME_ADM_CMD_DELETE_SQ:
         return nvme_del_sq(n, req);
@@ -2251,9 +2315,10 @@ static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req)
     case NVME_ADM_CMD_ASYNC_EV_REQ:
         return nvme_aer(n, req);
     default:
-        trace_pci_nvme_err_invalid_admin_opc(req->cmd.opcode);
-        return NVME_INVALID_OPCODE | NVME_DNR;
+        assert(false);
     }
+
+    return NVME_INVALID_OPCODE | NVME_DNR;
 }
 
 static void nvme_process_sq(void *opaque)
@@ -2352,6 +2417,23 @@ static void nvme_ctrl_shutdown(NvmeCtrl *n)
     }
 }
 
+static void nvme_select_ns_iocs(NvmeCtrl *n)
+{
+    NvmeNamespace *ns;
+    int i;
+
+    for (i = 1; i <= n->num_namespaces; i++) {
+        ns = nvme_ns(n, i);
+        if (!ns) {
+            continue;
+        }
+        ns->iocs = nvme_cse_iocs_none;
+        if (NVME_CC_CSS(n->bar.cc) != NVME_CC_CSS_ADMIN_ONLY) {
+            ns->iocs = nvme_cse_iocs_nvm;
+        }
+    }
+}
+
 static int nvme_start_ctrl(NvmeCtrl *n)
 {
     uint32_t page_bits = NVME_CC_MPS(n->bar.cc) + 12;
@@ -2450,6 +2532,8 @@ static int nvme_start_ctrl(NvmeCtrl *n)
 
     QTAILQ_INIT(&n->aer_queue);
 
+    nvme_select_ns_iocs(n);
+
     return 0;
 }
 
@@ -3057,7 +3141,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
     id->acl = 3;
     id->aerl = n->params.aerl;
     id->frmw = (NVME_NUM_FW_SLOTS << 1) | NVME_FRMW_SLOT1_RO;
-    id->lpa = NVME_LPA_NS_SMART | NVME_LPA_EXTENDED;
+    id->lpa = NVME_LPA_NS_SMART | NVME_LPA_CSE | NVME_LPA_EXTENDED;
 
     /* recommended default value (~70 C) */
     id->wctemp = cpu_to_le16(NVME_TEMPERATURE_WARNING);
diff --git a/hw/block/trace-events b/hw/block/trace-events
index 60262b03c901..9e1a17e62711 100644
--- a/hw/block/trace-events
+++ b/hw/block/trace-events
@@ -108,6 +108,7 @@ pci_nvme_err_invalid_prp2_align(uint64_t prp2) "PRP2 is not page aligned: 0x%"PR
 pci_nvme_err_invalid_opc(uint8_t opc) "invalid opcode 0x%"PRIx8""
 pci_nvme_err_invalid_admin_opc(uint8_t opc) "invalid admin opcode 0x%"PRIx8""
 pci_nvme_err_invalid_lba_range(uint64_t start, uint64_t len, uint64_t limit) "Invalid LBA start=%"PRIu64" len=%"PRIu64" limit=%"PRIu64""
+pci_nvme_err_invalid_log_page_offset(uint64_t ofs, uint64_t size) "must be <= %"PRIu64", got %"PRIu64""
 pci_nvme_err_invalid_del_sq(uint16_t qid) "invalid submission queue deletion, sid=%"PRIu16""
 pci_nvme_err_invalid_create_sq_cqid(uint16_t cqid) "failed creating submission queue, invalid cqid=%"PRIu16""
 pci_nvme_err_invalid_create_sq_sqid(uint16_t sqid) "failed creating submission queue, invalid sqid=%"PRIu16""
-- 
2.30.0
^ permalink raw reply related	[flat|nested] 62+ messages in thread
- * [PULL 13/56] hw/block/nvme: Add support for Namespace Types
  2021-02-09  7:30 [PULL 00/56] emulated nvme patches Klaus Jensen
                   ` (11 preceding siblings ...)
  2021-02-09  7:30 ` [PULL 12/56] hw/block/nvme: Add Commands Supported and Effects log Klaus Jensen
@ 2021-02-09  7:30 ` Klaus Jensen
  2021-02-09  7:30 ` [PULL 14/56] hw/block/nvme: Support allocated CNS command variants Klaus Jensen
                   ` (43 subsequent siblings)
  56 siblings, 0 replies; 62+ messages in thread
From: Klaus Jensen @ 2021-02-09  7:30 UTC (permalink / raw)
  To: qemu-devel, Peter Maydell
  Cc: Fam Zheng, Kevin Wolf, qemu-block, Niklas Cassel, Dmitry Fomichev,
	Klaus Jensen, Max Reitz, Klaus Jensen, Stefan Hajnoczi,
	Keith Busch
From: Niklas Cassel <niklas.cassel@wdc.com>
Define the structures and constants required to implement
Namespace Types support.
Namespace Types introduce a new command set, "I/O Command Sets",
that allows the host to retrieve the command sets associated with
a namespace. Introduce support for the command set and enable
detection for the NVM Command Set.
The new workflows for identify commands rely heavily on zero-filled
identify structs. E.g., certain CNS commands are defined to return
a zero-filled identify struct when an inactive namespace NSID
is supplied.
Add a helper function in order to avoid code duplication when
reporting zero-filled identify structures.
Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
Signed-off-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
Reviewed-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
---
 hw/block/nvme-ns.h    |   1 +
 include/block/nvme.h  |  64 ++++++++++----
 hw/block/nvme-ns.c    |   2 +
 hw/block/nvme.c       | 188 +++++++++++++++++++++++++++++++++++-------
 hw/block/trace-events |   6 ++
 5 files changed, 217 insertions(+), 44 deletions(-)
diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
index bdeaf1c0de84..bdbc98c2ec17 100644
--- a/hw/block/nvme-ns.h
+++ b/hw/block/nvme-ns.h
@@ -31,6 +31,7 @@ typedef struct NvmeNamespace {
     int64_t      size;
     NvmeIdNs     id_ns;
     const uint32_t *iocs;
+    uint8_t      csi;
 
     NvmeNamespaceParams params;
 
diff --git a/include/block/nvme.h b/include/block/nvme.h
index 397f7ca3b5cb..19347cf69e52 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -84,6 +84,7 @@ enum NvmeCapMask {
 
 enum NvmeCapCss {
     NVME_CAP_CSS_NVM        = 1 << 0,
+    NVME_CAP_CSS_CSI_SUPP   = 1 << 6,
     NVME_CAP_CSS_ADMIN_ONLY = 1 << 7,
 };
 
@@ -117,9 +118,25 @@ enum NvmeCcMask {
 
 enum NvmeCcCss {
     NVME_CC_CSS_NVM        = 0x0,
+    NVME_CC_CSS_CSI        = 0x6,
     NVME_CC_CSS_ADMIN_ONLY = 0x7,
 };
 
+#define NVME_SET_CC_EN(cc, val)     \
+    (cc |= (uint32_t)((val) & CC_EN_MASK) << CC_EN_SHIFT)
+#define NVME_SET_CC_CSS(cc, val)    \
+    (cc |= (uint32_t)((val) & CC_CSS_MASK) << CC_CSS_SHIFT)
+#define NVME_SET_CC_MPS(cc, val)    \
+    (cc |= (uint32_t)((val) & CC_MPS_MASK) << CC_MPS_SHIFT)
+#define NVME_SET_CC_AMS(cc, val)    \
+    (cc |= (uint32_t)((val) & CC_AMS_MASK) << CC_AMS_SHIFT)
+#define NVME_SET_CC_SHN(cc, val)    \
+    (cc |= (uint32_t)((val) & CC_SHN_MASK) << CC_SHN_SHIFT)
+#define NVME_SET_CC_IOSQES(cc, val) \
+    (cc |= (uint32_t)((val) & CC_IOSQES_MASK) << CC_IOSQES_SHIFT)
+#define NVME_SET_CC_IOCQES(cc, val) \
+    (cc |= (uint32_t)((val) & CC_IOCQES_MASK) << CC_IOCQES_SHIFT)
+
 enum NvmeCstsShift {
     CSTS_RDY_SHIFT      = 0,
     CSTS_CFS_SHIFT      = 1,
@@ -540,8 +557,13 @@ typedef struct QEMU_PACKED NvmeIdentify {
     uint64_t    rsvd2[2];
     uint64_t    prp1;
     uint64_t    prp2;
-    uint32_t    cns;
-    uint32_t    rsvd11[5];
+    uint8_t     cns;
+    uint8_t     rsvd10;
+    uint16_t    ctrlid;
+    uint16_t    nvmsetid;
+    uint8_t     rsvd11;
+    uint8_t     csi;
+    uint32_t    rsvd12[4];
 } NvmeIdentify;
 
 typedef struct QEMU_PACKED NvmeRwCmd {
@@ -662,6 +684,7 @@ enum NvmeStatusCodes {
     NVME_SGL_DESCR_TYPE_INVALID = 0x0011,
     NVME_INVALID_USE_OF_CMB     = 0x0012,
     NVME_INVALID_PRP_OFFSET     = 0x0013,
+    NVME_CMD_SET_CMB_REJECTED   = 0x002b,
     NVME_LBA_RANGE              = 0x0080,
     NVME_CAP_EXCEEDED           = 0x0081,
     NVME_NS_NOT_READY           = 0x0082,
@@ -789,11 +812,15 @@ typedef struct QEMU_PACKED NvmePSD {
 
 #define NVME_IDENTIFY_DATA_SIZE 4096
 
-enum {
-    NVME_ID_CNS_NS             = 0x0,
-    NVME_ID_CNS_CTRL           = 0x1,
-    NVME_ID_CNS_NS_ACTIVE_LIST = 0x2,
-    NVME_ID_CNS_NS_DESCR_LIST  = 0x3,
+enum NvmeIdCns {
+    NVME_ID_CNS_NS                = 0x00,
+    NVME_ID_CNS_CTRL              = 0x01,
+    NVME_ID_CNS_NS_ACTIVE_LIST    = 0x02,
+    NVME_ID_CNS_NS_DESCR_LIST     = 0x03,
+    NVME_ID_CNS_CS_NS             = 0x05,
+    NVME_ID_CNS_CS_CTRL           = 0x06,
+    NVME_ID_CNS_CS_NS_ACTIVE_LIST = 0x07,
+    NVME_ID_CNS_IO_COMMAND_SET    = 0x1c,
 };
 
 typedef struct QEMU_PACKED NvmeIdCtrl {
@@ -944,6 +971,7 @@ enum NvmeFeatureIds {
     NVME_WRITE_ATOMICITY            = 0xa,
     NVME_ASYNCHRONOUS_EVENT_CONF    = 0xb,
     NVME_TIMESTAMP                  = 0xe,
+    NVME_COMMAND_SET_PROFILE        = 0x19,
     NVME_SOFTWARE_PROGRESS_MARKER   = 0x80,
     NVME_FID_MAX                    = 0x100,
 };
@@ -1033,18 +1061,26 @@ typedef struct QEMU_PACKED NvmeIdNsDescr {
     uint8_t rsvd2[2];
 } NvmeIdNsDescr;
 
-enum {
-    NVME_NIDT_EUI64_LEN =  8,
-    NVME_NIDT_NGUID_LEN = 16,
-    NVME_NIDT_UUID_LEN  = 16,
+enum NvmeNsIdentifierLength {
+    NVME_NIDL_EUI64             = 8,
+    NVME_NIDL_NGUID             = 16,
+    NVME_NIDL_UUID              = 16,
+    NVME_NIDL_CSI               = 1,
 };
 
 enum NvmeNsIdentifierType {
-    NVME_NIDT_EUI64 = 0x1,
-    NVME_NIDT_NGUID = 0x2,
-    NVME_NIDT_UUID  = 0x3,
+    NVME_NIDT_EUI64             = 0x01,
+    NVME_NIDT_NGUID             = 0x02,
+    NVME_NIDT_UUID              = 0x03,
+    NVME_NIDT_CSI               = 0x04,
 };
 
+enum NvmeCsi {
+    NVME_CSI_NVM                = 0x00,
+};
+
+#define NVME_SET_CSI(vec, csi) (vec |= (uint8_t)(1 << (csi)))
+
 /*Deallocate Logical Block Features*/
 #define NVME_ID_NS_DLFEAT_GUARD_CRC(dlfeat)       ((dlfeat) & 0x10)
 #define NVME_ID_NS_DLFEAT_WRITE_ZEROES(dlfeat)    ((dlfeat) & 0x08)
diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index f6d752b71467..f5f6284a4d48 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -43,6 +43,8 @@ static int nvme_ns_init(NvmeNamespace *ns, Error **errp)
 
     id_ns->nsze = cpu_to_le64(nvme_ns_nlbas(ns));
 
+    ns->csi = NVME_CSI_NVM;
+
     /* no thin provisioning */
     id_ns->ncap = id_ns->nsze;
     id_ns->nuse = id_ns->ncap;
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 05e799623c41..27679c8be816 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1596,7 +1596,7 @@ static uint16_t nvme_error_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
                     DMA_DIRECTION_FROM_DEVICE, req);
 }
 
-static uint16_t nvme_cmd_effects(NvmeCtrl *n, uint32_t buf_len,
+static uint16_t nvme_cmd_effects(NvmeCtrl *n, uint8_t csi, uint32_t buf_len,
                                  uint64_t off, NvmeRequest *req)
 {
     NvmeEffectsLog log = {};
@@ -1611,8 +1611,15 @@ static uint16_t nvme_cmd_effects(NvmeCtrl *n, uint32_t buf_len,
     switch (NVME_CC_CSS(n->bar.cc)) {
     case NVME_CC_CSS_NVM:
         src_iocs = nvme_cse_iocs_nvm;
+        /* fall through */
     case NVME_CC_CSS_ADMIN_ONLY:
         break;
+    case NVME_CC_CSS_CSI:
+        switch (csi) {
+        case NVME_CSI_NVM:
+            src_iocs = nvme_cse_iocs_nvm;
+            break;
+        }
     }
 
     memcpy(log.acs, nvme_cse_acs, sizeof(nvme_cse_acs));
@@ -1638,6 +1645,7 @@ static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req)
     uint8_t  lid = dw10 & 0xff;
     uint8_t  lsp = (dw10 >> 8) & 0xf;
     uint8_t  rae = (dw10 >> 15) & 0x1;
+    uint8_t  csi = le32_to_cpu(cmd->cdw14) >> 24;
     uint32_t numdl, numdu;
     uint64_t off, lpol, lpou;
     size_t   len;
@@ -1671,7 +1679,7 @@ static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req)
     case NVME_LOG_FW_SLOT_INFO:
         return nvme_fw_log_info(n, len, off, req);
     case NVME_LOG_CMD_EFFECTS:
-        return nvme_cmd_effects(n, len, off, req);
+        return nvme_cmd_effects(n, csi, len, off, req);
     default:
         trace_pci_nvme_err_invalid_log_page(nvme_cid(req), lid);
         return NVME_INVALID_FIELD | NVME_DNR;
@@ -1784,6 +1792,13 @@ static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeRequest *req)
     return NVME_SUCCESS;
 }
 
+static uint16_t nvme_rpt_empty_id_struct(NvmeCtrl *n, NvmeRequest *req)
+{
+    uint8_t id[NVME_IDENTIFY_DATA_SIZE] = {};
+
+    return nvme_dma(n, id, sizeof(id), DMA_DIRECTION_FROM_DEVICE, req);
+}
+
 static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeRequest *req)
 {
     trace_pci_nvme_identify_ctrl();
@@ -1792,11 +1807,23 @@ static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeRequest *req)
                     DMA_DIRECTION_FROM_DEVICE, req);
 }
 
+static uint16_t nvme_identify_ctrl_csi(NvmeCtrl *n, NvmeRequest *req)
+{
+    NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
+
+    trace_pci_nvme_identify_ctrl_csi(c->csi);
+
+    if (c->csi == NVME_CSI_NVM) {
+        return nvme_rpt_empty_id_struct(n, req);
+    }
+
+    return NVME_INVALID_FIELD | NVME_DNR;
+}
+
 static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req)
 {
     NvmeNamespace *ns;
     NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
-    NvmeIdNs *id_ns, inactive = { 0 };
     uint32_t nsid = le32_to_cpu(c->nsid);
 
     trace_pci_nvme_identify_ns(nsid);
@@ -1807,23 +1834,46 @@ static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req)
 
     ns = nvme_ns(n, nsid);
     if (unlikely(!ns)) {
-        id_ns = &inactive;
-    } else {
-        id_ns = &ns->id_ns;
+        return nvme_rpt_empty_id_struct(n, req);
     }
 
-    return nvme_dma(n, (uint8_t *)id_ns, sizeof(NvmeIdNs),
+    return nvme_dma(n, (uint8_t *)&ns->id_ns, sizeof(NvmeIdNs),
                     DMA_DIRECTION_FROM_DEVICE, req);
 }
 
+static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, NvmeRequest *req)
+{
+    NvmeNamespace *ns;
+    NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
+    uint32_t nsid = le32_to_cpu(c->nsid);
+
+    trace_pci_nvme_identify_ns_csi(nsid, c->csi);
+
+    if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
+        return NVME_INVALID_NSID | NVME_DNR;
+    }
+
+    ns = nvme_ns(n, nsid);
+    if (unlikely(!ns)) {
+        return nvme_rpt_empty_id_struct(n, req);
+    }
+
+    if (c->csi == NVME_CSI_NVM) {
+        return nvme_rpt_empty_id_struct(n, req);
+    }
+
+    return NVME_INVALID_FIELD | NVME_DNR;
+}
+
 static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeRequest *req)
 {
+    NvmeNamespace *ns;
     NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
-    static const int data_len = NVME_IDENTIFY_DATA_SIZE;
     uint32_t min_nsid = le32_to_cpu(c->nsid);
-    uint32_t *list;
-    uint16_t ret;
-    int j = 0;
+    uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
+    static const int data_len = sizeof(list);
+    uint32_t *list_ptr = (uint32_t *)list;
+    int i, j = 0;
 
     trace_pci_nvme_identify_nslist(min_nsid);
 
@@ -1837,20 +1887,61 @@ static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeRequest *req)
         return NVME_INVALID_NSID | NVME_DNR;
     }
 
-    list = g_malloc0(data_len);
-    for (int i = 1; i <= n->num_namespaces; i++) {
-        if (i <= min_nsid || !nvme_ns(n, i)) {
+    for (i = 1; i <= n->num_namespaces; i++) {
+        ns = nvme_ns(n, i);
+        if (!ns) {
             continue;
         }
-        list[j++] = cpu_to_le32(i);
+        if (ns->params.nsid <= min_nsid) {
+            continue;
+        }
+        list_ptr[j++] = cpu_to_le32(ns->params.nsid);
         if (j == data_len / sizeof(uint32_t)) {
             break;
         }
     }
-    ret = nvme_dma(n, (uint8_t *)list, data_len, DMA_DIRECTION_FROM_DEVICE,
-                   req);
-    g_free(list);
-    return ret;
+
+    return nvme_dma(n, list, data_len, DMA_DIRECTION_FROM_DEVICE, req);
+}
+
+static uint16_t nvme_identify_nslist_csi(NvmeCtrl *n, NvmeRequest *req)
+{
+    NvmeNamespace *ns;
+    NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
+    uint32_t min_nsid = le32_to_cpu(c->nsid);
+    uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
+    static const int data_len = sizeof(list);
+    uint32_t *list_ptr = (uint32_t *)list;
+    int i, j = 0;
+
+    trace_pci_nvme_identify_nslist_csi(min_nsid, c->csi);
+
+    /*
+     * Same as in nvme_identify_nslist(), 0xffffffff/0xfffffffe are invalid.
+     */
+    if (min_nsid >= NVME_NSID_BROADCAST - 1) {
+        return NVME_INVALID_NSID | NVME_DNR;
+    }
+
+    if (c->csi != NVME_CSI_NVM) {
+        return NVME_INVALID_FIELD | NVME_DNR;
+    }
+
+    for (i = 1; i <= n->num_namespaces; i++) {
+        ns = nvme_ns(n, i);
+        if (!ns) {
+            continue;
+        }
+        if (ns->params.nsid <= min_nsid) {
+            continue;
+        }
+        list_ptr[j++] = cpu_to_le32(ns->params.nsid);
+        if (j == data_len / sizeof(uint32_t)) {
+            break;
+        }
+    }
+
+    return nvme_dma(n, list, data_len, DMA_DIRECTION_FROM_DEVICE, req);
 }
 
 static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req)
@@ -1858,13 +1949,17 @@ static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req)
     NvmeNamespace *ns;
     NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
     uint32_t nsid = le32_to_cpu(c->nsid);
-    uint8_t list[NVME_IDENTIFY_DATA_SIZE];
+    uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
 
     struct data {
         struct {
             NvmeIdNsDescr hdr;
-            uint8_t v[16];
+            uint8_t v[NVME_NIDL_UUID];
         } uuid;
+        struct {
+            NvmeIdNsDescr hdr;
+            uint8_t v;
+        } csi;
     };
 
     struct data *ns_descrs = (struct data *)list;
@@ -1880,19 +1975,31 @@ static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req)
         return NVME_INVALID_FIELD | NVME_DNR;
     }
 
-    memset(list, 0x0, sizeof(list));
-
     /*
      * Because the NGUID and EUI64 fields are 0 in the Identify Namespace data
      * structure, a Namespace UUID (nidt = 0x3) must be reported in the
      * Namespace Identification Descriptor. Add the namespace UUID here.
      */
     ns_descrs->uuid.hdr.nidt = NVME_NIDT_UUID;
-    ns_descrs->uuid.hdr.nidl = NVME_NIDT_UUID_LEN;
-    memcpy(&ns_descrs->uuid.v, ns->params.uuid.data, NVME_NIDT_UUID_LEN);
+    ns_descrs->uuid.hdr.nidl = NVME_NIDL_UUID;
+    memcpy(&ns_descrs->uuid.v, ns->params.uuid.data, NVME_NIDL_UUID);
 
-    return nvme_dma(n, list, NVME_IDENTIFY_DATA_SIZE,
-                    DMA_DIRECTION_FROM_DEVICE, req);
+    ns_descrs->csi.hdr.nidt = NVME_NIDT_CSI;
+    ns_descrs->csi.hdr.nidl = NVME_NIDL_CSI;
+    ns_descrs->csi.v = ns->csi;
+
+    return nvme_dma(n, list, sizeof(list), DMA_DIRECTION_FROM_DEVICE, req);
+}
+
+static uint16_t nvme_identify_cmd_set(NvmeCtrl *n, NvmeRequest *req)
+{
+    uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
+    static const int data_len = sizeof(list);
+
+    trace_pci_nvme_identify_cmd_set();
+
+    NVME_SET_CSI(*list, NVME_CSI_NVM);
+    return nvme_dma(n, list, data_len, DMA_DIRECTION_FROM_DEVICE, req);
 }
 
 static uint16_t nvme_identify(NvmeCtrl *n, NvmeRequest *req)
@@ -1902,12 +2009,20 @@ static uint16_t nvme_identify(NvmeCtrl *n, NvmeRequest *req)
     switch (le32_to_cpu(c->cns)) {
     case NVME_ID_CNS_NS:
         return nvme_identify_ns(n, req);
+    case NVME_ID_CNS_CS_NS:
+        return nvme_identify_ns_csi(n, req);
     case NVME_ID_CNS_CTRL:
         return nvme_identify_ctrl(n, req);
+    case NVME_ID_CNS_CS_CTRL:
+        return nvme_identify_ctrl_csi(n, req);
     case NVME_ID_CNS_NS_ACTIVE_LIST:
         return nvme_identify_nslist(n, req);
+    case NVME_ID_CNS_CS_NS_ACTIVE_LIST:
+        return nvme_identify_nslist_csi(n, req);
     case NVME_ID_CNS_NS_DESCR_LIST:
         return nvme_identify_ns_descr_list(n, req);
+    case NVME_ID_CNS_IO_COMMAND_SET:
+        return nvme_identify_cmd_set(n, req);
     default:
         trace_pci_nvme_err_invalid_identify_cns(le32_to_cpu(c->cns));
         return NVME_INVALID_FIELD | NVME_DNR;
@@ -2096,7 +2211,9 @@ defaults:
         if (iv == n->admin_cq.vector) {
             result |= NVME_INTVC_NOCOALESCING;
         }
-
+        break;
+    case NVME_COMMAND_SET_PROFILE:
+        result = 0;
         break;
     default:
         result = nvme_feature_default[fid];
@@ -2258,6 +2375,12 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req)
         break;
     case NVME_TIMESTAMP:
         return nvme_set_feature_timestamp(n, req);
+    case NVME_COMMAND_SET_PROFILE:
+        if (dw11 & 0x1ff) {
+            trace_pci_nvme_err_invalid_iocsci(dw11 & 0x1ff);
+            return NVME_CMD_SET_CMB_REJECTED | NVME_DNR;
+        }
+        break;
     default:
         return NVME_FEAT_NOT_CHANGEABLE | NVME_DNR;
     }
@@ -2428,8 +2551,12 @@ static void nvme_select_ns_iocs(NvmeCtrl *n)
             continue;
         }
         ns->iocs = nvme_cse_iocs_none;
-        if (NVME_CC_CSS(n->bar.cc) != NVME_CC_CSS_ADMIN_ONLY) {
-            ns->iocs = nvme_cse_iocs_nvm;
+        switch (ns->csi) {
+        case NVME_CSI_NVM:
+            if (NVME_CC_CSS(n->bar.cc) != NVME_CC_CSS_ADMIN_ONLY) {
+                ns->iocs = nvme_cse_iocs_nvm;
+            }
+            break;
         }
     }
 }
@@ -3170,6 +3297,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
     NVME_CAP_SET_CQR(n->bar.cap, 1);
     NVME_CAP_SET_TO(n->bar.cap, 0xf);
     NVME_CAP_SET_CSS(n->bar.cap, NVME_CAP_CSS_NVM);
+    NVME_CAP_SET_CSS(n->bar.cap, NVME_CAP_CSS_CSI_SUPP);
     NVME_CAP_SET_CSS(n->bar.cap, NVME_CAP_CSS_ADMIN_ONLY);
     NVME_CAP_SET_MPSMAX(n->bar.cap, 4);
 
diff --git a/hw/block/trace-events b/hw/block/trace-events
index 9e1a17e62711..e56e2b66b075 100644
--- a/hw/block/trace-events
+++ b/hw/block/trace-events
@@ -54,8 +54,12 @@ pci_nvme_create_cq(uint64_t addr, uint16_t cqid, uint16_t vector, uint16_t size,
 pci_nvme_del_sq(uint16_t qid) "deleting submission queue sqid=%"PRIu16""
 pci_nvme_del_cq(uint16_t cqid) "deleted completion queue, cqid=%"PRIu16""
 pci_nvme_identify_ctrl(void) "identify controller"
+pci_nvme_identify_ctrl_csi(uint8_t csi) "identify controller, csi=0x%"PRIx8""
 pci_nvme_identify_ns(uint32_t ns) "nsid %"PRIu32""
+pci_nvme_identify_ns_csi(uint32_t ns, uint8_t csi) "nsid=%"PRIu32", csi=0x%"PRIx8""
 pci_nvme_identify_nslist(uint32_t ns) "nsid %"PRIu32""
+pci_nvme_identify_nslist_csi(uint16_t ns, uint8_t csi) "nsid=%"PRIu16", csi=0x%"PRIx8""
+pci_nvme_identify_cmd_set(void) "identify i/o command set"
 pci_nvme_identify_ns_descr_list(uint32_t ns) "nsid %"PRIu32""
 pci_nvme_get_log(uint16_t cid, uint8_t lid, uint8_t lsp, uint8_t rae, uint32_t len, uint64_t off) "cid %"PRIu16" lid 0x%"PRIx8" lsp 0x%"PRIx8" rae 0x%"PRIx8" len %"PRIu32" off %"PRIu64""
 pci_nvme_getfeat(uint16_t cid, uint32_t nsid, uint8_t fid, uint8_t sel, uint32_t cdw11) "cid %"PRIu16" nsid 0x%"PRIx32" fid 0x%"PRIx8" sel 0x%"PRIx8" cdw11 0x%"PRIx32""
@@ -109,6 +113,7 @@ pci_nvme_err_invalid_opc(uint8_t opc) "invalid opcode 0x%"PRIx8""
 pci_nvme_err_invalid_admin_opc(uint8_t opc) "invalid admin opcode 0x%"PRIx8""
 pci_nvme_err_invalid_lba_range(uint64_t start, uint64_t len, uint64_t limit) "Invalid LBA start=%"PRIu64" len=%"PRIu64" limit=%"PRIu64""
 pci_nvme_err_invalid_log_page_offset(uint64_t ofs, uint64_t size) "must be <= %"PRIu64", got %"PRIu64""
+pci_nvme_err_invalid_iocsci(uint32_t idx) "unsupported command set combination index %"PRIu32""
 pci_nvme_err_invalid_del_sq(uint16_t qid) "invalid submission queue deletion, sid=%"PRIu16""
 pci_nvme_err_invalid_create_sq_cqid(uint16_t cqid) "failed creating submission queue, invalid cqid=%"PRIu16""
 pci_nvme_err_invalid_create_sq_sqid(uint16_t sqid) "failed creating submission queue, invalid sqid=%"PRIu16""
@@ -165,6 +170,7 @@ pci_nvme_ub_db_wr_invalid_cq(uint32_t qid) "completion queue doorbell write for
 pci_nvme_ub_db_wr_invalid_cqhead(uint32_t qid, uint16_t new_head) "completion queue doorbell write value beyond queue size, cqid=%"PRIu32", new_head=%"PRIu16", ignoring"
 pci_nvme_ub_db_wr_invalid_sq(uint32_t qid) "submission queue doorbell write for nonexistent queue, sqid=%"PRIu32", ignoring"
 pci_nvme_ub_db_wr_invalid_sqtail(uint32_t qid, uint16_t new_tail) "submission queue doorbell write value beyond queue size, sqid=%"PRIu32", new_head=%"PRIu16", ignoring"
+pci_nvme_ub_unknown_css_value(void) "unknown value in cc.css field"
 
 # xen-block.c
 xen_block_realize(const char *type, uint32_t disk, uint32_t partition) "%s d%up%u"
-- 
2.30.0
^ permalink raw reply related	[flat|nested] 62+ messages in thread
- * [PULL 14/56] hw/block/nvme: Support allocated CNS command variants
  2021-02-09  7:30 [PULL 00/56] emulated nvme patches Klaus Jensen
                   ` (12 preceding siblings ...)
  2021-02-09  7:30 ` [PULL 13/56] hw/block/nvme: Add support for Namespace Types Klaus Jensen
@ 2021-02-09  7:30 ` Klaus Jensen
  2021-02-09  7:30 ` [PULL 15/56] nvme: Make ZNS-related definitions Klaus Jensen
                   ` (42 subsequent siblings)
  56 siblings, 0 replies; 62+ messages in thread
From: Klaus Jensen @ 2021-02-09  7:30 UTC (permalink / raw)
  To: qemu-devel, Peter Maydell
  Cc: Fam Zheng, Kevin Wolf, qemu-block, Niklas Cassel, Dmitry Fomichev,
	Klaus Jensen, Max Reitz, Klaus Jensen, Stefan Hajnoczi,
	Keith Busch
From: Niklas Cassel <niklas.cassel@wdc.com>
Many CNS commands have "allocated" command variants. These include
a namespace as long as it is allocated, that is a namespace is
included regardless if it is active (attached) or not.
While these commands are optional (they are mandatory for controllers
supporting the namespace attachment command), our QEMU implementation
is more complete by actually providing support for these CNS values.
However, since our QEMU model currently does not support the namespace
attachment command, these new allocated CNS commands will return the
same result as the active CNS command variants.
The reason for not hooking up this command completely is because the
NVMe specification requires the namespace management command to be
supported if the namespace attachment command is supported.
Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
Signed-off-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
Reviewed-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
---
 include/block/nvme.h | 20 ++++++++++++--------
 hw/block/nvme.c      |  8 ++++++++
 2 files changed, 20 insertions(+), 8 deletions(-)
diff --git a/include/block/nvme.h b/include/block/nvme.h
index 19347cf69e52..adb5806365a3 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -813,14 +813,18 @@ typedef struct QEMU_PACKED NvmePSD {
 #define NVME_IDENTIFY_DATA_SIZE 4096
 
 enum NvmeIdCns {
-    NVME_ID_CNS_NS                = 0x00,
-    NVME_ID_CNS_CTRL              = 0x01,
-    NVME_ID_CNS_NS_ACTIVE_LIST    = 0x02,
-    NVME_ID_CNS_NS_DESCR_LIST     = 0x03,
-    NVME_ID_CNS_CS_NS             = 0x05,
-    NVME_ID_CNS_CS_CTRL           = 0x06,
-    NVME_ID_CNS_CS_NS_ACTIVE_LIST = 0x07,
-    NVME_ID_CNS_IO_COMMAND_SET    = 0x1c,
+    NVME_ID_CNS_NS                    = 0x00,
+    NVME_ID_CNS_CTRL                  = 0x01,
+    NVME_ID_CNS_NS_ACTIVE_LIST        = 0x02,
+    NVME_ID_CNS_NS_DESCR_LIST         = 0x03,
+    NVME_ID_CNS_CS_NS                 = 0x05,
+    NVME_ID_CNS_CS_CTRL               = 0x06,
+    NVME_ID_CNS_CS_NS_ACTIVE_LIST     = 0x07,
+    NVME_ID_CNS_NS_PRESENT_LIST       = 0x10,
+    NVME_ID_CNS_NS_PRESENT            = 0x11,
+    NVME_ID_CNS_CS_NS_PRESENT_LIST    = 0x1a,
+    NVME_ID_CNS_CS_NS_PRESENT         = 0x1b,
+    NVME_ID_CNS_IO_COMMAND_SET        = 0x1c,
 };
 
 typedef struct QEMU_PACKED NvmeIdCtrl {
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 27679c8be816..f1cc66d381a1 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -2008,16 +2008,24 @@ static uint16_t nvme_identify(NvmeCtrl *n, NvmeRequest *req)
 
     switch (le32_to_cpu(c->cns)) {
     case NVME_ID_CNS_NS:
+         /* fall through */
+    case NVME_ID_CNS_NS_PRESENT:
         return nvme_identify_ns(n, req);
     case NVME_ID_CNS_CS_NS:
+         /* fall through */
+    case NVME_ID_CNS_CS_NS_PRESENT:
         return nvme_identify_ns_csi(n, req);
     case NVME_ID_CNS_CTRL:
         return nvme_identify_ctrl(n, req);
     case NVME_ID_CNS_CS_CTRL:
         return nvme_identify_ctrl_csi(n, req);
     case NVME_ID_CNS_NS_ACTIVE_LIST:
+         /* fall through */
+    case NVME_ID_CNS_NS_PRESENT_LIST:
         return nvme_identify_nslist(n, req);
     case NVME_ID_CNS_CS_NS_ACTIVE_LIST:
+         /* fall through */
+    case NVME_ID_CNS_CS_NS_PRESENT_LIST:
         return nvme_identify_nslist_csi(n, req);
     case NVME_ID_CNS_NS_DESCR_LIST:
         return nvme_identify_ns_descr_list(n, req);
-- 
2.30.0
^ permalink raw reply related	[flat|nested] 62+ messages in thread
- * [PULL 15/56] nvme: Make ZNS-related definitions
  2021-02-09  7:30 [PULL 00/56] emulated nvme patches Klaus Jensen
                   ` (13 preceding siblings ...)
  2021-02-09  7:30 ` [PULL 14/56] hw/block/nvme: Support allocated CNS command variants Klaus Jensen
@ 2021-02-09  7:30 ` Klaus Jensen
  2021-02-09  7:30 ` [PULL 16/56] hw/block/nvme: Support Zoned Namespace Command Set Klaus Jensen
                   ` (41 subsequent siblings)
  56 siblings, 0 replies; 62+ messages in thread
From: Klaus Jensen @ 2021-02-09  7:30 UTC (permalink / raw)
  To: qemu-devel, Peter Maydell
  Cc: Fam Zheng, Kevin Wolf, qemu-block, Dmitry Fomichev, Klaus Jensen,
	Max Reitz, Klaus Jensen, Stefan Hajnoczi, Keith Busch
From: Dmitry Fomichev <dmitry.fomichev@wdc.com>
Define values and structures that are needed to support Zoned
Namespace Command Set (NVMe TP 4053).
Signed-off-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
---
 include/block/nvme.h | 114 ++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 113 insertions(+), 1 deletion(-)
diff --git a/include/block/nvme.h b/include/block/nvme.h
index adb5806365a3..9494246f1f59 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -489,6 +489,9 @@ enum NvmeIoCommands {
     NVME_CMD_COMPARE            = 0x05,
     NVME_CMD_WRITE_ZEROES       = 0x08,
     NVME_CMD_DSM                = 0x09,
+    NVME_CMD_ZONE_MGMT_SEND     = 0x79,
+    NVME_CMD_ZONE_MGMT_RECV     = 0x7a,
+    NVME_CMD_ZONE_APPEND        = 0x7d,
 };
 
 typedef struct QEMU_PACKED NvmeDeleteQ {
@@ -654,9 +657,13 @@ typedef struct QEMU_PACKED NvmeAerResult {
     uint8_t resv;
 } NvmeAerResult;
 
+typedef struct QEMU_PACKED NvmeZonedResult {
+    uint64_t slba;
+} NvmeZonedResult;
+
 typedef struct QEMU_PACKED NvmeCqe {
     uint32_t    result;
-    uint32_t    rsvd;
+    uint32_t    dw1;
     uint16_t    sq_head;
     uint16_t    sq_id;
     uint16_t    cid;
@@ -685,6 +692,7 @@ enum NvmeStatusCodes {
     NVME_INVALID_USE_OF_CMB     = 0x0012,
     NVME_INVALID_PRP_OFFSET     = 0x0013,
     NVME_CMD_SET_CMB_REJECTED   = 0x002b,
+    NVME_INVALID_CMD_SET        = 0x002c,
     NVME_LBA_RANGE              = 0x0080,
     NVME_CAP_EXCEEDED           = 0x0081,
     NVME_NS_NOT_READY           = 0x0082,
@@ -709,6 +717,14 @@ enum NvmeStatusCodes {
     NVME_CONFLICTING_ATTRS      = 0x0180,
     NVME_INVALID_PROT_INFO      = 0x0181,
     NVME_WRITE_TO_RO            = 0x0182,
+    NVME_ZONE_BOUNDARY_ERROR    = 0x01b8,
+    NVME_ZONE_FULL              = 0x01b9,
+    NVME_ZONE_READ_ONLY         = 0x01ba,
+    NVME_ZONE_OFFLINE           = 0x01bb,
+    NVME_ZONE_INVALID_WRITE     = 0x01bc,
+    NVME_ZONE_TOO_MANY_ACTIVE   = 0x01bd,
+    NVME_ZONE_TOO_MANY_OPEN     = 0x01be,
+    NVME_ZONE_INVAL_TRANSITION  = 0x01bf,
     NVME_WRITE_FAULT            = 0x0280,
     NVME_UNRECOVERED_READ       = 0x0281,
     NVME_E2E_GUARD_ERROR        = 0x0282,
@@ -894,6 +910,11 @@ typedef struct QEMU_PACKED NvmeIdCtrl {
     uint8_t     vs[1024];
 } NvmeIdCtrl;
 
+typedef struct NvmeIdCtrlZoned {
+    uint8_t     zasl;
+    uint8_t     rsvd1[4095];
+} NvmeIdCtrlZoned;
+
 enum NvmeIdCtrlOacs {
     NVME_OACS_SECURITY  = 1 << 0,
     NVME_OACS_FORMAT    = 1 << 1,
@@ -1022,6 +1043,12 @@ typedef struct QEMU_PACKED NvmeLBAF {
     uint8_t     rp;
 } NvmeLBAF;
 
+typedef struct QEMU_PACKED NvmeLBAFE {
+    uint64_t    zsze;
+    uint8_t     zdes;
+    uint8_t     rsvd9[7];
+} NvmeLBAFE;
+
 #define NVME_NSID_BROADCAST 0xffffffff
 
 typedef struct QEMU_PACKED NvmeIdNs {
@@ -1081,10 +1108,24 @@ enum NvmeNsIdentifierType {
 
 enum NvmeCsi {
     NVME_CSI_NVM                = 0x00,
+    NVME_CSI_ZONED              = 0x02,
 };
 
 #define NVME_SET_CSI(vec, csi) (vec |= (uint8_t)(1 << (csi)))
 
+typedef struct QEMU_PACKED NvmeIdNsZoned {
+    uint16_t    zoc;
+    uint16_t    ozcs;
+    uint32_t    mar;
+    uint32_t    mor;
+    uint32_t    rrl;
+    uint32_t    frl;
+    uint8_t     rsvd20[2796];
+    NvmeLBAFE   lbafe[16];
+    uint8_t     rsvd3072[768];
+    uint8_t     vs[256];
+} NvmeIdNsZoned;
+
 /*Deallocate Logical Block Features*/
 #define NVME_ID_NS_DLFEAT_GUARD_CRC(dlfeat)       ((dlfeat) & 0x10)
 #define NVME_ID_NS_DLFEAT_WRITE_ZEROES(dlfeat)    ((dlfeat) & 0x08)
@@ -1117,10 +1158,76 @@ enum NvmeIdNsDps {
     DPS_FIRST_EIGHT = 8,
 };
 
+enum NvmeZoneAttr {
+    NVME_ZA_FINISHED_BY_CTLR         = 1 << 0,
+    NVME_ZA_FINISH_RECOMMENDED       = 1 << 1,
+    NVME_ZA_RESET_RECOMMENDED        = 1 << 2,
+    NVME_ZA_ZD_EXT_VALID             = 1 << 7,
+};
+
+typedef struct QEMU_PACKED NvmeZoneReportHeader {
+    uint64_t    nr_zones;
+    uint8_t     rsvd[56];
+} NvmeZoneReportHeader;
+
+enum NvmeZoneReceiveAction {
+    NVME_ZONE_REPORT                 = 0,
+    NVME_ZONE_REPORT_EXTENDED        = 1,
+};
+
+enum NvmeZoneReportType {
+    NVME_ZONE_REPORT_ALL             = 0,
+    NVME_ZONE_REPORT_EMPTY           = 1,
+    NVME_ZONE_REPORT_IMPLICITLY_OPEN = 2,
+    NVME_ZONE_REPORT_EXPLICITLY_OPEN = 3,
+    NVME_ZONE_REPORT_CLOSED          = 4,
+    NVME_ZONE_REPORT_FULL            = 5,
+    NVME_ZONE_REPORT_READ_ONLY       = 6,
+    NVME_ZONE_REPORT_OFFLINE         = 7,
+};
+
+enum NvmeZoneType {
+    NVME_ZONE_TYPE_RESERVED          = 0x00,
+    NVME_ZONE_TYPE_SEQ_WRITE         = 0x02,
+};
+
+enum NvmeZoneSendAction {
+    NVME_ZONE_ACTION_RSD             = 0x00,
+    NVME_ZONE_ACTION_CLOSE           = 0x01,
+    NVME_ZONE_ACTION_FINISH          = 0x02,
+    NVME_ZONE_ACTION_OPEN            = 0x03,
+    NVME_ZONE_ACTION_RESET           = 0x04,
+    NVME_ZONE_ACTION_OFFLINE         = 0x05,
+    NVME_ZONE_ACTION_SET_ZD_EXT      = 0x10,
+};
+
+typedef struct QEMU_PACKED NvmeZoneDescr {
+    uint8_t     zt;
+    uint8_t     zs;
+    uint8_t     za;
+    uint8_t     rsvd3[5];
+    uint64_t    zcap;
+    uint64_t    zslba;
+    uint64_t    wp;
+    uint8_t     rsvd32[32];
+} NvmeZoneDescr;
+
+enum NvmeZoneState {
+    NVME_ZONE_STATE_RESERVED         = 0x00,
+    NVME_ZONE_STATE_EMPTY            = 0x01,
+    NVME_ZONE_STATE_IMPLICITLY_OPEN  = 0x02,
+    NVME_ZONE_STATE_EXPLICITLY_OPEN  = 0x03,
+    NVME_ZONE_STATE_CLOSED           = 0x04,
+    NVME_ZONE_STATE_READ_ONLY        = 0x0D,
+    NVME_ZONE_STATE_FULL             = 0x0E,
+    NVME_ZONE_STATE_OFFLINE          = 0x0F,
+};
+
 static inline void _nvme_check_size(void)
 {
     QEMU_BUILD_BUG_ON(sizeof(NvmeBar) != 4096);
     QEMU_BUILD_BUG_ON(sizeof(NvmeAerResult) != 4);
+    QEMU_BUILD_BUG_ON(sizeof(NvmeZonedResult) != 8);
     QEMU_BUILD_BUG_ON(sizeof(NvmeCqe) != 16);
     QEMU_BUILD_BUG_ON(sizeof(NvmeDsmRange) != 16);
     QEMU_BUILD_BUG_ON(sizeof(NvmeCmd) != 64);
@@ -1136,8 +1243,13 @@ static inline void _nvme_check_size(void)
     QEMU_BUILD_BUG_ON(sizeof(NvmeSmartLog) != 512);
     QEMU_BUILD_BUG_ON(sizeof(NvmeEffectsLog) != 4096);
     QEMU_BUILD_BUG_ON(sizeof(NvmeIdCtrl) != 4096);
+    QEMU_BUILD_BUG_ON(sizeof(NvmeIdCtrlZoned) != 4096);
+    QEMU_BUILD_BUG_ON(sizeof(NvmeLBAF) != 4);
+    QEMU_BUILD_BUG_ON(sizeof(NvmeLBAFE) != 16);
     QEMU_BUILD_BUG_ON(sizeof(NvmeIdNs) != 4096);
+    QEMU_BUILD_BUG_ON(sizeof(NvmeIdNsZoned) != 4096);
     QEMU_BUILD_BUG_ON(sizeof(NvmeSglDescriptor) != 16);
     QEMU_BUILD_BUG_ON(sizeof(NvmeIdNsDescr) != 4);
+    QEMU_BUILD_BUG_ON(sizeof(NvmeZoneDescr) != 64);
 }
 #endif
-- 
2.30.0
^ permalink raw reply related	[flat|nested] 62+ messages in thread
- * [PULL 16/56] hw/block/nvme: Support Zoned Namespace Command Set
  2021-02-09  7:30 [PULL 00/56] emulated nvme patches Klaus Jensen
                   ` (14 preceding siblings ...)
  2021-02-09  7:30 ` [PULL 15/56] nvme: Make ZNS-related definitions Klaus Jensen
@ 2021-02-09  7:30 ` Klaus Jensen
  2021-02-09  7:30 ` [PULL 17/56] hw/block/nvme: Introduce max active and open zone limits Klaus Jensen
                   ` (40 subsequent siblings)
  56 siblings, 0 replies; 62+ messages in thread
From: Klaus Jensen @ 2021-02-09  7:30 UTC (permalink / raw)
  To: qemu-devel, Peter Maydell
  Cc: Fam Zheng, Kevin Wolf, Ajay Joshi, Chaitanya Kulkarni, qemu-block,
	Niklas Cassel, Dmitry Fomichev, Adam Manzanares, Aravind Ramesh,
	Max Reitz, Klaus Jensen, Hans Holmberg, Stefan Hajnoczi,
	Klaus Jensen, Keith Busch, Shin'ichiro Kawasaki,
	Matias Bjorling
From: Dmitry Fomichev <dmitry.fomichev@wdc.com>
The emulation code has been changed to advertise NVM Command Set when
"zoned" device property is not set (default) and Zoned Namespace
Command Set otherwise.
Define values and structures that are needed to support Zoned
Namespace Command Set (NVMe TP 4053) in PCI NVMe controller emulator.
Define trace events where needed in newly introduced code.
In order to improve scalability, all open, closed and full zones
are organized in separate linked lists. Consequently, almost all
zone operations don't require scanning of the entire zone array
(which potentially can be quite large) - it is only necessary to
enumerate one or more zone lists.
Handlers for three new NVMe commands introduced in Zoned Namespace
Command Set specification are added, namely for Zone Management
Receive, Zone Management Send and Zone Append.
Device initialization code has been extended to create a proper
configuration for zoned operation using device properties.
Read/Write command handler is modified to only allow writes at the
write pointer if the namespace is zoned. For Zone Append command,
writes implicitly happen at the write pointer and the starting write
pointer value is returned as the result of the command. Write Zeroes
handler is modified to add zoned checks that are identical to those
done as a part of Write flow.
Subsequent commits in this series add ZDE support and checks for
active and open zone limits.
Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
Signed-off-by: Hans Holmberg <hans.holmberg@wdc.com>
Signed-off-by: Ajay Joshi <ajay.joshi@wdc.com>
Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Signed-off-by: Matias Bjorling <matias.bjorling@wdc.com>
Signed-off-by: Aravind Ramesh <aravind.ramesh@wdc.com>
Signed-off-by: Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com>
Signed-off-by: Adam Manzanares <adam.manzanares@wdc.com>
Signed-off-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
Reviewed-by: Niklas Cassel <Niklas.Cassel@wdc.com>
Reviewed-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
---
 hw/block/nvme-ns.h    |  52 +++
 hw/block/nvme.h       |   6 +
 hw/block/nvme-ns.c    | 166 +++++++++
 hw/block/nvme.c       | 807 +++++++++++++++++++++++++++++++++++++++++-
 hw/block/trace-events |  17 +
 5 files changed, 1040 insertions(+), 8 deletions(-)
diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
index bdbc98c2ec17..388381dda0df 100644
--- a/hw/block/nvme-ns.h
+++ b/hw/block/nvme-ns.h
@@ -19,9 +19,20 @@
 #define NVME_NS(obj) \
     OBJECT_CHECK(NvmeNamespace, (obj), TYPE_NVME_NS)
 
+typedef struct NvmeZone {
+    NvmeZoneDescr   d;
+    uint64_t        w_ptr;
+    QTAILQ_ENTRY(NvmeZone) entry;
+} NvmeZone;
+
 typedef struct NvmeNamespaceParams {
     uint32_t nsid;
     QemuUUID uuid;
+
+    bool     zoned;
+    bool     cross_zone_read;
+    uint64_t zone_size_bs;
+    uint64_t zone_cap_bs;
 } NvmeNamespaceParams;
 
 typedef struct NvmeNamespace {
@@ -33,6 +44,17 @@ typedef struct NvmeNamespace {
     const uint32_t *iocs;
     uint8_t      csi;
 
+    NvmeIdNsZoned   *id_ns_zoned;
+    NvmeZone        *zone_array;
+    QTAILQ_HEAD(, NvmeZone) exp_open_zones;
+    QTAILQ_HEAD(, NvmeZone) imp_open_zones;
+    QTAILQ_HEAD(, NvmeZone) closed_zones;
+    QTAILQ_HEAD(, NvmeZone) full_zones;
+    uint32_t        num_zones;
+    uint64_t        zone_size;
+    uint64_t        zone_capacity;
+    uint32_t        zone_size_log2;
+
     NvmeNamespaceParams params;
 
     struct {
@@ -74,8 +96,38 @@ static inline size_t nvme_l2b(NvmeNamespace *ns, uint64_t lba)
 
 typedef struct NvmeCtrl NvmeCtrl;
 
+static inline enum NvmeZoneState nvme_get_zone_state(NvmeZone *zone)
+{
+    return zone->d.zs >> 4;
+}
+
+static inline void nvme_set_zone_state(NvmeZone *zone, enum NvmeZoneState state)
+{
+    zone->d.zs = state << 4;
+}
+
+static inline uint64_t nvme_zone_rd_boundary(NvmeNamespace *ns, NvmeZone *zone)
+{
+    return zone->d.zslba + ns->zone_size;
+}
+
+static inline uint64_t nvme_zone_wr_boundary(NvmeZone *zone)
+{
+    return zone->d.zslba + zone->d.zcap;
+}
+
+static inline bool nvme_wp_is_valid(NvmeZone *zone)
+{
+    uint8_t st = nvme_get_zone_state(zone);
+
+    return st != NVME_ZONE_STATE_FULL &&
+           st != NVME_ZONE_STATE_READ_ONLY &&
+           st != NVME_ZONE_STATE_OFFLINE;
+}
+
 int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp);
 void nvme_ns_drain(NvmeNamespace *ns);
 void nvme_ns_shutdown(NvmeNamespace *ns);
+void nvme_ns_cleanup(NvmeNamespace *ns);
 
 #endif /* NVME_NS_H */
diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index 574333caa3f9..b7fbcca39d9f 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -6,6 +6,9 @@
 
 #define NVME_MAX_NAMESPACES 256
 
+#define NVME_DEFAULT_ZONE_SIZE   (128 * MiB)
+#define NVME_DEFAULT_MAX_ZA_SIZE (128 * KiB)
+
 typedef struct NvmeParams {
     char     *serial;
     uint32_t num_queues; /* deprecated since 5.1 */
@@ -16,6 +19,7 @@ typedef struct NvmeParams {
     uint32_t aer_max_queued;
     uint8_t  mdts;
     bool     use_intel_id;
+    uint32_t zasl_bs;
 } NvmeParams;
 
 typedef struct NvmeAsyncEvent {
@@ -149,6 +153,8 @@ typedef struct NvmeCtrl {
     QTAILQ_HEAD(, NvmeAsyncEvent) aer_queue;
     int         aer_queued;
 
+    uint8_t     zasl;
+
     NvmeNamespace   namespace;
     NvmeNamespace   *namespaces[NVME_MAX_NAMESPACES];
     NvmeSQueue      **sq;
diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index f5f6284a4d48..d79452c627cf 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -25,6 +25,7 @@
 #include "hw/qdev-properties.h"
 #include "hw/qdev-core.h"
 
+#include "trace.h"
 #include "nvme.h"
 #include "nvme-ns.h"
 
@@ -95,6 +96,147 @@ static int nvme_ns_init_blk(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
     return 0;
 }
 
+static int nvme_ns_zoned_check_calc_geometry(NvmeNamespace *ns, Error **errp)
+{
+    uint64_t zone_size, zone_cap;
+    uint32_t lbasz = ns->blkconf.logical_block_size;
+
+    /* Make sure that the values of ZNS properties are sane */
+    if (ns->params.zone_size_bs) {
+        zone_size = ns->params.zone_size_bs;
+    } else {
+        zone_size = NVME_DEFAULT_ZONE_SIZE;
+    }
+    if (ns->params.zone_cap_bs) {
+        zone_cap = ns->params.zone_cap_bs;
+    } else {
+        zone_cap = zone_size;
+    }
+    if (zone_cap > zone_size) {
+        error_setg(errp, "zone capacity %"PRIu64"B exceeds "
+                   "zone size %"PRIu64"B", zone_cap, zone_size);
+        return -1;
+    }
+    if (zone_size < lbasz) {
+        error_setg(errp, "zone size %"PRIu64"B too small, "
+                   "must be at least %"PRIu32"B", zone_size, lbasz);
+        return -1;
+    }
+    if (zone_cap < lbasz) {
+        error_setg(errp, "zone capacity %"PRIu64"B too small, "
+                   "must be at least %"PRIu32"B", zone_cap, lbasz);
+        return -1;
+    }
+
+    /*
+     * Save the main zone geometry values to avoid
+     * calculating them later again.
+     */
+    ns->zone_size = zone_size / lbasz;
+    ns->zone_capacity = zone_cap / lbasz;
+    ns->num_zones = ns->size / lbasz / ns->zone_size;
+    return 0;
+}
+
+static void nvme_ns_zoned_init_state(NvmeNamespace *ns)
+{
+    uint64_t start = 0, zone_size = ns->zone_size;
+    uint64_t capacity = ns->num_zones * zone_size;
+    NvmeZone *zone;
+    int i;
+
+    ns->zone_array = g_new0(NvmeZone, ns->num_zones);
+
+    QTAILQ_INIT(&ns->exp_open_zones);
+    QTAILQ_INIT(&ns->imp_open_zones);
+    QTAILQ_INIT(&ns->closed_zones);
+    QTAILQ_INIT(&ns->full_zones);
+
+    zone = ns->zone_array;
+    for (i = 0; i < ns->num_zones; i++, zone++) {
+        if (start + zone_size > capacity) {
+            zone_size = capacity - start;
+        }
+        zone->d.zt = NVME_ZONE_TYPE_SEQ_WRITE;
+        nvme_set_zone_state(zone, NVME_ZONE_STATE_EMPTY);
+        zone->d.za = 0;
+        zone->d.zcap = ns->zone_capacity;
+        zone->d.zslba = start;
+        zone->d.wp = start;
+        zone->w_ptr = start;
+        start += zone_size;
+    }
+
+    ns->zone_size_log2 = 0;
+    if (is_power_of_2(ns->zone_size)) {
+        ns->zone_size_log2 = 63 - clz64(ns->zone_size);
+    }
+}
+
+static void nvme_ns_init_zoned(NvmeCtrl *n, NvmeNamespace *ns, int lba_index)
+{
+    NvmeIdNsZoned *id_ns_z;
+
+    nvme_ns_zoned_init_state(ns);
+
+    id_ns_z = g_malloc0(sizeof(NvmeIdNsZoned));
+
+    /* MAR/MOR are zeroes-based, 0xffffffff means no limit */
+    id_ns_z->mar = 0xffffffff;
+    id_ns_z->mor = 0xffffffff;
+    id_ns_z->zoc = 0;
+    id_ns_z->ozcs = ns->params.cross_zone_read ? 0x01 : 0x00;
+
+    id_ns_z->lbafe[lba_index].zsze = cpu_to_le64(ns->zone_size);
+    id_ns_z->lbafe[lba_index].zdes = 0;
+
+    ns->csi = NVME_CSI_ZONED;
+    ns->id_ns.nsze = cpu_to_le64(ns->num_zones * ns->zone_size);
+    ns->id_ns.ncap = ns->id_ns.nsze;
+    ns->id_ns.nuse = ns->id_ns.ncap;
+
+    ns->id_ns_zoned = id_ns_z;
+}
+
+static void nvme_clear_zone(NvmeNamespace *ns, NvmeZone *zone)
+{
+    uint8_t state;
+
+    zone->w_ptr = zone->d.wp;
+    state = nvme_get_zone_state(zone);
+    if (zone->d.wp != zone->d.zslba) {
+        if (state != NVME_ZONE_STATE_CLOSED) {
+            trace_pci_nvme_clear_ns_close(state, zone->d.zslba);
+            nvme_set_zone_state(zone, NVME_ZONE_STATE_CLOSED);
+        }
+        QTAILQ_INSERT_HEAD(&ns->closed_zones, zone, entry);
+    } else {
+        trace_pci_nvme_clear_ns_reset(state, zone->d.zslba);
+        nvme_set_zone_state(zone, NVME_ZONE_STATE_EMPTY);
+    }
+}
+
+/*
+ * Close all the zones that are currently open.
+ */
+static void nvme_zoned_ns_shutdown(NvmeNamespace *ns)
+{
+    NvmeZone *zone, *next;
+
+    QTAILQ_FOREACH_SAFE(zone, &ns->closed_zones, entry, next) {
+        QTAILQ_REMOVE(&ns->closed_zones, zone, entry);
+        nvme_clear_zone(ns, zone);
+    }
+    QTAILQ_FOREACH_SAFE(zone, &ns->imp_open_zones, entry, next) {
+        QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry);
+        nvme_clear_zone(ns, zone);
+    }
+    QTAILQ_FOREACH_SAFE(zone, &ns->exp_open_zones, entry, next) {
+        QTAILQ_REMOVE(&ns->exp_open_zones, zone, entry);
+        nvme_clear_zone(ns, zone);
+    }
+}
+
 static int nvme_ns_check_constraints(NvmeNamespace *ns, Error **errp)
 {
     if (!ns->blkconf.blk) {
@@ -118,6 +260,12 @@ int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
     if (nvme_ns_init(ns, errp)) {
         return -1;
     }
+    if (ns->params.zoned) {
+        if (nvme_ns_zoned_check_calc_geometry(ns, errp) != 0) {
+            return -1;
+        }
+        nvme_ns_init_zoned(n, ns, 0);
+    }
 
     if (nvme_register_namespace(n, ns, errp)) {
         return -1;
@@ -134,6 +282,17 @@ void nvme_ns_drain(NvmeNamespace *ns)
 void nvme_ns_shutdown(NvmeNamespace *ns)
 {
     blk_flush(ns->blkconf.blk);
+    if (ns->params.zoned) {
+        nvme_zoned_ns_shutdown(ns);
+    }
+}
+
+void nvme_ns_cleanup(NvmeNamespace *ns)
+{
+    if (ns->params.zoned) {
+        g_free(ns->id_ns_zoned);
+        g_free(ns->zone_array);
+    }
 }
 
 static void nvme_ns_realize(DeviceState *dev, Error **errp)
@@ -154,6 +313,13 @@ static Property nvme_ns_props[] = {
     DEFINE_BLOCK_PROPERTIES(NvmeNamespace, blkconf),
     DEFINE_PROP_UINT32("nsid", NvmeNamespace, params.nsid, 0),
     DEFINE_PROP_UUID("uuid", NvmeNamespace, params.uuid),
+    DEFINE_PROP_BOOL("zoned", NvmeNamespace, params.zoned, false),
+    DEFINE_PROP_SIZE("zoned.zone_size", NvmeNamespace, params.zone_size_bs,
+                     NVME_DEFAULT_ZONE_SIZE),
+    DEFINE_PROP_SIZE("zoned.zone_capacity", NvmeNamespace, params.zone_cap_bs,
+                     0),
+    DEFINE_PROP_BOOL("zoned.cross_read", NvmeNamespace,
+                     params.cross_zone_read, false),
     DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index f1cc66d381a1..be83be3fb5d4 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -136,6 +136,18 @@ static const uint32_t nvme_cse_iocs_nvm[256] = {
     [NVME_CMD_COMPARE]              = NVME_CMD_EFF_CSUPP,
 };
 
+static const uint32_t nvme_cse_iocs_zoned[256] = {
+    [NVME_CMD_FLUSH]                = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
+    [NVME_CMD_WRITE_ZEROES]         = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
+    [NVME_CMD_WRITE]                = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
+    [NVME_CMD_READ]                 = NVME_CMD_EFF_CSUPP,
+    [NVME_CMD_DSM]                  = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
+    [NVME_CMD_COMPARE]              = NVME_CMD_EFF_CSUPP,
+    [NVME_CMD_ZONE_APPEND]          = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
+    [NVME_CMD_ZONE_MGMT_SEND]       = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
+    [NVME_CMD_ZONE_MGMT_RECV]       = NVME_CMD_EFF_CSUPP,
+};
+
 static void nvme_process_sq(void *opaque);
 
 static uint16_t nvme_cid(NvmeRequest *req)
@@ -152,6 +164,48 @@ static uint16_t nvme_sqid(NvmeRequest *req)
     return le16_to_cpu(req->sq->sqid);
 }
 
+static void nvme_assign_zone_state(NvmeNamespace *ns, NvmeZone *zone,
+                                   enum NvmeZoneState state)
+{
+    if (QTAILQ_IN_USE(zone, entry)) {
+        switch (nvme_get_zone_state(zone)) {
+        case NVME_ZONE_STATE_EXPLICITLY_OPEN:
+            QTAILQ_REMOVE(&ns->exp_open_zones, zone, entry);
+            break;
+        case NVME_ZONE_STATE_IMPLICITLY_OPEN:
+            QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry);
+            break;
+        case NVME_ZONE_STATE_CLOSED:
+            QTAILQ_REMOVE(&ns->closed_zones, zone, entry);
+            break;
+        case NVME_ZONE_STATE_FULL:
+            QTAILQ_REMOVE(&ns->full_zones, zone, entry);
+        default:
+            ;
+        }
+    }
+
+    nvme_set_zone_state(zone, state);
+
+    switch (state) {
+    case NVME_ZONE_STATE_EXPLICITLY_OPEN:
+        QTAILQ_INSERT_TAIL(&ns->exp_open_zones, zone, entry);
+        break;
+    case NVME_ZONE_STATE_IMPLICITLY_OPEN:
+        QTAILQ_INSERT_TAIL(&ns->imp_open_zones, zone, entry);
+        break;
+    case NVME_ZONE_STATE_CLOSED:
+        QTAILQ_INSERT_TAIL(&ns->closed_zones, zone, entry);
+        break;
+    case NVME_ZONE_STATE_FULL:
+        QTAILQ_INSERT_TAIL(&ns->full_zones, zone, entry);
+    case NVME_ZONE_STATE_READ_ONLY:
+        break;
+    default:
+        zone->d.za = 0;
+    }
+}
+
 static bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr)
 {
     hwaddr low = n->ctrl_mem.addr;
@@ -959,6 +1013,7 @@ static void nvme_aio_err(NvmeRequest *req, int ret)
     case NVME_CMD_FLUSH:
     case NVME_CMD_WRITE:
     case NVME_CMD_WRITE_ZEROES:
+    case NVME_CMD_ZONE_APPEND:
         status = NVME_WRITE_FAULT;
         break;
     default:
@@ -982,6 +1037,201 @@ static void nvme_aio_err(NvmeRequest *req, int ret)
     req->status = status;
 }
 
+static inline uint32_t nvme_zone_idx(NvmeNamespace *ns, uint64_t slba)
+{
+    return ns->zone_size_log2 > 0 ? slba >> ns->zone_size_log2 :
+                                    slba / ns->zone_size;
+}
+
+static inline NvmeZone *nvme_get_zone_by_slba(NvmeNamespace *ns, uint64_t slba)
+{
+    uint32_t zone_idx = nvme_zone_idx(ns, slba);
+
+    assert(zone_idx < ns->num_zones);
+    return &ns->zone_array[zone_idx];
+}
+
+static uint16_t nvme_check_zone_state_for_write(NvmeZone *zone)
+{
+    uint16_t status;
+
+    switch (nvme_get_zone_state(zone)) {
+    case NVME_ZONE_STATE_EMPTY:
+    case NVME_ZONE_STATE_IMPLICITLY_OPEN:
+    case NVME_ZONE_STATE_EXPLICITLY_OPEN:
+    case NVME_ZONE_STATE_CLOSED:
+        status = NVME_SUCCESS;
+        break;
+    case NVME_ZONE_STATE_FULL:
+        status = NVME_ZONE_FULL;
+        break;
+    case NVME_ZONE_STATE_OFFLINE:
+        status = NVME_ZONE_OFFLINE;
+        break;
+    case NVME_ZONE_STATE_READ_ONLY:
+        status = NVME_ZONE_READ_ONLY;
+        break;
+    default:
+        assert(false);
+    }
+
+    return status;
+}
+
+static uint16_t nvme_check_zone_write(NvmeCtrl *n, NvmeNamespace *ns,
+                                      NvmeZone *zone, uint64_t slba,
+                                      uint32_t nlb, bool append)
+{
+    uint16_t status;
+
+    if (unlikely((slba + nlb) > nvme_zone_wr_boundary(zone))) {
+        status = NVME_ZONE_BOUNDARY_ERROR;
+    } else {
+        status = nvme_check_zone_state_for_write(zone);
+    }
+
+    if (status != NVME_SUCCESS) {
+        trace_pci_nvme_err_zone_write_not_ok(slba, nlb, status);
+    } else {
+        assert(nvme_wp_is_valid(zone));
+        if (append) {
+            if (unlikely(slba != zone->d.zslba)) {
+                trace_pci_nvme_err_append_not_at_start(slba, zone->d.zslba);
+                status = NVME_ZONE_INVALID_WRITE;
+            }
+            if (nvme_l2b(ns, nlb) > (n->page_size << n->zasl)) {
+                trace_pci_nvme_err_append_too_large(slba, nlb, n->zasl);
+                status = NVME_INVALID_FIELD;
+            }
+        } else if (unlikely(slba != zone->w_ptr)) {
+            trace_pci_nvme_err_write_not_at_wp(slba, zone->d.zslba,
+                                               zone->w_ptr);
+            status = NVME_ZONE_INVALID_WRITE;
+        }
+    }
+
+    return status;
+}
+
+static uint16_t nvme_check_zone_state_for_read(NvmeZone *zone)
+{
+    uint16_t status;
+
+    switch (nvme_get_zone_state(zone)) {
+    case NVME_ZONE_STATE_EMPTY:
+    case NVME_ZONE_STATE_IMPLICITLY_OPEN:
+    case NVME_ZONE_STATE_EXPLICITLY_OPEN:
+    case NVME_ZONE_STATE_FULL:
+    case NVME_ZONE_STATE_CLOSED:
+    case NVME_ZONE_STATE_READ_ONLY:
+        status = NVME_SUCCESS;
+        break;
+    case NVME_ZONE_STATE_OFFLINE:
+        status = NVME_ZONE_OFFLINE;
+        break;
+    default:
+        assert(false);
+    }
+
+    return status;
+}
+
+static uint16_t nvme_check_zone_read(NvmeNamespace *ns, uint64_t slba,
+                                     uint32_t nlb)
+{
+    NvmeZone *zone = nvme_get_zone_by_slba(ns, slba);
+    uint64_t bndry = nvme_zone_rd_boundary(ns, zone);
+    uint64_t end = slba + nlb;
+    uint16_t status;
+
+    status = nvme_check_zone_state_for_read(zone);
+    if (status != NVME_SUCCESS) {
+        ;
+    } else if (unlikely(end > bndry)) {
+        if (!ns->params.cross_zone_read) {
+            status = NVME_ZONE_BOUNDARY_ERROR;
+        } else {
+            /*
+             * Read across zone boundary - check that all subsequent
+             * zones that are being read have an appropriate state.
+             */
+            do {
+                zone++;
+                status = nvme_check_zone_state_for_read(zone);
+                if (status != NVME_SUCCESS) {
+                    break;
+                }
+            } while (end > nvme_zone_rd_boundary(ns, zone));
+        }
+    }
+
+    return status;
+}
+
+static void nvme_finalize_zoned_write(NvmeNamespace *ns, NvmeRequest *req,
+                                      bool failed)
+{
+    NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
+    NvmeZone *zone;
+    NvmeZonedResult *res = (NvmeZonedResult *)&req->cqe;
+    uint64_t slba;
+    uint32_t nlb;
+
+    slba = le64_to_cpu(rw->slba);
+    nlb = le16_to_cpu(rw->nlb) + 1;
+    zone = nvme_get_zone_by_slba(ns, slba);
+
+    if (failed) {
+        res->slba = 0;
+        zone->d.wp += nlb;
+    } else if (zone->w_ptr == nvme_zone_wr_boundary(zone)) {
+        switch (nvme_get_zone_state(zone)) {
+        case NVME_ZONE_STATE_IMPLICITLY_OPEN:
+        case NVME_ZONE_STATE_EXPLICITLY_OPEN:
+        case NVME_ZONE_STATE_CLOSED:
+        case NVME_ZONE_STATE_EMPTY:
+            nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_FULL);
+            /* fall through */
+        case NVME_ZONE_STATE_FULL:
+            break;
+        default:
+            assert(false);
+        }
+        zone->d.wp = zone->w_ptr;
+    } else {
+        zone->d.wp += nlb;
+    }
+}
+
+static uint64_t nvme_advance_zone_wp(NvmeNamespace *ns, NvmeZone *zone,
+                                     uint32_t nlb)
+{
+    uint64_t result = zone->w_ptr;
+    uint8_t zs;
+
+    zone->w_ptr += nlb;
+
+    if (zone->w_ptr < nvme_zone_wr_boundary(zone)) {
+        zs = nvme_get_zone_state(zone);
+        switch (zs) {
+        case NVME_ZONE_STATE_EMPTY:
+        case NVME_ZONE_STATE_CLOSED:
+            nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_IMPLICITLY_OPEN);
+        }
+    }
+
+    return result;
+}
+
+static inline bool nvme_is_write(NvmeRequest *req)
+{
+    NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
+
+    return rw->opcode == NVME_CMD_WRITE ||
+           rw->opcode == NVME_CMD_ZONE_APPEND ||
+           rw->opcode == NVME_CMD_WRITE_ZEROES;
+}
+
 static void nvme_rw_cb(void *opaque, int ret)
 {
     NvmeRequest *req = opaque;
@@ -993,6 +1243,10 @@ static void nvme_rw_cb(void *opaque, int ret)
 
     trace_pci_nvme_rw_cb(nvme_cid(req), blk_name(blk));
 
+    if (ns->params.zoned && nvme_is_write(req)) {
+        nvme_finalize_zoned_write(ns, req, ret != 0);
+    }
+
     if (!ret) {
         block_acct_done(stats, acct);
     } else {
@@ -1225,6 +1479,14 @@ static uint16_t nvme_read(NvmeCtrl *n, NvmeRequest *req)
         goto invalid;
     }
 
+    if (ns->params.zoned) {
+        status = nvme_check_zone_read(ns, slba, nlb);
+        if (status != NVME_SUCCESS) {
+            trace_pci_nvme_err_zone_read_not_ok(slba, nlb, status);
+            goto invalid;
+        }
+    }
+
     status = nvme_map_dptr(n, data_size, req);
     if (status) {
         goto invalid;
@@ -1255,7 +1517,8 @@ invalid:
     return status | NVME_DNR;
 }
 
-static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool wrz)
+static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append,
+                              bool wrz)
 {
     NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
     NvmeNamespace *ns = req->ns;
@@ -1263,6 +1526,8 @@ static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool wrz)
     uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
     uint64_t data_size = nvme_l2b(ns, nlb);
     uint64_t data_offset;
+    NvmeZone *zone;
+    NvmeZonedResult *res = (NvmeZonedResult *)&req->cqe;
     BlockBackend *blk = ns->blkconf.blk;
     uint16_t status;
 
@@ -1283,6 +1548,25 @@ static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool wrz)
         goto invalid;
     }
 
+    if (ns->params.zoned) {
+        zone = nvme_get_zone_by_slba(ns, slba);
+
+        status = nvme_check_zone_write(n, ns, zone, slba, nlb, append);
+        if (status != NVME_SUCCESS) {
+            goto invalid;
+        }
+
+        if (append) {
+            slba = zone->w_ptr;
+        }
+
+        res->slba = nvme_advance_zone_wp(ns, zone, nlb);
+    } else if (append) {
+        trace_pci_nvme_err_invalid_opc(rw->opcode);
+        status = NVME_INVALID_OPCODE;
+        goto invalid;
+    }
+
     data_offset = nvme_l2b(ns, slba);
 
     if (!wrz) {
@@ -1315,12 +1599,445 @@ invalid:
 
 static inline uint16_t nvme_write(NvmeCtrl *n, NvmeRequest *req)
 {
-    return nvme_do_write(n, req, false);
+    return nvme_do_write(n, req, false, false);
 }
 
 static inline uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req)
 {
-    return nvme_do_write(n, req, true);
+    return nvme_do_write(n, req, false, true);
+}
+
+static inline uint16_t nvme_zone_append(NvmeCtrl *n, NvmeRequest *req)
+{
+    return nvme_do_write(n, req, true, false);
+}
+
+static uint16_t nvme_get_mgmt_zone_slba_idx(NvmeNamespace *ns, NvmeCmd *c,
+                                            uint64_t *slba, uint32_t *zone_idx)
+{
+    uint32_t dw10 = le32_to_cpu(c->cdw10);
+    uint32_t dw11 = le32_to_cpu(c->cdw11);
+
+    if (!ns->params.zoned) {
+        trace_pci_nvme_err_invalid_opc(c->opcode);
+        return NVME_INVALID_OPCODE | NVME_DNR;
+    }
+
+    *slba = ((uint64_t)dw11) << 32 | dw10;
+    if (unlikely(*slba >= ns->id_ns.nsze)) {
+        trace_pci_nvme_err_invalid_lba_range(*slba, 0, ns->id_ns.nsze);
+        *slba = 0;
+        return NVME_LBA_RANGE | NVME_DNR;
+    }
+
+    *zone_idx = nvme_zone_idx(ns, *slba);
+    assert(*zone_idx < ns->num_zones);
+
+    return NVME_SUCCESS;
+}
+
+typedef uint16_t (*op_handler_t)(NvmeNamespace *, NvmeZone *,
+                                 enum NvmeZoneState);
+
+enum NvmeZoneProcessingMask {
+    NVME_PROC_CURRENT_ZONE    = 0,
+    NVME_PROC_IMP_OPEN_ZONES  = 1 << 0,
+    NVME_PROC_EXP_OPEN_ZONES  = 1 << 1,
+    NVME_PROC_CLOSED_ZONES    = 1 << 2,
+    NVME_PROC_READ_ONLY_ZONES = 1 << 3,
+    NVME_PROC_FULL_ZONES      = 1 << 4,
+};
+
+static uint16_t nvme_open_zone(NvmeNamespace *ns, NvmeZone *zone,
+                               enum NvmeZoneState state)
+{
+    switch (state) {
+    case NVME_ZONE_STATE_EMPTY:
+    case NVME_ZONE_STATE_CLOSED:
+    case NVME_ZONE_STATE_IMPLICITLY_OPEN:
+        nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EXPLICITLY_OPEN);
+        /* fall through */
+    case NVME_ZONE_STATE_EXPLICITLY_OPEN:
+        return NVME_SUCCESS;
+    default:
+        return NVME_ZONE_INVAL_TRANSITION;
+    }
+}
+
+static uint16_t nvme_close_zone(NvmeNamespace *ns, NvmeZone *zone,
+                                enum NvmeZoneState state)
+{
+    switch (state) {
+    case NVME_ZONE_STATE_EXPLICITLY_OPEN:
+    case NVME_ZONE_STATE_IMPLICITLY_OPEN:
+        nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED);
+        /* fall through */
+    case NVME_ZONE_STATE_CLOSED:
+        return NVME_SUCCESS;
+    default:
+        return NVME_ZONE_INVAL_TRANSITION;
+    }
+}
+
+static uint16_t nvme_finish_zone(NvmeNamespace *ns, NvmeZone *zone,
+                                 enum NvmeZoneState state)
+{
+    switch (state) {
+    case NVME_ZONE_STATE_EXPLICITLY_OPEN:
+    case NVME_ZONE_STATE_IMPLICITLY_OPEN:
+    case NVME_ZONE_STATE_CLOSED:
+    case NVME_ZONE_STATE_EMPTY:
+        zone->w_ptr = nvme_zone_wr_boundary(zone);
+        zone->d.wp = zone->w_ptr;
+        nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_FULL);
+        /* fall through */
+    case NVME_ZONE_STATE_FULL:
+        return NVME_SUCCESS;
+    default:
+        return NVME_ZONE_INVAL_TRANSITION;
+    }
+}
+
+static uint16_t nvme_reset_zone(NvmeNamespace *ns, NvmeZone *zone,
+                                enum NvmeZoneState state)
+{
+    switch (state) {
+    case NVME_ZONE_STATE_EXPLICITLY_OPEN:
+    case NVME_ZONE_STATE_IMPLICITLY_OPEN:
+    case NVME_ZONE_STATE_CLOSED:
+    case NVME_ZONE_STATE_FULL:
+        zone->w_ptr = zone->d.zslba;
+        zone->d.wp = zone->w_ptr;
+        nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EMPTY);
+        /* fall through */
+    case NVME_ZONE_STATE_EMPTY:
+        return NVME_SUCCESS;
+    default:
+        return NVME_ZONE_INVAL_TRANSITION;
+    }
+}
+
+static uint16_t nvme_offline_zone(NvmeNamespace *ns, NvmeZone *zone,
+                                  enum NvmeZoneState state)
+{
+    switch (state) {
+    case NVME_ZONE_STATE_READ_ONLY:
+        nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_OFFLINE);
+        /* fall through */
+    case NVME_ZONE_STATE_OFFLINE:
+        return NVME_SUCCESS;
+    default:
+        return NVME_ZONE_INVAL_TRANSITION;
+    }
+}
+
+static uint16_t nvme_bulk_proc_zone(NvmeNamespace *ns, NvmeZone *zone,
+                                    enum NvmeZoneProcessingMask proc_mask,
+                                    op_handler_t op_hndlr)
+{
+    uint16_t status = NVME_SUCCESS;
+    enum NvmeZoneState zs = nvme_get_zone_state(zone);
+    bool proc_zone;
+
+    switch (zs) {
+    case NVME_ZONE_STATE_IMPLICITLY_OPEN:
+        proc_zone = proc_mask & NVME_PROC_IMP_OPEN_ZONES;
+        break;
+    case NVME_ZONE_STATE_EXPLICITLY_OPEN:
+        proc_zone = proc_mask & NVME_PROC_EXP_OPEN_ZONES;
+        break;
+    case NVME_ZONE_STATE_CLOSED:
+        proc_zone = proc_mask & NVME_PROC_CLOSED_ZONES;
+        break;
+    case NVME_ZONE_STATE_READ_ONLY:
+        proc_zone = proc_mask & NVME_PROC_READ_ONLY_ZONES;
+        break;
+    case NVME_ZONE_STATE_FULL:
+        proc_zone = proc_mask & NVME_PROC_FULL_ZONES;
+        break;
+    default:
+        proc_zone = false;
+    }
+
+    if (proc_zone) {
+        status = op_hndlr(ns, zone, zs);
+    }
+
+    return status;
+}
+
+static uint16_t nvme_do_zone_op(NvmeNamespace *ns, NvmeZone *zone,
+                                enum NvmeZoneProcessingMask proc_mask,
+                                op_handler_t op_hndlr)
+{
+    NvmeZone *next;
+    uint16_t status = NVME_SUCCESS;
+    int i;
+
+    if (!proc_mask) {
+        status = op_hndlr(ns, zone, nvme_get_zone_state(zone));
+    } else {
+        if (proc_mask & NVME_PROC_CLOSED_ZONES) {
+            QTAILQ_FOREACH_SAFE(zone, &ns->closed_zones, entry, next) {
+                status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr);
+                if (status != NVME_SUCCESS) {
+                    goto out;
+                }
+            }
+        }
+        if (proc_mask & NVME_PROC_IMP_OPEN_ZONES) {
+            QTAILQ_FOREACH_SAFE(zone, &ns->imp_open_zones, entry, next) {
+                status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr);
+                if (status != NVME_SUCCESS) {
+                    goto out;
+                }
+            }
+        }
+        if (proc_mask & NVME_PROC_EXP_OPEN_ZONES) {
+            QTAILQ_FOREACH_SAFE(zone, &ns->exp_open_zones, entry, next) {
+                status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr);
+                if (status != NVME_SUCCESS) {
+                    goto out;
+                }
+            }
+        }
+        if (proc_mask & NVME_PROC_FULL_ZONES) {
+            QTAILQ_FOREACH_SAFE(zone, &ns->full_zones, entry, next) {
+                status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr);
+                if (status != NVME_SUCCESS) {
+                    goto out;
+                }
+            }
+        }
+
+        if (proc_mask & NVME_PROC_READ_ONLY_ZONES) {
+            for (i = 0; i < ns->num_zones; i++, zone++) {
+                status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr);
+                if (status != NVME_SUCCESS) {
+                    goto out;
+                }
+            }
+        }
+    }
+
+out:
+    return status;
+}
+
+static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeRequest *req)
+{
+    NvmeCmd *cmd = (NvmeCmd *)&req->cmd;
+    NvmeNamespace *ns = req->ns;
+    NvmeZone *zone;
+    uint32_t dw13 = le32_to_cpu(cmd->cdw13);
+    uint64_t slba = 0;
+    uint32_t zone_idx = 0;
+    uint16_t status;
+    uint8_t action;
+    bool all;
+    enum NvmeZoneProcessingMask proc_mask = NVME_PROC_CURRENT_ZONE;
+
+    action = dw13 & 0xff;
+    all = dw13 & 0x100;
+
+    req->status = NVME_SUCCESS;
+
+    if (!all) {
+        status = nvme_get_mgmt_zone_slba_idx(ns, cmd, &slba, &zone_idx);
+        if (status) {
+            return status;
+        }
+    }
+
+    zone = &ns->zone_array[zone_idx];
+    if (slba != zone->d.zslba) {
+        trace_pci_nvme_err_unaligned_zone_cmd(action, slba, zone->d.zslba);
+        return NVME_INVALID_FIELD | NVME_DNR;
+    }
+
+    switch (action) {
+
+    case NVME_ZONE_ACTION_OPEN:
+        if (all) {
+            proc_mask = NVME_PROC_CLOSED_ZONES;
+        }
+        trace_pci_nvme_open_zone(slba, zone_idx, all);
+        status = nvme_do_zone_op(ns, zone, proc_mask, nvme_open_zone);
+        break;
+
+    case NVME_ZONE_ACTION_CLOSE:
+        if (all) {
+            proc_mask = NVME_PROC_IMP_OPEN_ZONES | NVME_PROC_EXP_OPEN_ZONES;
+        }
+        trace_pci_nvme_close_zone(slba, zone_idx, all);
+        status = nvme_do_zone_op(ns, zone, proc_mask, nvme_close_zone);
+        break;
+
+    case NVME_ZONE_ACTION_FINISH:
+        if (all) {
+            proc_mask = NVME_PROC_IMP_OPEN_ZONES | NVME_PROC_EXP_OPEN_ZONES |
+                        NVME_PROC_CLOSED_ZONES;
+        }
+        trace_pci_nvme_finish_zone(slba, zone_idx, all);
+        status = nvme_do_zone_op(ns, zone, proc_mask, nvme_finish_zone);
+        break;
+
+    case NVME_ZONE_ACTION_RESET:
+        if (all) {
+            proc_mask = NVME_PROC_IMP_OPEN_ZONES | NVME_PROC_EXP_OPEN_ZONES |
+                        NVME_PROC_CLOSED_ZONES | NVME_PROC_FULL_ZONES;
+        }
+        trace_pci_nvme_reset_zone(slba, zone_idx, all);
+        status = nvme_do_zone_op(ns, zone, proc_mask, nvme_reset_zone);
+        break;
+
+    case NVME_ZONE_ACTION_OFFLINE:
+        if (all) {
+            proc_mask = NVME_PROC_READ_ONLY_ZONES;
+        }
+        trace_pci_nvme_offline_zone(slba, zone_idx, all);
+        status = nvme_do_zone_op(ns, zone, proc_mask, nvme_offline_zone);
+        break;
+
+    case NVME_ZONE_ACTION_SET_ZD_EXT:
+        trace_pci_nvme_set_descriptor_extension(slba, zone_idx);
+        return NVME_INVALID_FIELD | NVME_DNR;
+        break;
+
+    default:
+        trace_pci_nvme_err_invalid_mgmt_action(action);
+        status = NVME_INVALID_FIELD;
+    }
+
+    if (status == NVME_ZONE_INVAL_TRANSITION) {
+        trace_pci_nvme_err_invalid_zone_state_transition(action, slba,
+                                                         zone->d.za);
+    }
+    if (status) {
+        status |= NVME_DNR;
+    }
+
+    return status;
+}
+
+static bool nvme_zone_matches_filter(uint32_t zafs, NvmeZone *zl)
+{
+    enum NvmeZoneState zs = nvme_get_zone_state(zl);
+
+    switch (zafs) {
+    case NVME_ZONE_REPORT_ALL:
+        return true;
+    case NVME_ZONE_REPORT_EMPTY:
+        return zs == NVME_ZONE_STATE_EMPTY;
+    case NVME_ZONE_REPORT_IMPLICITLY_OPEN:
+        return zs == NVME_ZONE_STATE_IMPLICITLY_OPEN;
+    case NVME_ZONE_REPORT_EXPLICITLY_OPEN:
+        return zs == NVME_ZONE_STATE_EXPLICITLY_OPEN;
+    case NVME_ZONE_REPORT_CLOSED:
+        return zs == NVME_ZONE_STATE_CLOSED;
+    case NVME_ZONE_REPORT_FULL:
+        return zs == NVME_ZONE_STATE_FULL;
+    case NVME_ZONE_REPORT_READ_ONLY:
+        return zs == NVME_ZONE_STATE_READ_ONLY;
+    case NVME_ZONE_REPORT_OFFLINE:
+        return zs == NVME_ZONE_STATE_OFFLINE;
+    default:
+        return false;
+    }
+}
+
+static uint16_t nvme_zone_mgmt_recv(NvmeCtrl *n, NvmeRequest *req)
+{
+    NvmeCmd *cmd = (NvmeCmd *)&req->cmd;
+    NvmeNamespace *ns = req->ns;
+    /* cdw12 is zero-based number of dwords to return. Convert to bytes */
+    uint32_t data_size = (le32_to_cpu(cmd->cdw12) + 1) << 2;
+    uint32_t dw13 = le32_to_cpu(cmd->cdw13);
+    uint32_t zone_idx, zra, zrasf, partial;
+    uint64_t max_zones, nr_zones = 0;
+    uint16_t status;
+    uint64_t slba, capacity = nvme_ns_nlbas(ns);
+    NvmeZoneDescr *z;
+    NvmeZone *zone;
+    NvmeZoneReportHeader *header;
+    void *buf, *buf_p;
+    size_t zone_entry_sz;
+
+    req->status = NVME_SUCCESS;
+
+    status = nvme_get_mgmt_zone_slba_idx(ns, cmd, &slba, &zone_idx);
+    if (status) {
+        return status;
+    }
+
+    zra = dw13 & 0xff;
+    if (zra != NVME_ZONE_REPORT) {
+        return NVME_INVALID_FIELD | NVME_DNR;
+    }
+
+    zrasf = (dw13 >> 8) & 0xff;
+    if (zrasf > NVME_ZONE_REPORT_OFFLINE) {
+        return NVME_INVALID_FIELD | NVME_DNR;
+    }
+
+    if (data_size < sizeof(NvmeZoneReportHeader)) {
+        return NVME_INVALID_FIELD | NVME_DNR;
+    }
+
+    status = nvme_check_mdts(n, data_size);
+    if (status) {
+        trace_pci_nvme_err_mdts(nvme_cid(req), data_size);
+        return status;
+    }
+
+    partial = (dw13 >> 16) & 0x01;
+
+    zone_entry_sz = sizeof(NvmeZoneDescr);
+
+    max_zones = (data_size - sizeof(NvmeZoneReportHeader)) / zone_entry_sz;
+    buf = g_malloc0(data_size);
+
+    zone = &ns->zone_array[zone_idx];
+    for (; slba < capacity; slba += ns->zone_size) {
+        if (partial && nr_zones >= max_zones) {
+            break;
+        }
+        if (nvme_zone_matches_filter(zrasf, zone++)) {
+            nr_zones++;
+        }
+    }
+    header = (NvmeZoneReportHeader *)buf;
+    header->nr_zones = cpu_to_le64(nr_zones);
+
+    buf_p = buf + sizeof(NvmeZoneReportHeader);
+    for (; zone_idx < ns->num_zones && max_zones > 0; zone_idx++) {
+        zone = &ns->zone_array[zone_idx];
+        if (nvme_zone_matches_filter(zrasf, zone)) {
+            z = (NvmeZoneDescr *)buf_p;
+            buf_p += sizeof(NvmeZoneDescr);
+
+            z->zt = zone->d.zt;
+            z->zs = zone->d.zs;
+            z->zcap = cpu_to_le64(zone->d.zcap);
+            z->zslba = cpu_to_le64(zone->d.zslba);
+            z->za = zone->d.za;
+
+            if (nvme_wp_is_valid(zone)) {
+                z->wp = cpu_to_le64(zone->d.wp);
+            } else {
+                z->wp = cpu_to_le64(~0ULL);
+            }
+
+            max_zones--;
+        }
+    }
+
+    status = nvme_dma(n, (uint8_t *)buf, data_size,
+                      DMA_DIRECTION_FROM_DEVICE, req);
+
+    g_free(buf);
+
+    return status;
 }
 
 static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
@@ -1349,6 +2066,8 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
         return nvme_flush(n, req);
     case NVME_CMD_WRITE_ZEROES:
         return nvme_write_zeroes(n, req);
+    case NVME_CMD_ZONE_APPEND:
+        return nvme_zone_append(n, req);
     case NVME_CMD_WRITE:
         return nvme_write(n, req);
     case NVME_CMD_READ:
@@ -1357,6 +2076,10 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
         return nvme_compare(n, req);
     case NVME_CMD_DSM:
         return nvme_dsm(n, req);
+    case NVME_CMD_ZONE_MGMT_SEND:
+        return nvme_zone_mgmt_send(n, req);
+    case NVME_CMD_ZONE_MGMT_RECV:
+        return nvme_zone_mgmt_recv(n, req);
     default:
         assert(false);
     }
@@ -1619,6 +2342,9 @@ static uint16_t nvme_cmd_effects(NvmeCtrl *n, uint8_t csi, uint32_t buf_len,
         case NVME_CSI_NVM:
             src_iocs = nvme_cse_iocs_nvm;
             break;
+        case NVME_CSI_ZONED:
+            src_iocs = nvme_cse_iocs_zoned;
+            break;
         }
     }
 
@@ -1799,6 +2525,16 @@ static uint16_t nvme_rpt_empty_id_struct(NvmeCtrl *n, NvmeRequest *req)
     return nvme_dma(n, id, sizeof(id), DMA_DIRECTION_FROM_DEVICE, req);
 }
 
+static inline bool nvme_csi_has_nvm_support(NvmeNamespace *ns)
+{
+    switch (ns->csi) {
+    case NVME_CSI_NVM:
+    case NVME_CSI_ZONED:
+        return true;
+    }
+    return false;
+}
+
 static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeRequest *req)
 {
     trace_pci_nvme_identify_ctrl();
@@ -1810,11 +2546,18 @@ static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeRequest *req)
 static uint16_t nvme_identify_ctrl_csi(NvmeCtrl *n, NvmeRequest *req)
 {
     NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
+    NvmeIdCtrlZoned id = {};
 
     trace_pci_nvme_identify_ctrl_csi(c->csi);
 
     if (c->csi == NVME_CSI_NVM) {
         return nvme_rpt_empty_id_struct(n, req);
+    } else if (c->csi == NVME_CSI_ZONED) {
+        if (n->params.zasl_bs) {
+            id.zasl = n->zasl;
+        }
+        return nvme_dma(n, (uint8_t *)&id, sizeof(id),
+                        DMA_DIRECTION_FROM_DEVICE, req);
     }
 
     return NVME_INVALID_FIELD | NVME_DNR;
@@ -1837,8 +2580,12 @@ static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req)
         return nvme_rpt_empty_id_struct(n, req);
     }
 
-    return nvme_dma(n, (uint8_t *)&ns->id_ns, sizeof(NvmeIdNs),
-                    DMA_DIRECTION_FROM_DEVICE, req);
+    if (c->csi == NVME_CSI_NVM && nvme_csi_has_nvm_support(ns)) {
+        return nvme_dma(n, (uint8_t *)&ns->id_ns, sizeof(NvmeIdNs),
+                        DMA_DIRECTION_FROM_DEVICE, req);
+    }
+
+    return NVME_INVALID_CMD_SET | NVME_DNR;
 }
 
 static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, NvmeRequest *req)
@@ -1858,8 +2605,11 @@ static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, NvmeRequest *req)
         return nvme_rpt_empty_id_struct(n, req);
     }
 
-    if (c->csi == NVME_CSI_NVM) {
+    if (c->csi == NVME_CSI_NVM && nvme_csi_has_nvm_support(ns)) {
         return nvme_rpt_empty_id_struct(n, req);
+    } else if (c->csi == NVME_CSI_ZONED && ns->csi == NVME_CSI_ZONED) {
+        return nvme_dma(n, (uint8_t *)ns->id_ns_zoned, sizeof(NvmeIdNsZoned),
+                        DMA_DIRECTION_FROM_DEVICE, req);
     }
 
     return NVME_INVALID_FIELD | NVME_DNR;
@@ -1923,7 +2673,7 @@ static uint16_t nvme_identify_nslist_csi(NvmeCtrl *n, NvmeRequest *req)
         return NVME_INVALID_NSID | NVME_DNR;
     }
 
-    if (c->csi != NVME_CSI_NVM) {
+    if (c->csi != NVME_CSI_NVM && c->csi != NVME_CSI_ZONED) {
         return NVME_INVALID_FIELD | NVME_DNR;
     }
 
@@ -1932,7 +2682,7 @@ static uint16_t nvme_identify_nslist_csi(NvmeCtrl *n, NvmeRequest *req)
         if (!ns) {
             continue;
         }
-        if (ns->params.nsid <= min_nsid) {
+        if (ns->params.nsid <= min_nsid || c->csi != ns->csi) {
             continue;
         }
         list_ptr[j++] = cpu_to_le32(ns->params.nsid);
@@ -1999,6 +2749,8 @@ static uint16_t nvme_identify_cmd_set(NvmeCtrl *n, NvmeRequest *req)
     trace_pci_nvme_identify_cmd_set();
 
     NVME_SET_CSI(*list, NVME_CSI_NVM);
+    NVME_SET_CSI(*list, NVME_CSI_ZONED);
+
     return nvme_dma(n, list, data_len, DMA_DIRECTION_FROM_DEVICE, req);
 }
 
@@ -2565,6 +3317,13 @@ static void nvme_select_ns_iocs(NvmeCtrl *n)
                 ns->iocs = nvme_cse_iocs_nvm;
             }
             break;
+        case NVME_CSI_ZONED:
+            if (NVME_CC_CSS(n->bar.cc) == NVME_CC_CSS_CSI) {
+                ns->iocs = nvme_cse_iocs_zoned;
+            } else if (NVME_CC_CSS(n->bar.cc) == NVME_CC_CSS_NVM) {
+                ns->iocs = nvme_cse_iocs_nvm;
+            }
+            break;
         }
     }
 }
@@ -2663,6 +3422,17 @@ static int nvme_start_ctrl(NvmeCtrl *n)
     nvme_init_sq(&n->admin_sq, n, n->bar.asq, 0, 0,
                  NVME_AQA_ASQS(n->bar.aqa) + 1);
 
+    if (!n->params.zasl_bs) {
+        n->zasl = n->params.mdts;
+    } else {
+        if (n->params.zasl_bs < n->page_size) {
+            trace_pci_nvme_err_startfail_zasl_too_small(n->params.zasl_bs,
+                                                        n->page_size);
+            return -1;
+        }
+        n->zasl = 31 - clz32(n->params.zasl_bs / n->page_size);
+    }
+
     nvme_set_timestamp(n, 0ULL);
 
     QTAILQ_INIT(&n->aer_queue);
@@ -3087,6 +3857,13 @@ static void nvme_check_constraints(NvmeCtrl *n, Error **errp)
 
         host_memory_backend_set_mapped(n->pmrdev, true);
     }
+
+    if (n->params.zasl_bs) {
+        if (!is_power_of_2(n->params.zasl_bs)) {
+            error_setg(errp, "zone append size limit has to be a power of 2");
+            return;
+        }
+    }
 }
 
 static void nvme_init_state(NvmeCtrl *n)
@@ -3351,8 +4128,20 @@ static void nvme_realize(PCIDevice *pci_dev, Error **errp)
 static void nvme_exit(PCIDevice *pci_dev)
 {
     NvmeCtrl *n = NVME(pci_dev);
+    NvmeNamespace *ns;
+    int i;
 
     nvme_ctrl_shutdown(n);
+
+    for (i = 1; i <= n->num_namespaces; i++) {
+        ns = nvme_ns(n, i);
+        if (!ns) {
+            continue;
+        }
+
+        nvme_ns_cleanup(ns);
+    }
+
     g_free(n->cq);
     g_free(n->sq);
     g_free(n->aer_reqs);
@@ -3380,6 +4169,8 @@ static Property nvme_props[] = {
     DEFINE_PROP_UINT32("aer_max_queued", NvmeCtrl, params.aer_max_queued, 64),
     DEFINE_PROP_UINT8("mdts", NvmeCtrl, params.mdts, 7),
     DEFINE_PROP_BOOL("use-intel-id", NvmeCtrl, params.use_intel_id, false),
+    DEFINE_PROP_SIZE32("zoned.append_size_limit", NvmeCtrl, params.zasl_bs,
+                       NVME_DEFAULT_MAX_ZA_SIZE),
     DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/hw/block/trace-events b/hw/block/trace-events
index e56e2b66b075..64366d89366c 100644
--- a/hw/block/trace-events
+++ b/hw/block/trace-events
@@ -95,6 +95,14 @@ pci_nvme_mmio_start_success(void) "setting controller enable bit succeeded"
 pci_nvme_mmio_stopped(void) "cleared controller enable bit"
 pci_nvme_mmio_shutdown_set(void) "shutdown bit set"
 pci_nvme_mmio_shutdown_cleared(void) "shutdown bit cleared"
+pci_nvme_open_zone(uint64_t slba, uint32_t zone_idx, int all) "open zone, slba=%"PRIu64", idx=%"PRIu32", all=%"PRIi32""
+pci_nvme_close_zone(uint64_t slba, uint32_t zone_idx, int all) "close zone, slba=%"PRIu64", idx=%"PRIu32", all=%"PRIi32""
+pci_nvme_finish_zone(uint64_t slba, uint32_t zone_idx, int all) "finish zone, slba=%"PRIu64", idx=%"PRIu32", all=%"PRIi32""
+pci_nvme_reset_zone(uint64_t slba, uint32_t zone_idx, int all) "reset zone, slba=%"PRIu64", idx=%"PRIu32", all=%"PRIi32""
+pci_nvme_offline_zone(uint64_t slba, uint32_t zone_idx, int all) "offline zone, slba=%"PRIu64", idx=%"PRIu32", all=%"PRIi32""
+pci_nvme_set_descriptor_extension(uint64_t slba, uint32_t zone_idx) "set zone descriptor extension, slba=%"PRIu64", idx=%"PRIu32""
+pci_nvme_clear_ns_close(uint32_t state, uint64_t slba) "zone state=%"PRIu32", slba=%"PRIu64" transitioned to Closed state"
+pci_nvme_clear_ns_reset(uint32_t state, uint64_t slba) "zone state=%"PRIu32", slba=%"PRIu64" transitioned to Empty state"
 
 # nvme traces for error conditions
 pci_nvme_err_mdts(uint16_t cid, size_t len) "cid %"PRIu16" len %zu"
@@ -113,6 +121,13 @@ pci_nvme_err_invalid_opc(uint8_t opc) "invalid opcode 0x%"PRIx8""
 pci_nvme_err_invalid_admin_opc(uint8_t opc) "invalid admin opcode 0x%"PRIx8""
 pci_nvme_err_invalid_lba_range(uint64_t start, uint64_t len, uint64_t limit) "Invalid LBA start=%"PRIu64" len=%"PRIu64" limit=%"PRIu64""
 pci_nvme_err_invalid_log_page_offset(uint64_t ofs, uint64_t size) "must be <= %"PRIu64", got %"PRIu64""
+pci_nvme_err_unaligned_zone_cmd(uint8_t action, uint64_t slba, uint64_t zslba) "unaligned zone op 0x%"PRIx32", got slba=%"PRIu64", zslba=%"PRIu64""
+pci_nvme_err_invalid_zone_state_transition(uint8_t action, uint64_t slba, uint8_t attrs) "action=0x%"PRIx8", slba=%"PRIu64", attrs=0x%"PRIx32""
+pci_nvme_err_write_not_at_wp(uint64_t slba, uint64_t zone, uint64_t wp) "writing at slba=%"PRIu64", zone=%"PRIu64", but wp=%"PRIu64""
+pci_nvme_err_append_not_at_start(uint64_t slba, uint64_t zone) "appending at slba=%"PRIu64", but zone=%"PRIu64""
+pci_nvme_err_zone_write_not_ok(uint64_t slba, uint32_t nlb, uint16_t status) "slba=%"PRIu64", nlb=%"PRIu32", status=0x%"PRIx16""
+pci_nvme_err_zone_read_not_ok(uint64_t slba, uint32_t nlb, uint16_t status) "slba=%"PRIu64", nlb=%"PRIu32", status=0x%"PRIx16""
+pci_nvme_err_append_too_large(uint64_t slba, uint32_t nlb, uint8_t zasl) "slba=%"PRIu64", nlb=%"PRIu32", zasl=%"PRIu8""
 pci_nvme_err_invalid_iocsci(uint32_t idx) "unsupported command set combination index %"PRIu32""
 pci_nvme_err_invalid_del_sq(uint16_t qid) "invalid submission queue deletion, sid=%"PRIu16""
 pci_nvme_err_invalid_create_sq_cqid(uint16_t cqid) "failed creating submission queue, invalid cqid=%"PRIu16""
@@ -146,7 +161,9 @@ pci_nvme_err_startfail_sqent_too_large(uint8_t log2ps, uint8_t maxlog2ps) "nvme_
 pci_nvme_err_startfail_css(uint8_t css) "nvme_start_ctrl failed because invalid command set selected:%u"
 pci_nvme_err_startfail_asqent_sz_zero(void) "nvme_start_ctrl failed because the admin submission queue size is zero"
 pci_nvme_err_startfail_acqent_sz_zero(void) "nvme_start_ctrl failed because the admin completion queue size is zero"
+pci_nvme_err_startfail_zasl_too_small(uint32_t zasl, uint32_t pagesz) "nvme_start_ctrl failed because zone append size limit %"PRIu32" is too small, needs to be >= %"PRIu32""
 pci_nvme_err_startfail(void) "setting controller enable bit failed"
+pci_nvme_err_invalid_mgmt_action(uint8_t action) "action=0x%"PRIx8""
 
 # Traces for undefined behavior
 pci_nvme_ub_mmiowr_misaligned32(uint64_t offset) "MMIO write not 32-bit aligned, offset=0x%"PRIx64""
-- 
2.30.0
^ permalink raw reply related	[flat|nested] 62+ messages in thread
- * [PULL 17/56] hw/block/nvme: Introduce max active and open zone limits
  2021-02-09  7:30 [PULL 00/56] emulated nvme patches Klaus Jensen
                   ` (15 preceding siblings ...)
  2021-02-09  7:30 ` [PULL 16/56] hw/block/nvme: Support Zoned Namespace Command Set Klaus Jensen
@ 2021-02-09  7:30 ` Klaus Jensen
  2021-02-09  7:30 ` [PULL 18/56] hw/block/nvme: Support Zone Descriptor Extensions Klaus Jensen
                   ` (39 subsequent siblings)
  56 siblings, 0 replies; 62+ messages in thread
From: Klaus Jensen @ 2021-02-09  7:30 UTC (permalink / raw)
  To: qemu-devel, Peter Maydell
  Cc: Fam Zheng, Kevin Wolf, qemu-block, Niklas Cassel, Dmitry Fomichev,
	Klaus Jensen, Max Reitz, Klaus Jensen, Hans Holmberg,
	Stefan Hajnoczi, Keith Busch
From: Dmitry Fomichev <dmitry.fomichev@wdc.com>
Add two module properties, "zoned.max_active" and "zoned.max_open"
to control the maximum number of zones that can be active or open.
Once these variables are set to non-default values, these limits are
checked during I/O and Too Many Active or Too Many Open command status
is returned if they are exceeded.
Signed-off-by: Hans Holmberg <hans.holmberg@wdc.com>
Signed-off-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
Reviewed-by: Niklas Cassel <Niklas.Cassel@wdc.com>
Reviewed-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
---
 hw/block/nvme-ns.h    | 41 +++++++++++++++++++
 hw/block/nvme-ns.c    | 31 ++++++++++++++-
 hw/block/nvme.c       | 92 +++++++++++++++++++++++++++++++++++++++++++
 hw/block/trace-events |  2 +
 4 files changed, 164 insertions(+), 2 deletions(-)
diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
index 388381dda0df..7e1fd26909ba 100644
--- a/hw/block/nvme-ns.h
+++ b/hw/block/nvme-ns.h
@@ -33,6 +33,8 @@ typedef struct NvmeNamespaceParams {
     bool     cross_zone_read;
     uint64_t zone_size_bs;
     uint64_t zone_cap_bs;
+    uint32_t max_active_zones;
+    uint32_t max_open_zones;
 } NvmeNamespaceParams;
 
 typedef struct NvmeNamespace {
@@ -54,6 +56,8 @@ typedef struct NvmeNamespace {
     uint64_t        zone_size;
     uint64_t        zone_capacity;
     uint32_t        zone_size_log2;
+    int32_t         nr_open_zones;
+    int32_t         nr_active_zones;
 
     NvmeNamespaceParams params;
 
@@ -125,6 +129,43 @@ static inline bool nvme_wp_is_valid(NvmeZone *zone)
            st != NVME_ZONE_STATE_OFFLINE;
 }
 
+static inline void nvme_aor_inc_open(NvmeNamespace *ns)
+{
+    assert(ns->nr_open_zones >= 0);
+    if (ns->params.max_open_zones) {
+        ns->nr_open_zones++;
+        assert(ns->nr_open_zones <= ns->params.max_open_zones);
+    }
+}
+
+static inline void nvme_aor_dec_open(NvmeNamespace *ns)
+{
+    if (ns->params.max_open_zones) {
+        assert(ns->nr_open_zones > 0);
+        ns->nr_open_zones--;
+    }
+    assert(ns->nr_open_zones >= 0);
+}
+
+static inline void nvme_aor_inc_active(NvmeNamespace *ns)
+{
+    assert(ns->nr_active_zones >= 0);
+    if (ns->params.max_active_zones) {
+        ns->nr_active_zones++;
+        assert(ns->nr_active_zones <= ns->params.max_active_zones);
+    }
+}
+
+static inline void nvme_aor_dec_active(NvmeNamespace *ns)
+{
+    if (ns->params.max_active_zones) {
+        assert(ns->nr_active_zones > 0);
+        ns->nr_active_zones--;
+        assert(ns->nr_active_zones >= ns->nr_open_zones);
+    }
+    assert(ns->nr_active_zones >= 0);
+}
+
 int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp);
 void nvme_ns_drain(NvmeNamespace *ns);
 void nvme_ns_shutdown(NvmeNamespace *ns);
diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index d79452c627cf..c55afc1920a3 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -135,6 +135,21 @@ static int nvme_ns_zoned_check_calc_geometry(NvmeNamespace *ns, Error **errp)
     ns->zone_size = zone_size / lbasz;
     ns->zone_capacity = zone_cap / lbasz;
     ns->num_zones = ns->size / lbasz / ns->zone_size;
+
+    /* Do a few more sanity checks of ZNS properties */
+    if (ns->params.max_open_zones > ns->num_zones) {
+        error_setg(errp,
+                   "max_open_zones value %u exceeds the number of zones %u",
+                   ns->params.max_open_zones, ns->num_zones);
+        return -1;
+    }
+    if (ns->params.max_active_zones > ns->num_zones) {
+        error_setg(errp,
+                   "max_active_zones value %u exceeds the number of zones %u",
+                   ns->params.max_active_zones, ns->num_zones);
+        return -1;
+    }
+
     return 0;
 }
 
@@ -182,8 +197,8 @@ static void nvme_ns_init_zoned(NvmeCtrl *n, NvmeNamespace *ns, int lba_index)
     id_ns_z = g_malloc0(sizeof(NvmeIdNsZoned));
 
     /* MAR/MOR are zeroes-based, 0xffffffff means no limit */
-    id_ns_z->mar = 0xffffffff;
-    id_ns_z->mor = 0xffffffff;
+    id_ns_z->mar = cpu_to_le32(ns->params.max_active_zones - 1);
+    id_ns_z->mor = cpu_to_le32(ns->params.max_open_zones - 1);
     id_ns_z->zoc = 0;
     id_ns_z->ozcs = ns->params.cross_zone_read ? 0x01 : 0x00;
 
@@ -209,6 +224,7 @@ static void nvme_clear_zone(NvmeNamespace *ns, NvmeZone *zone)
             trace_pci_nvme_clear_ns_close(state, zone->d.zslba);
             nvme_set_zone_state(zone, NVME_ZONE_STATE_CLOSED);
         }
+        nvme_aor_inc_active(ns);
         QTAILQ_INSERT_HEAD(&ns->closed_zones, zone, entry);
     } else {
         trace_pci_nvme_clear_ns_reset(state, zone->d.zslba);
@@ -225,16 +241,23 @@ static void nvme_zoned_ns_shutdown(NvmeNamespace *ns)
 
     QTAILQ_FOREACH_SAFE(zone, &ns->closed_zones, entry, next) {
         QTAILQ_REMOVE(&ns->closed_zones, zone, entry);
+        nvme_aor_dec_active(ns);
         nvme_clear_zone(ns, zone);
     }
     QTAILQ_FOREACH_SAFE(zone, &ns->imp_open_zones, entry, next) {
         QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry);
+        nvme_aor_dec_open(ns);
+        nvme_aor_dec_active(ns);
         nvme_clear_zone(ns, zone);
     }
     QTAILQ_FOREACH_SAFE(zone, &ns->exp_open_zones, entry, next) {
         QTAILQ_REMOVE(&ns->exp_open_zones, zone, entry);
+        nvme_aor_dec_open(ns);
+        nvme_aor_dec_active(ns);
         nvme_clear_zone(ns, zone);
     }
+
+    assert(ns->nr_open_zones == 0);
 }
 
 static int nvme_ns_check_constraints(NvmeNamespace *ns, Error **errp)
@@ -320,6 +343,10 @@ static Property nvme_ns_props[] = {
                      0),
     DEFINE_PROP_BOOL("zoned.cross_read", NvmeNamespace,
                      params.cross_zone_read, false),
+    DEFINE_PROP_UINT32("zoned.max_active", NvmeNamespace,
+                       params.max_active_zones, 0),
+    DEFINE_PROP_UINT32("zoned.max_open", NvmeNamespace,
+                       params.max_open_zones, 0),
     DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index be83be3fb5d4..c07dbcd2a809 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -206,6 +206,26 @@ static void nvme_assign_zone_state(NvmeNamespace *ns, NvmeZone *zone,
     }
 }
 
+/*
+ * Check if we can open a zone without exceeding open/active limits.
+ * AOR stands for "Active and Open Resources" (see TP 4053 section 2.5).
+ */
+static int nvme_aor_check(NvmeNamespace *ns, uint32_t act, uint32_t opn)
+{
+    if (ns->params.max_active_zones != 0 &&
+        ns->nr_active_zones + act > ns->params.max_active_zones) {
+        trace_pci_nvme_err_insuff_active_res(ns->params.max_active_zones);
+        return NVME_ZONE_TOO_MANY_ACTIVE | NVME_DNR;
+    }
+    if (ns->params.max_open_zones != 0 &&
+        ns->nr_open_zones + opn > ns->params.max_open_zones) {
+        trace_pci_nvme_err_insuff_open_res(ns->params.max_open_zones);
+        return NVME_ZONE_TOO_MANY_OPEN | NVME_DNR;
+    }
+
+    return NVME_SUCCESS;
+}
+
 static bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr)
 {
     hwaddr low = n->ctrl_mem.addr;
@@ -1168,6 +1188,40 @@ static uint16_t nvme_check_zone_read(NvmeNamespace *ns, uint64_t slba,
     return status;
 }
 
+static void nvme_auto_transition_zone(NvmeNamespace *ns)
+{
+    NvmeZone *zone;
+
+    if (ns->params.max_open_zones &&
+        ns->nr_open_zones == ns->params.max_open_zones) {
+        zone = QTAILQ_FIRST(&ns->imp_open_zones);
+        if (zone) {
+            /*
+             * Automatically close this implicitly open zone.
+             */
+            QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry);
+            nvme_aor_dec_open(ns);
+            nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED);
+        }
+    }
+}
+
+static uint16_t nvme_auto_open_zone(NvmeNamespace *ns, NvmeZone *zone)
+{
+    uint16_t status = NVME_SUCCESS;
+    uint8_t zs = nvme_get_zone_state(zone);
+
+    if (zs == NVME_ZONE_STATE_EMPTY) {
+        nvme_auto_transition_zone(ns);
+        status = nvme_aor_check(ns, 1, 1);
+    } else if (zs == NVME_ZONE_STATE_CLOSED) {
+        nvme_auto_transition_zone(ns);
+        status = nvme_aor_check(ns, 0, 1);
+    }
+
+    return status;
+}
+
 static void nvme_finalize_zoned_write(NvmeNamespace *ns, NvmeRequest *req,
                                       bool failed)
 {
@@ -1188,7 +1242,11 @@ static void nvme_finalize_zoned_write(NvmeNamespace *ns, NvmeRequest *req,
         switch (nvme_get_zone_state(zone)) {
         case NVME_ZONE_STATE_IMPLICITLY_OPEN:
         case NVME_ZONE_STATE_EXPLICITLY_OPEN:
+            nvme_aor_dec_open(ns);
+            /* fall through */
         case NVME_ZONE_STATE_CLOSED:
+            nvme_aor_dec_active(ns);
+            /* fall through */
         case NVME_ZONE_STATE_EMPTY:
             nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_FULL);
             /* fall through */
@@ -1215,7 +1273,10 @@ static uint64_t nvme_advance_zone_wp(NvmeNamespace *ns, NvmeZone *zone,
         zs = nvme_get_zone_state(zone);
         switch (zs) {
         case NVME_ZONE_STATE_EMPTY:
+            nvme_aor_inc_active(ns);
+            /* fall through */
         case NVME_ZONE_STATE_CLOSED:
+            nvme_aor_inc_open(ns);
             nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_IMPLICITLY_OPEN);
         }
     }
@@ -1556,6 +1617,11 @@ static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append,
             goto invalid;
         }
 
+        status = nvme_auto_open_zone(ns, zone);
+        if (status != NVME_SUCCESS) {
+            goto invalid;
+        }
+
         if (append) {
             slba = zone->w_ptr;
         }
@@ -1651,9 +1717,26 @@ enum NvmeZoneProcessingMask {
 static uint16_t nvme_open_zone(NvmeNamespace *ns, NvmeZone *zone,
                                enum NvmeZoneState state)
 {
+    uint16_t status;
+
     switch (state) {
     case NVME_ZONE_STATE_EMPTY:
+        status = nvme_aor_check(ns, 1, 0);
+        if (status != NVME_SUCCESS) {
+            return status;
+        }
+        nvme_aor_inc_active(ns);
+        /* fall through */
     case NVME_ZONE_STATE_CLOSED:
+        status = nvme_aor_check(ns, 0, 1);
+        if (status != NVME_SUCCESS) {
+            if (state == NVME_ZONE_STATE_EMPTY) {
+                nvme_aor_dec_active(ns);
+            }
+            return status;
+        }
+        nvme_aor_inc_open(ns);
+        /* fall through */
     case NVME_ZONE_STATE_IMPLICITLY_OPEN:
         nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EXPLICITLY_OPEN);
         /* fall through */
@@ -1670,6 +1753,7 @@ static uint16_t nvme_close_zone(NvmeNamespace *ns, NvmeZone *zone,
     switch (state) {
     case NVME_ZONE_STATE_EXPLICITLY_OPEN:
     case NVME_ZONE_STATE_IMPLICITLY_OPEN:
+        nvme_aor_dec_open(ns);
         nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED);
         /* fall through */
     case NVME_ZONE_STATE_CLOSED:
@@ -1685,7 +1769,11 @@ static uint16_t nvme_finish_zone(NvmeNamespace *ns, NvmeZone *zone,
     switch (state) {
     case NVME_ZONE_STATE_EXPLICITLY_OPEN:
     case NVME_ZONE_STATE_IMPLICITLY_OPEN:
+        nvme_aor_dec_open(ns);
+        /* fall through */
     case NVME_ZONE_STATE_CLOSED:
+        nvme_aor_dec_active(ns);
+        /* fall through */
     case NVME_ZONE_STATE_EMPTY:
         zone->w_ptr = nvme_zone_wr_boundary(zone);
         zone->d.wp = zone->w_ptr;
@@ -1704,7 +1792,11 @@ static uint16_t nvme_reset_zone(NvmeNamespace *ns, NvmeZone *zone,
     switch (state) {
     case NVME_ZONE_STATE_EXPLICITLY_OPEN:
     case NVME_ZONE_STATE_IMPLICITLY_OPEN:
+        nvme_aor_dec_open(ns);
+        /* fall through */
     case NVME_ZONE_STATE_CLOSED:
+        nvme_aor_dec_active(ns);
+        /* fall through */
     case NVME_ZONE_STATE_FULL:
         zone->w_ptr = zone->d.zslba;
         zone->d.wp = zone->w_ptr;
diff --git a/hw/block/trace-events b/hw/block/trace-events
index 64366d89366c..bbc10bc7371a 100644
--- a/hw/block/trace-events
+++ b/hw/block/trace-events
@@ -128,6 +128,8 @@ pci_nvme_err_append_not_at_start(uint64_t slba, uint64_t zone) "appending at slb
 pci_nvme_err_zone_write_not_ok(uint64_t slba, uint32_t nlb, uint16_t status) "slba=%"PRIu64", nlb=%"PRIu32", status=0x%"PRIx16""
 pci_nvme_err_zone_read_not_ok(uint64_t slba, uint32_t nlb, uint16_t status) "slba=%"PRIu64", nlb=%"PRIu32", status=0x%"PRIx16""
 pci_nvme_err_append_too_large(uint64_t slba, uint32_t nlb, uint8_t zasl) "slba=%"PRIu64", nlb=%"PRIu32", zasl=%"PRIu8""
+pci_nvme_err_insuff_active_res(uint32_t max_active) "max_active=%"PRIu32" zone limit exceeded"
+pci_nvme_err_insuff_open_res(uint32_t max_open) "max_open=%"PRIu32" zone limit exceeded"
 pci_nvme_err_invalid_iocsci(uint32_t idx) "unsupported command set combination index %"PRIu32""
 pci_nvme_err_invalid_del_sq(uint16_t qid) "invalid submission queue deletion, sid=%"PRIu16""
 pci_nvme_err_invalid_create_sq_cqid(uint16_t cqid) "failed creating submission queue, invalid cqid=%"PRIu16""
-- 
2.30.0
^ permalink raw reply related	[flat|nested] 62+ messages in thread
- * [PULL 18/56] hw/block/nvme: Support Zone Descriptor Extensions
  2021-02-09  7:30 [PULL 00/56] emulated nvme patches Klaus Jensen
                   ` (16 preceding siblings ...)
  2021-02-09  7:30 ` [PULL 17/56] hw/block/nvme: Introduce max active and open zone limits Klaus Jensen
@ 2021-02-09  7:30 ` Klaus Jensen
  2021-02-09  7:30 ` [PULL 19/56] hw/block/nvme: Document zoned parameters in usage text Klaus Jensen
                   ` (38 subsequent siblings)
  56 siblings, 0 replies; 62+ messages in thread
From: Klaus Jensen @ 2021-02-09  7:30 UTC (permalink / raw)
  To: qemu-devel, Peter Maydell
  Cc: Fam Zheng, Kevin Wolf, qemu-block, Niklas Cassel, Dmitry Fomichev,
	Klaus Jensen, Max Reitz, Klaus Jensen, Hans Holmberg,
	Stefan Hajnoczi, Keith Busch
From: Dmitry Fomichev <dmitry.fomichev@wdc.com>
Zone Descriptor Extension is a label that can be assigned to a zone.
It can be set to an Empty zone and it stays assigned until the zone
is reset.
This commit adds a new optional module property,
"zoned.descr_ext_size". Its value must be a multiple of 64 bytes.
If this value is non-zero, it becomes possible to assign extensions
of that size to any Empty zones. The default value for this property
is 0, therefore setting extensions is disabled by default.
Signed-off-by: Hans Holmberg <hans.holmberg@wdc.com>
Signed-off-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
Reviewed-by: Niklas Cassel <Niklas.Cassel@wdc.com>
Reviewed-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
---
 hw/block/nvme-ns.h    |  8 +++++++
 hw/block/nvme-ns.c    | 25 ++++++++++++++++++--
 hw/block/nvme.c       | 53 +++++++++++++++++++++++++++++++++++++++++--
 hw/block/trace-events |  2 ++
 4 files changed, 84 insertions(+), 4 deletions(-)
diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
index 7e1fd26909ba..f8f3c28c360b 100644
--- a/hw/block/nvme-ns.h
+++ b/hw/block/nvme-ns.h
@@ -35,6 +35,7 @@ typedef struct NvmeNamespaceParams {
     uint64_t zone_cap_bs;
     uint32_t max_active_zones;
     uint32_t max_open_zones;
+    uint32_t zd_extension_size;
 } NvmeNamespaceParams;
 
 typedef struct NvmeNamespace {
@@ -56,6 +57,7 @@ typedef struct NvmeNamespace {
     uint64_t        zone_size;
     uint64_t        zone_capacity;
     uint32_t        zone_size_log2;
+    uint8_t         *zd_extensions;
     int32_t         nr_open_zones;
     int32_t         nr_active_zones;
 
@@ -129,6 +131,12 @@ static inline bool nvme_wp_is_valid(NvmeZone *zone)
            st != NVME_ZONE_STATE_OFFLINE;
 }
 
+static inline uint8_t *nvme_get_zd_extension(NvmeNamespace *ns,
+                                             uint32_t zone_idx)
+{
+    return &ns->zd_extensions[zone_idx * ns->params.zd_extension_size];
+}
+
 static inline void nvme_aor_inc_open(NvmeNamespace *ns)
 {
     assert(ns->nr_open_zones >= 0);
diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index c55afc1920a3..838b15c064f5 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -150,6 +150,18 @@ static int nvme_ns_zoned_check_calc_geometry(NvmeNamespace *ns, Error **errp)
         return -1;
     }
 
+    if (ns->params.zd_extension_size) {
+        if (ns->params.zd_extension_size & 0x3f) {
+            error_setg(errp,
+                "zone descriptor extension size must be a multiple of 64B");
+            return -1;
+        }
+        if ((ns->params.zd_extension_size >> 6) > 0xff) {
+            error_setg(errp, "zone descriptor extension size is too large");
+            return -1;
+        }
+    }
+
     return 0;
 }
 
@@ -161,6 +173,10 @@ static void nvme_ns_zoned_init_state(NvmeNamespace *ns)
     int i;
 
     ns->zone_array = g_new0(NvmeZone, ns->num_zones);
+    if (ns->params.zd_extension_size) {
+        ns->zd_extensions = g_malloc0(ns->params.zd_extension_size *
+                                      ns->num_zones);
+    }
 
     QTAILQ_INIT(&ns->exp_open_zones);
     QTAILQ_INIT(&ns->imp_open_zones);
@@ -203,7 +219,8 @@ static void nvme_ns_init_zoned(NvmeCtrl *n, NvmeNamespace *ns, int lba_index)
     id_ns_z->ozcs = ns->params.cross_zone_read ? 0x01 : 0x00;
 
     id_ns_z->lbafe[lba_index].zsze = cpu_to_le64(ns->zone_size);
-    id_ns_z->lbafe[lba_index].zdes = 0;
+    id_ns_z->lbafe[lba_index].zdes =
+        ns->params.zd_extension_size >> 6; /* Units of 64B */
 
     ns->csi = NVME_CSI_ZONED;
     ns->id_ns.nsze = cpu_to_le64(ns->num_zones * ns->zone_size);
@@ -219,7 +236,8 @@ static void nvme_clear_zone(NvmeNamespace *ns, NvmeZone *zone)
 
     zone->w_ptr = zone->d.wp;
     state = nvme_get_zone_state(zone);
-    if (zone->d.wp != zone->d.zslba) {
+    if (zone->d.wp != zone->d.zslba ||
+        (zone->d.za & NVME_ZA_ZD_EXT_VALID)) {
         if (state != NVME_ZONE_STATE_CLOSED) {
             trace_pci_nvme_clear_ns_close(state, zone->d.zslba);
             nvme_set_zone_state(zone, NVME_ZONE_STATE_CLOSED);
@@ -315,6 +333,7 @@ void nvme_ns_cleanup(NvmeNamespace *ns)
     if (ns->params.zoned) {
         g_free(ns->id_ns_zoned);
         g_free(ns->zone_array);
+        g_free(ns->zd_extensions);
     }
 }
 
@@ -347,6 +366,8 @@ static Property nvme_ns_props[] = {
                        params.max_active_zones, 0),
     DEFINE_PROP_UINT32("zoned.max_open", NvmeNamespace,
                        params.max_open_zones, 0),
+    DEFINE_PROP_UINT32("zoned.descr_ext_size", NvmeNamespace,
+                       params.zd_extension_size, 0),
     DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index c07dbcd2a809..4bcc7660736b 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1823,6 +1823,25 @@ static uint16_t nvme_offline_zone(NvmeNamespace *ns, NvmeZone *zone,
     }
 }
 
+static uint16_t nvme_set_zd_ext(NvmeNamespace *ns, NvmeZone *zone)
+{
+    uint16_t status;
+    uint8_t state = nvme_get_zone_state(zone);
+
+    if (state == NVME_ZONE_STATE_EMPTY) {
+        status = nvme_aor_check(ns, 1, 0);
+        if (status != NVME_SUCCESS) {
+            return status;
+        }
+        nvme_aor_inc_active(ns);
+        zone->d.za |= NVME_ZA_ZD_EXT_VALID;
+        nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED);
+        return NVME_SUCCESS;
+    }
+
+    return NVME_ZONE_INVAL_TRANSITION;
+}
+
 static uint16_t nvme_bulk_proc_zone(NvmeNamespace *ns, NvmeZone *zone,
                                     enum NvmeZoneProcessingMask proc_mask,
                                     op_handler_t op_hndlr)
@@ -1921,6 +1940,7 @@ static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeRequest *req)
     NvmeCmd *cmd = (NvmeCmd *)&req->cmd;
     NvmeNamespace *ns = req->ns;
     NvmeZone *zone;
+    uint8_t *zd_ext;
     uint32_t dw13 = le32_to_cpu(cmd->cdw13);
     uint64_t slba = 0;
     uint32_t zone_idx = 0;
@@ -1993,7 +2013,22 @@ static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeRequest *req)
 
     case NVME_ZONE_ACTION_SET_ZD_EXT:
         trace_pci_nvme_set_descriptor_extension(slba, zone_idx);
-        return NVME_INVALID_FIELD | NVME_DNR;
+        if (all || !ns->params.zd_extension_size) {
+            return NVME_INVALID_FIELD | NVME_DNR;
+        }
+        zd_ext = nvme_get_zd_extension(ns, zone_idx);
+        status = nvme_dma(n, zd_ext, ns->params.zd_extension_size,
+                          DMA_DIRECTION_TO_DEVICE, req);
+        if (status) {
+            trace_pci_nvme_err_zd_extension_map_error(zone_idx);
+            return status;
+        }
+
+        status = nvme_set_zd_ext(ns, zone);
+        if (status == NVME_SUCCESS) {
+            trace_pci_nvme_zd_extension_set(zone_idx);
+            return status;
+        }
         break;
 
     default:
@@ -2063,7 +2098,10 @@ static uint16_t nvme_zone_mgmt_recv(NvmeCtrl *n, NvmeRequest *req)
     }
 
     zra = dw13 & 0xff;
-    if (zra != NVME_ZONE_REPORT) {
+    if (zra != NVME_ZONE_REPORT && zra != NVME_ZONE_REPORT_EXTENDED) {
+        return NVME_INVALID_FIELD | NVME_DNR;
+    }
+    if (zra == NVME_ZONE_REPORT_EXTENDED && !ns->params.zd_extension_size) {
         return NVME_INVALID_FIELD | NVME_DNR;
     }
 
@@ -2085,6 +2123,9 @@ static uint16_t nvme_zone_mgmt_recv(NvmeCtrl *n, NvmeRequest *req)
     partial = (dw13 >> 16) & 0x01;
 
     zone_entry_sz = sizeof(NvmeZoneDescr);
+    if (zra == NVME_ZONE_REPORT_EXTENDED) {
+        zone_entry_sz += ns->params.zd_extension_size;
+    }
 
     max_zones = (data_size - sizeof(NvmeZoneReportHeader)) / zone_entry_sz;
     buf = g_malloc0(data_size);
@@ -2120,6 +2161,14 @@ static uint16_t nvme_zone_mgmt_recv(NvmeCtrl *n, NvmeRequest *req)
                 z->wp = cpu_to_le64(~0ULL);
             }
 
+            if (zra == NVME_ZONE_REPORT_EXTENDED) {
+                if (zone->d.za & NVME_ZA_ZD_EXT_VALID) {
+                    memcpy(buf_p, nvme_get_zd_extension(ns, zone_idx),
+                           ns->params.zd_extension_size);
+                }
+                buf_p += ns->params.zd_extension_size;
+            }
+
             max_zones--;
         }
     }
diff --git a/hw/block/trace-events b/hw/block/trace-events
index bbc10bc7371a..a455979e5911 100644
--- a/hw/block/trace-events
+++ b/hw/block/trace-events
@@ -101,6 +101,7 @@ pci_nvme_finish_zone(uint64_t slba, uint32_t zone_idx, int all) "finish zone, sl
 pci_nvme_reset_zone(uint64_t slba, uint32_t zone_idx, int all) "reset zone, slba=%"PRIu64", idx=%"PRIu32", all=%"PRIi32""
 pci_nvme_offline_zone(uint64_t slba, uint32_t zone_idx, int all) "offline zone, slba=%"PRIu64", idx=%"PRIu32", all=%"PRIi32""
 pci_nvme_set_descriptor_extension(uint64_t slba, uint32_t zone_idx) "set zone descriptor extension, slba=%"PRIu64", idx=%"PRIu32""
+pci_nvme_zd_extension_set(uint32_t zone_idx) "set descriptor extension for zone_idx=%"PRIu32""
 pci_nvme_clear_ns_close(uint32_t state, uint64_t slba) "zone state=%"PRIu32", slba=%"PRIu64" transitioned to Closed state"
 pci_nvme_clear_ns_reset(uint32_t state, uint64_t slba) "zone state=%"PRIu32", slba=%"PRIu64" transitioned to Empty state"
 
@@ -130,6 +131,7 @@ pci_nvme_err_zone_read_not_ok(uint64_t slba, uint32_t nlb, uint16_t status) "slb
 pci_nvme_err_append_too_large(uint64_t slba, uint32_t nlb, uint8_t zasl) "slba=%"PRIu64", nlb=%"PRIu32", zasl=%"PRIu8""
 pci_nvme_err_insuff_active_res(uint32_t max_active) "max_active=%"PRIu32" zone limit exceeded"
 pci_nvme_err_insuff_open_res(uint32_t max_open) "max_open=%"PRIu32" zone limit exceeded"
+pci_nvme_err_zd_extension_map_error(uint32_t zone_idx) "can't map descriptor extension for zone_idx=%"PRIu32""
 pci_nvme_err_invalid_iocsci(uint32_t idx) "unsupported command set combination index %"PRIu32""
 pci_nvme_err_invalid_del_sq(uint16_t qid) "invalid submission queue deletion, sid=%"PRIu16""
 pci_nvme_err_invalid_create_sq_cqid(uint16_t cqid) "failed creating submission queue, invalid cqid=%"PRIu16""
-- 
2.30.0
^ permalink raw reply related	[flat|nested] 62+ messages in thread
- * [PULL 19/56] hw/block/nvme: Document zoned parameters in usage text
  2021-02-09  7:30 [PULL 00/56] emulated nvme patches Klaus Jensen
                   ` (17 preceding siblings ...)
  2021-02-09  7:30 ` [PULL 18/56] hw/block/nvme: Support Zone Descriptor Extensions Klaus Jensen
@ 2021-02-09  7:30 ` Klaus Jensen
  2021-02-09  7:30 ` [PULL 20/56] hw/block/nvme: fix for non-msix machines Klaus Jensen
                   ` (37 subsequent siblings)
  56 siblings, 0 replies; 62+ messages in thread
From: Klaus Jensen @ 2021-02-09  7:30 UTC (permalink / raw)
  To: qemu-devel, Peter Maydell
  Cc: Fam Zheng, Kevin Wolf, qemu-block, Niklas Cassel, Dmitry Fomichev,
	Klaus Jensen, Max Reitz, Klaus Jensen, Stefan Hajnoczi,
	Keith Busch
From: Dmitry Fomichev <dmitry.fomichev@wdc.com>
Added brief descriptions of the new device properties that are
now available to users to configure features of Zoned Namespace
Command Set in the emulator.
This patch is for documentation only, no functionality change.
Signed-off-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
Reviewed-by: Niklas Cassel <Niklas.Cassel@wdc.com>
Reviewed-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
---
 hw/block/nvme.c | 43 ++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 38 insertions(+), 5 deletions(-)
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 4bcc7660736b..f4f1487afeb1 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -9,7 +9,7 @@
  */
 
 /**
- * Reference Specs: http://www.nvmexpress.org, 1.2, 1.1, 1.0e
+ * Reference Specs: http://www.nvmexpress.org, 1.4, 1.3, 1.2, 1.1, 1.0e
  *
  *  https://nvmexpress.org/developers/nvme-specification/
  */
@@ -22,8 +22,9 @@
  *              [pmrdev=<mem_backend_file_id>,] \
  *              max_ioqpairs=<N[optional]>, \
  *              aerl=<N[optional]>, aer_max_queued=<N[optional]>, \
- *              mdts=<N[optional]>
- *      -device nvme-ns,drive=<drive_id>,bus=bus_name,nsid=<nsid>
+ *              mdts=<N[optional]>,zoned.append_size_limit=<N[optional]> \
+ *      -device nvme-ns,drive=<drive_id>,bus=<bus_name>,nsid=<nsid>,\
+ *              zoned=<true|false[optional]>
  *
  * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
  * offset 0 in BAR2 and supports only WDS, RDS and SQS for now.
@@ -41,14 +42,46 @@
  * ~~~~~~~~~~~~~~~~~~~~~~
  * - `aerl`
  *   The Asynchronous Event Request Limit (AERL). Indicates the maximum number
- *   of concurrently outstanding Asynchronous Event Request commands suppoert
+ *   of concurrently outstanding Asynchronous Event Request commands support
  *   by the controller. This is a 0's based value.
  *
  * - `aer_max_queued`
  *   This is the maximum number of events that the device will enqueue for
- *   completion when there are no oustanding AERs. When the maximum number of
+ *   completion when there are no outstanding AERs. When the maximum number of
  *   enqueued events are reached, subsequent events will be dropped.
  *
+ * - `zoned.append_size_limit`
+ *   The maximum I/O size in bytes that is allowed in Zone Append command.
+ *   The default is 128KiB. Since internally this this value is maintained as
+ *   ZASL = log2(<maximum append size> / <page size>), some values assigned
+ *   to this property may be rounded down and result in a lower maximum ZA
+ *   data size being in effect. By setting this property to 0, users can make
+ *   ZASL to be equal to MDTS. This property only affects zoned namespaces.
+ *
+ * Setting `zoned` to true selects Zoned Command Set at the namespace.
+ * In this case, the following namespace properties are available to configure
+ * zoned operation:
+ *     zoned.zone_size=<zone size in bytes, default: 128MiB>
+ *         The number may be followed by K, M, G as in kilo-, mega- or giga-.
+ *
+ *     zoned.zone_capacity=<zone capacity in bytes, default: zone size>
+ *         The value 0 (default) forces zone capacity to be the same as zone
+ *         size. The value of this property may not exceed zone size.
+ *
+ *     zoned.descr_ext_size=<zone descriptor extension size, default 0>
+ *         This value needs to be specified in 64B units. If it is zero,
+ *         namespace(s) will not support zone descriptor extensions.
+ *
+ *     zoned.max_active=<Maximum Active Resources (zones), default: 0>
+ *         The default value means there is no limit to the number of
+ *         concurrently active zones.
+ *
+ *     zoned.max_open=<Maximum Open Resources (zones), default: 0>
+ *         The default value means there is no limit to the number of
+ *         concurrently open zones.
+ *
+ *     zoned.cross_zone_read=<enable RAZB, default: false>
+ *         Setting this property to true enables Read Across Zone Boundaries.
  */
 
 #include "qemu/osdep.h"
-- 
2.30.0
^ permalink raw reply related	[flat|nested] 62+ messages in thread
- * [PULL 20/56] hw/block/nvme: fix for non-msix machines
  2021-02-09  7:30 [PULL 00/56] emulated nvme patches Klaus Jensen
                   ` (18 preceding siblings ...)
  2021-02-09  7:30 ` [PULL 19/56] hw/block/nvme: Document zoned parameters in usage text Klaus Jensen
@ 2021-02-09  7:30 ` Klaus Jensen
  2021-02-09  7:30 ` [PULL 21/56] hw/block/nvme: conditionally enable DULBE for zoned namespaces Klaus Jensen
                   ` (36 subsequent siblings)
  56 siblings, 0 replies; 62+ messages in thread
From: Klaus Jensen @ 2021-02-09  7:30 UTC (permalink / raw)
  To: qemu-devel, Peter Maydell
  Cc: Fam Zheng, Kevin Wolf, qemu-block, Klaus Jensen, Max Reitz,
	Klaus Jensen, Stefan Hajnoczi, Keith Busch,
	Philippe Mathieu-Daudé, Guenter Roeck
From: Klaus Jensen <k.jensen@samsung.com>
Commit 1c0c2163aa08 ("hw/block/nvme: verify msix_init_exclusive_bar()
return value") had the unintended effect of breaking support on
several platforms not supporting MSI-X.
Still check for errors, but only report that MSI-X is unsupported
instead of bailing out.
Fixes: 1c0c2163aa08 ("hw/block/nvme: verify msix_init_exclusive_bar() return value")
Fixes: fbf2e5375e33 ("hw/block/nvme: Verify msix_vector_use() returned value")
Reported-by: Guenter Roeck <linux@roeck-us.net>
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
---
 hw/block/nvme.c | 31 ++++++++++++++++++++++---------
 1 file changed, 22 insertions(+), 9 deletions(-)
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index f4f1487afeb1..b0b7abf3312e 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -2590,7 +2590,9 @@ static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n)
 {
     n->cq[cq->cqid] = NULL;
     timer_free(cq->timer);
-    msix_vector_unuse(&n->parent_obj, cq->vector);
+    if (msix_enabled(&n->parent_obj)) {
+        msix_vector_unuse(&n->parent_obj, cq->vector);
+    }
     if (cq->cqid) {
         g_free(cq);
     }
@@ -2624,8 +2626,10 @@ static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr,
 {
     int ret;
 
-    ret = msix_vector_use(&n->parent_obj, vector);
-    assert(ret == 0);
+    if (msix_enabled(&n->parent_obj)) {
+        ret = msix_vector_use(&n->parent_obj, vector);
+        assert(ret == 0);
+    }
     cq->ctrl = n;
     cq->cqid = cqid;
     cq->size = size;
@@ -4161,9 +4165,12 @@ static void nvme_init_pmr(NvmeCtrl *n, PCIDevice *pci_dev)
                      PCI_BASE_ADDRESS_MEM_PREFETCH, &n->pmrdev->mr);
 }
 
-static void nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp)
+static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp)
 {
     uint8_t *pci_conf = pci_dev->config;
+    int ret;
+
+    Error *err = NULL;
 
     pci_conf[PCI_INTERRUPT_PIN] = 1;
     pci_config_set_prog_interface(pci_conf, 0x2);
@@ -4183,8 +4190,14 @@ static void nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp)
                           n->reg_size);
     pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY |
                      PCI_BASE_ADDRESS_MEM_TYPE_64, &n->iomem);
-    if (msix_init_exclusive_bar(pci_dev, n->params.msix_qsize, 4, errp)) {
-        return;
+    ret = msix_init_exclusive_bar(pci_dev, n->params.msix_qsize, 4, &err);
+    if (ret < 0) {
+        if (ret == -ENOTSUP) {
+            warn_report_err(err);
+        } else {
+            error_propagate(errp, err);
+            return ret;
+        }
     }
 
     if (n->params.cmb_size_mb) {
@@ -4192,6 +4205,8 @@ static void nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp)
     } else if (n->pmrdev) {
         nvme_init_pmr(n, pci_dev);
     }
+
+    return 0;
 }
 
 static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
@@ -4280,9 +4295,7 @@ static void nvme_realize(PCIDevice *pci_dev, Error **errp)
                         &pci_dev->qdev, n->parent_obj.qdev.id);
 
     nvme_init_state(n);
-    nvme_init_pci(n, pci_dev, &local_err);
-    if (local_err) {
-        error_propagate(errp, local_err);
+    if (nvme_init_pci(n, pci_dev, errp)) {
         return;
     }
 
-- 
2.30.0
^ permalink raw reply related	[flat|nested] 62+ messages in thread
- * [PULL 21/56] hw/block/nvme: conditionally enable DULBE for zoned namespaces
  2021-02-09  7:30 [PULL 00/56] emulated nvme patches Klaus Jensen
                   ` (19 preceding siblings ...)
  2021-02-09  7:30 ` [PULL 20/56] hw/block/nvme: fix for non-msix machines Klaus Jensen
@ 2021-02-09  7:30 ` Klaus Jensen
  2021-02-09  7:30 ` [PULL 22/56] hw/block/nvme: fix shutdown/reset logic Klaus Jensen
                   ` (35 subsequent siblings)
  56 siblings, 0 replies; 62+ messages in thread
From: Klaus Jensen @ 2021-02-09  7:30 UTC (permalink / raw)
  To: qemu-devel, Peter Maydell
  Cc: Fam Zheng, Kevin Wolf, qemu-block, Klaus Jensen, Max Reitz,
	Klaus Jensen, Stefan Hajnoczi, Keith Busch
From: Klaus Jensen <k.jensen@samsung.com>
The device uses the BDRV_BLOCK_ZERO flag to determine the "deallocated"
status of logical blocks. Since the zoned namespaces command set
specification defines that logical blocks SHALL be marked as deallocated
when the zone is in the Empty or Offline states, DULBE can only be
supported if the zone size is a multiple of the calculated deallocation
granularity (reported in NPDG) which depends on the underlying block
device cluster size (if applicable) or the configured
discard_granularity.
Reviewed-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
---
 hw/block/nvme-ns.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)
diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index 838b15c064f5..9be170abb78d 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -16,6 +16,7 @@
 #include "qemu/units.h"
 #include "qemu/cutils.h"
 #include "qemu/log.h"
+#include "qemu/error-report.h"
 #include "hw/block/block.h"
 #include "hw/pci/pci.h"
 #include "sysemu/sysemu.h"
@@ -227,6 +228,22 @@ static void nvme_ns_init_zoned(NvmeCtrl *n, NvmeNamespace *ns, int lba_index)
     ns->id_ns.ncap = ns->id_ns.nsze;
     ns->id_ns.nuse = ns->id_ns.ncap;
 
+    /*
+     * The device uses the BDRV_BLOCK_ZERO flag to determine the "deallocated"
+     * status of logical blocks. Since the spec defines that logical blocks
+     * SHALL be deallocated when then zone is in the Empty or Offline states,
+     * we can only support DULBE if the zone size is a multiple of the
+     * calculated NPDG.
+     */
+    if (ns->zone_size % (ns->id_ns.npdg + 1)) {
+        warn_report("the zone size (%"PRIu64" blocks) is not a multiple of "
+                    "the calculated deallocation granularity (%d blocks); "
+                    "DULBE support disabled",
+                    ns->zone_size, ns->id_ns.npdg + 1);
+
+        ns->id_ns.nsfeat &= ~0x4;
+    }
+
     ns->id_ns_zoned = id_ns_z;
 }
 
-- 
2.30.0
^ permalink raw reply related	[flat|nested] 62+ messages in thread
- * [PULL 22/56] hw/block/nvme: fix shutdown/reset logic
  2021-02-09  7:30 [PULL 00/56] emulated nvme patches Klaus Jensen
                   ` (20 preceding siblings ...)
  2021-02-09  7:30 ` [PULL 21/56] hw/block/nvme: conditionally enable DULBE for zoned namespaces Klaus Jensen
@ 2021-02-09  7:30 ` Klaus Jensen
  2021-02-09  7:30 ` [PULL 23/56] hw/block/nvme: merge implicitly/explicitly opened processing masks Klaus Jensen
                   ` (34 subsequent siblings)
  56 siblings, 0 replies; 62+ messages in thread
From: Klaus Jensen @ 2021-02-09  7:30 UTC (permalink / raw)
  To: qemu-devel, Peter Maydell
  Cc: Fam Zheng, Kevin Wolf, qemu-block, Dmitry Fomichev, Klaus Jensen,
	Max Reitz, Klaus Jensen, Stefan Hajnoczi, Keith Busch
From: Klaus Jensen <k.jensen@samsung.com>
A shutdown is only about flushing stuff. It is the host that should
delete any queues, so do not perform a reset here.
Also, on shutdown, make sure that the PMR is flushed if in use.
Fixes: 368f4e752cf9 ("hw/block/nvme: Process controller reset and shutdown differently")
Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
Reviewed-by: Keith Busch <kbusch@kernel.org>
Tested-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
Reviewed-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
---
 hw/block/nvme.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index b0b7abf3312e..551878338e5d 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -3419,7 +3419,7 @@ static void nvme_process_sq(void *opaque)
     }
 }
 
-static void nvme_clear_ctrl(NvmeCtrl *n)
+static void nvme_ctrl_reset(NvmeCtrl *n)
 {
     NvmeNamespace *ns;
     int i;
@@ -3453,11 +3453,7 @@ static void nvme_clear_ctrl(NvmeCtrl *n)
     n->aer_queued = 0;
     n->outstanding_aers = 0;
     n->qs_created = false;
-}
 
-static void nvme_ctrl_reset(NvmeCtrl *n)
-{
-    nvme_clear_ctrl(n);
     n->bar.cc = 0;
 }
 
@@ -3466,7 +3462,9 @@ static void nvme_ctrl_shutdown(NvmeCtrl *n)
     NvmeNamespace *ns;
     int i;
 
-    nvme_clear_ctrl(n);
+    if (n->pmrdev) {
+        memory_region_msync(&n->pmrdev->mr, 0, n->pmrdev->size);
+    }
 
     for (i = 1; i <= n->num_namespaces; i++) {
         ns = nvme_ns(n, i);
@@ -4318,7 +4316,7 @@ static void nvme_exit(PCIDevice *pci_dev)
     NvmeNamespace *ns;
     int i;
 
-    nvme_ctrl_shutdown(n);
+    nvme_ctrl_reset(n);
 
     for (i = 1; i <= n->num_namespaces; i++) {
         ns = nvme_ns(n, i);
-- 
2.30.0
^ permalink raw reply related	[flat|nested] 62+ messages in thread
- * [PULL 23/56] hw/block/nvme: merge implicitly/explicitly opened processing masks
  2021-02-09  7:30 [PULL 00/56] emulated nvme patches Klaus Jensen
                   ` (21 preceding siblings ...)
  2021-02-09  7:30 ` [PULL 22/56] hw/block/nvme: fix shutdown/reset logic Klaus Jensen
@ 2021-02-09  7:30 ` Klaus Jensen
  2021-02-09  7:30 ` [PULL 24/56] hw/block/nvme: enum style fix Klaus Jensen
                   ` (33 subsequent siblings)
  56 siblings, 0 replies; 62+ messages in thread
From: Klaus Jensen @ 2021-02-09  7:30 UTC (permalink / raw)
  To: qemu-devel, Peter Maydell
  Cc: Fam Zheng, Kevin Wolf, qemu-block, Dmitry Fomichev, Klaus Jensen,
	Max Reitz, Klaus Jensen, Stefan Hajnoczi, Keith Busch
From: Klaus Jensen <k.jensen@samsung.com>
Implicitly and explicitly opended zones are always bulk processed
together, so merge the two processing masks.
Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
Tested-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
Reviewed-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
---
 hw/block/nvme.c | 27 +++++++++++----------------
 1 file changed, 11 insertions(+), 16 deletions(-)
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 551878338e5d..a7245a7e05a1 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1740,11 +1740,10 @@ typedef uint16_t (*op_handler_t)(NvmeNamespace *, NvmeZone *,
 
 enum NvmeZoneProcessingMask {
     NVME_PROC_CURRENT_ZONE    = 0,
-    NVME_PROC_IMP_OPEN_ZONES  = 1 << 0,
-    NVME_PROC_EXP_OPEN_ZONES  = 1 << 1,
-    NVME_PROC_CLOSED_ZONES    = 1 << 2,
-    NVME_PROC_READ_ONLY_ZONES = 1 << 3,
-    NVME_PROC_FULL_ZONES      = 1 << 4,
+    NVME_PROC_OPENED_ZONES    = 1 << 0,
+    NVME_PROC_CLOSED_ZONES    = 1 << 1,
+    NVME_PROC_READ_ONLY_ZONES = 1 << 2,
+    NVME_PROC_FULL_ZONES      = 1 << 3,
 };
 
 static uint16_t nvme_open_zone(NvmeNamespace *ns, NvmeZone *zone,
@@ -1885,10 +1884,8 @@ static uint16_t nvme_bulk_proc_zone(NvmeNamespace *ns, NvmeZone *zone,
 
     switch (zs) {
     case NVME_ZONE_STATE_IMPLICITLY_OPEN:
-        proc_zone = proc_mask & NVME_PROC_IMP_OPEN_ZONES;
-        break;
     case NVME_ZONE_STATE_EXPLICITLY_OPEN:
-        proc_zone = proc_mask & NVME_PROC_EXP_OPEN_ZONES;
+        proc_zone = proc_mask & NVME_PROC_OPENED_ZONES;
         break;
     case NVME_ZONE_STATE_CLOSED:
         proc_zone = proc_mask & NVME_PROC_CLOSED_ZONES;
@@ -1929,15 +1926,14 @@ static uint16_t nvme_do_zone_op(NvmeNamespace *ns, NvmeZone *zone,
                 }
             }
         }
-        if (proc_mask & NVME_PROC_IMP_OPEN_ZONES) {
+        if (proc_mask & NVME_PROC_OPENED_ZONES) {
             QTAILQ_FOREACH_SAFE(zone, &ns->imp_open_zones, entry, next) {
                 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr);
                 if (status != NVME_SUCCESS) {
                     goto out;
                 }
             }
-        }
-        if (proc_mask & NVME_PROC_EXP_OPEN_ZONES) {
+
             QTAILQ_FOREACH_SAFE(zone, &ns->exp_open_zones, entry, next) {
                 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr);
                 if (status != NVME_SUCCESS) {
@@ -2012,7 +2008,7 @@ static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeRequest *req)
 
     case NVME_ZONE_ACTION_CLOSE:
         if (all) {
-            proc_mask = NVME_PROC_IMP_OPEN_ZONES | NVME_PROC_EXP_OPEN_ZONES;
+            proc_mask = NVME_PROC_OPENED_ZONES;
         }
         trace_pci_nvme_close_zone(slba, zone_idx, all);
         status = nvme_do_zone_op(ns, zone, proc_mask, nvme_close_zone);
@@ -2020,8 +2016,7 @@ static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeRequest *req)
 
     case NVME_ZONE_ACTION_FINISH:
         if (all) {
-            proc_mask = NVME_PROC_IMP_OPEN_ZONES | NVME_PROC_EXP_OPEN_ZONES |
-                        NVME_PROC_CLOSED_ZONES;
+            proc_mask = NVME_PROC_OPENED_ZONES | NVME_PROC_CLOSED_ZONES;
         }
         trace_pci_nvme_finish_zone(slba, zone_idx, all);
         status = nvme_do_zone_op(ns, zone, proc_mask, nvme_finish_zone);
@@ -2029,8 +2024,8 @@ static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeRequest *req)
 
     case NVME_ZONE_ACTION_RESET:
         if (all) {
-            proc_mask = NVME_PROC_IMP_OPEN_ZONES | NVME_PROC_EXP_OPEN_ZONES |
-                        NVME_PROC_CLOSED_ZONES | NVME_PROC_FULL_ZONES;
+            proc_mask = NVME_PROC_OPENED_ZONES | NVME_PROC_CLOSED_ZONES |
+                NVME_PROC_FULL_ZONES;
         }
         trace_pci_nvme_reset_zone(slba, zone_idx, all);
         status = nvme_do_zone_op(ns, zone, proc_mask, nvme_reset_zone);
-- 
2.30.0
^ permalink raw reply related	[flat|nested] 62+ messages in thread
- * [PULL 24/56] hw/block/nvme: enum style fix
  2021-02-09  7:30 [PULL 00/56] emulated nvme patches Klaus Jensen
                   ` (22 preceding siblings ...)
  2021-02-09  7:30 ` [PULL 23/56] hw/block/nvme: merge implicitly/explicitly opened processing masks Klaus Jensen
@ 2021-02-09  7:30 ` Klaus Jensen
  2021-02-09  7:30 ` [PULL 25/56] hw/block/nvme: zero out zones on reset Klaus Jensen
                   ` (32 subsequent siblings)
  56 siblings, 0 replies; 62+ messages in thread
From: Klaus Jensen @ 2021-02-09  7:30 UTC (permalink / raw)
  To: qemu-devel, Peter Maydell
  Cc: Fam Zheng, Kevin Wolf, qemu-block, Dmitry Fomichev, Klaus Jensen,
	Max Reitz, Klaus Jensen, Stefan Hajnoczi, Keith Busch
From: Klaus Jensen <k.jensen@samsung.com>
Align with existing style and use a typedef for header-file enums.
Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
Tested-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
Reviewed-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
---
 hw/block/nvme-ns.h   |  4 ++--
 include/block/nvme.h |  4 ++--
 hw/block/nvme.c      | 19 +++++++++----------
 3 files changed, 13 insertions(+), 14 deletions(-)
diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
index f8f3c28c360b..a0baa5f6d44c 100644
--- a/hw/block/nvme-ns.h
+++ b/hw/block/nvme-ns.h
@@ -102,12 +102,12 @@ static inline size_t nvme_l2b(NvmeNamespace *ns, uint64_t lba)
 
 typedef struct NvmeCtrl NvmeCtrl;
 
-static inline enum NvmeZoneState nvme_get_zone_state(NvmeZone *zone)
+static inline NvmeZoneState nvme_get_zone_state(NvmeZone *zone)
 {
     return zone->d.zs >> 4;
 }
 
-static inline void nvme_set_zone_state(NvmeZone *zone, enum NvmeZoneState state)
+static inline void nvme_set_zone_state(NvmeZone *zone, NvmeZoneState state)
 {
     zone->d.zs = state << 4;
 }
diff --git a/include/block/nvme.h b/include/block/nvme.h
index 9494246f1f59..45b2678db1f0 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -1212,7 +1212,7 @@ typedef struct QEMU_PACKED NvmeZoneDescr {
     uint8_t     rsvd32[32];
 } NvmeZoneDescr;
 
-enum NvmeZoneState {
+typedef enum NvmeZoneState {
     NVME_ZONE_STATE_RESERVED         = 0x00,
     NVME_ZONE_STATE_EMPTY            = 0x01,
     NVME_ZONE_STATE_IMPLICITLY_OPEN  = 0x02,
@@ -1221,7 +1221,7 @@ enum NvmeZoneState {
     NVME_ZONE_STATE_READ_ONLY        = 0x0D,
     NVME_ZONE_STATE_FULL             = 0x0E,
     NVME_ZONE_STATE_OFFLINE          = 0x0F,
-};
+} NvmeZoneState;
 
 static inline void _nvme_check_size(void)
 {
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index a7245a7e05a1..a5cf798bbbaa 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -198,7 +198,7 @@ static uint16_t nvme_sqid(NvmeRequest *req)
 }
 
 static void nvme_assign_zone_state(NvmeNamespace *ns, NvmeZone *zone,
-                                   enum NvmeZoneState state)
+                                   NvmeZoneState state)
 {
     if (QTAILQ_IN_USE(zone, entry)) {
         switch (nvme_get_zone_state(zone)) {
@@ -1735,8 +1735,7 @@ static uint16_t nvme_get_mgmt_zone_slba_idx(NvmeNamespace *ns, NvmeCmd *c,
     return NVME_SUCCESS;
 }
 
-typedef uint16_t (*op_handler_t)(NvmeNamespace *, NvmeZone *,
-                                 enum NvmeZoneState);
+typedef uint16_t (*op_handler_t)(NvmeNamespace *, NvmeZone *, NvmeZoneState);
 
 enum NvmeZoneProcessingMask {
     NVME_PROC_CURRENT_ZONE    = 0,
@@ -1747,7 +1746,7 @@ enum NvmeZoneProcessingMask {
 };
 
 static uint16_t nvme_open_zone(NvmeNamespace *ns, NvmeZone *zone,
-                               enum NvmeZoneState state)
+                               NvmeZoneState state)
 {
     uint16_t status;
 
@@ -1780,7 +1779,7 @@ static uint16_t nvme_open_zone(NvmeNamespace *ns, NvmeZone *zone,
 }
 
 static uint16_t nvme_close_zone(NvmeNamespace *ns, NvmeZone *zone,
-                                enum NvmeZoneState state)
+                                NvmeZoneState state)
 {
     switch (state) {
     case NVME_ZONE_STATE_EXPLICITLY_OPEN:
@@ -1796,7 +1795,7 @@ static uint16_t nvme_close_zone(NvmeNamespace *ns, NvmeZone *zone,
 }
 
 static uint16_t nvme_finish_zone(NvmeNamespace *ns, NvmeZone *zone,
-                                 enum NvmeZoneState state)
+                                 NvmeZoneState state)
 {
     switch (state) {
     case NVME_ZONE_STATE_EXPLICITLY_OPEN:
@@ -1819,7 +1818,7 @@ static uint16_t nvme_finish_zone(NvmeNamespace *ns, NvmeZone *zone,
 }
 
 static uint16_t nvme_reset_zone(NvmeNamespace *ns, NvmeZone *zone,
-                                enum NvmeZoneState state)
+                                NvmeZoneState state)
 {
     switch (state) {
     case NVME_ZONE_STATE_EXPLICITLY_OPEN:
@@ -1842,7 +1841,7 @@ static uint16_t nvme_reset_zone(NvmeNamespace *ns, NvmeZone *zone,
 }
 
 static uint16_t nvme_offline_zone(NvmeNamespace *ns, NvmeZone *zone,
-                                  enum NvmeZoneState state)
+                                  NvmeZoneState state)
 {
     switch (state) {
     case NVME_ZONE_STATE_READ_ONLY:
@@ -1879,7 +1878,7 @@ static uint16_t nvme_bulk_proc_zone(NvmeNamespace *ns, NvmeZone *zone,
                                     op_handler_t op_hndlr)
 {
     uint16_t status = NVME_SUCCESS;
-    enum NvmeZoneState zs = nvme_get_zone_state(zone);
+    NvmeZoneState zs = nvme_get_zone_state(zone);
     bool proc_zone;
 
     switch (zs) {
@@ -2077,7 +2076,7 @@ static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeRequest *req)
 
 static bool nvme_zone_matches_filter(uint32_t zafs, NvmeZone *zl)
 {
-    enum NvmeZoneState zs = nvme_get_zone_state(zl);
+    NvmeZoneState zs = nvme_get_zone_state(zl);
 
     switch (zafs) {
     case NVME_ZONE_REPORT_ALL:
-- 
2.30.0
^ permalink raw reply related	[flat|nested] 62+ messages in thread
- * [PULL 25/56] hw/block/nvme: zero out zones on reset
  2021-02-09  7:30 [PULL 00/56] emulated nvme patches Klaus Jensen
                   ` (23 preceding siblings ...)
  2021-02-09  7:30 ` [PULL 24/56] hw/block/nvme: enum style fix Klaus Jensen
@ 2021-02-09  7:30 ` Klaus Jensen
  2021-02-09  7:30 ` [PULL 26/56] hw/block/nvme: add missing string representations for commands Klaus Jensen
                   ` (31 subsequent siblings)
  56 siblings, 0 replies; 62+ messages in thread
From: Klaus Jensen @ 2021-02-09  7:30 UTC (permalink / raw)
  To: qemu-devel, Peter Maydell
  Cc: Fam Zheng, Kevin Wolf, qemu-block, Dmitry Fomichev, Klaus Jensen,
	Max Reitz, Klaus Jensen, Stefan Hajnoczi, Keith Busch
From: Klaus Jensen <k.jensen@samsung.com>
The zoned command set specification states that "All logical blocks in a
zone *shall* be marked as deallocated when [the zone is reset]". Since
the device guarantees 0x00 to be read from deallocated blocks we have to
issue a pwrite_zeroes since we cannot be sure that a discard will do
anything. But typically, this will be achieved with an efficient
unmap/discard operation.
Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
Tested-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
Reviewed-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
---
 hw/block/nvme.c       | 150 +++++++++++++++++++++++++++++++-----------
 hw/block/trace-events |   1 +
 2 files changed, 113 insertions(+), 38 deletions(-)
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index a5cf798bbbaa..7222eff755ee 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1371,6 +1371,53 @@ static void nvme_aio_discard_cb(void *opaque, int ret)
     nvme_enqueue_req_completion(nvme_cq(req), req);
 }
 
+struct nvme_zone_reset_ctx {
+    NvmeRequest *req;
+    NvmeZone    *zone;
+};
+
+static void nvme_aio_zone_reset_cb(void *opaque, int ret)
+{
+    struct nvme_zone_reset_ctx *ctx = opaque;
+    NvmeRequest *req = ctx->req;
+    NvmeNamespace *ns = req->ns;
+    NvmeZone *zone = ctx->zone;
+    uintptr_t *resets = (uintptr_t *)&req->opaque;
+
+    g_free(ctx);
+
+    trace_pci_nvme_aio_zone_reset_cb(nvme_cid(req), zone->d.zslba);
+
+    if (!ret) {
+        switch (nvme_get_zone_state(zone)) {
+        case NVME_ZONE_STATE_EXPLICITLY_OPEN:
+        case NVME_ZONE_STATE_IMPLICITLY_OPEN:
+            nvme_aor_dec_open(ns);
+            /* fall through */
+        case NVME_ZONE_STATE_CLOSED:
+            nvme_aor_dec_active(ns);
+            /* fall through */
+        case NVME_ZONE_STATE_FULL:
+            zone->w_ptr = zone->d.zslba;
+            zone->d.wp = zone->w_ptr;
+            nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EMPTY);
+            /* fall through */
+        default:
+            break;
+        }
+    } else {
+        nvme_aio_err(req, ret);
+    }
+
+    (*resets)--;
+
+    if (*resets) {
+        return;
+    }
+
+    nvme_enqueue_req_completion(nvme_cq(req), req);
+}
+
 struct nvme_compare_ctx {
     QEMUIOVector iov;
     uint8_t *bounce;
@@ -1735,7 +1782,8 @@ static uint16_t nvme_get_mgmt_zone_slba_idx(NvmeNamespace *ns, NvmeCmd *c,
     return NVME_SUCCESS;
 }
 
-typedef uint16_t (*op_handler_t)(NvmeNamespace *, NvmeZone *, NvmeZoneState);
+typedef uint16_t (*op_handler_t)(NvmeNamespace *, NvmeZone *, NvmeZoneState,
+                                 NvmeRequest *);
 
 enum NvmeZoneProcessingMask {
     NVME_PROC_CURRENT_ZONE    = 0,
@@ -1746,7 +1794,7 @@ enum NvmeZoneProcessingMask {
 };
 
 static uint16_t nvme_open_zone(NvmeNamespace *ns, NvmeZone *zone,
-                               NvmeZoneState state)
+                               NvmeZoneState state, NvmeRequest *req)
 {
     uint16_t status;
 
@@ -1779,7 +1827,7 @@ static uint16_t nvme_open_zone(NvmeNamespace *ns, NvmeZone *zone,
 }
 
 static uint16_t nvme_close_zone(NvmeNamespace *ns, NvmeZone *zone,
-                                NvmeZoneState state)
+                                NvmeZoneState state, NvmeRequest *req)
 {
     switch (state) {
     case NVME_ZONE_STATE_EXPLICITLY_OPEN:
@@ -1795,7 +1843,7 @@ static uint16_t nvme_close_zone(NvmeNamespace *ns, NvmeZone *zone,
 }
 
 static uint16_t nvme_finish_zone(NvmeNamespace *ns, NvmeZone *zone,
-                                 NvmeZoneState state)
+                                 NvmeZoneState state, NvmeRequest *req)
 {
     switch (state) {
     case NVME_ZONE_STATE_EXPLICITLY_OPEN:
@@ -1818,30 +1866,42 @@ static uint16_t nvme_finish_zone(NvmeNamespace *ns, NvmeZone *zone,
 }
 
 static uint16_t nvme_reset_zone(NvmeNamespace *ns, NvmeZone *zone,
-                                NvmeZoneState state)
+                                NvmeZoneState state, NvmeRequest *req)
 {
+    uintptr_t *resets = (uintptr_t *)&req->opaque;
+    struct nvme_zone_reset_ctx *ctx;
+
     switch (state) {
-    case NVME_ZONE_STATE_EXPLICITLY_OPEN:
-    case NVME_ZONE_STATE_IMPLICITLY_OPEN:
-        nvme_aor_dec_open(ns);
-        /* fall through */
-    case NVME_ZONE_STATE_CLOSED:
-        nvme_aor_dec_active(ns);
-        /* fall through */
-    case NVME_ZONE_STATE_FULL:
-        zone->w_ptr = zone->d.zslba;
-        zone->d.wp = zone->w_ptr;
-        nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EMPTY);
-        /* fall through */
     case NVME_ZONE_STATE_EMPTY:
         return NVME_SUCCESS;
+    case NVME_ZONE_STATE_EXPLICITLY_OPEN:
+    case NVME_ZONE_STATE_IMPLICITLY_OPEN:
+    case NVME_ZONE_STATE_CLOSED:
+    case NVME_ZONE_STATE_FULL:
+        break;
     default:
         return NVME_ZONE_INVAL_TRANSITION;
     }
+
+    /*
+     * The zone reset aio callback needs to know the zone that is being reset
+     * in order to transition the zone on completion.
+     */
+    ctx = g_new(struct nvme_zone_reset_ctx, 1);
+    ctx->req = req;
+    ctx->zone = zone;
+
+    (*resets)++;
+
+    blk_aio_pwrite_zeroes(ns->blkconf.blk, nvme_l2b(ns, zone->d.zslba),
+                          nvme_l2b(ns, ns->zone_size), BDRV_REQ_MAY_UNMAP,
+                          nvme_aio_zone_reset_cb, ctx);
+
+    return NVME_NO_COMPLETE;
 }
 
 static uint16_t nvme_offline_zone(NvmeNamespace *ns, NvmeZone *zone,
-                                  NvmeZoneState state)
+                                  NvmeZoneState state, NvmeRequest *req)
 {
     switch (state) {
     case NVME_ZONE_STATE_READ_ONLY:
@@ -1875,7 +1935,7 @@ static uint16_t nvme_set_zd_ext(NvmeNamespace *ns, NvmeZone *zone)
 
 static uint16_t nvme_bulk_proc_zone(NvmeNamespace *ns, NvmeZone *zone,
                                     enum NvmeZoneProcessingMask proc_mask,
-                                    op_handler_t op_hndlr)
+                                    op_handler_t op_hndlr, NvmeRequest *req)
 {
     uint16_t status = NVME_SUCCESS;
     NvmeZoneState zs = nvme_get_zone_state(zone);
@@ -1900,7 +1960,7 @@ static uint16_t nvme_bulk_proc_zone(NvmeNamespace *ns, NvmeZone *zone,
     }
 
     if (proc_zone) {
-        status = op_hndlr(ns, zone, zs);
+        status = op_hndlr(ns, zone, zs, req);
     }
 
     return status;
@@ -1908,42 +1968,46 @@ static uint16_t nvme_bulk_proc_zone(NvmeNamespace *ns, NvmeZone *zone,
 
 static uint16_t nvme_do_zone_op(NvmeNamespace *ns, NvmeZone *zone,
                                 enum NvmeZoneProcessingMask proc_mask,
-                                op_handler_t op_hndlr)
+                                op_handler_t op_hndlr, NvmeRequest *req)
 {
     NvmeZone *next;
     uint16_t status = NVME_SUCCESS;
     int i;
 
     if (!proc_mask) {
-        status = op_hndlr(ns, zone, nvme_get_zone_state(zone));
+        status = op_hndlr(ns, zone, nvme_get_zone_state(zone), req);
     } else {
         if (proc_mask & NVME_PROC_CLOSED_ZONES) {
             QTAILQ_FOREACH_SAFE(zone, &ns->closed_zones, entry, next) {
-                status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr);
-                if (status != NVME_SUCCESS) {
+                status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
+                                             req);
+                if (status && status != NVME_NO_COMPLETE) {
                     goto out;
                 }
             }
         }
         if (proc_mask & NVME_PROC_OPENED_ZONES) {
             QTAILQ_FOREACH_SAFE(zone, &ns->imp_open_zones, entry, next) {
-                status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr);
-                if (status != NVME_SUCCESS) {
+                status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
+                                             req);
+                if (status && status != NVME_NO_COMPLETE) {
                     goto out;
                 }
             }
 
             QTAILQ_FOREACH_SAFE(zone, &ns->exp_open_zones, entry, next) {
-                status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr);
-                if (status != NVME_SUCCESS) {
+                status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
+                                             req);
+                if (status && status != NVME_NO_COMPLETE) {
                     goto out;
                 }
             }
         }
         if (proc_mask & NVME_PROC_FULL_ZONES) {
             QTAILQ_FOREACH_SAFE(zone, &ns->full_zones, entry, next) {
-                status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr);
-                if (status != NVME_SUCCESS) {
+                status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
+                                             req);
+                if (status && status != NVME_NO_COMPLETE) {
                     goto out;
                 }
             }
@@ -1951,8 +2015,9 @@ static uint16_t nvme_do_zone_op(NvmeNamespace *ns, NvmeZone *zone,
 
         if (proc_mask & NVME_PROC_READ_ONLY_ZONES) {
             for (i = 0; i < ns->num_zones; i++, zone++) {
-                status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr);
-                if (status != NVME_SUCCESS) {
+                status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
+                                             req);
+                if (status && status != NVME_NO_COMPLETE) {
                     goto out;
                 }
             }
@@ -1968,6 +2033,7 @@ static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeRequest *req)
     NvmeCmd *cmd = (NvmeCmd *)&req->cmd;
     NvmeNamespace *ns = req->ns;
     NvmeZone *zone;
+    uintptr_t *resets;
     uint8_t *zd_ext;
     uint32_t dw13 = le32_to_cpu(cmd->cdw13);
     uint64_t slba = 0;
@@ -2002,7 +2068,7 @@ static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeRequest *req)
             proc_mask = NVME_PROC_CLOSED_ZONES;
         }
         trace_pci_nvme_open_zone(slba, zone_idx, all);
-        status = nvme_do_zone_op(ns, zone, proc_mask, nvme_open_zone);
+        status = nvme_do_zone_op(ns, zone, proc_mask, nvme_open_zone, req);
         break;
 
     case NVME_ZONE_ACTION_CLOSE:
@@ -2010,7 +2076,7 @@ static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeRequest *req)
             proc_mask = NVME_PROC_OPENED_ZONES;
         }
         trace_pci_nvme_close_zone(slba, zone_idx, all);
-        status = nvme_do_zone_op(ns, zone, proc_mask, nvme_close_zone);
+        status = nvme_do_zone_op(ns, zone, proc_mask, nvme_close_zone, req);
         break;
 
     case NVME_ZONE_ACTION_FINISH:
@@ -2018,24 +2084,32 @@ static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeRequest *req)
             proc_mask = NVME_PROC_OPENED_ZONES | NVME_PROC_CLOSED_ZONES;
         }
         trace_pci_nvme_finish_zone(slba, zone_idx, all);
-        status = nvme_do_zone_op(ns, zone, proc_mask, nvme_finish_zone);
+        status = nvme_do_zone_op(ns, zone, proc_mask, nvme_finish_zone, req);
         break;
 
     case NVME_ZONE_ACTION_RESET:
+        resets = (uintptr_t *)&req->opaque;
+
         if (all) {
             proc_mask = NVME_PROC_OPENED_ZONES | NVME_PROC_CLOSED_ZONES |
                 NVME_PROC_FULL_ZONES;
         }
         trace_pci_nvme_reset_zone(slba, zone_idx, all);
-        status = nvme_do_zone_op(ns, zone, proc_mask, nvme_reset_zone);
-        break;
+
+        *resets = 1;
+
+        status = nvme_do_zone_op(ns, zone, proc_mask, nvme_reset_zone, req);
+
+        (*resets)--;
+
+        return *resets ? NVME_NO_COMPLETE : req->status;
 
     case NVME_ZONE_ACTION_OFFLINE:
         if (all) {
             proc_mask = NVME_PROC_READ_ONLY_ZONES;
         }
         trace_pci_nvme_offline_zone(slba, zone_idx, all);
-        status = nvme_do_zone_op(ns, zone, proc_mask, nvme_offline_zone);
+        status = nvme_do_zone_op(ns, zone, proc_mask, nvme_offline_zone, req);
         break;
 
     case NVME_ZONE_ACTION_SET_ZD_EXT:
diff --git a/hw/block/trace-events b/hw/block/trace-events
index a455979e5911..6d1686e6dc9d 100644
--- a/hw/block/trace-events
+++ b/hw/block/trace-events
@@ -49,6 +49,7 @@ pci_nvme_dsm_deallocate(uint16_t cid, uint32_t nsid, uint64_t slba, uint32_t nlb
 pci_nvme_compare(uint16_t cid, uint32_t nsid, uint64_t slba, uint32_t nlb) "cid %"PRIu16" nsid %"PRIu32" slba 0x%"PRIx64" nlb %"PRIu32""
 pci_nvme_compare_cb(uint16_t cid) "cid %"PRIu16""
 pci_nvme_aio_discard_cb(uint16_t cid) "cid %"PRIu16""
+pci_nvme_aio_zone_reset_cb(uint16_t cid, uint64_t zslba) "cid %"PRIu16" zslba 0x%"PRIx64""
 pci_nvme_create_sq(uint64_t addr, uint16_t sqid, uint16_t cqid, uint16_t qsize, uint16_t qflags) "create submission queue, addr=0x%"PRIx64", sqid=%"PRIu16", cqid=%"PRIu16", qsize=%"PRIu16", qflags=%"PRIu16""
 pci_nvme_create_cq(uint64_t addr, uint16_t cqid, uint16_t vector, uint16_t size, uint16_t qflags, int ien) "create completion queue, addr=0x%"PRIx64", cqid=%"PRIu16", vector=%"PRIu16", qsize=%"PRIu16", qflags=%"PRIu16", ien=%d"
 pci_nvme_del_sq(uint16_t qid) "deleting submission queue sqid=%"PRIu16""
-- 
2.30.0
^ permalink raw reply related	[flat|nested] 62+ messages in thread
- * [PULL 26/56] hw/block/nvme: add missing string representations for commands
  2021-02-09  7:30 [PULL 00/56] emulated nvme patches Klaus Jensen
                   ` (24 preceding siblings ...)
  2021-02-09  7:30 ` [PULL 25/56] hw/block/nvme: zero out zones on reset Klaus Jensen
@ 2021-02-09  7:30 ` Klaus Jensen
  2021-02-09  7:30 ` [PULL 27/56] hw/block/nvme: remove unnecessary check for append Klaus Jensen
                   ` (30 subsequent siblings)
  56 siblings, 0 replies; 62+ messages in thread
From: Klaus Jensen @ 2021-02-09  7:30 UTC (permalink / raw)
  To: qemu-devel, Peter Maydell
  Cc: Fam Zheng, Kevin Wolf, qemu-block, Dmitry Fomichev, Klaus Jensen,
	Max Reitz, Klaus Jensen, Stefan Hajnoczi, Keith Busch
From: Klaus Jensen <k.jensen@samsung.com>
Add missing string representations for a couple of new commands.
Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
Tested-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
Reviewed-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
---
 hw/block/nvme.h | 4 ++++
 1 file changed, 4 insertions(+)
diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index b7fbcca39d9f..65540b650e1d 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -64,8 +64,12 @@ static inline const char *nvme_io_opc_str(uint8_t opc)
     case NVME_CMD_FLUSH:            return "NVME_NVM_CMD_FLUSH";
     case NVME_CMD_WRITE:            return "NVME_NVM_CMD_WRITE";
     case NVME_CMD_READ:             return "NVME_NVM_CMD_READ";
+    case NVME_CMD_COMPARE:          return "NVME_NVM_CMD_COMPARE";
     case NVME_CMD_WRITE_ZEROES:     return "NVME_NVM_CMD_WRITE_ZEROES";
     case NVME_CMD_DSM:              return "NVME_NVM_CMD_DSM";
+    case NVME_CMD_ZONE_MGMT_SEND:   return "NVME_ZONED_CMD_MGMT_SEND";
+    case NVME_CMD_ZONE_MGMT_RECV:   return "NVME_ZONED_CMD_MGMT_RECV";
+    case NVME_CMD_ZONE_APPEND:      return "NVME_ZONED_CMD_ZONE_APPEND";
     default:                        return "NVME_NVM_CMD_UNKNOWN";
     }
 }
-- 
2.30.0
^ permalink raw reply related	[flat|nested] 62+ messages in thread
- * [PULL 27/56] hw/block/nvme: remove unnecessary check for append
  2021-02-09  7:30 [PULL 00/56] emulated nvme patches Klaus Jensen
                   ` (25 preceding siblings ...)
  2021-02-09  7:30 ` [PULL 26/56] hw/block/nvme: add missing string representations for commands Klaus Jensen
@ 2021-02-09  7:30 ` Klaus Jensen
  2021-02-09  7:30 ` [PULL 28/56] hw/block/nvme: Correct error status for unaligned ZA Klaus Jensen
                   ` (29 subsequent siblings)
  56 siblings, 0 replies; 62+ messages in thread
From: Klaus Jensen @ 2021-02-09  7:30 UTC (permalink / raw)
  To: qemu-devel, Peter Maydell
  Cc: Fam Zheng, Kevin Wolf, qemu-block, Dmitry Fomichev, Klaus Jensen,
	Max Reitz, Klaus Jensen, Stefan Hajnoczi, Keith Busch
From: Klaus Jensen <k.jensen@samsung.com>
nvme_io_cmd already checks if the namespace supports the Zone Append
command, so the removed check is dead code.
Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
Tested-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
Reviewed-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
---
 hw/block/nvme.c | 4 ----
 1 file changed, 4 deletions(-)
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 7222eff755ee..c73afdf8054f 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1707,10 +1707,6 @@ static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append,
         }
 
         res->slba = nvme_advance_zone_wp(ns, zone, nlb);
-    } else if (append) {
-        trace_pci_nvme_err_invalid_opc(rw->opcode);
-        status = NVME_INVALID_OPCODE;
-        goto invalid;
     }
 
     data_offset = nvme_l2b(ns, slba);
-- 
2.30.0
^ permalink raw reply related	[flat|nested] 62+ messages in thread
- * [PULL 28/56] hw/block/nvme: Correct error status for unaligned ZA
  2021-02-09  7:30 [PULL 00/56] emulated nvme patches Klaus Jensen
                   ` (26 preceding siblings ...)
  2021-02-09  7:30 ` [PULL 27/56] hw/block/nvme: remove unnecessary check for append Klaus Jensen
@ 2021-02-09  7:30 ` Klaus Jensen
  2021-02-09  7:30 ` [PULL 29/56] hw/block/nvme: remove unused argument in nvme_ns_init_zoned Klaus Jensen
                   ` (28 subsequent siblings)
  56 siblings, 0 replies; 62+ messages in thread
From: Klaus Jensen @ 2021-02-09  7:30 UTC (permalink / raw)
  To: qemu-devel, Peter Maydell
  Cc: Fam Zheng, Kevin Wolf, qemu-block, Dmitry Fomichev, Klaus Jensen,
	Max Reitz, Klaus Jensen, Stefan Hajnoczi, Keith Busch
From: Dmitry Fomichev <dmitry.fomichev@wdc.com>
TP 4053 says (in section 2.3.1.1) -
... if a Zone Append command specifies a ZSLBA that is not the lowest
logical block address in that zone, then the controller shall abort
that command with a status code of Invalid Field In Command.
In the code, Zone Invalid Write is returned instead, fix this.
Signed-off-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
---
 hw/block/nvme.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index c73afdf8054f..35f39ecd9559 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1150,7 +1150,7 @@ static uint16_t nvme_check_zone_write(NvmeCtrl *n, NvmeNamespace *ns,
         if (append) {
             if (unlikely(slba != zone->d.zslba)) {
                 trace_pci_nvme_err_append_not_at_start(slba, zone->d.zslba);
-                status = NVME_ZONE_INVALID_WRITE;
+                status = NVME_INVALID_FIELD;
             }
             if (nvme_l2b(ns, nlb) > (n->page_size << n->zasl)) {
                 trace_pci_nvme_err_append_too_large(slba, nlb, n->zasl);
-- 
2.30.0
^ permalink raw reply related	[flat|nested] 62+ messages in thread
- * [PULL 29/56] hw/block/nvme: remove unused argument in nvme_ns_init_zoned
  2021-02-09  7:30 [PULL 00/56] emulated nvme patches Klaus Jensen
                   ` (27 preceding siblings ...)
  2021-02-09  7:30 ` [PULL 28/56] hw/block/nvme: Correct error status for unaligned ZA Klaus Jensen
@ 2021-02-09  7:30 ` Klaus Jensen
  2021-02-09  7:30 ` [PULL 30/56] hw/block/nvme: open code for volatile write cache Klaus Jensen
                   ` (27 subsequent siblings)
  56 siblings, 0 replies; 62+ messages in thread
From: Klaus Jensen @ 2021-02-09  7:30 UTC (permalink / raw)
  To: qemu-devel, Peter Maydell
  Cc: Fam Zheng, Kevin Wolf, qemu-block, Klaus Jensen, Max Reitz,
	Klaus Jensen, Minwoo Im, Stefan Hajnoczi, Keith Busch
From: Minwoo Im <minwoo.im.dev@gmail.com>
nvme_ns_init_zoned() has no use for given NvmeCtrl object.
Signed-off-by: Minwoo Im <minwoo.im.dev@gmail.com>
Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
---
 hw/block/nvme-ns.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index 9be170abb78d..d35c2925ecb8 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -205,7 +205,7 @@ static void nvme_ns_zoned_init_state(NvmeNamespace *ns)
     }
 }
 
-static void nvme_ns_init_zoned(NvmeCtrl *n, NvmeNamespace *ns, int lba_index)
+static void nvme_ns_init_zoned(NvmeNamespace *ns, int lba_index)
 {
     NvmeIdNsZoned *id_ns_z;
 
@@ -322,7 +322,7 @@ int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
         if (nvme_ns_zoned_check_calc_geometry(ns, errp) != 0) {
             return -1;
         }
-        nvme_ns_init_zoned(n, ns, 0);
+        nvme_ns_init_zoned(ns, 0);
     }
 
     if (nvme_register_namespace(n, ns, errp)) {
-- 
2.30.0
^ permalink raw reply related	[flat|nested] 62+ messages in thread
- * [PULL 30/56] hw/block/nvme: open code for volatile write cache
  2021-02-09  7:30 [PULL 00/56] emulated nvme patches Klaus Jensen
                   ` (28 preceding siblings ...)
  2021-02-09  7:30 ` [PULL 29/56] hw/block/nvme: remove unused argument in nvme_ns_init_zoned Klaus Jensen
@ 2021-02-09  7:30 ` Klaus Jensen
  2021-02-09  7:30 ` [PULL 31/56] hw/block/nvme: remove unused argument in nvme_ns_init_blk Klaus Jensen
                   ` (26 subsequent siblings)
  56 siblings, 0 replies; 62+ messages in thread
From: Klaus Jensen @ 2021-02-09  7:30 UTC (permalink / raw)
  To: qemu-devel, Peter Maydell
  Cc: Fam Zheng, Kevin Wolf, qemu-block, Klaus Jensen, Max Reitz,
	Klaus Jensen, Minwoo Im, Stefan Hajnoczi, Keith Busch
From: Minwoo Im <minwoo.im.dev@gmail.com>
Volatile Write Cache(VWC) feature is set in nvme_ns_setup() in the
initial time.  This feature is related to block device backed,  but this
feature is controlled in controller level via Set/Get Features command.
This patch removed dependency between nvme and nvme-ns to manage the VWC
flag value.  Also, it open coded the Get Features for VWC to check all
namespaces attached to the controller, and if false detected, return
directly false.
Signed-off-by: Minwoo Im <minwoo.im.dev@gmail.com>
[k.jensen: report write cache preset if present on ANY namespace]
Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
---
 hw/block/nvme.h    |  1 -
 hw/block/nvme-ns.c |  4 ----
 hw/block/nvme.c    | 15 ++++++++++++---
 3 files changed, 12 insertions(+), 8 deletions(-)
diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index 65540b650e1d..347c149e7905 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -121,7 +121,6 @@ typedef struct NvmeFeatureVal {
         uint16_t temp_thresh_low;
     };
     uint32_t    async_config;
-    uint32_t    vwc;
 } NvmeFeatureVal;
 
 typedef struct NvmeCtrl {
diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index d35c2925ecb8..7a5a77983798 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -90,10 +90,6 @@ static int nvme_ns_init_blk(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
         return -1;
     }
 
-    if (blk_enable_write_cache(ns->blkconf.blk)) {
-        n->features.vwc = 0x1;
-    }
-
     return 0;
 }
 
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 35f39ecd9559..0b002cb2beab 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -3097,6 +3097,7 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest *req)
     NvmeGetFeatureSelect sel = NVME_GETFEAT_SELECT(dw10);
     uint16_t iv;
     NvmeNamespace *ns;
+    int i;
 
     static const uint32_t nvme_feature_default[NVME_FID_MAX] = {
         [NVME_ARBITRATION] = NVME_ARB_AB_NOLIMIT,
@@ -3172,7 +3173,17 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest *req)
         result = ns->features.err_rec;
         goto out;
     case NVME_VOLATILE_WRITE_CACHE:
-        result = n->features.vwc;
+        for (i = 1; i <= n->num_namespaces; i++) {
+            ns = nvme_ns(n, i);
+            if (!ns) {
+                continue;
+            }
+
+            result = blk_enable_write_cache(ns->blkconf.blk);
+            if (result) {
+                break;
+            }
+        }
         trace_pci_nvme_getfeat_vwcache(result ? "enabled" : "disabled");
         goto out;
     case NVME_ASYNCHRONOUS_EVENT_CONF:
@@ -3335,8 +3346,6 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req)
         ns->features.err_rec = dw11;
         break;
     case NVME_VOLATILE_WRITE_CACHE:
-        n->features.vwc = dw11 & 0x1;
-
         for (i = 1; i <= n->num_namespaces; i++) {
             ns = nvme_ns(n, i);
             if (!ns) {
-- 
2.30.0
^ permalink raw reply related	[flat|nested] 62+ messages in thread
- * [PULL 31/56] hw/block/nvme: remove unused argument in nvme_ns_init_blk
  2021-02-09  7:30 [PULL 00/56] emulated nvme patches Klaus Jensen
                   ` (29 preceding siblings ...)
  2021-02-09  7:30 ` [PULL 30/56] hw/block/nvme: open code for volatile write cache Klaus Jensen
@ 2021-02-09  7:30 ` Klaus Jensen
  2021-02-09  7:30 ` [PULL 32/56] hw/block/nvme: split setup and register for namespace Klaus Jensen
                   ` (25 subsequent siblings)
  56 siblings, 0 replies; 62+ messages in thread
From: Klaus Jensen @ 2021-02-09  7:30 UTC (permalink / raw)
  To: qemu-devel, Peter Maydell
  Cc: Fam Zheng, Kevin Wolf, qemu-block, Klaus Jensen, Max Reitz,
	Klaus Jensen, Minwoo Im, Stefan Hajnoczi, Keith Busch
From: Minwoo Im <minwoo.im.dev@gmail.com>
Removed no longer used aregument NvmeCtrl object in nvme_ns_init_blk().
Signed-off-by: Minwoo Im <minwoo.im.dev@gmail.com>
Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
---
 hw/block/nvme-ns.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index 7a5a77983798..17e876e6bc44 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -66,7 +66,7 @@ static int nvme_ns_init(NvmeNamespace *ns, Error **errp)
     return 0;
 }
 
-static int nvme_ns_init_blk(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
+static int nvme_ns_init_blk(NvmeNamespace *ns, Error **errp)
 {
     bool read_only;
 
@@ -307,7 +307,7 @@ int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
         return -1;
     }
 
-    if (nvme_ns_init_blk(n, ns, errp)) {
+    if (nvme_ns_init_blk(ns, errp)) {
         return -1;
     }
 
-- 
2.30.0
^ permalink raw reply related	[flat|nested] 62+ messages in thread
- * [PULL 32/56] hw/block/nvme: split setup and register for namespace
  2021-02-09  7:30 [PULL 00/56] emulated nvme patches Klaus Jensen
                   ` (30 preceding siblings ...)
  2021-02-09  7:30 ` [PULL 31/56] hw/block/nvme: remove unused argument in nvme_ns_init_blk Klaus Jensen
@ 2021-02-09  7:30 ` Klaus Jensen
  2021-02-11  9:53   ` Alexander Graf
  2021-02-09  7:30 ` [PULL 33/56] hw/block/nvme: remove unused argument in nvme_ns_setup Klaus Jensen
                   ` (24 subsequent siblings)
  56 siblings, 1 reply; 62+ messages in thread
From: Klaus Jensen @ 2021-02-09  7:30 UTC (permalink / raw)
  To: qemu-devel, Peter Maydell
  Cc: Fam Zheng, Kevin Wolf, qemu-block, Klaus Jensen, Max Reitz,
	Klaus Jensen, Minwoo Im, Stefan Hajnoczi, Keith Busch
From: Minwoo Im <minwoo.im.dev@gmail.com>
In NVMe, namespace is being attached to process I/O.  We register NVMe
namespace to a controller via nvme_register_namespace() during
nvme_ns_setup().  This is main reason of receiving NvmeCtrl object
instance to this function to map the namespace to a controller.
To make namespace instance more independent, it should be split into two
parts: setup and register.  This patch split them into two differnt
parts, and finally nvme_ns_setup() does not have nothing to do with
NvmeCtrl instance at all.
This patch is a former patch to introduce NVMe subsystem scheme to the
existing design especially for multi-path.  In that case, it should be
split into two to make namespace independent from a controller.
Signed-off-by: Minwoo Im <minwoo.im.dev@gmail.com>
Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
---
 hw/block/nvme-ns.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)
diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index 17e876e6bc44..ce79ad4a5319 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -321,10 +321,6 @@ int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
         nvme_ns_init_zoned(ns, 0);
     }
 
-    if (nvme_register_namespace(n, ns, errp)) {
-        return -1;
-    }
-
     return 0;
 }
 
@@ -362,6 +358,13 @@ static void nvme_ns_realize(DeviceState *dev, Error **errp)
                                 "could not setup namespace: ");
         return;
     }
+
+    if (nvme_register_namespace(n, ns, errp)) {
+        error_propagate_prepend(errp, local_err,
+                                "could not register namespace: ");
+        return;
+    }
+
 }
 
 static Property nvme_ns_props[] = {
-- 
2.30.0
^ permalink raw reply related	[flat|nested] 62+ messages in thread
- * Re: [PULL 32/56] hw/block/nvme: split setup and register for namespace
  2021-02-09  7:30 ` [PULL 32/56] hw/block/nvme: split setup and register for namespace Klaus Jensen
@ 2021-02-11  9:53   ` Alexander Graf
  2021-02-11 10:41     ` Klaus Jensen
  2021-02-11 11:40     ` Philippe Mathieu-Daudé
  0 siblings, 2 replies; 62+ messages in thread
From: Alexander Graf @ 2021-02-11  9:53 UTC (permalink / raw)
  To: Klaus Jensen, qemu-devel, Peter Maydell
  Cc: Fam Zheng, Kevin Wolf, qemu-block, Klaus Jensen, Max Reitz,
	Minwoo Im, Stefan Hajnoczi, Keith Busch
[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset=UTF-8; format=flowed, Size: 2216 bytes --]
Hi Klaus,
On 09.02.21 08:30, Klaus Jensen wrote:
> From: Minwoo Im <minwoo.im.dev@gmail.com>
>
> In NVMe, namespace is being attached to process I/O.  We register NVMe
> namespace to a controller via nvme_register_namespace() during
> nvme_ns_setup().  This is main reason of receiving NvmeCtrl object
> instance to this function to map the namespace to a controller.
>
> To make namespace instance more independent, it should be split into two
> parts: setup and register.  This patch split them into two differnt
> parts, and finally nvme_ns_setup() does not have nothing to do with
> NvmeCtrl instance at all.
>
> This patch is a former patch to introduce NVMe subsystem scheme to the
> existing design especially for multi-path.  In that case, it should be
> split into two to make namespace independent from a controller.
>
> Signed-off-by: Minwoo Im <minwoo.im.dev@gmail.com>
> Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
In latest master, I can no longer access NVMe devices from edk2. Git 
bisect pointed me to this commit.
To reproduce:
  $ ./build/qemu-system-x86_64 -enable-kvm -pflash 
build/pc-bios/edk2-x86_64-code.fd -drive 
file=image.raw,if=none,id=d,snapshot=on -device nvme,serial=1234,drive=d 
-nographic -net none
You will see that before this commit, edk2 is able to enumerate the 
block device, while after this commit it does not find it.
good:
Mapping table
      FS0: Alias(s):HD1b:;BLK2:
         
PciRoot(0x0)/Pci(0x3,0x0)/NVMe(0x1,00-00-00-00-00-00-00-00)/HD(1,GPT,7A866FF6-0A12-4911-A4ED-D0565BEB41A2,0x80,0x64000)
     BLK0: Alias(s):
          PciRoot(0x0)/Pci(0x1,0x1)/Ata(0x0)
     BLK1: Alias(s):
         
PciRoot(0x0)/Pci(0x3,0x0)/NVMe(0x1,00-00-00-00-00-00-00-00)
     BLK3: Alias(s):
         
PciRoot(0x0)/Pci(0x3,0x0)/NVMe(0x1,00-00-00-00-00-00-00-00)/HD(2,GPT,F070566B-C58D-4F13-9B92-64F1794385E7,0x64080,0x40000)
     BLK4: Alias(s):
         
PciRoot(0x0)/Pci(0x3,0x0)/NVMe(0x1,00-00-00-00-00-00-00-00)/HD(3,GPT,EDE86BE3-C64F-4ACB-B4AA-5E6C0135D335,0xA4080,0x315BF1B)
bad:
Mapping table
     BLK0: Alias(s):
          PciRoot(0x0)/Pci(0x1,0x1)/Ata(0x0)
Thanks,
Alex
^ permalink raw reply	[flat|nested] 62+ messages in thread 
- * Re: [PULL 32/56] hw/block/nvme: split setup and register for namespace
  2021-02-11  9:53   ` Alexander Graf
@ 2021-02-11 10:41     ` Klaus Jensen
  2021-02-11 11:40     ` Philippe Mathieu-Daudé
  1 sibling, 0 replies; 62+ messages in thread
From: Klaus Jensen @ 2021-02-11 10:41 UTC (permalink / raw)
  To: Alexander Graf
  Cc: Fam Zheng, Peter Maydell, qemu-block, Klaus Jensen, qemu-devel,
	Max Reitz, Minwoo Im, Stefan Hajnoczi, Keith Busch, Kevin Wolf
[-- Attachment #1: Type: text/plain, Size: 1783 bytes --]
On Feb 11 10:53, Alexander Graf wrote:
> Hi Klaus,
> 
> On 09.02.21 08:30, Klaus Jensen wrote:
> > From: Minwoo Im <minwoo.im.dev@gmail.com>
> > 
> > In NVMe, namespace is being attached to process I/O.  We register NVMe
> > namespace to a controller via nvme_register_namespace() during
> > nvme_ns_setup().  This is main reason of receiving NvmeCtrl object
> > instance to this function to map the namespace to a controller.
> > 
> > To make namespace instance more independent, it should be split into two
> > parts: setup and register.  This patch split them into two differnt
> > parts, and finally nvme_ns_setup() does not have nothing to do with
> > NvmeCtrl instance at all.
> > 
> > This patch is a former patch to introduce NVMe subsystem scheme to the
> > existing design especially for multi-path.  In that case, it should be
> > split into two to make namespace independent from a controller.
> > 
> > Signed-off-by: Minwoo Im <minwoo.im.dev@gmail.com>
> > Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
> 
> 
> In latest master, I can no longer access NVMe devices from edk2. Git bisect
> pointed me to this commit.
> 
Hi Alexander,
Thanks for reporting this. This patch should fix it, I'll queue it up.
diff --git i/hw/block/nvme.c w/hw/block/nvme.c
index 5ce21b7100b3..d36e14fe13e2 100644
--- i/hw/block/nvme.c
+++ w/hw/block/nvme.c
@@ -4507,6 +4507,12 @@ static void nvme_realize(PCIDevice *pci_dev, Error **errp)
         if (nvme_ns_setup(ns, errp)) {
             return;
         }
+
+        if (nvme_register_namespace(n, ns, errp)) {
+            error_propagate_prepend(errp, local_err,
+                                    "could not register namespace: ");
+            return;
+        }
     }
 }
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]
^ permalink raw reply related	[flat|nested] 62+ messages in thread
- * Re: [PULL 32/56] hw/block/nvme: split setup and register for namespace
  2021-02-11  9:53   ` Alexander Graf
  2021-02-11 10:41     ` Klaus Jensen
@ 2021-02-11 11:40     ` Philippe Mathieu-Daudé
  2021-02-11 11:46       ` Klaus Jensen
  1 sibling, 1 reply; 62+ messages in thread
From: Philippe Mathieu-Daudé @ 2021-02-11 11:40 UTC (permalink / raw)
  To: Alexander Graf, Klaus Jensen, qemu-devel, Peter Maydell
  Cc: Fam Zheng, Kevin Wolf, Willian Rampazzo, qemu-block, Klaus Jensen,
	Max Reitz, Minwoo Im, Stefan Hajnoczi, Keith Busch
On 2/11/21 10:53 AM, Alexander Graf wrote:
> Hi Klaus,
> 
> On 09.02.21 08:30, Klaus Jensen wrote:
>> From: Minwoo Im <minwoo.im.dev@gmail.com>
>>
>> In NVMe, namespace is being attached to process I/O.  We register NVMe
>> namespace to a controller via nvme_register_namespace() during
>> nvme_ns_setup().  This is main reason of receiving NvmeCtrl object
>> instance to this function to map the namespace to a controller.
>>
>> To make namespace instance more independent, it should be split into two
>> parts: setup and register.  This patch split them into two differnt
>> parts, and finally nvme_ns_setup() does not have nothing to do with
>> NvmeCtrl instance at all.
>>
>> This patch is a former patch to introduce NVMe subsystem scheme to the
>> existing design especially for multi-path.  In that case, it should be
>> split into two to make namespace independent from a controller.
>>
>> Signed-off-by: Minwoo Im <minwoo.im.dev@gmail.com>
>> Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
> 
> 
> In latest master, I can no longer access NVMe devices from edk2. Git
> bisect pointed me to this commit.
> 
> To reproduce:
> 
>   $ ./build/qemu-system-x86_64 -enable-kvm -pflash
> build/pc-bios/edk2-x86_64-code.fd -drive
> file=image.raw,if=none,id=d,snapshot=on -device nvme,serial=1234,drive=d
> -nographic -net none
> 
> You will see that before this commit, edk2 is able to enumerate the
> block device, while after this commit it does not find it.
> 
> 
> good:
> 
> Mapping table
>       FS0: Alias(s):HD1b:;BLK2:
>         �
> PciRoot(0x0)/Pci(0x3,0x0)/NVMe(0x1,00-00-00-00-00-00-00-00)/HD(1,GPT,7A866FF6-0A12-4911-A4ED-D0565BEB41A2,0x80,0x64000)
> 
>      BLK0: Alias(s):
>           PciRoot(0x0)/Pci(0x1,0x1)/Ata(0x0)
>      BLK1: Alias(s):
>         � PciRoot(0x0)/Pci(0x3,0x0)/NVMe(0x1,00-00-00-00-00-00-00-00)
>      BLK3: Alias(s):
>         �
> PciRoot(0x0)/Pci(0x3,0x0)/NVMe(0x1,00-00-00-00-00-00-00-00)/HD(2,GPT,F070566B-C58D-4F13-9B92-64F1794385E7,0x64080,0x40000)
> 
>      BLK4: Alias(s):
>         �
> PciRoot(0x0)/Pci(0x3,0x0)/NVMe(0x1,00-00-00-00-00-00-00-00)/HD(3,GPT,EDE86BE3-C64F-4ACB-B4AA-5E6C0135D335,0xA4080,0x315BF1B)
Good integration test for the emulated NVMe subsystem!
tests/acceptance/boot_linux_console.py should provide trivial template.
> 
> 
> 
> bad:
> 
> Mapping table
>      BLK0: Alias(s):
>           PciRoot(0x0)/Pci(0x1,0x1)/Ata(0x0)
> 
> 
> 
> Thanks,
> 
> Alex
> 
> 
> 
^ permalink raw reply	[flat|nested] 62+ messages in thread 
- * Re: [PULL 32/56] hw/block/nvme: split setup and register for namespace
  2021-02-11 11:40     ` Philippe Mathieu-Daudé
@ 2021-02-11 11:46       ` Klaus Jensen
  0 siblings, 0 replies; 62+ messages in thread
From: Klaus Jensen @ 2021-02-11 11:46 UTC (permalink / raw)
  To: Philippe Mathieu-Daudé
  Cc: Fam Zheng, Peter Maydell, Willian Rampazzo, qemu-block,
	Klaus Jensen, qemu-devel, Max Reitz, Alexander Graf, Minwoo Im,
	Stefan Hajnoczi, Keith Busch, Kevin Wolf
[-- Attachment #1: Type: text/plain, Size: 2755 bytes --]
On Feb 11 12:40, Philippe Mathieu-Daudé wrote:
> On 2/11/21 10:53 AM, Alexander Graf wrote:
> > Hi Klaus,
> > 
> > On 09.02.21 08:30, Klaus Jensen wrote:
> >> From: Minwoo Im <minwoo.im.dev@gmail.com>
> >>
> >> In NVMe, namespace is being attached to process I/O.  We register NVMe
> >> namespace to a controller via nvme_register_namespace() during
> >> nvme_ns_setup().  This is main reason of receiving NvmeCtrl object
> >> instance to this function to map the namespace to a controller.
> >>
> >> To make namespace instance more independent, it should be split into two
> >> parts: setup and register.  This patch split them into two differnt
> >> parts, and finally nvme_ns_setup() does not have nothing to do with
> >> NvmeCtrl instance at all.
> >>
> >> This patch is a former patch to introduce NVMe subsystem scheme to the
> >> existing design especially for multi-path.  In that case, it should be
> >> split into two to make namespace independent from a controller.
> >>
> >> Signed-off-by: Minwoo Im <minwoo.im.dev@gmail.com>
> >> Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
> > 
> > 
> > In latest master, I can no longer access NVMe devices from edk2. Git
> > bisect pointed me to this commit.
> > 
> > To reproduce:
> > 
> >   $ ./build/qemu-system-x86_64 -enable-kvm -pflash
> > build/pc-bios/edk2-x86_64-code.fd -drive
> > file=image.raw,if=none,id=d,snapshot=on -device nvme,serial=1234,drive=d
> > -nographic -net none
> > 
> > You will see that before this commit, edk2 is able to enumerate the
> > block device, while after this commit it does not find it.
> > 
> > 
> > good:
> > 
> > Mapping table
> >       FS0: Alias(s):HD1b:;BLK2:
> >         �
> > PciRoot(0x0)/Pci(0x3,0x0)/NVMe(0x1,00-00-00-00-00-00-00-00)/HD(1,GPT,7A866FF6-0A12-4911-A4ED-D0565BEB41A2,0x80,0x64000)
> > 
> >      BLK0: Alias(s):
> >           PciRoot(0x0)/Pci(0x1,0x1)/Ata(0x0)
> >      BLK1: Alias(s):
> >         � PciRoot(0x0)/Pci(0x3,0x0)/NVMe(0x1,00-00-00-00-00-00-00-00)
> >      BLK3: Alias(s):
> >         �
> > PciRoot(0x0)/Pci(0x3,0x0)/NVMe(0x1,00-00-00-00-00-00-00-00)/HD(2,GPT,F070566B-C58D-4F13-9B92-64F1794385E7,0x64080,0x40000)
> > 
> >      BLK4: Alias(s):
> >         �
> > PciRoot(0x0)/Pci(0x3,0x0)/NVMe(0x1,00-00-00-00-00-00-00-00)/HD(3,GPT,EDE86BE3-C64F-4ACB-B4AA-5E6C0135D335,0xA4080,0x315BF1B)
> 
> Good integration test for the emulated NVMe subsystem!
> 
> tests/acceptance/boot_linux_console.py should provide trivial template.
> 
Yes. This is something we really need. I will priorities this together
with documentation :)
Thanks for pointing me in the direction of this Philippe.
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]
^ permalink raw reply	[flat|nested] 62+ messages in thread 
 
 
 
- * [PULL 33/56] hw/block/nvme: remove unused argument in nvme_ns_setup
  2021-02-09  7:30 [PULL 00/56] emulated nvme patches Klaus Jensen
                   ` (31 preceding siblings ...)
  2021-02-09  7:30 ` [PULL 32/56] hw/block/nvme: split setup and register for namespace Klaus Jensen
@ 2021-02-09  7:30 ` Klaus Jensen
  2021-02-09  7:30 ` [PULL 34/56] hw/block/nvme: fix zone write finalize Klaus Jensen
                   ` (23 subsequent siblings)
  56 siblings, 0 replies; 62+ messages in thread
From: Klaus Jensen @ 2021-02-09  7:30 UTC (permalink / raw)
  To: qemu-devel, Peter Maydell
  Cc: Fam Zheng, Kevin Wolf, qemu-block, Klaus Jensen, Max Reitz,
	Klaus Jensen, Minwoo Im, Stefan Hajnoczi, Keith Busch
From: Minwoo Im <minwoo.im.dev@gmail.com>
nvme_ns_setup() finally does not have nothing to do with NvmeCtrl
instance.
Signed-off-by: Minwoo Im <minwoo.im.dev@gmail.com>
Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
---
 hw/block/nvme-ns.h | 2 +-
 hw/block/nvme-ns.c | 4 ++--
 hw/block/nvme.c    | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
index a0baa5f6d44c..293ac990e3f6 100644
--- a/hw/block/nvme-ns.h
+++ b/hw/block/nvme-ns.h
@@ -174,7 +174,7 @@ static inline void nvme_aor_dec_active(NvmeNamespace *ns)
     assert(ns->nr_active_zones >= 0);
 }
 
-int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp);
+int nvme_ns_setup(NvmeNamespace *ns, Error **errp);
 void nvme_ns_drain(NvmeNamespace *ns);
 void nvme_ns_shutdown(NvmeNamespace *ns);
 void nvme_ns_cleanup(NvmeNamespace *ns);
diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index ce79ad4a5319..3f52acb89c95 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -301,7 +301,7 @@ static int nvme_ns_check_constraints(NvmeNamespace *ns, Error **errp)
     return 0;
 }
 
-int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
+int nvme_ns_setup(NvmeNamespace *ns, Error **errp)
 {
     if (nvme_ns_check_constraints(ns, errp)) {
         return -1;
@@ -353,7 +353,7 @@ static void nvme_ns_realize(DeviceState *dev, Error **errp)
     NvmeCtrl *n = NVME(s->parent);
     Error *local_err = NULL;
 
-    if (nvme_ns_setup(n, ns, &local_err)) {
+    if (nvme_ns_setup(ns, &local_err)) {
         error_propagate_prepend(errp, local_err,
                                 "could not setup namespace: ");
         return;
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 0b002cb2beab..30bd70fd5b07 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -4377,7 +4377,7 @@ static void nvme_realize(PCIDevice *pci_dev, Error **errp)
         ns = &n->namespace;
         ns->params.nsid = 1;
 
-        if (nvme_ns_setup(n, ns, errp)) {
+        if (nvme_ns_setup(ns, errp)) {
             return;
         }
     }
-- 
2.30.0
^ permalink raw reply related	[flat|nested] 62+ messages in thread
- * [PULL 34/56] hw/block/nvme: fix zone write finalize
  2021-02-09  7:30 [PULL 00/56] emulated nvme patches Klaus Jensen
                   ` (32 preceding siblings ...)
  2021-02-09  7:30 ` [PULL 33/56] hw/block/nvme: remove unused argument in nvme_ns_setup Klaus Jensen
@ 2021-02-09  7:30 ` Klaus Jensen
  2021-02-09  7:30 ` [PULL 35/56] nvme: introduce bit 5 for critical warning Klaus Jensen
                   ` (22 subsequent siblings)
  56 siblings, 0 replies; 62+ messages in thread
From: Klaus Jensen @ 2021-02-09  7:30 UTC (permalink / raw)
  To: qemu-devel, Peter Maydell
  Cc: Fam Zheng, Kevin Wolf, qemu-block, Dmitry Fomichev, Klaus Jensen,
	Max Reitz, Klaus Jensen, Stefan Hajnoczi, Keith Busch
From: Klaus Jensen <k.jensen@samsung.com>
The zone write pointer is unconditionally advanced, even for write
faults. Make sure that the zone is always transitioned to Full if the
write pointer reaches zone capacity.
Cc: Dmitry Fomichev <dmitry.fomichev@wdc.com>
Reviewed-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
---
 hw/block/nvme.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 30bd70fd5b07..4d73398798f1 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1268,10 +1268,13 @@ static void nvme_finalize_zoned_write(NvmeNamespace *ns, NvmeRequest *req,
     nlb = le16_to_cpu(rw->nlb) + 1;
     zone = nvme_get_zone_by_slba(ns, slba);
 
+    zone->d.wp += nlb;
+
     if (failed) {
         res->slba = 0;
-        zone->d.wp += nlb;
-    } else if (zone->w_ptr == nvme_zone_wr_boundary(zone)) {
+    }
+
+    if (zone->d.wp == nvme_zone_wr_boundary(zone)) {
         switch (nvme_get_zone_state(zone)) {
         case NVME_ZONE_STATE_IMPLICITLY_OPEN:
         case NVME_ZONE_STATE_EXPLICITLY_OPEN:
@@ -1288,9 +1291,6 @@ static void nvme_finalize_zoned_write(NvmeNamespace *ns, NvmeRequest *req,
         default:
             assert(false);
         }
-        zone->d.wp = zone->w_ptr;
-    } else {
-        zone->d.wp += nlb;
     }
 }
 
-- 
2.30.0
^ permalink raw reply related	[flat|nested] 62+ messages in thread
- * [PULL 35/56] nvme: introduce bit 5 for critical warning
  2021-02-09  7:30 [PULL 00/56] emulated nvme patches Klaus Jensen
                   ` (33 preceding siblings ...)
  2021-02-09  7:30 ` [PULL 34/56] hw/block/nvme: fix zone write finalize Klaus Jensen
@ 2021-02-09  7:30 ` Klaus Jensen
  2021-02-09  7:30 ` [PULL 36/56] hw/block/nvme: add smart_critical_warning property Klaus Jensen
                   ` (21 subsequent siblings)
  56 siblings, 0 replies; 62+ messages in thread
From: Klaus Jensen @ 2021-02-09  7:30 UTC (permalink / raw)
  To: qemu-devel, Peter Maydell
  Cc: Fam Zheng, Kevin Wolf, qemu-block, Klaus Jensen, zhenwei pi,
	Max Reitz, Klaus Jensen, Stefan Hajnoczi, Keith Busch
From: zhenwei pi <pizhenwei@bytedance.com>
According to NVM Express v1.4, Section 5.14.1.2 ("SMART / Health
Information"), introduce bit 5 for "Persistent Memory Region has become
read-only or unreliable".
Signed-off-by: zhenwei pi <pizhenwei@bytedance.com>
[k.jensen: minor brush ups in commit message]
Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
---
 include/block/nvme.h | 1 +
 1 file changed, 1 insertion(+)
diff --git a/include/block/nvme.h b/include/block/nvme.h
index 45b2678db1f0..41614c5e12af 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -789,6 +789,7 @@ enum NvmeSmartWarn {
     NVME_SMART_RELIABILITY            = 1 << 2,
     NVME_SMART_MEDIA_READ_ONLY        = 1 << 3,
     NVME_SMART_FAILED_VOLATILE_MEDIA  = 1 << 4,
+    NVME_SMART_PMR_UNRELIABLE         = 1 << 5,
 };
 
 typedef struct NvmeEffectsLog {
-- 
2.30.0
^ permalink raw reply related	[flat|nested] 62+ messages in thread
- * [PULL 36/56] hw/block/nvme: add smart_critical_warning property
  2021-02-09  7:30 [PULL 00/56] emulated nvme patches Klaus Jensen
                   ` (34 preceding siblings ...)
  2021-02-09  7:30 ` [PULL 35/56] nvme: introduce bit 5 for critical warning Klaus Jensen
@ 2021-02-09  7:30 ` Klaus Jensen
  2021-02-09  7:30 ` [PULL 37/56] hw/block/nvme: trigger async event during injecting smart warning Klaus Jensen
                   ` (20 subsequent siblings)
  56 siblings, 0 replies; 62+ messages in thread
From: Klaus Jensen @ 2021-02-09  7:30 UTC (permalink / raw)
  To: qemu-devel, Peter Maydell
  Cc: Fam Zheng, Kevin Wolf, qemu-block, Klaus Jensen, zhenwei pi,
	Max Reitz, Klaus Jensen, Stefan Hajnoczi, Keith Busch,
	Philippe Mathieu-Daudé
From: zhenwei pi <pizhenwei@bytedance.com>
There is a very low probability that hitting physical NVMe disk
hardware critical warning case, it's hard to write & test a monitor
agent service.
For debugging purposes, add a new 'smart_critical_warning' property
to emulate this situation.
The orignal version of this change is implemented by adding a fixed
property which could be initialized by QEMU command line. Suggested
by Philippe & Klaus, rework like current version.
Test with this patch:
1, change smart_critical_warning property for a running VM:
 #virsh qemu-monitor-command nvme-upstream '{ "execute": "qom-set",
  "arguments": { "path": "/machine/peripheral-anon/device[0]",
  "property": "smart_critical_warning", "value":16 } }'
2, run smartctl in guest
 #smartctl -H -l error /dev/nvme0n1
  === START OF SMART DATA SECTION ===
  SMART overall-health self-assessment test result: FAILED!
  - volatile memory backup device has failed
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Signed-off-by: zhenwei pi <pizhenwei@bytedance.com>
Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
---
 hw/block/nvme.h      |  1 +
 include/block/nvme.h |  1 +
 hw/block/nvme.c      | 45 +++++++++++++++++++++++++++++++++++++++++---
 3 files changed, 44 insertions(+), 3 deletions(-)
diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index 347c149e7905..b0d5b6409d8e 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -148,6 +148,7 @@ typedef struct NvmeCtrl {
     uint64_t    timestamp_set_qemu_clock_ms;    /* QEMU clock time */
     uint64_t    starttime_ms;
     uint16_t    temperature;
+    uint8_t     smart_critical_warning;
 
     HostMemoryBackend *pmrdev;
 
diff --git a/include/block/nvme.h b/include/block/nvme.h
index 41614c5e12af..88af3b42348c 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -60,6 +60,7 @@ enum NvmeCapMask {
 #define NVME_CAP_CSS(cap)   (((cap) >> CAP_CSS_SHIFT)    & CAP_CSS_MASK)
 #define NVME_CAP_MPSMIN(cap)(((cap) >> CAP_MPSMIN_SHIFT) & CAP_MPSMIN_MASK)
 #define NVME_CAP_MPSMAX(cap)(((cap) >> CAP_MPSMAX_SHIFT) & CAP_MPSMAX_MASK)
+#define NVME_CAP_PMR(cap)   (((cap) >> CAP_PMR_SHIFT)    & CAP_PMR_MASK)
 
 #define NVME_CAP_SET_MQES(cap, val)   (cap |= (uint64_t)(val & CAP_MQES_MASK)  \
                                                            << CAP_MQES_SHIFT)
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 4d73398798f1..f0cb7acd7454 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -2490,6 +2490,7 @@ static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
     }
 
     trans_len = MIN(sizeof(smart) - off, buf_len);
+    smart.critical_warning = n->smart_critical_warning;
 
     smart.data_units_read[0] = cpu_to_le64(DIV_ROUND_UP(stats.units_read,
                                                         1000));
@@ -4432,6 +4433,40 @@ static Property nvme_props[] = {
     DEFINE_PROP_END_OF_LIST(),
 };
 
+static void nvme_get_smart_warning(Object *obj, Visitor *v, const char *name,
+                                   void *opaque, Error **errp)
+{
+    NvmeCtrl *n = NVME(obj);
+    uint8_t value = n->smart_critical_warning;
+
+    visit_type_uint8(v, name, &value, errp);
+}
+
+static void nvme_set_smart_warning(Object *obj, Visitor *v, const char *name,
+                                   void *opaque, Error **errp)
+{
+    NvmeCtrl *n = NVME(obj);
+    uint8_t value, cap = 0;
+
+    if (!visit_type_uint8(v, name, &value, errp)) {
+        return;
+    }
+
+    cap = NVME_SMART_SPARE | NVME_SMART_TEMPERATURE | NVME_SMART_RELIABILITY
+          | NVME_SMART_MEDIA_READ_ONLY | NVME_SMART_FAILED_VOLATILE_MEDIA;
+    if (NVME_CAP_PMR(n->bar.cap)) {
+        cap |= NVME_SMART_PMR_UNRELIABLE;
+    }
+
+    if ((value & cap) != value) {
+        error_setg(errp, "unsupported smart critical warning bits: 0x%x",
+                   value & ~cap);
+        return;
+    }
+
+    n->smart_critical_warning = value;
+}
+
 static const VMStateDescription nvme_vmstate = {
     .name = "nvme",
     .unmigratable = 1,
@@ -4455,13 +4490,17 @@ static void nvme_class_init(ObjectClass *oc, void *data)
 
 static void nvme_instance_init(Object *obj)
 {
-    NvmeCtrl *s = NVME(obj);
+    NvmeCtrl *n = NVME(obj);
 
-    if (s->namespace.blkconf.blk) {
-        device_add_bootindex_property(obj, &s->namespace.blkconf.bootindex,
+    if (n->namespace.blkconf.blk) {
+        device_add_bootindex_property(obj, &n->namespace.blkconf.bootindex,
                                       "bootindex", "/namespace@1,0",
                                       DEVICE(obj));
     }
+
+    object_property_add(obj, "smart_critical_warning", "uint8",
+                        nvme_get_smart_warning,
+                        nvme_set_smart_warning, NULL, NULL);
 }
 
 static const TypeInfo nvme_info = {
-- 
2.30.0
^ permalink raw reply related	[flat|nested] 62+ messages in thread
- * [PULL 37/56] hw/block/nvme: trigger async event during injecting smart warning
  2021-02-09  7:30 [PULL 00/56] emulated nvme patches Klaus Jensen
                   ` (35 preceding siblings ...)
  2021-02-09  7:30 ` [PULL 36/56] hw/block/nvme: add smart_critical_warning property Klaus Jensen
@ 2021-02-09  7:30 ` Klaus Jensen
  2021-02-09  7:30 ` [PULL 38/56] hw/block/nvme: add size to mmio read/write trace events Klaus Jensen
                   ` (19 subsequent siblings)
  56 siblings, 0 replies; 62+ messages in thread
From: Klaus Jensen @ 2021-02-09  7:30 UTC (permalink / raw)
  To: qemu-devel, Peter Maydell
  Cc: Fam Zheng, Kevin Wolf, qemu-block, Klaus Jensen, zhenwei pi,
	Max Reitz, Klaus Jensen, Stefan Hajnoczi, Keith Busch
From: zhenwei pi <pizhenwei@bytedance.com>
During smart critical warning injection by setting property from QMP
command, also try to trigger asynchronous event.
Suggested by Keith, if a event has already been raised, there is no
need to enqueue the duplicate event any more.
Signed-off-by: zhenwei pi <pizhenwei@bytedance.com>
[k.jensen: fix typo in commit message]
Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
---
 include/block/nvme.h |  1 +
 hw/block/nvme.c      | 48 +++++++++++++++++++++++++++++++++++++-------
 2 files changed, 42 insertions(+), 7 deletions(-)
diff --git a/include/block/nvme.h b/include/block/nvme.h
index 88af3b42348c..854fb2abb6f8 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -784,6 +784,7 @@ typedef struct QEMU_PACKED NvmeSmartLog {
     uint8_t     reserved2[320];
 } NvmeSmartLog;
 
+#define NVME_SMART_WARN_MAX     6
 enum NvmeSmartWarn {
     NVME_SMART_SPARE                  = 1 << 0,
     NVME_SMART_TEMPERATURE            = 1 << 1,
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index f0cb7acd7454..09eb1f06e8b1 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -980,6 +980,35 @@ static void nvme_enqueue_event(NvmeCtrl *n, uint8_t event_type,
     nvme_process_aers(n);
 }
 
+static void nvme_smart_event(NvmeCtrl *n, uint8_t event)
+{
+    uint8_t aer_info;
+
+    /* Ref SPEC <Asynchronous Event Information 0x2013 SMART / Health Status> */
+    if (!(NVME_AEC_SMART(n->features.async_config) & event)) {
+        return;
+    }
+
+    switch (event) {
+    case NVME_SMART_SPARE:
+        aer_info = NVME_AER_INFO_SMART_SPARE_THRESH;
+        break;
+    case NVME_SMART_TEMPERATURE:
+        aer_info = NVME_AER_INFO_SMART_TEMP_THRESH;
+        break;
+    case NVME_SMART_RELIABILITY:
+    case NVME_SMART_MEDIA_READ_ONLY:
+    case NVME_SMART_FAILED_VOLATILE_MEDIA:
+    case NVME_SMART_PMR_UNRELIABLE:
+        aer_info = NVME_AER_INFO_SMART_RELIABILITY;
+        break;
+    default:
+        return;
+    }
+
+    nvme_enqueue_event(n, NVME_AER_TYPE_SMART, aer_info, NVME_LOG_SMART_INFO);
+}
+
 static void nvme_clear_events(NvmeCtrl *n, uint8_t event_type)
 {
     n->aer_mask &= ~(1 << event_type);
@@ -3317,12 +3346,9 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req)
             return NVME_INVALID_FIELD | NVME_DNR;
         }
 
-        if (((n->temperature >= n->features.temp_thresh_hi) ||
-             (n->temperature <= n->features.temp_thresh_low)) &&
-            NVME_AEC_SMART(n->features.async_config) & NVME_SMART_TEMPERATURE) {
-            nvme_enqueue_event(n, NVME_AER_TYPE_SMART,
-                               NVME_AER_INFO_SMART_TEMP_THRESH,
-                               NVME_LOG_SMART_INFO);
+        if ((n->temperature >= n->features.temp_thresh_hi) ||
+            (n->temperature <= n->features.temp_thresh_low)) {
+            nvme_smart_event(n, NVME_AER_INFO_SMART_TEMP_THRESH);
         }
 
         break;
@@ -4446,7 +4472,7 @@ static void nvme_set_smart_warning(Object *obj, Visitor *v, const char *name,
                                    void *opaque, Error **errp)
 {
     NvmeCtrl *n = NVME(obj);
-    uint8_t value, cap = 0;
+    uint8_t value, old_value, cap = 0, index, event;
 
     if (!visit_type_uint8(v, name, &value, errp)) {
         return;
@@ -4464,7 +4490,15 @@ static void nvme_set_smart_warning(Object *obj, Visitor *v, const char *name,
         return;
     }
 
+    old_value = n->smart_critical_warning;
     n->smart_critical_warning = value;
+
+    /* only inject new bits of smart critical warning */
+    for (index = 0; index < NVME_SMART_WARN_MAX; index++) {
+        event = 1 << index;
+        if (value & ~old_value & event)
+            nvme_smart_event(n, event);
+    }
 }
 
 static const VMStateDescription nvme_vmstate = {
-- 
2.30.0
^ permalink raw reply related	[flat|nested] 62+ messages in thread
- * [PULL 38/56] hw/block/nvme: add size to mmio read/write trace events
  2021-02-09  7:30 [PULL 00/56] emulated nvme patches Klaus Jensen
                   ` (36 preceding siblings ...)
  2021-02-09  7:30 ` [PULL 37/56] hw/block/nvme: trigger async event during injecting smart warning Klaus Jensen
@ 2021-02-09  7:30 ` Klaus Jensen
  2021-02-09  7:30 ` [PULL 39/56] hw/block/nvme: fix 64 bit register hi/lo split writes Klaus Jensen
                   ` (18 subsequent siblings)
  56 siblings, 0 replies; 62+ messages in thread
From: Klaus Jensen @ 2021-02-09  7:30 UTC (permalink / raw)
  To: qemu-devel, Peter Maydell
  Cc: Fam Zheng, Kevin Wolf, qemu-block, Klaus Jensen, Max Reitz,
	Klaus Jensen, Minwoo Im, Stefan Hajnoczi, Keith Busch
From: Klaus Jensen <k.jensen@samsung.com>
Add the size of the mmio read/write to the trace event.
Reviewed-by: Minwoo Im <minwoo.im.dev@gmail.com>
Reviewed-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
---
 hw/block/nvme.c       | 4 ++--
 hw/block/trace-events | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 09eb1f06e8b1..2407b6578abc 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -3878,7 +3878,7 @@ static uint64_t nvme_mmio_read(void *opaque, hwaddr addr, unsigned size)
     uint8_t *ptr = (uint8_t *)&n->bar;
     uint64_t val = 0;
 
-    trace_pci_nvme_mmio_read(addr);
+    trace_pci_nvme_mmio_read(addr, size);
 
     if (unlikely(addr & (sizeof(uint32_t) - 1))) {
         NVME_GUEST_ERR(pci_nvme_ub_mmiord_misaligned32,
@@ -4042,7 +4042,7 @@ static void nvme_mmio_write(void *opaque, hwaddr addr, uint64_t data,
 {
     NvmeCtrl *n = (NvmeCtrl *)opaque;
 
-    trace_pci_nvme_mmio_write(addr, data);
+    trace_pci_nvme_mmio_write(addr, data, size);
 
     if (addr < sizeof(n->bar)) {
         nvme_write_bar(n, addr, data, size);
diff --git a/hw/block/trace-events b/hw/block/trace-events
index 6d1686e6dc9d..3772502033af 100644
--- a/hw/block/trace-events
+++ b/hw/block/trace-events
@@ -80,8 +80,8 @@ pci_nvme_enqueue_event_noqueue(int queued) "queued %d"
 pci_nvme_enqueue_event_masked(uint8_t typ) "type 0x%"PRIx8""
 pci_nvme_no_outstanding_aers(void) "ignoring event; no outstanding AERs"
 pci_nvme_enqueue_req_completion(uint16_t cid, uint16_t cqid, uint16_t status) "cid %"PRIu16" cqid %"PRIu16" status 0x%"PRIx16""
-pci_nvme_mmio_read(uint64_t addr) "addr 0x%"PRIx64""
-pci_nvme_mmio_write(uint64_t addr, uint64_t data) "addr 0x%"PRIx64" data 0x%"PRIx64""
+pci_nvme_mmio_read(uint64_t addr, unsigned size) "addr 0x%"PRIx64" size %d"
+pci_nvme_mmio_write(uint64_t addr, uint64_t data, unsigned size) "addr 0x%"PRIx64" data 0x%"PRIx64" size %d"
 pci_nvme_mmio_doorbell_cq(uint16_t cqid, uint16_t new_head) "cqid %"PRIu16" new_head %"PRIu16""
 pci_nvme_mmio_doorbell_sq(uint16_t sqid, uint16_t new_tail) "sqid %"PRIu16" new_tail %"PRIu16""
 pci_nvme_mmio_intm_set(uint64_t data, uint64_t new_mask) "wrote MMIO, interrupt mask set, data=0x%"PRIx64", new_mask=0x%"PRIx64""
-- 
2.30.0
^ permalink raw reply related	[flat|nested] 62+ messages in thread
- * [PULL 39/56] hw/block/nvme: fix 64 bit register hi/lo split writes
  2021-02-09  7:30 [PULL 00/56] emulated nvme patches Klaus Jensen
                   ` (37 preceding siblings ...)
  2021-02-09  7:30 ` [PULL 38/56] hw/block/nvme: add size to mmio read/write trace events Klaus Jensen
@ 2021-02-09  7:30 ` Klaus Jensen
  2021-02-09  7:30 ` [PULL 40/56] hw/block/nvme: indicate CMB support through controller capabilities register Klaus Jensen
                   ` (17 subsequent siblings)
  56 siblings, 0 replies; 62+ messages in thread
From: Klaus Jensen @ 2021-02-09  7:30 UTC (permalink / raw)
  To: qemu-devel, Peter Maydell
  Cc: Fam Zheng, Kevin Wolf, qemu-block, Klaus Jensen, Max Reitz,
	Klaus Jensen, Stefan Hajnoczi, Keith Busch
From: Klaus Jensen <k.jensen@samsung.com>
64 bit registers like ASQ and ACQ should be writable by both a hi/lo 32
bit write combination as well as a plain 64 bit write. The spec does not
define ordering on the hi/lo split, but the code currently assumes that
the low order bits are written first. Additionally, the code does not
consider that another address might already have been written into the
register, causing the OR'ing to result in a bad address.
Fix this by explicitly overwriting only the low or high order bits for
32 bit writes.
Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
Reviewed-by: Keith Busch <kbusch@kernel.org>
---
 hw/block/nvme.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 2407b6578abc..2785127037db 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -3819,19 +3819,21 @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
         trace_pci_nvme_mmio_aqattr(data & 0xffffffff);
         break;
     case 0x28:  /* ASQ */
-        n->bar.asq = data;
+        n->bar.asq = size == 8 ? data :
+            (n->bar.asq & ~0xffffffffULL) | (data & 0xffffffff);
         trace_pci_nvme_mmio_asqaddr(data);
         break;
     case 0x2c:  /* ASQ hi */
-        n->bar.asq |= data << 32;
+        n->bar.asq = (n->bar.asq & 0xffffffff) | (data << 32);
         trace_pci_nvme_mmio_asqaddr_hi(data, n->bar.asq);
         break;
     case 0x30:  /* ACQ */
         trace_pci_nvme_mmio_acqaddr(data);
-        n->bar.acq = data;
+        n->bar.acq = size == 8 ? data :
+            (n->bar.acq & ~0xffffffffULL) | (data & 0xffffffff);
         break;
     case 0x34:  /* ACQ hi */
-        n->bar.acq |= data << 32;
+        n->bar.acq = (n->bar.acq & 0xffffffff) | (data << 32);
         trace_pci_nvme_mmio_acqaddr_hi(data, n->bar.acq);
         break;
     case 0x38:  /* CMBLOC */
-- 
2.30.0
^ permalink raw reply related	[flat|nested] 62+ messages in thread
- * [PULL 40/56] hw/block/nvme: indicate CMB support through controller capabilities register
  2021-02-09  7:30 [PULL 00/56] emulated nvme patches Klaus Jensen
                   ` (38 preceding siblings ...)
  2021-02-09  7:30 ` [PULL 39/56] hw/block/nvme: fix 64 bit register hi/lo split writes Klaus Jensen
@ 2021-02-09  7:30 ` Klaus Jensen
  2021-02-09  7:30 ` [PULL 41/56] hw/block/nvme: move msix table and pba to BAR 0 Klaus Jensen
                   ` (16 subsequent siblings)
  56 siblings, 0 replies; 62+ messages in thread
From: Klaus Jensen @ 2021-02-09  7:30 UTC (permalink / raw)
  To: qemu-devel, Peter Maydell
  Cc: Fam Zheng, Kevin Wolf, qemu-block, Klaus Jensen, Max Reitz,
	Klaus Jensen, Andrzej Jakowski, Maxim Levitsky, Stefan Hajnoczi,
	Keith Busch, Minwoo Im
From: Andrzej Jakowski <andrzej.jakowski@linux.intel.com>
This patch sets CMBS bit in controller capabilities register when user
configures NVMe driver with CMB support, so capabilites are correctly
reported to guest OS.
Signed-off-by: Andrzej Jakowski <andrzej.jakowski@linux.intel.com>
Reviewed-by: Maxim Levitsky <mlevitsky@gmail.com>
Reviewed-by: Minwoo Im <minwoo.im.dev@gmail.com>
Reviewed-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
---
 include/block/nvme.h | 10 +++++++---
 hw/block/nvme.c      |  1 +
 2 files changed, 8 insertions(+), 3 deletions(-)
diff --git a/include/block/nvme.h b/include/block/nvme.h
index 854fb2abb6f8..151921da21f9 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -36,6 +36,7 @@ enum NvmeCapShift {
     CAP_MPSMIN_SHIFT   = 48,
     CAP_MPSMAX_SHIFT   = 52,
     CAP_PMR_SHIFT      = 56,
+    CAP_CMB_SHIFT      = 57,
 };
 
 enum NvmeCapMask {
@@ -49,6 +50,7 @@ enum NvmeCapMask {
     CAP_MPSMIN_MASK    = 0xf,
     CAP_MPSMAX_MASK    = 0xf,
     CAP_PMR_MASK       = 0x1,
+    CAP_CMB_MASK       = 0x1,
 };
 
 #define NVME_CAP_MQES(cap)  (((cap) >> CAP_MQES_SHIFT)   & CAP_MQES_MASK)
@@ -79,9 +81,11 @@ enum NvmeCapMask {
 #define NVME_CAP_SET_MPSMIN(cap, val) (cap |= (uint64_t)(val & CAP_MPSMIN_MASK)\
                                                            << CAP_MPSMIN_SHIFT)
 #define NVME_CAP_SET_MPSMAX(cap, val) (cap |= (uint64_t)(val & CAP_MPSMAX_MASK)\
-                                                            << CAP_MPSMAX_SHIFT)
-#define NVME_CAP_SET_PMRS(cap, val) (cap |= (uint64_t)(val & CAP_PMR_MASK)\
-                                                            << CAP_PMR_SHIFT)
+                                                           << CAP_MPSMAX_SHIFT)
+#define NVME_CAP_SET_PMRS(cap, val)   (cap |= (uint64_t)(val & CAP_PMR_MASK)   \
+                                                           << CAP_PMR_SHIFT)
+#define NVME_CAP_SET_CMBS(cap, val)   (cap |= (uint64_t)(val & CAP_CMB_MASK)   \
+                                                           << CAP_CMB_SHIFT)
 
 enum NvmeCapCss {
     NVME_CAP_CSS_NVM        = 1 << 0,
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 2785127037db..5f12ac1200ec 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -4374,6 +4374,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
     NVME_CAP_SET_CSS(n->bar.cap, NVME_CAP_CSS_CSI_SUPP);
     NVME_CAP_SET_CSS(n->bar.cap, NVME_CAP_CSS_ADMIN_ONLY);
     NVME_CAP_SET_MPSMAX(n->bar.cap, 4);
+    NVME_CAP_SET_CMBS(n->bar.cap, n->params.cmb_size_mb ? 1 : 0);
 
     n->bar.vs = NVME_SPEC_VER;
     n->bar.intmc = n->bar.intms = 0;
-- 
2.30.0
^ permalink raw reply related	[flat|nested] 62+ messages in thread
- * [PULL 41/56] hw/block/nvme: move msix table and pba to BAR 0
  2021-02-09  7:30 [PULL 00/56] emulated nvme patches Klaus Jensen
                   ` (39 preceding siblings ...)
  2021-02-09  7:30 ` [PULL 40/56] hw/block/nvme: indicate CMB support through controller capabilities register Klaus Jensen
@ 2021-02-09  7:30 ` Klaus Jensen
  2021-02-09  7:30 ` [PULL 42/56] hw/block/nvme: allow cmb and pmr to coexist Klaus Jensen
                   ` (15 subsequent siblings)
  56 siblings, 0 replies; 62+ messages in thread
From: Klaus Jensen @ 2021-02-09  7:30 UTC (permalink / raw)
  To: qemu-devel, Peter Maydell
  Cc: Fam Zheng, Kevin Wolf, qemu-block, Klaus Jensen, Max Reitz,
	Klaus Jensen, Minwoo Im, Stefan Hajnoczi, Keith Busch
From: Klaus Jensen <k.jensen@samsung.com>
In the interest of supporting both CMB and PMR to be enabled on the same
device, move the MSI-X table and pending bit array out of BAR 4 and into
BAR 0.
This is a simplified version of the patch contributed by Andrzej
Jakowski (see [1]). Leaving the CMB at offset 0 removes the need for
changes to CMB address mapping code.
  [1]: https://lore.kernel.org/qemu-devel/20200729220107.37758-3-andrzej.jakowski@linux.intel.com/
Reviewed-by: Minwoo Im <minwoo.im.dev@gmail.com>
Tested-by: Minwoo Im <minwoo.im.dev@gmail.com>
Reviewed-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
---
 hw/block/nvme.h |  1 +
 hw/block/nvme.c | 23 +++++++++++++++++++++--
 2 files changed, 22 insertions(+), 2 deletions(-)
diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index b0d5b6409d8e..1cdb360bc549 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -125,6 +125,7 @@ typedef struct NvmeFeatureVal {
 
 typedef struct NvmeCtrl {
     PCIDevice    parent_obj;
+    MemoryRegion bar0;
     MemoryRegion iomem;
     MemoryRegion ctrl_mem;
     NvmeBar      bar;
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 5f12ac1200ec..85d3c43c4f74 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -4268,6 +4268,8 @@ static void nvme_init_pmr(NvmeCtrl *n, PCIDevice *pci_dev)
 static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp)
 {
     uint8_t *pci_conf = pci_dev->config;
+    uint64_t bar_size, msix_table_size, msix_pba_size;
+    unsigned msix_table_offset, msix_pba_offset;
     int ret;
 
     Error *err = NULL;
@@ -4286,11 +4288,28 @@ static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp)
     pci_config_set_class(pci_conf, PCI_CLASS_STORAGE_EXPRESS);
     pcie_endpoint_cap_init(pci_dev, 0x80);
 
+    bar_size = QEMU_ALIGN_UP(n->reg_size, 4 * KiB);
+    msix_table_offset = bar_size;
+    msix_table_size = PCI_MSIX_ENTRY_SIZE * n->params.msix_qsize;
+
+    bar_size += msix_table_size;
+    bar_size = QEMU_ALIGN_UP(bar_size, 4 * KiB);
+    msix_pba_offset = bar_size;
+    msix_pba_size = QEMU_ALIGN_UP(n->params.msix_qsize, 64) / 8;
+
+    bar_size += msix_pba_size;
+    bar_size = pow2ceil(bar_size);
+
+    memory_region_init(&n->bar0, OBJECT(n), "nvme-bar0", bar_size);
     memory_region_init_io(&n->iomem, OBJECT(n), &nvme_mmio_ops, n, "nvme",
                           n->reg_size);
+    memory_region_add_subregion(&n->bar0, 0, &n->iomem);
+
     pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY |
-                     PCI_BASE_ADDRESS_MEM_TYPE_64, &n->iomem);
-    ret = msix_init_exclusive_bar(pci_dev, n->params.msix_qsize, 4, &err);
+                     PCI_BASE_ADDRESS_MEM_TYPE_64, &n->bar0);
+    ret = msix_init(pci_dev, n->params.msix_qsize,
+                    &n->bar0, 0, msix_table_offset,
+                    &n->bar0, 0, msix_pba_offset, 0, &err);
     if (ret < 0) {
         if (ret == -ENOTSUP) {
             warn_report_err(err);
-- 
2.30.0
^ permalink raw reply related	[flat|nested] 62+ messages in thread
- * [PULL 42/56] hw/block/nvme: allow cmb and pmr to coexist
  2021-02-09  7:30 [PULL 00/56] emulated nvme patches Klaus Jensen
                   ` (40 preceding siblings ...)
  2021-02-09  7:30 ` [PULL 41/56] hw/block/nvme: move msix table and pba to BAR 0 Klaus Jensen
@ 2021-02-09  7:30 ` Klaus Jensen
  2021-02-09  7:30 ` [PULL 43/56] hw/block/nvme: rename PMR/CMB shift/mask fields Klaus Jensen
                   ` (14 subsequent siblings)
  56 siblings, 0 replies; 62+ messages in thread
From: Klaus Jensen @ 2021-02-09  7:30 UTC (permalink / raw)
  To: qemu-devel, Peter Maydell
  Cc: Fam Zheng, Kevin Wolf, qemu-block, Klaus Jensen, Max Reitz,
	Klaus Jensen, Minwoo Im, Stefan Hajnoczi, Keith Busch
From: Klaus Jensen <k.jensen@samsung.com>
With BAR 4 now free to use, allow PMR and CMB to be enabled
simultaneously.
Reviewed-by: Minwoo Im <minwoo.im.dev@gmail.com>
Reviewed-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
---
 hw/block/nvme.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 85d3c43c4f74..4ce75642f1a4 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -29,14 +29,13 @@
  * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
  * offset 0 in BAR2 and supports only WDS, RDS and SQS for now.
  *
- * cmb_size_mb= and pmrdev= options are mutually exclusive due to limitation
- * in available BAR's. cmb_size_mb= will take precedence over pmrdev= when
- * both provided.
  * Enabling pmr emulation can be achieved by pointing to memory-backend-file.
  * For example:
  * -object memory-backend-file,id=<mem_id>,share=on,mem-path=<file_path>, \
  *  size=<size> .... -device nvme,...,pmrdev=<mem_id>
  *
+ * The PMR will use BAR 4/5 exclusively.
+ *
  *
  * nvme device parameters
  * ~~~~~~~~~~~~~~~~~~~~~~
@@ -109,7 +108,7 @@
 #define NVME_DB_SIZE  4
 #define NVME_SPEC_VER 0x00010300
 #define NVME_CMB_BIR 2
-#define NVME_PMR_BIR 2
+#define NVME_PMR_BIR 4
 #define NVME_TEMPERATURE 0x143
 #define NVME_TEMPERATURE_WARNING 0x157
 #define NVME_TEMPERATURE_CRITICAL 0x175
@@ -4121,7 +4120,7 @@ static void nvme_check_constraints(NvmeCtrl *n, Error **errp)
         return;
     }
 
-    if (!n->params.cmb_size_mb && n->pmrdev) {
+    if (n->pmrdev) {
         if (host_memory_backend_is_mapped(n->pmrdev)) {
             error_setg(errp, "can't use already busy memdev: %s",
                        object_get_canonical_path_component(OBJECT(n->pmrdev)));
@@ -4218,9 +4217,6 @@ static void nvme_init_cmb(NvmeCtrl *n, PCIDevice *pci_dev)
 
 static void nvme_init_pmr(NvmeCtrl *n, PCIDevice *pci_dev)
 {
-    /* Controller Capabilities register */
-    NVME_CAP_SET_PMRS(n->bar.cap, 1);
-
     /* PMR Capabities register */
     n->bar.pmrcap = 0;
     NVME_PMRCAP_SET_RDS(n->bar.pmrcap, 0);
@@ -4321,7 +4317,9 @@ static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp)
 
     if (n->params.cmb_size_mb) {
         nvme_init_cmb(n, pci_dev);
-    } else if (n->pmrdev) {
+    }
+
+    if (n->pmrdev) {
         nvme_init_pmr(n, pci_dev);
     }
 
@@ -4394,6 +4392,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
     NVME_CAP_SET_CSS(n->bar.cap, NVME_CAP_CSS_ADMIN_ONLY);
     NVME_CAP_SET_MPSMAX(n->bar.cap, 4);
     NVME_CAP_SET_CMBS(n->bar.cap, n->params.cmb_size_mb ? 1 : 0);
+    NVME_CAP_SET_PMRS(n->bar.cap, n->pmrdev ? 1 : 0);
 
     n->bar.vs = NVME_SPEC_VER;
     n->bar.intmc = n->bar.intms = 0;
-- 
2.30.0
^ permalink raw reply related	[flat|nested] 62+ messages in thread
- * [PULL 43/56] hw/block/nvme: rename PMR/CMB shift/mask fields
  2021-02-09  7:30 [PULL 00/56] emulated nvme patches Klaus Jensen
                   ` (41 preceding siblings ...)
  2021-02-09  7:30 ` [PULL 42/56] hw/block/nvme: allow cmb and pmr to coexist Klaus Jensen
@ 2021-02-09  7:30 ` Klaus Jensen
  2021-02-09  7:30 ` [PULL 44/56] hw/block/nvme: remove redundant zeroing of PMR registers Klaus Jensen
                   ` (13 subsequent siblings)
  56 siblings, 0 replies; 62+ messages in thread
From: Klaus Jensen @ 2021-02-09  7:30 UTC (permalink / raw)
  To: qemu-devel, Peter Maydell
  Cc: Fam Zheng, Kevin Wolf, qemu-block, Klaus Jensen, Max Reitz,
	Klaus Jensen, Minwoo Im, Stefan Hajnoczi, Keith Busch
From: Klaus Jensen <k.jensen@samsung.com>
Use the correct field names.
Reviewed-by: Minwoo Im <minwoo.im.dev@gmail.com>
Reviewed-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
---
 include/block/nvme.h | 18 +++++++++---------
 hw/block/nvme.c      |  2 +-
 2 files changed, 10 insertions(+), 10 deletions(-)
diff --git a/include/block/nvme.h b/include/block/nvme.h
index 151921da21f9..008108bd1af8 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -35,8 +35,8 @@ enum NvmeCapShift {
     CAP_CSS_SHIFT      = 37,
     CAP_MPSMIN_SHIFT   = 48,
     CAP_MPSMAX_SHIFT   = 52,
-    CAP_PMR_SHIFT      = 56,
-    CAP_CMB_SHIFT      = 57,
+    CAP_PMRS_SHIFT     = 56,
+    CAP_CMBS_SHIFT     = 57,
 };
 
 enum NvmeCapMask {
@@ -49,8 +49,8 @@ enum NvmeCapMask {
     CAP_CSS_MASK       = 0xff,
     CAP_MPSMIN_MASK    = 0xf,
     CAP_MPSMAX_MASK    = 0xf,
-    CAP_PMR_MASK       = 0x1,
-    CAP_CMB_MASK       = 0x1,
+    CAP_PMRS_MASK      = 0x1,
+    CAP_CMBS_MASK      = 0x1,
 };
 
 #define NVME_CAP_MQES(cap)  (((cap) >> CAP_MQES_SHIFT)   & CAP_MQES_MASK)
@@ -62,7 +62,7 @@ enum NvmeCapMask {
 #define NVME_CAP_CSS(cap)   (((cap) >> CAP_CSS_SHIFT)    & CAP_CSS_MASK)
 #define NVME_CAP_MPSMIN(cap)(((cap) >> CAP_MPSMIN_SHIFT) & CAP_MPSMIN_MASK)
 #define NVME_CAP_MPSMAX(cap)(((cap) >> CAP_MPSMAX_SHIFT) & CAP_MPSMAX_MASK)
-#define NVME_CAP_PMR(cap)   (((cap) >> CAP_PMR_SHIFT)    & CAP_PMR_MASK)
+#define NVME_CAP_PMRS(cap)  (((cap) >> CAP_PMRS_SHIFT)   & CAP_PMRS_MASK)
 
 #define NVME_CAP_SET_MQES(cap, val)   (cap |= (uint64_t)(val & CAP_MQES_MASK)  \
                                                            << CAP_MQES_SHIFT)
@@ -82,10 +82,10 @@ enum NvmeCapMask {
                                                            << CAP_MPSMIN_SHIFT)
 #define NVME_CAP_SET_MPSMAX(cap, val) (cap |= (uint64_t)(val & CAP_MPSMAX_MASK)\
                                                            << CAP_MPSMAX_SHIFT)
-#define NVME_CAP_SET_PMRS(cap, val)   (cap |= (uint64_t)(val & CAP_PMR_MASK)   \
-                                                           << CAP_PMR_SHIFT)
-#define NVME_CAP_SET_CMBS(cap, val)   (cap |= (uint64_t)(val & CAP_CMB_MASK)   \
-                                                           << CAP_CMB_SHIFT)
+#define NVME_CAP_SET_PMRS(cap, val)   (cap |= (uint64_t)(val & CAP_PMRS_MASK)  \
+                                                           << CAP_PMRS_SHIFT)
+#define NVME_CAP_SET_CMBS(cap, val)   (cap |= (uint64_t)(val & CAP_CMBS_MASK)  \
+                                                           << CAP_CMBS_SHIFT)
 
 enum NvmeCapCss {
     NVME_CAP_CSS_NVM        = 1 << 0,
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 4ce75642f1a4..0057a02402b7 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -4501,7 +4501,7 @@ static void nvme_set_smart_warning(Object *obj, Visitor *v, const char *name,
 
     cap = NVME_SMART_SPARE | NVME_SMART_TEMPERATURE | NVME_SMART_RELIABILITY
           | NVME_SMART_MEDIA_READ_ONLY | NVME_SMART_FAILED_VOLATILE_MEDIA;
-    if (NVME_CAP_PMR(n->bar.cap)) {
+    if (NVME_CAP_PMRS(n->bar.cap)) {
         cap |= NVME_SMART_PMR_UNRELIABLE;
     }
 
-- 
2.30.0
^ permalink raw reply related	[flat|nested] 62+ messages in thread
- * [PULL 44/56] hw/block/nvme: remove redundant zeroing of PMR registers
  2021-02-09  7:30 [PULL 00/56] emulated nvme patches Klaus Jensen
                   ` (42 preceding siblings ...)
  2021-02-09  7:30 ` [PULL 43/56] hw/block/nvme: rename PMR/CMB shift/mask fields Klaus Jensen
@ 2021-02-09  7:30 ` Klaus Jensen
  2021-02-09  7:30 ` [PULL 45/56] hw/block/nvme: disable PMR at boot up Klaus Jensen
                   ` (12 subsequent siblings)
  56 siblings, 0 replies; 62+ messages in thread
From: Klaus Jensen @ 2021-02-09  7:30 UTC (permalink / raw)
  To: qemu-devel, Peter Maydell
  Cc: Fam Zheng, Kevin Wolf, qemu-block, Klaus Jensen, Max Reitz,
	Klaus Jensen, Minwoo Im, Stefan Hajnoczi, Keith Busch
From: Klaus Jensen <k.jensen@samsung.com>
The controller registers are initially zero. Remove the redundant
zeroing.
Reviewed-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Minwoo Im <minwoo.im.dev@gmail.com>
Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
---
 hw/block/nvme.c | 34 ----------------------------------
 1 file changed, 34 deletions(-)
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 0057a02402b7..f8dd771925f9 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -4217,43 +4217,9 @@ static void nvme_init_cmb(NvmeCtrl *n, PCIDevice *pci_dev)
 
 static void nvme_init_pmr(NvmeCtrl *n, PCIDevice *pci_dev)
 {
-    /* PMR Capabities register */
-    n->bar.pmrcap = 0;
-    NVME_PMRCAP_SET_RDS(n->bar.pmrcap, 0);
-    NVME_PMRCAP_SET_WDS(n->bar.pmrcap, 0);
     NVME_PMRCAP_SET_BIR(n->bar.pmrcap, NVME_PMR_BIR);
-    NVME_PMRCAP_SET_PMRTU(n->bar.pmrcap, 0);
     /* Turn on bit 1 support */
     NVME_PMRCAP_SET_PMRWBM(n->bar.pmrcap, 0x02);
-    NVME_PMRCAP_SET_PMRTO(n->bar.pmrcap, 0);
-    NVME_PMRCAP_SET_CMSS(n->bar.pmrcap, 0);
-
-    /* PMR Control register */
-    n->bar.pmrctl = 0;
-    NVME_PMRCTL_SET_EN(n->bar.pmrctl, 0);
-
-    /* PMR Status register */
-    n->bar.pmrsts = 0;
-    NVME_PMRSTS_SET_ERR(n->bar.pmrsts, 0);
-    NVME_PMRSTS_SET_NRDY(n->bar.pmrsts, 0);
-    NVME_PMRSTS_SET_HSTS(n->bar.pmrsts, 0);
-    NVME_PMRSTS_SET_CBAI(n->bar.pmrsts, 0);
-
-    /* PMR Elasticity Buffer Size register */
-    n->bar.pmrebs = 0;
-    NVME_PMREBS_SET_PMRSZU(n->bar.pmrebs, 0);
-    NVME_PMREBS_SET_RBB(n->bar.pmrebs, 0);
-    NVME_PMREBS_SET_PMRWBZ(n->bar.pmrebs, 0);
-
-    /* PMR Sustained Write Throughput register */
-    n->bar.pmrswtp = 0;
-    NVME_PMRSWTP_SET_PMRSWTU(n->bar.pmrswtp, 0);
-    NVME_PMRSWTP_SET_PMRSWTV(n->bar.pmrswtp, 0);
-
-    /* PMR Memory Space Control register */
-    n->bar.pmrmsc = 0;
-    NVME_PMRMSC_SET_CMSE(n->bar.pmrmsc, 0);
-    NVME_PMRMSC_SET_CBA(n->bar.pmrmsc, 0);
 
     pci_register_bar(pci_dev, NVME_PMRCAP_BIR(n->bar.pmrcap),
                      PCI_BASE_ADDRESS_SPACE_MEMORY |
-- 
2.30.0
^ permalink raw reply related	[flat|nested] 62+ messages in thread
- * [PULL 45/56] hw/block/nvme: disable PMR at boot up
  2021-02-09  7:30 [PULL 00/56] emulated nvme patches Klaus Jensen
                   ` (43 preceding siblings ...)
  2021-02-09  7:30 ` [PULL 44/56] hw/block/nvme: remove redundant zeroing of PMR registers Klaus Jensen
@ 2021-02-09  7:30 ` Klaus Jensen
  2021-02-09  7:30 ` [PULL 46/56] hw/block/nvme: add PMR RDS/WDS support Klaus Jensen
                   ` (11 subsequent siblings)
  56 siblings, 0 replies; 62+ messages in thread
From: Klaus Jensen @ 2021-02-09  7:30 UTC (permalink / raw)
  To: qemu-devel, Peter Maydell
  Cc: Fam Zheng, Kevin Wolf, qemu-block, Klaus Jensen, Max Reitz,
	Klaus Jensen, Stefan Hajnoczi, Keith Busch
From: Klaus Jensen <k.jensen@samsung.com>
The PMR should not be enabled at boot up. Disable the PMR MemoryRegion
initially and implement MMIO for PMRCTL, allowing the host to enable the
PMR explicitly.
Reviewed-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
---
 hw/block/nvme.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index f8dd771925f9..d773796051d6 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -3848,8 +3848,16 @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
         NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrcap_readonly,
                        "invalid write to PMRCAP register, ignored");
         return;
-    case 0xE04: /* TODO PMRCTL */
-        break;
+    case 0xE04: /* PMRCTL */
+        n->bar.pmrctl = data;
+        if (NVME_PMRCTL_EN(data)) {
+            memory_region_set_enabled(&n->pmrdev->mr, true);
+            n->bar.pmrsts = 0;
+        } else {
+            memory_region_set_enabled(&n->pmrdev->mr, false);
+            NVME_PMRSTS_SET_NRDY(n->bar.pmrsts, 1);
+        }
+        return;
     case 0xE08: /* PMRSTS */
         NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrsts_readonly,
                        "invalid write to PMRSTS register, ignored");
@@ -4225,6 +4233,8 @@ static void nvme_init_pmr(NvmeCtrl *n, PCIDevice *pci_dev)
                      PCI_BASE_ADDRESS_SPACE_MEMORY |
                      PCI_BASE_ADDRESS_MEM_TYPE_64 |
                      PCI_BASE_ADDRESS_MEM_PREFETCH, &n->pmrdev->mr);
+
+    memory_region_set_enabled(&n->pmrdev->mr, false);
 }
 
 static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp)
-- 
2.30.0
^ permalink raw reply related	[flat|nested] 62+ messages in thread
- * [PULL 46/56] hw/block/nvme: add PMR RDS/WDS support
  2021-02-09  7:30 [PULL 00/56] emulated nvme patches Klaus Jensen
                   ` (44 preceding siblings ...)
  2021-02-09  7:30 ` [PULL 45/56] hw/block/nvme: disable PMR at boot up Klaus Jensen
@ 2021-02-09  7:30 ` Klaus Jensen
  2021-02-09  7:30 ` [PULL 47/56] hw/block/nvme: move cmb logic to v1.4 Klaus Jensen
                   ` (10 subsequent siblings)
  56 siblings, 0 replies; 62+ messages in thread
From: Klaus Jensen @ 2021-02-09  7:30 UTC (permalink / raw)
  To: qemu-devel, Peter Maydell
  Cc: Fam Zheng, Kevin Wolf, qemu-block, Klaus Jensen, Naveen Nagar,
	Max Reitz, Klaus Jensen, Stefan Hajnoczi, Keith Busch
From: Naveen Nagar <naveen.n1@samsung.com>
Add support for the PMRMSCL and PMRMSCU MMIO registers. This allows
adding RDS/WDS support for PMR as well.
Reviewed-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Naveen Nagar <naveen.n1@samsung.com>
Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
---
 hw/block/nvme.h |   6 ++-
 hw/block/nvme.c | 122 +++++++++++++++++++++++++++++++++++++++---------
 2 files changed, 106 insertions(+), 22 deletions(-)
diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index 1cdb360bc549..b7702e937e56 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -151,7 +151,11 @@ typedef struct NvmeCtrl {
     uint16_t    temperature;
     uint8_t     smart_critical_warning;
 
-    HostMemoryBackend *pmrdev;
+    struct {
+        HostMemoryBackend *dev;
+        bool              cmse;
+        hwaddr            cba;
+    } pmr;
 
     uint8_t     aer_mask;
     NvmeRequest **aer_reqs;
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index d773796051d6..7f1c8dd7751c 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -273,6 +273,24 @@ static inline void *nvme_addr_to_cmb(NvmeCtrl *n, hwaddr addr)
     return &n->cmbuf[addr - n->ctrl_mem.addr];
 }
 
+static bool nvme_addr_is_pmr(NvmeCtrl *n, hwaddr addr)
+{
+    hwaddr hi;
+
+    if (!n->pmr.cmse) {
+        return false;
+    }
+
+    hi = n->pmr.cba + int128_get64(n->pmr.dev->mr.size);
+
+    return addr >= n->pmr.cba && addr < hi;
+}
+
+static inline void *nvme_addr_to_pmr(NvmeCtrl *n, hwaddr addr)
+{
+    return memory_region_get_ram_ptr(&n->pmr.dev->mr) + (addr - n->pmr.cba);
+}
+
 static int nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
 {
     hwaddr hi = addr + size - 1;
@@ -285,6 +303,11 @@ static int nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
         return 0;
     }
 
+    if (nvme_addr_is_pmr(n, addr) && nvme_addr_is_pmr(n, hi)) {
+        memcpy(buf, nvme_addr_to_pmr(n, addr), size);
+        return 0;
+    }
+
     return pci_dma_read(&n->parent_obj, addr, buf, size);
 }
 
@@ -406,9 +429,27 @@ static uint16_t nvme_map_addr_cmb(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr,
     return NVME_SUCCESS;
 }
 
+static uint16_t nvme_map_addr_pmr(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr,
+    size_t len)
+{
+    if (!len) {
+        return NVME_SUCCESS;
+    }
+
+    if (!nvme_addr_is_pmr(n, addr) || !nvme_addr_is_pmr(n, addr + len - 1)) {
+        return NVME_DATA_TRAS_ERROR;
+    }
+
+    qemu_iovec_add(iov, nvme_addr_to_pmr(n, addr), len);
+
+    return NVME_SUCCESS;
+}
+
 static uint16_t nvme_map_addr(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov,
                               hwaddr addr, size_t len)
 {
+    bool cmb = false, pmr = false;
+
     if (!len) {
         return NVME_SUCCESS;
     }
@@ -416,6 +457,12 @@ static uint16_t nvme_map_addr(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov,
     trace_pci_nvme_map_addr(addr, len);
 
     if (nvme_addr_is_cmb(n, addr)) {
+        cmb = true;
+    } else if (nvme_addr_is_pmr(n, addr)) {
+        pmr = true;
+    }
+
+    if (cmb || pmr) {
         if (qsg && qsg->sg) {
             return NVME_INVALID_USE_OF_CMB | NVME_DNR;
         }
@@ -426,7 +473,11 @@ static uint16_t nvme_map_addr(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov,
             qemu_iovec_init(iov, 1);
         }
 
-        return nvme_map_addr_cmb(n, iov, addr, len);
+        if (cmb) {
+            return nvme_map_addr_cmb(n, iov, addr, len);
+        } else {
+            return nvme_map_addr_pmr(n, iov, addr, len);
+        }
     }
 
     if (iov && iov->iov) {
@@ -459,7 +510,7 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, uint64_t prp1, uint64_t prp2,
 
     trace_pci_nvme_map_prp(trans_len, len, prp1, prp2, num_prps);
 
-    if (nvme_addr_is_cmb(n, prp1)) {
+    if (nvme_addr_is_cmb(n, prp1) || (nvme_addr_is_pmr(n, prp1))) {
         qemu_iovec_init(iov, num_prps);
     } else {
         pci_dma_sglist_init(qsg, &n->parent_obj, num_prps);
@@ -3561,8 +3612,8 @@ static void nvme_ctrl_shutdown(NvmeCtrl *n)
     NvmeNamespace *ns;
     int i;
 
-    if (n->pmrdev) {
-        memory_region_msync(&n->pmrdev->mr, 0, n->pmrdev->size);
+    if (n->pmr.dev) {
+        memory_region_msync(&n->pmr.dev->mr, 0, n->pmr.dev->size);
     }
 
     for (i = 1; i <= n->num_namespaces; i++) {
@@ -3851,11 +3902,12 @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
     case 0xE04: /* PMRCTL */
         n->bar.pmrctl = data;
         if (NVME_PMRCTL_EN(data)) {
-            memory_region_set_enabled(&n->pmrdev->mr, true);
+            memory_region_set_enabled(&n->pmr.dev->mr, true);
             n->bar.pmrsts = 0;
         } else {
-            memory_region_set_enabled(&n->pmrdev->mr, false);
+            memory_region_set_enabled(&n->pmr.dev->mr, false);
             NVME_PMRSTS_SET_NRDY(n->bar.pmrsts, 1);
+            n->pmr.cmse = false;
         }
         return;
     case 0xE08: /* PMRSTS */
@@ -3870,8 +3922,33 @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
         NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrswtp_readonly,
                        "invalid write to PMRSWTP register, ignored");
         return;
-    case 0xE14: /* TODO PMRMSC */
-        break;
+    case 0xE14: /* PMRMSCL */
+        if (!NVME_CAP_PMRS(n->bar.cap)) {
+            return;
+        }
+
+        n->bar.pmrmsc = (n->bar.pmrmsc & ~0xffffffff) | (data & 0xffffffff);
+        n->pmr.cmse = false;
+
+        if (NVME_PMRMSC_CMSE(n->bar.pmrmsc)) {
+            hwaddr cba = NVME_PMRMSC_CBA(n->bar.pmrmsc) << PMRMSC_CBA_SHIFT;
+            if (cba + int128_get64(n->pmr.dev->mr.size) < cba) {
+                NVME_PMRSTS_SET_CBAI(n->bar.pmrsts, 1);
+                return;
+            }
+
+            n->pmr.cmse = true;
+            n->pmr.cba = cba;
+        }
+
+        return;
+    case 0xE18: /* PMRMSCU */
+        if (!NVME_CAP_PMRS(n->bar.cap)) {
+            return;
+        }
+
+        n->bar.pmrmsc = (n->bar.pmrmsc & 0xffffffff) | (data << 32);
+        return;
     default:
         NVME_GUEST_ERR(pci_nvme_ub_mmiowr_invalid,
                        "invalid MMIO write,"
@@ -3909,7 +3986,7 @@ static uint64_t nvme_mmio_read(void *opaque, hwaddr addr, unsigned size)
          */
         if (addr == 0xE08 &&
             (NVME_PMRCAP_PMRWBM(n->bar.pmrcap) & 0x02)) {
-            memory_region_msync(&n->pmrdev->mr, 0, n->pmrdev->size);
+            memory_region_msync(&n->pmr.dev->mr, 0, n->pmr.dev->size);
         }
         memcpy(&val, ptr + addr, size);
     } else {
@@ -4128,19 +4205,19 @@ static void nvme_check_constraints(NvmeCtrl *n, Error **errp)
         return;
     }
 
-    if (n->pmrdev) {
-        if (host_memory_backend_is_mapped(n->pmrdev)) {
+    if (n->pmr.dev) {
+        if (host_memory_backend_is_mapped(n->pmr.dev)) {
             error_setg(errp, "can't use already busy memdev: %s",
-                       object_get_canonical_path_component(OBJECT(n->pmrdev)));
+                       object_get_canonical_path_component(OBJECT(n->pmr.dev)));
             return;
         }
 
-        if (!is_power_of_2(n->pmrdev->size)) {
+        if (!is_power_of_2(n->pmr.dev->size)) {
             error_setg(errp, "pmr backend size needs to be power of 2 in size");
             return;
         }
 
-        host_memory_backend_set_mapped(n->pmrdev, true);
+        host_memory_backend_set_mapped(n->pmr.dev, true);
     }
 
     if (n->params.zasl_bs) {
@@ -4225,16 +4302,19 @@ static void nvme_init_cmb(NvmeCtrl *n, PCIDevice *pci_dev)
 
 static void nvme_init_pmr(NvmeCtrl *n, PCIDevice *pci_dev)
 {
+    NVME_PMRCAP_SET_RDS(n->bar.pmrcap, 1);
+    NVME_PMRCAP_SET_WDS(n->bar.pmrcap, 1);
     NVME_PMRCAP_SET_BIR(n->bar.pmrcap, NVME_PMR_BIR);
     /* Turn on bit 1 support */
     NVME_PMRCAP_SET_PMRWBM(n->bar.pmrcap, 0x02);
+    NVME_PMRCAP_SET_CMSS(n->bar.pmrcap, 1);
 
     pci_register_bar(pci_dev, NVME_PMRCAP_BIR(n->bar.pmrcap),
                      PCI_BASE_ADDRESS_SPACE_MEMORY |
                      PCI_BASE_ADDRESS_MEM_TYPE_64 |
-                     PCI_BASE_ADDRESS_MEM_PREFETCH, &n->pmrdev->mr);
+                     PCI_BASE_ADDRESS_MEM_PREFETCH, &n->pmr.dev->mr);
 
-    memory_region_set_enabled(&n->pmrdev->mr, false);
+    memory_region_set_enabled(&n->pmr.dev->mr, false);
 }
 
 static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp)
@@ -4295,7 +4375,7 @@ static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp)
         nvme_init_cmb(n, pci_dev);
     }
 
-    if (n->pmrdev) {
+    if (n->pmr.dev) {
         nvme_init_pmr(n, pci_dev);
     }
 
@@ -4368,7 +4448,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
     NVME_CAP_SET_CSS(n->bar.cap, NVME_CAP_CSS_ADMIN_ONLY);
     NVME_CAP_SET_MPSMAX(n->bar.cap, 4);
     NVME_CAP_SET_CMBS(n->bar.cap, n->params.cmb_size_mb ? 1 : 0);
-    NVME_CAP_SET_PMRS(n->bar.cap, n->pmrdev ? 1 : 0);
+    NVME_CAP_SET_PMRS(n->bar.cap, n->pmr.dev ? 1 : 0);
 
     n->bar.vs = NVME_SPEC_VER;
     n->bar.intmc = n->bar.intms = 0;
@@ -4432,15 +4512,15 @@ static void nvme_exit(PCIDevice *pci_dev)
         g_free(n->cmbuf);
     }
 
-    if (n->pmrdev) {
-        host_memory_backend_set_mapped(n->pmrdev, false);
+    if (n->pmr.dev) {
+        host_memory_backend_set_mapped(n->pmr.dev, false);
     }
     msix_uninit_exclusive_bar(pci_dev);
 }
 
 static Property nvme_props[] = {
     DEFINE_BLOCK_PROPERTIES(NvmeCtrl, namespace.blkconf),
-    DEFINE_PROP_LINK("pmrdev", NvmeCtrl, pmrdev, TYPE_MEMORY_BACKEND,
+    DEFINE_PROP_LINK("pmrdev", NvmeCtrl, pmr.dev, TYPE_MEMORY_BACKEND,
                      HostMemoryBackend *),
     DEFINE_PROP_STRING("serial", NvmeCtrl, params.serial),
     DEFINE_PROP_UINT32("cmb_size_mb", NvmeCtrl, params.cmb_size_mb, 0),
-- 
2.30.0
^ permalink raw reply related	[flat|nested] 62+ messages in thread
- * [PULL 47/56] hw/block/nvme: move cmb logic to v1.4
  2021-02-09  7:30 [PULL 00/56] emulated nvme patches Klaus Jensen
                   ` (45 preceding siblings ...)
  2021-02-09  7:30 ` [PULL 46/56] hw/block/nvme: add PMR RDS/WDS support Klaus Jensen
@ 2021-02-09  7:30 ` Klaus Jensen
  2021-02-09  7:30 ` [PULL 48/56] hw/block/nvme: bump " Klaus Jensen
                   ` (9 subsequent siblings)
  56 siblings, 0 replies; 62+ messages in thread
From: Klaus Jensen @ 2021-02-09  7:30 UTC (permalink / raw)
  To: qemu-devel, Peter Maydell
  Cc: Fam Zheng, Kevin Wolf, Padmakar Kalghatgi, qemu-block,
	Klaus Jensen, Max Reitz, Klaus Jensen, Minwoo Im, Stefan Hajnoczi,
	Keith Busch
From: Padmakar Kalghatgi <p.kalghatgi@samsung.com>
Implement v1.4 logic for configuring the Controller Memory Buffer. By
default, the v1.4 scheme will be used (CMB must be explicitly enabled by
the host), so drivers that only support v1.3 will not be able to use the
CMB anymore.
To retain the v1.3 behavior, set the boolean 'legacy-cmb' nvme device
parameter.
Reviewed-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Minwoo Im <minwoo.im.dev@gmail.com>
Signed-off-by: Padmakar Kalghatgi <p.kalghatgi@samsung.com>
Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
---
 hw/block/nvme.h       |  10 +++-
 include/block/nvme.h  | 107 +++++++++++++++++++++++++++++++++++++-----
 hw/block/nvme.c       | 101 +++++++++++++++++++++++++++++----------
 hw/block/trace-events |   2 +
 4 files changed, 182 insertions(+), 38 deletions(-)
diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index b7702e937e56..dee6092bd45f 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -20,6 +20,7 @@ typedef struct NvmeParams {
     uint8_t  mdts;
     bool     use_intel_id;
     uint32_t zasl_bs;
+    bool     legacy_cmb;
 } NvmeParams;
 
 typedef struct NvmeAsyncEvent {
@@ -127,7 +128,6 @@ typedef struct NvmeCtrl {
     PCIDevice    parent_obj;
     MemoryRegion bar0;
     MemoryRegion iomem;
-    MemoryRegion ctrl_mem;
     NvmeBar      bar;
     NvmeParams   params;
     NvmeBus      bus;
@@ -143,7 +143,6 @@ typedef struct NvmeCtrl {
     uint32_t    num_namespaces;
     uint32_t    max_q_ents;
     uint8_t     outstanding_aers;
-    uint8_t     *cmbuf;
     uint32_t    irq_status;
     uint64_t    host_timestamp;                 /* Timestamp sent by the host */
     uint64_t    timestamp_set_qemu_clock_ms;    /* QEMU clock time */
@@ -151,6 +150,13 @@ typedef struct NvmeCtrl {
     uint16_t    temperature;
     uint8_t     smart_critical_warning;
 
+    struct {
+        MemoryRegion mem;
+        uint8_t      *buf;
+        bool         cmse;
+        hwaddr       cba;
+    } cmb;
+
     struct {
         HostMemoryBackend *dev;
         bool              cmse;
diff --git a/include/block/nvme.h b/include/block/nvme.h
index 008108bd1af8..2e85b97a6c4e 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -15,14 +15,19 @@ typedef struct QEMU_PACKED NvmeBar {
     uint64_t    acq;
     uint32_t    cmbloc;
     uint32_t    cmbsz;
-    uint8_t     padding[3520]; /* not used by QEMU */
+    uint32_t    bpinfo;
+    uint32_t    bprsel;
+    uint64_t    bpmbl;
+    uint64_t    cmbmsc;
+    uint32_t    cmbsts;
+    uint8_t     rsvd92[3492];
     uint32_t    pmrcap;
     uint32_t    pmrctl;
     uint32_t    pmrsts;
     uint32_t    pmrebs;
     uint32_t    pmrswtp;
     uint64_t    pmrmsc;
-    uint8_t     reserved[484];
+    uint8_t     css[484];
 } NvmeBar;
 
 enum NvmeCapShift {
@@ -63,6 +68,7 @@ enum NvmeCapMask {
 #define NVME_CAP_MPSMIN(cap)(((cap) >> CAP_MPSMIN_SHIFT) & CAP_MPSMIN_MASK)
 #define NVME_CAP_MPSMAX(cap)(((cap) >> CAP_MPSMAX_SHIFT) & CAP_MPSMAX_MASK)
 #define NVME_CAP_PMRS(cap)  (((cap) >> CAP_PMRS_SHIFT)   & CAP_PMRS_MASK)
+#define NVME_CAP_CMBS(cap)  (((cap) >> CAP_CMBS_SHIFT)   & CAP_CMBS_MASK)
 
 #define NVME_CAP_SET_MQES(cap, val)   (cap |= (uint64_t)(val & CAP_MQES_MASK)  \
                                                            << CAP_MQES_SHIFT)
@@ -184,25 +190,64 @@ enum NvmeAqaMask {
 #define NVME_AQA_ACQS(aqa) ((aqa >> AQA_ACQS_SHIFT) & AQA_ACQS_MASK)
 
 enum NvmeCmblocShift {
-    CMBLOC_BIR_SHIFT  = 0,
-    CMBLOC_OFST_SHIFT = 12,
+    CMBLOC_BIR_SHIFT     = 0,
+    CMBLOC_CQMMS_SHIFT   = 3,
+    CMBLOC_CQPDS_SHIFT   = 4,
+    CMBLOC_CDPMLS_SHIFT  = 5,
+    CMBLOC_CDPCILS_SHIFT = 6,
+    CMBLOC_CDMMMS_SHIFT  = 7,
+    CMBLOC_CQDA_SHIFT    = 8,
+    CMBLOC_OFST_SHIFT    = 12,
 };
 
 enum NvmeCmblocMask {
-    CMBLOC_BIR_MASK  = 0x7,
-    CMBLOC_OFST_MASK = 0xfffff,
+    CMBLOC_BIR_MASK     = 0x7,
+    CMBLOC_CQMMS_MASK   = 0x1,
+    CMBLOC_CQPDS_MASK   = 0x1,
+    CMBLOC_CDPMLS_MASK  = 0x1,
+    CMBLOC_CDPCILS_MASK = 0x1,
+    CMBLOC_CDMMMS_MASK  = 0x1,
+    CMBLOC_CQDA_MASK    = 0x1,
+    CMBLOC_OFST_MASK    = 0xfffff,
 };
 
-#define NVME_CMBLOC_BIR(cmbloc) ((cmbloc >> CMBLOC_BIR_SHIFT)  & \
-                                 CMBLOC_BIR_MASK)
-#define NVME_CMBLOC_OFST(cmbloc)((cmbloc >> CMBLOC_OFST_SHIFT) & \
-                                 CMBLOC_OFST_MASK)
+#define NVME_CMBLOC_BIR(cmbloc) \
+    ((cmbloc >> CMBLOC_BIR_SHIFT) & CMBLOC_BIR_MASK)
+#define NVME_CMBLOC_CQMMS(cmbloc) \
+    ((cmbloc >> CMBLOC_CQMMS_SHIFT) & CMBLOC_CQMMS_MASK)
+#define NVME_CMBLOC_CQPDS(cmbloc) \
+    ((cmbloc >> CMBLOC_CQPDS_SHIFT) & CMBLOC_CQPDS_MASK)
+#define NVME_CMBLOC_CDPMLS(cmbloc) \
+    ((cmbloc >> CMBLOC_CDPMLS_SHIFT) & CMBLOC_CDPMLS_MASK)
+#define NVME_CMBLOC_CDPCILS(cmbloc) \
+    ((cmbloc >> CMBLOC_CDPCILS_SHIFT) & CMBLOC_CDPCILS_MASK)
+#define NVME_CMBLOC_CDMMMS(cmbloc) \
+    ((cmbloc >> CMBLOC_CDMMMS_SHIFT) & CMBLOC_CDMMMS_MASK)
+#define NVME_CMBLOC_CQDA(cmbloc) \
+    ((cmbloc >> CMBLOC_CQDA_SHIFT) & CMBLOC_CQDA_MASK)
+#define NVME_CMBLOC_OFST(cmbloc) \
+    ((cmbloc >> CMBLOC_OFST_SHIFT) & CMBLOC_OFST_MASK)
 
-#define NVME_CMBLOC_SET_BIR(cmbloc, val)  \
+#define NVME_CMBLOC_SET_BIR(cmbloc, val) \
     (cmbloc |= (uint64_t)(val & CMBLOC_BIR_MASK) << CMBLOC_BIR_SHIFT)
+#define NVME_CMBLOC_SET_CQMMS(cmbloc, val) \
+    (cmbloc |= (uint64_t)(val & CMBLOC_CQMMS_MASK) << CMBLOC_CQMMS_SHIFT)
+#define NVME_CMBLOC_SET_CQPDS(cmbloc, val) \
+    (cmbloc |= (uint64_t)(val & CMBLOC_CQPDS_MASK) << CMBLOC_CQPDS_SHIFT)
+#define NVME_CMBLOC_SET_CDPMLS(cmbloc, val) \
+    (cmbloc |= (uint64_t)(val & CMBLOC_CDPMLS_MASK) << CMBLOC_CDPMLS_SHIFT)
+#define NVME_CMBLOC_SET_CDPCILS(cmbloc, val) \
+    (cmbloc |= (uint64_t)(val & CMBLOC_CDPCILS_MASK) << CMBLOC_CDPCILS_SHIFT)
+#define NVME_CMBLOC_SET_CDMMMS(cmbloc, val) \
+    (cmbloc |= (uint64_t)(val & CMBLOC_CDMMMS_MASK) << CMBLOC_CDMMMS_SHIFT)
+#define NVME_CMBLOC_SET_CQDA(cmbloc, val) \
+    (cmbloc |= (uint64_t)(val & CMBLOC_CQDA_MASK) << CMBLOC_CQDA_SHIFT)
 #define NVME_CMBLOC_SET_OFST(cmbloc, val) \
     (cmbloc |= (uint64_t)(val & CMBLOC_OFST_MASK) << CMBLOC_OFST_SHIFT)
 
+#define NVME_CMBMSMC_SET_CRE (cmbmsc, val) \
+    (cmbmsc |= (uint64_t)(val & CMBLOC_OFST_MASK) << CMBMSC_CRE_SHIFT)
+
 enum NvmeCmbszShift {
     CMBSZ_SQS_SHIFT   = 0,
     CMBSZ_CQS_SHIFT   = 1,
@@ -249,6 +294,46 @@ enum NvmeCmbszMask {
 #define NVME_CMBSZ_GETSIZE(cmbsz) \
     (NVME_CMBSZ_SZ(cmbsz) * (1 << (12 + 4 * NVME_CMBSZ_SZU(cmbsz))))
 
+enum NvmeCmbmscShift {
+    CMBMSC_CRE_SHIFT  = 0,
+    CMBMSC_CMSE_SHIFT = 1,
+    CMBMSC_CBA_SHIFT  = 12,
+};
+
+enum NvmeCmbmscMask {
+    CMBMSC_CRE_MASK  = 0x1,
+    CMBMSC_CMSE_MASK = 0x1,
+    CMBMSC_CBA_MASK  = ((1ULL << 52) - 1),
+};
+
+#define NVME_CMBMSC_CRE(cmbmsc) \
+    ((cmbmsc >> CMBMSC_CRE_SHIFT)  & CMBMSC_CRE_MASK)
+#define NVME_CMBMSC_CMSE(cmbmsc) \
+    ((cmbmsc >> CMBMSC_CMSE_SHIFT) & CMBMSC_CMSE_MASK)
+#define NVME_CMBMSC_CBA(cmbmsc) \
+    ((cmbmsc >> CMBMSC_CBA_SHIFT) & CMBMSC_CBA_MASK)
+
+
+#define NVME_CMBMSC_SET_CRE(cmbmsc, val)  \
+    (cmbmsc |= (uint64_t)(val & CMBMSC_CRE_MASK) << CMBMSC_CRE_SHIFT)
+#define NVME_CMBMSC_SET_CMSE(cmbmsc, val) \
+    (cmbmsc |= (uint64_t)(val & CMBMSC_CMSE_MASK) << CMBMSC_CMSE_SHIFT)
+#define NVME_CMBMSC_SET_CBA(cmbmsc, val) \
+    (cmbmsc |= (uint64_t)(val & CMBMSC_CBA_MASK) << CMBMSC_CBA_SHIFT)
+
+enum NvmeCmbstsShift {
+    CMBSTS_CBAI_SHIFT = 0,
+};
+enum NvmeCmbstsMask {
+    CMBSTS_CBAI_MASK = 0x1,
+};
+
+#define NVME_CMBSTS_CBAI(cmbsts) \
+    ((cmbsts >> CMBSTS_CBAI_SHIFT) & CMBSTS_CBAI_MASK)
+
+#define NVME_CMBSTS_SET_CBAI(cmbsts, val)  \
+    (cmbsts |= (uint64_t)(val & CMBSTS_CBAI_MASK) << CMBSTS_CBAI_SHIFT)
+
 enum NvmePmrcapShift {
     PMRCAP_RDS_SHIFT      = 3,
     PMRCAP_WDS_SHIFT      = 4,
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 7f1c8dd7751c..1e13d25b0887 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -27,7 +27,9 @@
  *              zoned=<true|false[optional]>
  *
  * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
- * offset 0 in BAR2 and supports only WDS, RDS and SQS for now.
+ * offset 0 in BAR2 and supports only WDS, RDS and SQS for now. By default, the
+ * device will use the "v1.4 CMB scheme" - use the `legacy-cmb` parameter to
+ * always enable the CMBLOC and CMBSZ registers (v1.3 behavior).
  *
  * Enabling pmr emulation can be achieved by pointing to memory-backend-file.
  * For example:
@@ -260,17 +262,22 @@ static int nvme_aor_check(NvmeNamespace *ns, uint32_t act, uint32_t opn)
 
 static bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr)
 {
-    hwaddr low = n->ctrl_mem.addr;
-    hwaddr hi  = n->ctrl_mem.addr + int128_get64(n->ctrl_mem.size);
+    hwaddr hi, lo;
 
-    return addr >= low && addr < hi;
+    if (!n->cmb.cmse) {
+        return false;
+    }
+
+    lo = n->params.legacy_cmb ? n->cmb.mem.addr : n->cmb.cba;
+    hi = lo + int128_get64(n->cmb.mem.size);
+
+    return addr >= lo && addr < hi;
 }
 
 static inline void *nvme_addr_to_cmb(NvmeCtrl *n, hwaddr addr)
 {
-    assert(nvme_addr_is_cmb(n, addr));
-
-    return &n->cmbuf[addr - n->ctrl_mem.addr];
+    hwaddr base = n->params.legacy_cmb ? n->cmb.mem.addr : n->cmb.cba;
+    return &n->cmb.buf[addr - base];
 }
 
 static bool nvme_addr_is_pmr(NvmeCtrl *n, hwaddr addr)
@@ -3768,6 +3775,19 @@ static int nvme_start_ctrl(NvmeCtrl *n)
     return 0;
 }
 
+static void nvme_cmb_enable_regs(NvmeCtrl *n)
+{
+    NVME_CMBLOC_SET_BIR(n->bar.cmbloc, NVME_CMB_BIR);
+
+    NVME_CMBSZ_SET_SQS(n->bar.cmbsz, 1);
+    NVME_CMBSZ_SET_CQS(n->bar.cmbsz, 0);
+    NVME_CMBSZ_SET_LISTS(n->bar.cmbsz, 1);
+    NVME_CMBSZ_SET_RDS(n->bar.cmbsz, 1);
+    NVME_CMBSZ_SET_WDS(n->bar.cmbsz, 1);
+    NVME_CMBSZ_SET_SZU(n->bar.cmbsz, 2); /* MBs */
+    NVME_CMBSZ_SET_SZ(n->bar.cmbsz, n->params.cmb_size_mb);
+}
+
 static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
                            unsigned size)
 {
@@ -3895,6 +3915,38 @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
         NVME_GUEST_ERR(pci_nvme_ub_mmiowr_cmbsz_readonly,
                        "invalid write to read only CMBSZ, ignored");
         return;
+    case 0x50:  /* CMBMSC */
+        if (!NVME_CAP_CMBS(n->bar.cap)) {
+            return;
+        }
+
+        n->bar.cmbmsc = size == 8 ? data :
+            (n->bar.cmbmsc & ~0xffffffff) | (data & 0xffffffff);
+        n->cmb.cmse = false;
+
+        if (NVME_CMBMSC_CRE(data)) {
+            nvme_cmb_enable_regs(n);
+
+            if (NVME_CMBMSC_CMSE(data)) {
+                hwaddr cba = NVME_CMBMSC_CBA(data) << CMBMSC_CBA_SHIFT;
+                if (cba + int128_get64(n->cmb.mem.size) < cba) {
+                    NVME_CMBSTS_SET_CBAI(n->bar.cmbsts, 1);
+                    return;
+                }
+
+                n->cmb.cba = cba;
+                n->cmb.cmse = true;
+            }
+        } else {
+            n->bar.cmbsz = 0;
+            n->bar.cmbloc = 0;
+        }
+
+        return;
+    case 0x54:  /* CMBMSC hi */
+        n->bar.cmbmsc = (n->bar.cmbmsc & 0xffffffff) | (data << 32);
+        return;
+
     case 0xE00: /* PMRCAP */
         NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrcap_readonly,
                        "invalid write to PMRCAP register, ignored");
@@ -4151,13 +4203,13 @@ static void nvme_cmb_write(void *opaque, hwaddr addr, uint64_t data,
                            unsigned size)
 {
     NvmeCtrl *n = (NvmeCtrl *)opaque;
-    stn_le_p(&n->cmbuf[addr], size, data);
+    stn_le_p(&n->cmb.buf[addr], size, data);
 }
 
 static uint64_t nvme_cmb_read(void *opaque, hwaddr addr, unsigned size)
 {
     NvmeCtrl *n = (NvmeCtrl *)opaque;
-    return ldn_le_p(&n->cmbuf[addr], size);
+    return ldn_le_p(&n->cmb.buf[addr], size);
 }
 
 static const MemoryRegionOps nvme_cmb_ops = {
@@ -4280,24 +4332,22 @@ int nvme_register_namespace(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
 
 static void nvme_init_cmb(NvmeCtrl *n, PCIDevice *pci_dev)
 {
-    NVME_CMBLOC_SET_BIR(n->bar.cmbloc, NVME_CMB_BIR);
-    NVME_CMBLOC_SET_OFST(n->bar.cmbloc, 0);
+    uint64_t cmb_size = n->params.cmb_size_mb * MiB;
 
-    NVME_CMBSZ_SET_SQS(n->bar.cmbsz, 1);
-    NVME_CMBSZ_SET_CQS(n->bar.cmbsz, 0);
-    NVME_CMBSZ_SET_LISTS(n->bar.cmbsz, 1);
-    NVME_CMBSZ_SET_RDS(n->bar.cmbsz, 1);
-    NVME_CMBSZ_SET_WDS(n->bar.cmbsz, 1);
-    NVME_CMBSZ_SET_SZU(n->bar.cmbsz, 2); /* MBs */
-    NVME_CMBSZ_SET_SZ(n->bar.cmbsz, n->params.cmb_size_mb);
-
-    n->cmbuf = g_malloc0(NVME_CMBSZ_GETSIZE(n->bar.cmbsz));
-    memory_region_init_io(&n->ctrl_mem, OBJECT(n), &nvme_cmb_ops, n,
-                          "nvme-cmb", NVME_CMBSZ_GETSIZE(n->bar.cmbsz));
-    pci_register_bar(pci_dev, NVME_CMBLOC_BIR(n->bar.cmbloc),
+    n->cmb.buf = g_malloc0(cmb_size);
+    memory_region_init_io(&n->cmb.mem, OBJECT(n), &nvme_cmb_ops, n,
+                          "nvme-cmb", cmb_size);
+    pci_register_bar(pci_dev, NVME_CMB_BIR,
                      PCI_BASE_ADDRESS_SPACE_MEMORY |
                      PCI_BASE_ADDRESS_MEM_TYPE_64 |
-                     PCI_BASE_ADDRESS_MEM_PREFETCH, &n->ctrl_mem);
+                     PCI_BASE_ADDRESS_MEM_PREFETCH, &n->cmb.mem);
+
+    NVME_CAP_SET_CMBS(n->bar.cap, 1);
+
+    if (n->params.legacy_cmb) {
+        nvme_cmb_enable_regs(n);
+        n->cmb.cmse = true;
+    }
 }
 
 static void nvme_init_pmr(NvmeCtrl *n, PCIDevice *pci_dev)
@@ -4509,7 +4559,7 @@ static void nvme_exit(PCIDevice *pci_dev)
     g_free(n->aer_reqs);
 
     if (n->params.cmb_size_mb) {
-        g_free(n->cmbuf);
+        g_free(n->cmb.buf);
     }
 
     if (n->pmr.dev) {
@@ -4531,6 +4581,7 @@ static Property nvme_props[] = {
     DEFINE_PROP_UINT32("aer_max_queued", NvmeCtrl, params.aer_max_queued, 64),
     DEFINE_PROP_UINT8("mdts", NvmeCtrl, params.mdts, 7),
     DEFINE_PROP_BOOL("use-intel-id", NvmeCtrl, params.use_intel_id, false),
+    DEFINE_PROP_BOOL("legacy-cmb", NvmeCtrl, params.legacy_cmb, false),
     DEFINE_PROP_SIZE32("zoned.append_size_limit", NvmeCtrl, params.zasl_bs,
                        NVME_DEFAULT_MAX_ZA_SIZE),
     DEFINE_PROP_END_OF_LIST(),
diff --git a/hw/block/trace-events b/hw/block/trace-events
index 3772502033af..87ab6c509045 100644
--- a/hw/block/trace-events
+++ b/hw/block/trace-events
@@ -123,6 +123,8 @@ pci_nvme_err_invalid_opc(uint8_t opc) "invalid opcode 0x%"PRIx8""
 pci_nvme_err_invalid_admin_opc(uint8_t opc) "invalid admin opcode 0x%"PRIx8""
 pci_nvme_err_invalid_lba_range(uint64_t start, uint64_t len, uint64_t limit) "Invalid LBA start=%"PRIu64" len=%"PRIu64" limit=%"PRIu64""
 pci_nvme_err_invalid_log_page_offset(uint64_t ofs, uint64_t size) "must be <= %"PRIu64", got %"PRIu64""
+pci_nvme_err_cmb_invalid_cba(uint64_t cmbmsc) "cmbmsc 0x%"PRIx64""
+pci_nvme_err_cmb_not_enabled(uint64_t cmbmsc) "cmbmsc 0x%"PRIx64""
 pci_nvme_err_unaligned_zone_cmd(uint8_t action, uint64_t slba, uint64_t zslba) "unaligned zone op 0x%"PRIx32", got slba=%"PRIu64", zslba=%"PRIu64""
 pci_nvme_err_invalid_zone_state_transition(uint8_t action, uint64_t slba, uint8_t attrs) "action=0x%"PRIx8", slba=%"PRIu64", attrs=0x%"PRIx32""
 pci_nvme_err_write_not_at_wp(uint64_t slba, uint64_t zone, uint64_t wp) "writing at slba=%"PRIu64", zone=%"PRIu64", but wp=%"PRIu64""
-- 
2.30.0
^ permalink raw reply related	[flat|nested] 62+ messages in thread
- * [PULL 48/56] hw/block/nvme: bump to v1.4
  2021-02-09  7:30 [PULL 00/56] emulated nvme patches Klaus Jensen
                   ` (46 preceding siblings ...)
  2021-02-09  7:30 ` [PULL 47/56] hw/block/nvme: move cmb logic to v1.4 Klaus Jensen
@ 2021-02-09  7:30 ` Klaus Jensen
  2021-02-09  7:30 ` [PULL 49/56] hw/block/nvme: lift cmb restrictions Klaus Jensen
                   ` (8 subsequent siblings)
  56 siblings, 0 replies; 62+ messages in thread
From: Klaus Jensen @ 2021-02-09  7:30 UTC (permalink / raw)
  To: qemu-devel, Peter Maydell
  Cc: Fam Zheng, Kevin Wolf, qemu-block, Klaus Jensen, Max Reitz,
	Klaus Jensen, Stefan Hajnoczi, Keith Busch
From: Klaus Jensen <k.jensen@samsung.com>
With the new CMB logic in place, bump the implemented specification
version to v1.4 by default.
This requires adding the setting the CNTRLTYPE field and modifying the
VWC field since 0x00 is no longer a valid value for bits 2:1.
Reviewed-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
---
 include/block/nvme.h | 3 ++-
 hw/block/nvme.c      | 5 +++--
 2 files changed, 5 insertions(+), 3 deletions(-)
diff --git a/include/block/nvme.h b/include/block/nvme.h
index 2e85b97a6c4e..07cfc929368b 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -951,7 +951,8 @@ typedef struct QEMU_PACKED NvmeIdCtrl {
     uint32_t    rtd3e;
     uint32_t    oaes;
     uint32_t    ctratt;
-    uint8_t     rsvd100[12];
+    uint8_t     rsvd100[11];
+    uint8_t     cntrltype;
     uint8_t     fguid[16];
     uint8_t     rsvd128[128];
     uint16_t    oacs;
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 1e13d25b0887..c4c968f5951e 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -108,7 +108,7 @@
 
 #define NVME_MAX_IOQPAIRS 0xffff
 #define NVME_DB_SIZE  4
-#define NVME_SPEC_VER 0x00010300
+#define NVME_SPEC_VER 0x00010400
 #define NVME_CMB_BIR 2
 #define NVME_PMR_BIR 4
 #define NVME_TEMPERATURE 0x143
@@ -4450,6 +4450,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
     id->mdts = n->params.mdts;
     id->ver = cpu_to_le32(NVME_SPEC_VER);
     id->oacs = cpu_to_le16(0);
+    id->cntrltype = 0x1;
 
     /*
      * Because the controller always completes the Abort command immediately,
@@ -4478,7 +4479,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
                            NVME_ONCS_FEATURES | NVME_ONCS_DSM |
                            NVME_ONCS_COMPARE);
 
-    id->vwc = 0x1;
+    id->vwc = (0x2 << 1) | 0x1;
     id->sgls = cpu_to_le32(NVME_CTRL_SGLS_SUPPORT_NO_ALIGN |
                            NVME_CTRL_SGLS_BITBUCKET);
 
-- 
2.30.0
^ permalink raw reply related	[flat|nested] 62+ messages in thread
- * [PULL 49/56] hw/block/nvme: lift cmb restrictions
  2021-02-09  7:30 [PULL 00/56] emulated nvme patches Klaus Jensen
                   ` (47 preceding siblings ...)
  2021-02-09  7:30 ` [PULL 48/56] hw/block/nvme: bump " Klaus Jensen
@ 2021-02-09  7:30 ` Klaus Jensen
  2021-02-09  7:30 ` [PULL 50/56] hw/block/nvme: error if drive less than a zone size Klaus Jensen
                   ` (7 subsequent siblings)
  56 siblings, 0 replies; 62+ messages in thread
From: Klaus Jensen @ 2021-02-09  7:30 UTC (permalink / raw)
  To: qemu-devel, Peter Maydell
  Cc: Fam Zheng, Kevin Wolf, qemu-block, Klaus Jensen, Max Reitz,
	Klaus Jensen, Minwoo Im, Stefan Hajnoczi, Keith Busch
From: Klaus Jensen <k.jensen@samsung.com>
The controller now implements v1.4 and we can lift the restrictions on
CMB Data Pointer and Command Independent Locations Support (CDPCILS) and
CMB Data Pointer Mixed Locations Support (CDPMLS) since the device
really does not care about mixed host/cmb pointers in those cases.
Reviewed-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Minwoo Im <minwoo.im.dev@gmail.com>
Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
---
 hw/block/nvme.c | 33 ++-------------------------------
 1 file changed, 2 insertions(+), 31 deletions(-)
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index c4c968f5951e..40784bd908fb 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -509,7 +509,6 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, uint64_t prp1, uint64_t prp2,
     trans_len = MIN(len, trans_len);
     int num_prps = (len >> n->page_bits) + 1;
     uint16_t status;
-    bool prp_list_in_cmb = false;
     int ret;
 
     QEMUSGList *qsg = &req->qsg;
@@ -535,10 +534,6 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, uint64_t prp1, uint64_t prp2,
             uint32_t nents, prp_trans;
             int i = 0;
 
-            if (nvme_addr_is_cmb(n, prp2)) {
-                prp_list_in_cmb = true;
-            }
-
             nents = (len + n->page_size - 1) >> n->page_bits;
             prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t);
             ret = nvme_addr_read(n, prp2, (void *)prp_list, prp_trans);
@@ -555,10 +550,6 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, uint64_t prp1, uint64_t prp2,
                         return NVME_INVALID_PRP_OFFSET | NVME_DNR;
                     }
 
-                    if (prp_list_in_cmb != nvme_addr_is_cmb(n, prp_ent)) {
-                        return NVME_INVALID_USE_OF_CMB | NVME_DNR;
-                    }
-
                     i = 0;
                     nents = (len + n->page_size - 1) >> n->page_bits;
                     prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t);
@@ -692,7 +683,6 @@ static uint16_t nvme_map_sgl(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov,
     uint64_t nsgld;
     uint32_t seg_len;
     uint16_t status;
-    bool sgl_in_cmb = false;
     hwaddr addr;
     int ret;
 
@@ -714,18 +704,6 @@ static uint16_t nvme_map_sgl(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov,
         goto out;
     }
 
-    /*
-     * If the segment is located in the CMB, the submission queue of the
-     * request must also reside there.
-     */
-    if (nvme_addr_is_cmb(n, addr)) {
-        if (!nvme_addr_is_cmb(n, req->sq->dma_addr)) {
-            return NVME_INVALID_USE_OF_CMB | NVME_DNR;
-        }
-
-        sgl_in_cmb = true;
-    }
-
     for (;;) {
         switch (NVME_SGL_TYPE(sgld->type)) {
         case NVME_SGL_DESCR_TYPE_SEGMENT:
@@ -814,15 +792,6 @@ static uint16_t nvme_map_sgl(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov,
         if (status) {
             goto unmap;
         }
-
-        /*
-         * If the next segment is in the CMB, make sure that the sgl was
-         * already located there.
-         */
-        if (sgl_in_cmb != nvme_addr_is_cmb(n, addr)) {
-            status = NVME_INVALID_USE_OF_CMB | NVME_DNR;
-            goto unmap;
-        }
     }
 
 out:
@@ -3777,6 +3746,8 @@ static int nvme_start_ctrl(NvmeCtrl *n)
 
 static void nvme_cmb_enable_regs(NvmeCtrl *n)
 {
+    NVME_CMBLOC_SET_CDPCILS(n->bar.cmbloc, 1);
+    NVME_CMBLOC_SET_CDPMLS(n->bar.cmbloc, 1);
     NVME_CMBLOC_SET_BIR(n->bar.cmbloc, NVME_CMB_BIR);
 
     NVME_CMBSZ_SET_SQS(n->bar.cmbsz, 1);
-- 
2.30.0
^ permalink raw reply related	[flat|nested] 62+ messages in thread
- * [PULL 50/56] hw/block/nvme: error if drive less than a zone size
  2021-02-09  7:30 [PULL 00/56] emulated nvme patches Klaus Jensen
                   ` (48 preceding siblings ...)
  2021-02-09  7:30 ` [PULL 49/56] hw/block/nvme: lift cmb restrictions Klaus Jensen
@ 2021-02-09  7:30 ` Klaus Jensen
  2021-02-09  7:30 ` [PULL 51/56] hw/block/nvme: fix set feature for error recovery Klaus Jensen
                   ` (6 subsequent siblings)
  56 siblings, 0 replies; 62+ messages in thread
From: Klaus Jensen @ 2021-02-09  7:30 UTC (permalink / raw)
  To: qemu-devel, Peter Maydell
  Cc: Fam Zheng, Kevin Wolf, qemu-block, Klaus Jensen, Max Reitz,
	Klaus Jensen, Minwoo Im, Stefan Hajnoczi, Keith Busch
From: Minwoo Im <minwoo.im.dev@gmail.com>
If a user assigns a backing device with less capacity than the size of a
single zone, the namespace capacity will be reported as zero and the
kernel will silently fail to allocate the namespace.
This patch errors out in case that the backing device cannot accomodate
at least a single zone.
Signed-off-by: Minwoo Im <minwoo.im.dev@gmail.com>
[k.jensen: small fixup in the error and commit message]
Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
---
 hw/block/nvme-ns.c | 7 +++++++
 1 file changed, 7 insertions(+)
diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index 3f52acb89c95..dfed71a950fa 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -134,6 +134,13 @@ static int nvme_ns_zoned_check_calc_geometry(NvmeNamespace *ns, Error **errp)
     ns->num_zones = ns->size / lbasz / ns->zone_size;
 
     /* Do a few more sanity checks of ZNS properties */
+    if (!ns->num_zones) {
+        error_setg(errp,
+                   "insufficient drive capacity, must be at least the size "
+                   "of one zone (%"PRIu64"B)", zone_size);
+        return -1;
+    }
+
     if (ns->params.max_open_zones > ns->num_zones) {
         error_setg(errp,
                    "max_open_zones value %u exceeds the number of zones %u",
-- 
2.30.0
^ permalink raw reply related	[flat|nested] 62+ messages in thread
- * [PULL 51/56] hw/block/nvme: fix set feature for error recovery
  2021-02-09  7:30 [PULL 00/56] emulated nvme patches Klaus Jensen
                   ` (49 preceding siblings ...)
  2021-02-09  7:30 ` [PULL 50/56] hw/block/nvme: error if drive less than a zone size Klaus Jensen
@ 2021-02-09  7:30 ` Klaus Jensen
  2021-02-09  7:30 ` [PULL 52/56] hw/block/nvme: fix set feature save field check Klaus Jensen
                   ` (5 subsequent siblings)
  56 siblings, 0 replies; 62+ messages in thread
From: Klaus Jensen @ 2021-02-09  7:30 UTC (permalink / raw)
  To: qemu-devel, Peter Maydell
  Cc: Fam Zheng, Kevin Wolf, qemu-block, Klaus Jensen,
	Gollu Appalanaidu, Max Reitz, Klaus Jensen, Stefan Hajnoczi,
	Keith Busch
From: Gollu Appalanaidu <anaidu.gollu@samsung.com>
Only enable DULBE if the namespace supports it.
Signed-off-by: Gollu Appalanaidu <anaidu.gollu@samsung.com>
Reviewed-by: Klaus Jensen <k.jensen@samsung.com>
Reviewed-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
---
 hw/block/nvme.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 40784bd908fb..b3d072c8b2bb 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -3396,7 +3396,9 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req)
         }
 
         assert(ns);
-        ns->features.err_rec = dw11;
+        if (NVME_ID_NS_NSFEAT_DULBE(ns->id_ns.nsfeat))  {
+            ns->features.err_rec = dw11;
+        }
         break;
     case NVME_VOLATILE_WRITE_CACHE:
         for (i = 1; i <= n->num_namespaces; i++) {
-- 
2.30.0
^ permalink raw reply related	[flat|nested] 62+ messages in thread
- * [PULL 52/56] hw/block/nvme: fix set feature save field check
  2021-02-09  7:30 [PULL 00/56] emulated nvme patches Klaus Jensen
                   ` (50 preceding siblings ...)
  2021-02-09  7:30 ` [PULL 51/56] hw/block/nvme: fix set feature for error recovery Klaus Jensen
@ 2021-02-09  7:30 ` Klaus Jensen
  2021-02-09  7:30 ` [PULL 53/56] hw/block/nvme: align with existing style Klaus Jensen
                   ` (4 subsequent siblings)
  56 siblings, 0 replies; 62+ messages in thread
From: Klaus Jensen @ 2021-02-09  7:30 UTC (permalink / raw)
  To: qemu-devel, Peter Maydell
  Cc: Fam Zheng, Kevin Wolf, qemu-block, Klaus Jensen,
	Gollu Appalanaidu, Max Reitz, Klaus Jensen, Stefan Hajnoczi,
	Keith Busch
From: Gollu Appalanaidu <anaidu.gollu@samsung.com>
Currently, no features are saveable, so the current check is not wrong,
but add a check against the feature capabilities to make sure this will
not regress if saveable features are added later.
Signed-off-by: Gollu Appalanaidu <anaidu.gollu@samsung.com>
Reviewed-by: Klaus Jensen <k.jensen@samsung.com>
Reviewed-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
---
 hw/block/nvme.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index b3d072c8b2bb..c99a3fbf3461 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -3324,7 +3324,7 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req)
 
     trace_pci_nvme_setfeat(nvme_cid(req), nsid, fid, save, dw11);
 
-    if (save) {
+    if (save && !(nvme_feature_cap[fid] & NVME_FEAT_CAP_SAVE)) {
         return NVME_FID_NOT_SAVEABLE | NVME_DNR;
     }
 
-- 
2.30.0
^ permalink raw reply related	[flat|nested] 62+ messages in thread
- * [PULL 53/56] hw/block/nvme: align with existing style
  2021-02-09  7:30 [PULL 00/56] emulated nvme patches Klaus Jensen
                   ` (51 preceding siblings ...)
  2021-02-09  7:30 ` [PULL 52/56] hw/block/nvme: fix set feature save field check Klaus Jensen
@ 2021-02-09  7:30 ` Klaus Jensen
  2021-02-09  7:30 ` [PULL 54/56] hw/block/nvme: fix wrong parameter name 'cross_read' Klaus Jensen
                   ` (3 subsequent siblings)
  56 siblings, 0 replies; 62+ messages in thread
From: Klaus Jensen @ 2021-02-09  7:30 UTC (permalink / raw)
  To: qemu-devel, Peter Maydell
  Cc: Fam Zheng, Kevin Wolf, qemu-block, Dmitry Fomichev, Klaus Jensen,
	Gollu Appalanaidu, Max Reitz, Klaus Jensen, Stefan Hajnoczi,
	Keith Busch
From: Gollu Appalanaidu <anaidu.gollu@samsung.com>
Change status checks to align with the existing style and remove the
explicit check against NVME_SUCCESS.
Cc: Dmitry Fomichev <dmitry.fomichev@wdc.com>
Signed-off-by: Gollu Appalanaidu <anaidu.gollu@samsung.com>
Reviewed-by: Klaus Jensen <k.jensen@samsung.com>
Reviewed-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
---
 hw/block/nvme.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index c99a3fbf3461..2335739bdb17 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1198,7 +1198,7 @@ static uint16_t nvme_check_zone_write(NvmeCtrl *n, NvmeNamespace *ns,
         status = nvme_check_zone_state_for_write(zone);
     }
 
-    if (status != NVME_SUCCESS) {
+    if (status) {
         trace_pci_nvme_err_zone_write_not_ok(slba, nlb, status);
     } else {
         assert(nvme_wp_is_valid(zone));
@@ -1253,7 +1253,7 @@ static uint16_t nvme_check_zone_read(NvmeNamespace *ns, uint64_t slba,
     uint16_t status;
 
     status = nvme_check_zone_state_for_read(zone);
-    if (status != NVME_SUCCESS) {
+    if (status) {
         ;
     } else if (unlikely(end > bndry)) {
         if (!ns->params.cross_zone_read) {
@@ -1266,7 +1266,7 @@ static uint16_t nvme_check_zone_read(NvmeNamespace *ns, uint64_t slba,
             do {
                 zone++;
                 status = nvme_check_zone_state_for_read(zone);
-                if (status != NVME_SUCCESS) {
+                if (status) {
                     break;
                 }
             } while (end > nvme_zone_rd_boundary(ns, zone));
@@ -1677,7 +1677,7 @@ static uint16_t nvme_read(NvmeCtrl *n, NvmeRequest *req)
 
     if (ns->params.zoned) {
         status = nvme_check_zone_read(ns, slba, nlb);
-        if (status != NVME_SUCCESS) {
+        if (status) {
             trace_pci_nvme_err_zone_read_not_ok(slba, nlb, status);
             goto invalid;
         }
@@ -1748,12 +1748,12 @@ static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append,
         zone = nvme_get_zone_by_slba(ns, slba);
 
         status = nvme_check_zone_write(n, ns, zone, slba, nlb, append);
-        if (status != NVME_SUCCESS) {
+        if (status) {
             goto invalid;
         }
 
         status = nvme_auto_open_zone(ns, zone);
-        if (status != NVME_SUCCESS) {
+        if (status) {
             goto invalid;
         }
 
@@ -1852,14 +1852,14 @@ static uint16_t nvme_open_zone(NvmeNamespace *ns, NvmeZone *zone,
     switch (state) {
     case NVME_ZONE_STATE_EMPTY:
         status = nvme_aor_check(ns, 1, 0);
-        if (status != NVME_SUCCESS) {
+        if (status) {
             return status;
         }
         nvme_aor_inc_active(ns);
         /* fall through */
     case NVME_ZONE_STATE_CLOSED:
         status = nvme_aor_check(ns, 0, 1);
-        if (status != NVME_SUCCESS) {
+        if (status) {
             if (state == NVME_ZONE_STATE_EMPTY) {
                 nvme_aor_dec_active(ns);
             }
@@ -1972,7 +1972,7 @@ static uint16_t nvme_set_zd_ext(NvmeNamespace *ns, NvmeZone *zone)
 
     if (state == NVME_ZONE_STATE_EMPTY) {
         status = nvme_aor_check(ns, 1, 0);
-        if (status != NVME_SUCCESS) {
+        if (status) {
             return status;
         }
         nvme_aor_inc_active(ns);
@@ -3301,7 +3301,7 @@ static uint16_t nvme_set_feature_timestamp(NvmeCtrl *n, NvmeRequest *req)
 
     ret = nvme_dma(n, (uint8_t *)×tamp, sizeof(timestamp),
                    DMA_DIRECTION_TO_DEVICE, req);
-    if (ret != NVME_SUCCESS) {
+    if (ret) {
         return ret;
     }
 
-- 
2.30.0
^ permalink raw reply related	[flat|nested] 62+ messages in thread
- * [PULL 54/56] hw/block/nvme: fix wrong parameter name 'cross_read'
  2021-02-09  7:30 [PULL 00/56] emulated nvme patches Klaus Jensen
                   ` (52 preceding siblings ...)
  2021-02-09  7:30 ` [PULL 53/56] hw/block/nvme: align with existing style Klaus Jensen
@ 2021-02-09  7:30 ` Klaus Jensen
  2021-02-09  7:31 ` [PULL 55/56] hw/block/nvme: fix zone boundary check for append Klaus Jensen
                   ` (2 subsequent siblings)
  56 siblings, 0 replies; 62+ messages in thread
From: Klaus Jensen @ 2021-02-09  7:30 UTC (permalink / raw)
  To: qemu-devel, Peter Maydell
  Cc: Fam Zheng, Kevin Wolf, qemu-block, Klaus Jensen, Max Reitz,
	Klaus Jensen, Minwoo Im, Stefan Hajnoczi, Keith Busch
From: Minwoo Im <minwoo.im.dev@gmail.com>
The actual parameter name is 'cross_read' rather than 'cross_zone_read'.
Signed-off-by: Minwoo Im <minwoo.im.dev@gmail.com>
Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
---
 hw/block/nvme.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 2335739bdb17..e562d7467b3b 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -81,7 +81,7 @@
  *         The default value means there is no limit to the number of
  *         concurrently open zones.
  *
- *     zoned.cross_zone_read=<enable RAZB, default: false>
+ *     zoned.cross_read=<enable RAZB, default: false>
  *         Setting this property to true enables Read Across Zone Boundaries.
  */
 
-- 
2.30.0
^ permalink raw reply related	[flat|nested] 62+ messages in thread
- * [PULL 55/56] hw/block/nvme: fix zone boundary check for append
  2021-02-09  7:30 [PULL 00/56] emulated nvme patches Klaus Jensen
                   ` (53 preceding siblings ...)
  2021-02-09  7:30 ` [PULL 54/56] hw/block/nvme: fix wrong parameter name 'cross_read' Klaus Jensen
@ 2021-02-09  7:31 ` Klaus Jensen
  2021-02-09  7:31 ` [PULL 56/56] hw/block/nvme: refactor the logic for zone write checks Klaus Jensen
  2021-02-09 14:52 ` [PULL 00/56] emulated nvme patches Peter Maydell
  56 siblings, 0 replies; 62+ messages in thread
From: Klaus Jensen @ 2021-02-09  7:31 UTC (permalink / raw)
  To: qemu-devel, Peter Maydell
  Cc: Fam Zheng, Kevin Wolf, qemu-block, Niklas Cassel, Dmitry Fomichev,
	Klaus Jensen, Max Reitz, Klaus Jensen, Stefan Hajnoczi,
	Keith Busch
From: Klaus Jensen <k.jensen@samsung.com>
When a zone append is processed the controller checks that validity of
the write before assigning the LBA to the append command. This causes
the boundary check to be wrong.
Fix this by checking the write *after* assigning the LBA. Remove the
append special case from the nvme_check_zone_write and open code it in
nvme_do_write, assigning the slba when basic sanity checks have been
performed. Then check the validity of the resulting write like any other
write command.
In the process, also fix a missing endianness conversion for the zone
append ALBA.
Reported-by: Niklas Cassel <Niklas.Cassel@wdc.com>
Cc: Dmitry Fomichev <dmitry.fomichev@wdc.com>
Tested-by: Niklas Cassel <niklas.cassel@wdc.com>
Tested-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
Reviewed-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
Reviewed-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
---
 hw/block/nvme.c | 46 ++++++++++++++++++++++++----------------------
 1 file changed, 24 insertions(+), 22 deletions(-)
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index e562d7467b3b..cedb4ad9ffd3 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1188,7 +1188,7 @@ static uint16_t nvme_check_zone_state_for_write(NvmeZone *zone)
 
 static uint16_t nvme_check_zone_write(NvmeCtrl *n, NvmeNamespace *ns,
                                       NvmeZone *zone, uint64_t slba,
-                                      uint32_t nlb, bool append)
+                                      uint32_t nlb)
 {
     uint16_t status;
 
@@ -1202,16 +1202,8 @@ static uint16_t nvme_check_zone_write(NvmeCtrl *n, NvmeNamespace *ns,
         trace_pci_nvme_err_zone_write_not_ok(slba, nlb, status);
     } else {
         assert(nvme_wp_is_valid(zone));
-        if (append) {
-            if (unlikely(slba != zone->d.zslba)) {
-                trace_pci_nvme_err_append_not_at_start(slba, zone->d.zslba);
-                status = NVME_INVALID_FIELD;
-            }
-            if (nvme_l2b(ns, nlb) > (n->page_size << n->zasl)) {
-                trace_pci_nvme_err_append_too_large(slba, nlb, n->zasl);
-                status = NVME_INVALID_FIELD;
-            }
-        } else if (unlikely(slba != zone->w_ptr)) {
+
+        if (unlikely(slba != zone->w_ptr)) {
             trace_pci_nvme_err_write_not_at_wp(slba, zone->d.zslba,
                                                zone->w_ptr);
             status = NVME_ZONE_INVALID_WRITE;
@@ -1349,10 +1341,9 @@ static void nvme_finalize_zoned_write(NvmeNamespace *ns, NvmeRequest *req,
     }
 }
 
-static uint64_t nvme_advance_zone_wp(NvmeNamespace *ns, NvmeZone *zone,
-                                     uint32_t nlb)
+static void nvme_advance_zone_wp(NvmeNamespace *ns, NvmeZone *zone,
+                                 uint32_t nlb)
 {
-    uint64_t result = zone->w_ptr;
     uint8_t zs;
 
     zone->w_ptr += nlb;
@@ -1368,8 +1359,6 @@ static uint64_t nvme_advance_zone_wp(NvmeNamespace *ns, NvmeZone *zone,
             nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_IMPLICITLY_OPEN);
         }
     }
-
-    return result;
 }
 
 static inline bool nvme_is_write(NvmeRequest *req)
@@ -1747,7 +1736,24 @@ static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append,
     if (ns->params.zoned) {
         zone = nvme_get_zone_by_slba(ns, slba);
 
-        status = nvme_check_zone_write(n, ns, zone, slba, nlb, append);
+        if (append) {
+            if (unlikely(slba != zone->d.zslba)) {
+                trace_pci_nvme_err_append_not_at_start(slba, zone->d.zslba);
+                status = NVME_INVALID_FIELD;
+                goto invalid;
+            }
+
+            if (nvme_l2b(ns, nlb) > (n->page_size << n->zasl)) {
+                trace_pci_nvme_err_append_too_large(slba, nlb, n->zasl);
+                status = NVME_INVALID_FIELD;
+                goto invalid;
+            }
+
+            slba = zone->w_ptr;
+            res->slba = cpu_to_le64(slba);
+        }
+
+        status = nvme_check_zone_write(n, ns, zone, slba, nlb);
         if (status) {
             goto invalid;
         }
@@ -1757,11 +1763,7 @@ static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append,
             goto invalid;
         }
 
-        if (append) {
-            slba = zone->w_ptr;
-        }
-
-        res->slba = nvme_advance_zone_wp(ns, zone, nlb);
+        nvme_advance_zone_wp(ns, zone, nlb);
     }
 
     data_offset = nvme_l2b(ns, slba);
-- 
2.30.0
^ permalink raw reply related	[flat|nested] 62+ messages in thread
- * [PULL 56/56] hw/block/nvme: refactor the logic for zone write checks
  2021-02-09  7:30 [PULL 00/56] emulated nvme patches Klaus Jensen
                   ` (54 preceding siblings ...)
  2021-02-09  7:31 ` [PULL 55/56] hw/block/nvme: fix zone boundary check for append Klaus Jensen
@ 2021-02-09  7:31 ` Klaus Jensen
  2021-02-09 14:52 ` [PULL 00/56] emulated nvme patches Peter Maydell
  56 siblings, 0 replies; 62+ messages in thread
From: Klaus Jensen @ 2021-02-09  7:31 UTC (permalink / raw)
  To: qemu-devel, Peter Maydell
  Cc: Fam Zheng, Kevin Wolf, qemu-block, Niklas Cassel, Dmitry Fomichev,
	Klaus Jensen, Max Reitz, Klaus Jensen, Stefan Hajnoczi,
	Keith Busch
From: Klaus Jensen <k.jensen@samsung.com>
Refactor the zone write check logic such that the most "meaningful"
error is returned first. That is, first, if the zone is not writable,
return an appropriate status code for that. Then, make sure we are
actually writing at the write pointer and finally check that we do not
cross the zone write boundary. This aligns with the "priority" of status
codes for zone read checks.
Also add a couple of additional descriptive trace events and remove an
always true assert.
Cc: Dmitry Fomichev <dmitry.fomichev@wdc.com>
Tested-by: Niklas Cassel <niklas.cassel@wdc.com>
Tested-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
Reviewed-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
Reviewed-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
---
 hw/block/nvme.c       | 49 ++++++++++++++++++++-----------------------
 hw/block/trace-events |  5 +++++
 2 files changed, 28 insertions(+), 26 deletions(-)
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index cedb4ad9ffd3..5ce21b7100b3 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1161,56 +1161,53 @@ static inline NvmeZone *nvme_get_zone_by_slba(NvmeNamespace *ns, uint64_t slba)
 
 static uint16_t nvme_check_zone_state_for_write(NvmeZone *zone)
 {
-    uint16_t status;
+    uint64_t zslba = zone->d.zslba;
 
     switch (nvme_get_zone_state(zone)) {
     case NVME_ZONE_STATE_EMPTY:
     case NVME_ZONE_STATE_IMPLICITLY_OPEN:
     case NVME_ZONE_STATE_EXPLICITLY_OPEN:
     case NVME_ZONE_STATE_CLOSED:
-        status = NVME_SUCCESS;
-        break;
+        return NVME_SUCCESS;
     case NVME_ZONE_STATE_FULL:
-        status = NVME_ZONE_FULL;
-        break;
+        trace_pci_nvme_err_zone_is_full(zslba);
+        return NVME_ZONE_FULL;
     case NVME_ZONE_STATE_OFFLINE:
-        status = NVME_ZONE_OFFLINE;
-        break;
+        trace_pci_nvme_err_zone_is_offline(zslba);
+        return NVME_ZONE_OFFLINE;
     case NVME_ZONE_STATE_READ_ONLY:
-        status = NVME_ZONE_READ_ONLY;
-        break;
+        trace_pci_nvme_err_zone_is_read_only(zslba);
+        return NVME_ZONE_READ_ONLY;
     default:
         assert(false);
     }
 
-    return status;
+    return NVME_INTERNAL_DEV_ERROR;
 }
 
 static uint16_t nvme_check_zone_write(NvmeCtrl *n, NvmeNamespace *ns,
                                       NvmeZone *zone, uint64_t slba,
                                       uint32_t nlb)
 {
+    uint64_t zcap = nvme_zone_wr_boundary(zone);
     uint16_t status;
 
-    if (unlikely((slba + nlb) > nvme_zone_wr_boundary(zone))) {
-        status = NVME_ZONE_BOUNDARY_ERROR;
-    } else {
-        status = nvme_check_zone_state_for_write(zone);
-    }
-
+    status = nvme_check_zone_state_for_write(zone);
     if (status) {
-        trace_pci_nvme_err_zone_write_not_ok(slba, nlb, status);
-    } else {
-        assert(nvme_wp_is_valid(zone));
-
-        if (unlikely(slba != zone->w_ptr)) {
-            trace_pci_nvme_err_write_not_at_wp(slba, zone->d.zslba,
-                                               zone->w_ptr);
-            status = NVME_ZONE_INVALID_WRITE;
-        }
+        return status;
     }
 
-    return status;
+    if (unlikely(slba != zone->w_ptr)) {
+        trace_pci_nvme_err_write_not_at_wp(slba, zone->d.zslba, zone->w_ptr);
+        return NVME_ZONE_INVALID_WRITE;
+    }
+
+    if (unlikely((slba + nlb) > zcap)) {
+        trace_pci_nvme_err_zone_boundary(slba, nlb, zcap);
+        return NVME_ZONE_BOUNDARY_ERROR;
+    }
+
+    return NVME_SUCCESS;
 }
 
 static uint16_t nvme_check_zone_state_for_read(NvmeZone *zone)
diff --git a/hw/block/trace-events b/hw/block/trace-events
index 87ab6c509045..d32475c3989e 100644
--- a/hw/block/trace-events
+++ b/hw/block/trace-events
@@ -129,6 +129,11 @@ pci_nvme_err_unaligned_zone_cmd(uint8_t action, uint64_t slba, uint64_t zslba) "
 pci_nvme_err_invalid_zone_state_transition(uint8_t action, uint64_t slba, uint8_t attrs) "action=0x%"PRIx8", slba=%"PRIu64", attrs=0x%"PRIx32""
 pci_nvme_err_write_not_at_wp(uint64_t slba, uint64_t zone, uint64_t wp) "writing at slba=%"PRIu64", zone=%"PRIu64", but wp=%"PRIu64""
 pci_nvme_err_append_not_at_start(uint64_t slba, uint64_t zone) "appending at slba=%"PRIu64", but zone=%"PRIu64""
+pci_nvme_err_zone_is_full(uint64_t zslba) "zslba 0x%"PRIx64""
+pci_nvme_err_zone_is_read_only(uint64_t zslba) "zslba 0x%"PRIx64""
+pci_nvme_err_zone_is_offline(uint64_t zslba) "zslba 0x%"PRIx64""
+pci_nvme_err_zone_boundary(uint64_t slba, uint32_t nlb, uint64_t zcap) "lba 0x%"PRIx64" nlb %"PRIu32" zcap 0x%"PRIx64""
+pci_nvme_err_zone_invalid_write(uint64_t slba, uint64_t wp) "lba 0x%"PRIx64" wp 0x%"PRIx64""
 pci_nvme_err_zone_write_not_ok(uint64_t slba, uint32_t nlb, uint16_t status) "slba=%"PRIu64", nlb=%"PRIu32", status=0x%"PRIx16""
 pci_nvme_err_zone_read_not_ok(uint64_t slba, uint32_t nlb, uint16_t status) "slba=%"PRIu64", nlb=%"PRIu32", status=0x%"PRIx16""
 pci_nvme_err_append_too_large(uint64_t slba, uint32_t nlb, uint8_t zasl) "slba=%"PRIu64", nlb=%"PRIu32", zasl=%"PRIu8""
-- 
2.30.0
^ permalink raw reply related	[flat|nested] 62+ messages in thread
- * Re: [PULL 00/56] emulated nvme patches
  2021-02-09  7:30 [PULL 00/56] emulated nvme patches Klaus Jensen
                   ` (55 preceding siblings ...)
  2021-02-09  7:31 ` [PULL 56/56] hw/block/nvme: refactor the logic for zone write checks Klaus Jensen
@ 2021-02-09 14:52 ` Peter Maydell
  56 siblings, 0 replies; 62+ messages in thread
From: Peter Maydell @ 2021-02-09 14:52 UTC (permalink / raw)
  To: Klaus Jensen
  Cc: Fam Zheng, Kevin Wolf, Qemu-block, Klaus Jensen, QEMU Developers,
	Max Reitz, Stefan Hajnoczi, Keith Busch
On Tue, 9 Feb 2021 at 07:31, Klaus Jensen <its@irrelevant.dk> wrote:
>
> From: Klaus Jensen <k.jensen@samsung.com>
>
> The following changes since commit 4f799257b323e1238a900fd0c71c2057863e0308:
>
>   Merge remote-tracking branch 'remotes/armbru/tags/pull-qapi-2021-02-08' into staging (2021-02-08 16:12:21 +0000)
>
> are available in the Git repository at:
>
>   git://git.infradead.org/qemu-nvme.git tags/nvme-next-pull-request
>
> for you to fetch changes up to 3e22762edc74be3e1ecafc361351a9640d114978:
>
>   hw/block/nvme: refactor the logic for zone write checks (2021-02-08 21:15:54 +0100)
>
> ----------------------------------------------------------------
> Emulated NVMe device updates
>
>   * deallocate or unwritten logical block error feature (me)
>   * dataset management command (me)
>   * compare command (Gollu Appalanaidu)
>   * namespace types (Niklas Cassel)
>   * zoned namespaces (Dmitry Fomichev)
>   * smart critical warning toggle (Zhenwei Pi)
>   * allow cmb and pmr to coexist (Andrzej Jakowski, me)
>   * pmr rds/wds support (Naveen Nagar)
>   * cmb v1.4 logic (Padmakar Kalghatgi)
>
> And a lot of smaller fixes from Gollu Appalanaidu, Minwoo Im and me.
>
> ----------------------------------------------------------------
Applied, thanks.
Please update the changelog at https://wiki.qemu.org/ChangeLog/6.0
for any user-visible changes.
-- PMM
^ permalink raw reply	[flat|nested] 62+ messages in thread