* [RFC 4/5] hw/nvme: refactor zone append writes using block layer APIs
@ 2023-08-16 7:08 Sam Li
2023-08-16 7:08 ` [RFC 5/5] hw/nvme: make ZDED persistent Sam Li
0 siblings, 1 reply; 2+ messages in thread
From: Sam Li @ 2023-08-16 7:08 UTC (permalink / raw)
To: qemu-devel
Cc: qemu-block, Klaus Jensen, Markus Armbruster, Hanna Reitz,
Peter Xu, David Hildenbrand, dlemoal, Keith Busch,
Philippe Mathieu-Daudé, Eric Blake, hare, Kevin Wolf,
stefanha, Paolo Bonzini, dmitry.fomichev, Sam Li
Signed-off-by: Sam Li <faithilikerun@gmail.com>
---
block/block-backend.c | 8 ++
block/qcow2.c | 7 +-
hw/nvme/ctrl.c | 195 ++++++++++++++++++++++--------
include/sysemu/block-backend-io.h | 1 +
include/sysemu/dma.h | 3 +
softmmu/dma-helpers.c | 17 +++
6 files changed, 181 insertions(+), 50 deletions(-)
diff --git a/block/block-backend.c b/block/block-backend.c
index 9c95ae0267..2aafb4cee3 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -2426,6 +2426,14 @@ uint32_t blk_get_nr_zones(BlockBackend *blk)
return bs ? bs->bl.nr_zones : 0;
}
+uint32_t blk_get_write_granularity(BlockBackend *blk)
+{
+ BlockDriverState *bs = blk_bs(blk);
+ IO_CODE();
+
+ return bs ? bs->bl.write_granularity : 0;
+}
+
uint8_t *blk_get_zone_extension(BlockBackend *blk) {
BlockDriverState * bs = blk_bs(blk);
IO_CODE();
diff --git a/block/qcow2.c b/block/qcow2.c
index 41549dd68b..5a038792f1 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -2198,7 +2198,7 @@ static void qcow2_refresh_limits(BlockDriverState *bs, Error **errp)
bs->bl.max_active_zones = s->zoned_header.max_active_zones;
bs->bl.max_open_zones = s->zoned_header.max_open_zones;
bs->bl.zone_size = s->zoned_header.zone_size;
- bs->bl.write_granularity = BDRV_SECTOR_SIZE;
+ bs->bl.write_granularity = 4096; /* physical block size */
}
static int qcow2_reopen_prepare(BDRVReopenState *state,
@@ -4915,6 +4915,11 @@ qcow2_co_zone_append(BlockDriverState *bs, int64_t *offset, QEMUIOVector *qiov,
qemu_co_mutex_lock(&s->wps->colock);
uint64_t wp = s->wps->wp[index];
uint64_t wp_i = qcow2_get_wp(wp);
+ printf("qcow2 offset 0x%lx\n", *offset);
+ printf("checking wp[%ld]: 0b%lb\n", *offset / bs->bl.zone_size, wp);
+ for (int i = 0; i < bs->bl.nr_zones; i++) {
+ printf("Listing wp[%d]: 0b%lb\n", i, s->wps->wp[i]);
+ }
ret = qcow2_co_pwritev_part(bs, wp_i, len, qiov, 0, 0);
if (ret == 0) {
*offset = wp_i;
diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 8d4c08dc4c..3932b516ed 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -1740,6 +1740,95 @@ static void nvme_misc_cb(void *opaque, int ret)
nvme_enqueue_req_completion(nvme_cq(req), req);
}
+typedef struct NvmeZoneCmdAIOCB {
+ NvmeRequest *req;
+ NvmeCmd *cmd;
+ NvmeCtrl *n;
+
+ union {
+ struct {
+ uint32_t partial;
+ unsigned int nr_zones;
+ BlockZoneDescriptor *zones;
+ } zone_report_data;
+ struct {
+ int64_t offset;
+ } zone_append_data;
+ };
+} NvmeZoneCmdAIOCB;
+
+static void nvme_blk_zone_append_complete_cb(void *opaque, int ret)
+{
+ NvmeZoneCmdAIOCB *cb = opaque;
+ NvmeRequest *req = cb->req;
+ int64_t *offset = (int64_t *)&req->cqe;
+
+ if (ret) {
+ nvme_aio_err(req, ret);
+ }
+
+ *offset = nvme_b2l(req->ns, cb->zone_append_data.offset);
+ nvme_enqueue_req_completion(nvme_cq(req), req);
+ g_free(cb);
+}
+
+static inline void nvme_blk_zone_append(BlockBackend *blk, int64_t *offset,
+ uint32_t align,
+ BlockCompletionFunc *cb,
+ NvmeZoneCmdAIOCB *aiocb)
+{
+ NvmeRequest *req = aiocb->req;
+ assert(req->sg.flags & NVME_SG_ALLOC);
+
+ if (req->sg.flags & NVME_SG_DMA) {
+ req->aiocb = dma_blk_zone_append(blk, &req->sg.qsg, (int64_t)offset,
+ align, cb, aiocb);
+ } else {
+ req->aiocb = blk_aio_zone_append(blk, offset, &req->sg.iov, 0,
+ cb, aiocb);
+ }
+}
+
+static void nvme_zone_append_cb(void *opaque, int ret)
+{
+ NvmeZoneCmdAIOCB *aiocb = opaque;
+ NvmeRequest *req = aiocb->req;
+ NvmeNamespace *ns = req->ns;
+
+ BlockBackend *blk = ns->blkconf.blk;
+
+ trace_pci_nvme_rw_cb(nvme_cid(req), blk_name(blk));
+
+ if (ret) {
+ goto out;
+ }
+
+ if (ns->lbaf.ms) {
+ NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
+ uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
+ int64_t offset = aiocb->zone_append_data.offset;
+
+ if (nvme_ns_ext(ns) || req->cmd.mptr) {
+ uint16_t status;
+
+ nvme_sg_unmap(&req->sg);
+ status = nvme_map_mdata(nvme_ctrl(req), nlb, req);
+ if (status) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ return nvme_blk_zone_append(blk, &offset, 1,
+ nvme_blk_zone_append_complete_cb,
+ aiocb);
+ }
+ }
+
+out:
+ nvme_blk_zone_append_complete_cb(aiocb, ret);
+}
+
+
void nvme_rw_complete_cb(void *opaque, int ret)
{
NvmeRequest *req = opaque;
@@ -3067,6 +3156,9 @@ static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append,
uint64_t mapped_size = data_size;
uint64_t data_offset;
BlockBackend *blk = ns->blkconf.blk;
+ BlockZoneWps *wps = blk_get_zone_wps(blk);
+ uint32_t zone_size = blk_get_zone_size(blk);
+ uint32_t zone_idx;
uint16_t status;
if (nvme_ns_ext(ns)) {
@@ -3097,42 +3189,47 @@ static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append,
}
if (blk_get_zone_model(blk)) {
- uint32_t zone_size = blk_get_zone_size(blk);
- uint32_t zone_idx = slba / zone_size;
- int64_t zone_start = zone_idx * zone_size;
+ assert(wps);
+ if (zone_size) {
+ zone_idx = slba / zone_size;
+ int64_t zone_start = zone_idx * zone_size;
+
+ if (append) {
+ bool piremap = !!(ctrl & NVME_RW_PIREMAP);
+
+ if (n->params.zasl &&
+ data_size > (uint64_t)
+ n->page_size << n->params.zasl) {
+ trace_pci_nvme_err_zasl(data_size);
+ return NVME_INVALID_FIELD | NVME_DNR;
+ }
- if (append) {
- bool piremap = !!(ctrl & NVME_RW_PIREMAP);
+ rw->slba = cpu_to_le64(slba);
- if (n->params.zasl &&
- data_size > (uint64_t)n->page_size << n->params.zasl) {
- trace_pci_nvme_err_zasl(data_size);
- return NVME_INVALID_FIELD | NVME_DNR;
- }
+ switch (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
+ case NVME_ID_NS_DPS_TYPE_1:
+ if (!piremap) {
+ return NVME_INVALID_PROT_INFO | NVME_DNR;
+ }
- rw->slba = cpu_to_le64(slba);
- switch (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
- case NVME_ID_NS_DPS_TYPE_1:
- if (!piremap) {
- return NVME_INVALID_PROT_INFO | NVME_DNR;
- }
+ /* fallthrough */
- /* fallthrough */
+ case NVME_ID_NS_DPS_TYPE_2:
+ if (piremap) {
+ uint32_t reftag = le32_to_cpu(rw->reftag);
+ rw->reftag =
+ cpu_to_le32(reftag + (slba - zone_start));
+ }
- case NVME_ID_NS_DPS_TYPE_2:
- if (piremap) {
- uint32_t reftag = le32_to_cpu(rw->reftag);
- rw->reftag = cpu_to_le32(reftag + (slba - zone_start));
- }
+ break;
- break;
+ case NVME_ID_NS_DPS_TYPE_3:
+ if (piremap) {
+ return NVME_INVALID_PROT_INFO | NVME_DNR;
+ }
- case NVME_ID_NS_DPS_TYPE_3:
- if (piremap) {
- return NVME_INVALID_PROT_INFO | NVME_DNR;
+ break;
}
-
- break;
}
}
@@ -3152,9 +3249,21 @@ static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append,
goto invalid;
}
- block_acct_start(blk_get_stats(blk), &req->acct, data_size,
- BLOCK_ACCT_WRITE);
- nvme_blk_write(blk, data_offset, BDRV_SECTOR_SIZE, nvme_rw_cb, req);
+ if (append) {
+ NvmeZoneCmdAIOCB *cb = g_malloc(sizeof(NvmeZoneCmdAIOCB));
+ cb->req = req;
+ cb->zone_append_data.offset = data_offset;
+
+ block_acct_start(blk_get_stats(blk), &req->acct, data_size,
+ BLOCK_ACCT_ZONE_APPEND);
+ nvme_blk_zone_append(blk, &cb->zone_append_data.offset,
+ blk_get_write_granularity(blk),
+ nvme_zone_append_cb, cb);
+ } else {
+ block_acct_start(blk_get_stats(blk), &req->acct, data_size,
+ BLOCK_ACCT_WRITE);
+ nvme_blk_write(blk, data_offset, BDRV_SECTOR_SIZE, nvme_rw_cb, req);
+ }
} else {
req->aiocb = blk_aio_pwrite_zeroes(blk, data_offset, data_size,
BDRV_REQ_MAY_UNMAP, nvme_rw_cb,
@@ -3178,24 +3287,7 @@ static inline uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req)
return nvme_do_write(n, req, false, true);
}
-typedef struct NvmeZoneCmdAIOCB {
- NvmeRequest *req;
- NvmeCmd *cmd;
- NvmeCtrl *n;
-
- union {
- struct {
- uint32_t partial;
- unsigned int nr_zones;
- BlockZoneDescriptor *zones;
- } zone_report_data;
- struct {
- int64_t offset;
- } zone_append_data;
- };
-} NvmeZoneCmdAIOCB;
-
-static inline uint16_t nvme_zone_append(NvmeCtrl *n, NvmeRequest *req)
+static uint16_t nvme_zone_append(NvmeCtrl *n, NvmeRequest *req)
{
return nvme_do_write(n, req, true, false);
}
@@ -3333,6 +3425,11 @@ static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeRequest *req)
NvmeNamespace *ns = req->ns;
NvmeZoneMgmtAIOCB *iocb;
uint64_t slba = 0;
+ uint64_t offset;
+ BlockBackend *blk = ns->blkconf.blk;
+ uint32_t zone_size = blk_get_zone_size(blk);
+ uint64_t size = zone_size * blk_get_nr_zones(blk);
+ int64_t len;
uint32_t zone_idx = 0;
uint16_t status;
uint8_t action = cmd->zsa;
diff --git a/include/sysemu/block-backend-io.h b/include/sysemu/block-backend-io.h
index f69aa1094a..fcbdd93dea 100644
--- a/include/sysemu/block-backend-io.h
+++ b/include/sysemu/block-backend-io.h
@@ -109,6 +109,7 @@ uint32_t blk_get_max_append_sectors(BlockBackend *blk);
uint32_t blk_get_nr_zones(BlockBackend *blk);
uint8_t *blk_get_zone_extension(BlockBackend *blk);
uint32_t blk_get_zd_ext_size(BlockBackend *blk);
+uint32_t blk_get_write_granularity(BlockBackend *blk);
BlockZoneWps *blk_get_zone_wps(BlockBackend *blk);
void blk_io_plug(void);
diff --git a/include/sysemu/dma.h b/include/sysemu/dma.h
index a1ac5bc1b5..680e0b5477 100644
--- a/include/sysemu/dma.h
+++ b/include/sysemu/dma.h
@@ -301,6 +301,9 @@ BlockAIOCB *dma_blk_read(BlockBackend *blk,
BlockAIOCB *dma_blk_write(BlockBackend *blk,
QEMUSGList *sg, uint64_t offset, uint32_t align,
BlockCompletionFunc *cb, void *opaque);
+BlockAIOCB *dma_blk_zone_append(BlockBackend *blk,
+ QEMUSGList *sg, int64_t offset, uint32_t align,
+ void (*cb)(void *opaque, int ret), void *opaque);
MemTxResult dma_buf_read(void *ptr, dma_addr_t len, dma_addr_t *residual,
QEMUSGList *sg, MemTxAttrs attrs);
MemTxResult dma_buf_write(void *ptr, dma_addr_t len, dma_addr_t *residual,
diff --git a/softmmu/dma-helpers.c b/softmmu/dma-helpers.c
index 2463964805..88bc13264b 100644
--- a/softmmu/dma-helpers.c
+++ b/softmmu/dma-helpers.c
@@ -282,6 +282,23 @@ BlockAIOCB *dma_blk_write(BlockBackend *blk,
DMA_DIRECTION_TO_DEVICE);
}
+static
+BlockAIOCB *dma_blk_zone_append_io_func(int64_t offset, QEMUIOVector *iov,
+ BlockCompletionFunc *cb, void *cb_opaque,
+ void *opaque)
+{
+ BlockBackend *blk = opaque;
+ return blk_aio_zone_append(blk, (int64_t *)offset, iov, 0, cb, cb_opaque);
+}
+
+BlockAIOCB *dma_blk_zone_append(BlockBackend *blk,
+ QEMUSGList *sg, int64_t offset, uint32_t align,
+ void (*cb)(void *opaque, int ret), void *opaque)
+{
+ return dma_blk_io(blk_get_aio_context(blk), sg, offset, align,
+ dma_blk_zone_append_io_func, blk, cb, opaque,
+ DMA_DIRECTION_TO_DEVICE);
+}
static MemTxResult dma_buf_rw(void *buf, dma_addr_t len, dma_addr_t *residual,
QEMUSGList *sg, DMADirection dir,
--
2.40.1
^ permalink raw reply related [flat|nested] 2+ messages in thread
* [RFC 5/5] hw/nvme: make ZDED persistent
2023-08-16 7:08 [RFC 4/5] hw/nvme: refactor zone append writes using block layer APIs Sam Li
@ 2023-08-16 7:08 ` Sam Li
0 siblings, 0 replies; 2+ messages in thread
From: Sam Li @ 2023-08-16 7:08 UTC (permalink / raw)
To: qemu-devel
Cc: qemu-block, Klaus Jensen, Markus Armbruster, Hanna Reitz,
Peter Xu, David Hildenbrand, dlemoal, Keith Busch,
Philippe Mathieu-Daudé, Eric Blake, hare, Kevin Wolf,
stefanha, Paolo Bonzini, dmitry.fomichev, Sam Li
Zone descriptor extension data (ZDED) is not persistent across QEMU
restarts. The zone descriptor extension valid bit (ZDEV) is part of
zone attributes, which sets to one when the ZDED is associated with
the zone.
With the qcow2-ZNS file as the backing file, the NVMe ZNS device stores
the zone attributes at the following eight bit of zoned bit of write
pointers for each zone. The ZDED is stored as part of zoned metadata as
write pointers.
Signed-off-by: Sam Li <faithilikerun@gmail.com>
---
block/qcow2.c | 44 +++++++++++++++++++++++++++++++++++-
hw/nvme/ctrl.c | 6 +----
include/block/block-common.h | 1 +
3 files changed, 45 insertions(+), 6 deletions(-)
diff --git a/block/qcow2.c b/block/qcow2.c
index 5a038792f1..ac5ecef559 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -25,6 +25,7 @@
#include "qemu/osdep.h"
#include "block/qdict.h"
+#include "block/nvme.h"
#include "sysemu/block-backend.h"
#include "qemu/main-loop.h"
#include "qemu/module.h"
@@ -214,6 +215,17 @@ static inline void qcow2_set_wp(uint64_t *wp, BlockZoneState zs)
*wp = addr;
}
+static inline void qcow2_set_za(uint64_t *wp, uint8_t za)
+{
+ /*
+ * The zone attribute takes up one byte. Store it after the zoned
+ * bit.
+ */
+ uint64_t addr = *wp;
+ addr |= ((uint64_t)za << 51);
+ *wp = addr;
+}
+
/*
* File wp tracking: reset zone, finish zone and append zone can
* change the value of write pointer. All zone operations will change
@@ -308,7 +320,7 @@ static int qcow2_check_open(BlockDriverState *bs)
/*
* The zoned device has limited zone resources of open, closed, active
- * zones.
+ * zones. Check if we can manage a zone without exceeding those limits.
*/
static int qcow2_check_zone_resources(BlockDriverState *bs,
BlockZoneState zs)
@@ -4801,6 +4813,33 @@ unlock:
return ret;
}
+static int qcow2_zns_set_zded(BlockDriverState *bs, uint32_t index)
+{
+ BDRVQcow2State *s = bs->opaque;
+ int ret;
+
+ qemu_co_mutex_lock(&s->wps->colock);
+ uint64_t *wp = &s->wps->wp[index];
+ BlockZoneState zs = qcow2_get_zs(*wp);
+ if (zs == BLK_ZS_EMPTY) {
+ ret = qcow2_check_zone_resources(bs, zs);
+ if (ret < 0) {
+ return ret;
+ }
+
+ qcow2_set_za(wp, NVME_ZA_ZD_EXT_VALID);
+ ret = qcow2_write_wp_at(bs, wp, index, BLK_ZO_CLOSE);
+ if (ret < 0) {
+ error_report("Failed to set zone extension at 0x%" PRIx64 "", *wp);
+ return ret;
+ }
+ s->nr_zones_closed++;
+ return ret;
+ }
+
+ return NVME_ZONE_INVAL_TRANSITION;
+}
+
static int coroutine_fn qcow2_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
int64_t offset, int64_t len)
{
@@ -4857,6 +4896,9 @@ static int coroutine_fn qcow2_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
case BLK_ZO_OFFLINE:
ret = qcow2_write_wp_at(bs, &wps->wp[index], index, BLK_ZO_OFFLINE);
break;
+ case BLK_ZO_SET_ZDED:
+ ret = qcow2_zns_set_zded(bs, index);
+ break;
default:
error_report("Unsupported zone op: 0x%x", op);
ret = -ENOTSUP;
diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 3932b516ed..fcd774e3f7 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -3425,11 +3425,6 @@ static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeRequest *req)
NvmeNamespace *ns = req->ns;
NvmeZoneMgmtAIOCB *iocb;
uint64_t slba = 0;
- uint64_t offset;
- BlockBackend *blk = ns->blkconf.blk;
- uint32_t zone_size = blk_get_zone_size(blk);
- uint64_t size = zone_size * blk_get_nr_zones(blk);
- int64_t len;
uint32_t zone_idx = 0;
uint16_t status;
uint8_t action = cmd->zsa;
@@ -3485,6 +3480,7 @@ static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeRequest *req)
break;
case NVME_ZONE_ACTION_SET_ZD_EXT:
+ op = BLK_ZO_SET_ZDED;
int zd_ext_size = blk_get_zd_ext_size(blk);
trace_pci_nvme_set_descriptor_extension(slba, zone_idx);
if (all || !zd_ext_size) {
diff --git a/include/block/block-common.h b/include/block/block-common.h
index 0cbed607a8..b369e77607 100644
--- a/include/block/block-common.h
+++ b/include/block/block-common.h
@@ -84,6 +84,7 @@ typedef enum BlockZoneOp {
BLK_ZO_FINISH,
BLK_ZO_RESET,
BLK_ZO_OFFLINE,
+ BLK_ZO_SET_ZDED,
} BlockZoneOp;
typedef enum BlockZoneModel {
--
2.40.1
^ permalink raw reply related [flat|nested] 2+ messages in thread
end of thread, other threads:[~2023-08-16 7:09 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2023-08-16 7:08 [RFC 4/5] hw/nvme: refactor zone append writes using block layer APIs Sam Li
2023-08-16 7:08 ` [RFC 5/5] hw/nvme: make ZDED persistent Sam Li
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).