On Sun, May 10, 2026 at 07:50:58PM +0200, Sam Li wrote: > By adding zone operations and zoned metadata, the zoned emulation > capability enables full emulation support of zoned device using > a qcow2 file. The zoned device metadata includes zone type, > zoned device state and write pointer of each zone, which is stored > to an array of unsigned integers. > > Each zone of a zoned device makes state transitions following > the zone state machine. The zone state machine mainly describes > five states, IMPLICIT OPEN, EXPLICIT OPEN, FULL, EMPTY and CLOSED. > READ ONLY and OFFLINE states will generally be affected by device > internal events. The operations on zones cause corresponding state > changing. > > Zoned devices have limits on zone resources, which put constraints on > write operations on zones. It is managed by active zone queues > following LRU policy. > > Signed-off-by: Sam Li > --- > block/qcow2.c | 851 +++++++++++++++++++++++++++++++++++++++++- > block/qcow2.h | 13 +- > block/trace-events | 2 + > hw/block/virtio-blk.c | 22 +- > include/qemu/queue.h | 1 + > include/qemu/range.h | 4 + > 6 files changed, 884 insertions(+), 9 deletions(-) Here are is the feedback I have so far: > > diff --git a/block/qcow2.c b/block/qcow2.c > index b543bcf3e3..c84b67976e 100644 > --- a/block/qcow2.c > +++ b/block/qcow2.c > @@ -195,6 +195,293 @@ qcow2_extract_crypto_opts(QemuOpts *opts, const char *fmt, Error **errp) > return cryptoopts_qdict; > } > > +#define QCOW2_ZT_IS_CONV(wp) (wp & 1ULL << 59) Patch 1 does not document in the file format specification that write pointer bit 59 indicates a conventional zone. > +#define QCOW2_GET_WP(wp) ((wp << 5) >> 5) > + > +/* > + * To emulate a real zoned device, closed, empty and full states are > + * preserved after a power cycle. The open states are in-memory and will > + * be lost after closing the device. Read-only and offline states are > + * device-internal events, which are not considered for simplicity. > + */ > +static inline BlockZoneState qcow2_get_zone_state(BlockDriverState *bs, > + uint32_t index) > +{ > + BDRVQcow2State *s = bs->opaque; > + Qcow2ZoneListEntry *zone_entry = &s->zone_list_entries[index]; > + uint64_t zone_wp = bs->wps->wp[index]; > + uint64_t zone_start; > + > + if (QCOW2_ZT_IS_CONV(zone_wp)) { > + return BLK_ZS_NOT_WP; > + } > + > + if (QTAILQ_IN_USE(zone_entry, exp_open_zone_entry)) { > + return BLK_ZS_EOPEN; > + } > + if (QTAILQ_IN_USE(zone_entry, imp_open_zone_entry)) { > + return BLK_ZS_IOPEN; > + } > + > + zone_start = index * bs->bl.zone_size; > + if (zone_wp == zone_start) { > + return BLK_ZS_EMPTY; > + } > + if (zone_wp >= zone_start + bs->bl.zone_capacity) { > + return BLK_ZS_FULL; > + } > + if (zone_wp > zone_start) { > + if (!QTAILQ_IN_USE(zone_entry, closed_zone_entry)) { > + /* > + * The number of closed zones is not always updated in time when > + * the device is closed. However, it only matters when doing > + * zone report. Refresh the count and list of closed zones to > + * provide correct zone states for zone report. > + */ > + QTAILQ_INSERT_HEAD(&s->closed_zones, zone_entry, closed_zone_entry); > + s->nr_zones_closed++; > + } > + return BLK_ZS_CLOSED; > + } > + return BLK_ZS_NOT_WP; > +} > + > +static void qcow2_rm_exp_open_zone(BDRVQcow2State *s, > + uint32_t index) > +{ > + Qcow2ZoneListEntry *zone_entry = &s->zone_list_entries[index]; > + > + QTAILQ_REMOVE(&s->exp_open_zones, zone_entry, exp_open_zone_entry); > + s->nr_zones_exp_open--; > +} > + > +static void qcow2_rm_imp_open_zone(BDRVQcow2State *s, > + int32_t index) > +{ > + Qcow2ZoneListEntry *zone_entry; > + if (index < 0) { > + /* Apply LRU when the index is not specified. */ > + zone_entry = QTAILQ_LAST(&s->imp_open_zones); > + } else { > + zone_entry = &s->zone_list_entries[index]; > + } > + > + QTAILQ_REMOVE(&s->imp_open_zones, zone_entry, imp_open_zone_entry); > + s->nr_zones_imp_open--; > +} > + > +static void qcow2_rm_open_zone(BDRVQcow2State *s, > + uint32_t index) > +{ > + Qcow2ZoneListEntry *zone_entry = &s->zone_list_entries[index]; > + > + if (QTAILQ_IN_USE(zone_entry, exp_open_zone_entry)) { > + qcow2_rm_exp_open_zone(s, index); > + } else if (QTAILQ_IN_USE(zone_entry, imp_open_zone_entry)) { > + qcow2_rm_imp_open_zone(s, index); > + } > +} > + > +static void qcow2_rm_closed_zone(BDRVQcow2State *s, > + uint32_t index) > +{ > + Qcow2ZoneListEntry *zone_entry = &s->zone_list_entries[index]; > + > + QTAILQ_REMOVE(&s->closed_zones, zone_entry, closed_zone_entry); > + s->nr_zones_closed--; > +} > + > +static void qcow2_do_imp_open_zone(BDRVQcow2State *s, > + uint32_t index, > + BlockZoneState zs) > +{ > + Qcow2ZoneListEntry *zone_entry = &s->zone_list_entries[index]; > + > + switch (zs) { > + case BLK_ZS_EMPTY: > + break; > + case BLK_ZS_CLOSED: > + qcow2_rm_closed_zone(s, index); > + break; > + case BLK_ZS_IOPEN: > + /* > + * The LRU policy: update the zone that is most recently > + * used to the head of the zone list > + */ > + if (zone_entry == QTAILQ_FIRST(&s->imp_open_zones)) { > + return; > + } > + QTAILQ_REMOVE(&s->imp_open_zones, zone_entry, imp_open_zone_entry); > + s->nr_zones_imp_open--; > + break; > + default: > + return; > + } > + > + QTAILQ_INSERT_HEAD(&s->imp_open_zones, zone_entry, imp_open_zone_entry); > + s->nr_zones_imp_open++; > +} > + > +static void qcow2_do_exp_open_zone(BDRVQcow2State *s, > + uint32_t index) > +{ > + Qcow2ZoneListEntry *zone_entry = &s->zone_list_entries[index]; > + > + QTAILQ_INSERT_HEAD(&s->exp_open_zones, zone_entry, exp_open_zone_entry); > + s->nr_zones_exp_open++; > +} > + > +/* > + * The list of zones is managed using an LRU policy: the last > + * zone of the list is always the one that was least recently used > + * for writing and is chosen as the zone to close to be able to > + * implicitly open another zone. > + * > + * We can only close the open zones. The index is not specified > + * when it is less than 0. > + */ > +static void qcow2_do_close_zone(BlockDriverState *bs, > + int32_t index, > + BlockZoneState zs) > +{ > + BDRVQcow2State *s = bs->opaque; > + Qcow2ZoneListEntry *zone_entry; > + > + if (index >= 0) { > + zone_entry = &s->zone_list_entries[index]; > + } else { > + /* before removal of the last implicitly open zone */ > + zone_entry = QTAILQ_LAST(&s->imp_open_zones); > + } > + > + if (zs == BLK_ZS_IOPEN) { > + qcow2_rm_imp_open_zone(s, index); > + goto close_zone; > + } > + > + if (index >= 0 && zs == BLK_ZS_EOPEN) { > + qcow2_rm_exp_open_zone(s, index); > + /* > + * The zone state changes when the zone is removed from the list of > + * open zones (explicitly open -> empty). The closed zone list is > + * refreshed during get_zone_state(). > + */ > + qcow2_get_zone_state(bs, index); > + } > + return; > + > +close_zone: > + QTAILQ_INSERT_HEAD(&s->closed_zones, zone_entry, closed_zone_entry); > + s->nr_zones_closed++; > +} > + > +/* > + * Read/Write the new wp value to the dedicated location of the image file. > + */ > +static int coroutine_fn GRAPH_RDLOCK > +qcow2_rw_wp_at(BlockDriverState *bs, uint64_t *wp, > + int32_t index, bool is_write) { > + BDRVQcow2State *s = bs->opaque; > + g_autofree uint64_t *temp = NULL; > + uint64_t wpv = *wp; > + int ret; > + > + if (is_write) { > + ret = bdrv_pwrite(bs->file, s->zoned_header.zonedmeta_offset > + + sizeof(uint64_t) * index, sizeof(uint64_t), wp, 0); > + if (ret < 0) { > + *wp = wpv; > + goto exit; > + } > + } else { > + temp = g_new(uint64_t, 1); > + ret = bdrv_pread(bs->file, s->zoned_header.zonedmeta_offset > + + sizeof(uint64_t) * index, sizeof(uint64_t), temp, 0); > + if (ret < 0) { > + goto exit; > + } > + > + *wp = *temp; > + } > + > + trace_qcow2_wp_tracking(index, *wp >> BDRV_SECTOR_BITS); > + return ret; > + > +exit: > + error_report("Failed to %s metadata to file", is_write ? "write" : "read"); > + return ret; > +} > + > +static bool qcow2_can_activate_zone(BlockDriverState *bs) > +{ > + BDRVQcow2State *s = bs->opaque; > + > + /* When the max active zone is zero, there is no limit on active zones */ > + if (!s->zoned_header.max_active_zones) { > + return true; > + } > + > + /* Active zones are zones that are open or closed */ > + return s->nr_zones_exp_open + s->nr_zones_imp_open + s->nr_zones_closed > + < s->zoned_header.max_active_zones; > +} > + > +/* > + * This function manages open zones under active zones limit. It checks > + * if a zone can transition to open state while maintaining max open and > + * active zone limits. > + */ > +static bool qcow2_can_open_zone(BlockDriverState *bs) > +{ > + BDRVQcow2State *s = bs->opaque; > + > + /* When the max open zone is zero, there is no limit on open zones */ > + if (!s->zoned_header.max_open_zones) { > + return true; > + } > + > + /* > + * The open zones are zones with the states of explicitly and > + * implicitly open. > + */ > + if (s->nr_zones_imp_open + s->nr_zones_exp_open < > + s->zoned_header.max_open_zones) { > + return true; > + } > + > + /* > + * Zones are managed one at a time. Thus, the number of implicitly open > + * zone can never be over the open zone limit. When the active zone limit > + * is not reached, close only one implicitly open zone. > + */ > + if (qcow2_can_activate_zone(bs)) { > + qcow2_do_close_zone(bs, -1, BLK_ZS_IOPEN); > + trace_qcow2_imp_open_zones(0x23, s->nr_zones_imp_open); > + return true; > + } > + return false; > +} > + > +static inline int coroutine_fn GRAPH_RDLOCK > +qcow2_refresh_zonedmeta(BlockDriverState *bs) > +{ > + int ret; > + BDRVQcow2State *s = bs->opaque; > + uint64_t wps_size = s->zoned_header.zonedmeta_size; > + g_autofree uint64_t *temp = NULL; > + > + temp = g_new(uint64_t, s->zoned_header.nr_zones); > + ret = bdrv_pread(bs->file, s->zoned_header.zonedmeta_offset, > + wps_size, temp, 0); > + if (ret < 0) { > + error_report("Cannot read metadata"); > + return ret; > + } > + > + memcpy(bs->wps->wp, temp, wps_size); > + return 0; > +} > + > /* > * Passing by the zoned device configurations by a zoned_header struct, check > * if the zone device options are under constraints. Return false when some > @@ -527,7 +814,23 @@ qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset, > be32_to_cpu(zoned_ext.max_active_zones); > zoned_ext.max_append_bytes = > be32_to_cpu(zoned_ext.max_append_bytes); > + zoned_ext.zonedmeta_offset = > + be64_to_cpu(zoned_ext.zonedmeta_offset); > + zoned_ext.zonedmeta_size = be64_to_cpu(zoned_ext.zonedmeta_size); > s->zoned_header = zoned_ext; > + bs->wps = g_malloc(sizeof(BlockZoneWps) > + + s->zoned_header.zonedmeta_size); > + ret = qcow2_refresh_zonedmeta(bs); > + if (ret < 0) { > + return ret; > + } > + > + s->zone_list_entries = g_new0(Qcow2ZoneListEntry, > + zoned_ext.nr_zones); > + QTAILQ_INIT(&s->exp_open_zones); > + QTAILQ_INIT(&s->imp_open_zones); > + QTAILQ_INIT(&s->closed_zones); > + qemu_co_mutex_init(&bs->wps->colock); > > /* refuse to open broken images */ > if (zoned_ext.nr_zones != DIV_ROUND_UP(bs->total_sectors * > @@ -2883,21 +3186,119 @@ static coroutine_fn GRAPH_RDLOCK int qcow2_co_pwritev_task_entry(AioTask *task) > t->l2meta); > } > > +/* > + * If it is an append write request, the offset pointer needs to be updated to > + * the wp value of that zone after the IO completion. The unique pointer is > + * passed on to this function to prevent the value being changed in condition of > + * multiple concurrent writes. > + */ > static int coroutine_fn GRAPH_RDLOCK > -qcow2_co_pwritev_part(BlockDriverState *bs, int64_t offset, int64_t bytes, > - QEMUIOVector *qiov, size_t qiov_offset, > - BdrvRequestFlags flags) > +qcow2_co_pwv_part(BlockDriverState *bs, int64_t *offset_ptr, int64_t bytes, > + QEMUIOVector *qiov, size_t qiov_offset, bool is_append, > + BdrvRequestFlags flags) > { > BDRVQcow2State *s = bs->opaque; > int offset_in_cluster; > int ret; > unsigned int cur_bytes; /* number of sectors in current iteration */ > uint64_t host_offset; > + int64_t offset = *offset_ptr; > QCowL2Meta *l2meta = NULL; > AioTaskPool *aio = NULL; > + int64_t start_offset, start_bytes; > + BlockZoneState zs; > + int64_t end_zone, end_offset; > + uint64_t *wp; > + int64_t zone_size = bs->bl.zone_size; > + int64_t zone_capacity = bs->bl.zone_capacity; > + int index; > > trace_qcow2_writev_start_req(qemu_coroutine_self(), offset, bytes); > > + start_offset = offset; > + start_bytes = bytes; > + if (bs->bl.zoned == BLK_Z_HM) { > + index = start_offset / zone_size; > + wp = &bs->wps->wp[index]; > + if (!QCOW2_ZT_IS_CONV(*wp)) { > + if (offset != *wp && !is_append) { > + /* The write offset must be equal to the zone write pointer */ > + error_report("Offset 0x%" PRIx64 " of regular writes must be " > + "equal to the zone write pointer 0x%" PRIx64 "", > + offset, *wp); > + return -EINVAL; > + } > + > + if (is_append) { > + /* > + * The offset of append write is the write pointer value of > + * that zone. > + */ > + start_offset = *wp; > + } > + > + end_offset = start_offset + start_bytes; > + > + /* Only allow writes when there are zone resources left */ > + zs = qcow2_get_zone_state(bs, index); > + if (zs == BLK_ZS_CLOSED || zs == BLK_ZS_EMPTY) { > + if (!qcow2_can_open_zone(bs)) { > + error_report("no more open zones available"); > + return -EINVAL; > + } > + } > + > + /* > + * Align up (start_offset, zone_size), the start offset is not > + * necessarily power of two. > + */ > + end_zone = index * zone_size + zone_capacity; > + /* Write cannot exceed the zone capacity. */ > + if (end_offset > end_zone) { > + error_report("write exceeds zone capacity with end_offset:" > + "0x%lx, end_zone: 0x%lx", > + end_offset / 512, end_zone / 512); > + return -EINVAL; > + } > + > + /* > + * Real drives change states before it can write to the zone. If > + * the write fails, the zone state may have changed. > + * > + * The zone state transitions to implicit open when the original > + * state is empty or closed. When the wp reaches the end, the > + * open states (explicit open, implicit open) become full. > + */ > + zs = qcow2_get_zone_state(bs, index); > + if (!(end_offset & (zone_capacity - 1))) { > + /* Being aligned to zone capacity implies full state */ > + qcow2_rm_open_zone(s, index); > + trace_qcow2_imp_open_zones(0x24, > + s->nr_zones_imp_open); > + } else { > + qcow2_do_imp_open_zone(s, index, zs); > + trace_qcow2_imp_open_zones(0x24, > + s->nr_zones_imp_open); > + } > + > + /* > + * The write pointer is update before IO completion, with the > + * assumption that the write IO will succeed. > + */ > + qemu_co_mutex_lock(&bs->wps->colock); > + if (is_append) { > + *offset_ptr = *wp; > + } > + *wp = end_offset; > + ret = qcow2_rw_wp_at(bs, wp, index, true); > + qemu_co_mutex_unlock(&bs->wps->colock); > + if (ret < 0) { > + error_report("failed to update write pointer"); > + return -EINVAL; > + } > + } > + } > + > while (bytes != 0 && aio_task_pool_status(aio) == 0) { > > l2meta = NULL; > @@ -2943,6 +3344,7 @@ qcow2_co_pwritev_part(BlockDriverState *bs, int64_t offset, int64_t bytes, > qiov_offset += cur_bytes; > trace_qcow2_writev_done_part(qemu_coroutine_self(), cur_bytes); > } > + > ret = 0; > > qemu_co_mutex_lock(&s->lock); > @@ -2961,11 +3363,32 @@ fail_nometa: > g_free(aio); > } > > + if (ret < 0 && bs->bl.zoned == BLK_Z_HM) { > + /* update the wp when write IO failed */ > + qemu_co_mutex_lock(&bs->wps->colock); > + index = start_offset / zone_size; > + wp = &bs->wps->wp[index]; > + if (!QCOW2_ZT_IS_CONV(*wp)) { > + ret = qcow2_rw_wp_at(bs, wp, index, false); > + } > + qemu_co_mutex_unlock(&bs->wps->colock); > + } > + > trace_qcow2_writev_done_req(qemu_coroutine_self(), ret); > > return ret; > } > > +static int coroutine_fn GRAPH_RDLOCK > +qcow2_co_pwritev_part(BlockDriverState *bs, int64_t offset, int64_t bytes, > + QEMUIOVector *qiov, size_t qiov_offset, > + BdrvRequestFlags flags) > +{ > + return qcow2_co_pwv_part(bs, &offset, bytes, qiov, qiov_offset, false, > + flags); > +} > + > + > static int GRAPH_RDLOCK qcow2_inactivate(BlockDriverState *bs) > { > BDRVQcow2State *s = bs->opaque; > @@ -3001,6 +3424,25 @@ static int GRAPH_RDLOCK qcow2_inactivate(BlockDriverState *bs) > return result; > } > > +static void qcow2_do_close_all_zone(BDRVQcow2State *s) > +{ > + Qcow2ZoneListEntry *zone_entry, *next; > + > + QTAILQ_FOREACH_SAFE(zone_entry, &s->imp_open_zones, imp_open_zone_entry, > + next) { > + QTAILQ_REMOVE(&s->imp_open_zones, zone_entry, imp_open_zone_entry); > + s->nr_zones_imp_open--; > + } > + > + QTAILQ_FOREACH_SAFE(zone_entry, &s->exp_open_zones, exp_open_zone_entry, > + next) { > + QTAILQ_REMOVE(&s->exp_open_zones, zone_entry, exp_open_zone_entry); > + s->nr_zones_exp_open--; > + } > + > + assert(s->nr_zones_imp_open + s->nr_zones_exp_open == 0); > +} > + > static void coroutine_mixed_fn GRAPH_RDLOCK > qcow2_do_close(BlockDriverState *bs, bool close_data_file) > { > @@ -3040,6 +3482,8 @@ qcow2_do_close(BlockDriverState *bs, bool close_data_file) > > qcow2_refcount_close(bs); > qcow2_free_snapshots(bs); > + qcow2_do_close_all_zone(s); > + g_free(bs->wps); > } > > static void GRAPH_UNLOCKED qcow2_close(BlockDriverState *bs) > @@ -3357,7 +3801,10 @@ int qcow2_update_header(BlockDriverState *bs) > .max_active_zones = > cpu_to_be32(s->zoned_header.max_active_zones), > .max_append_bytes = > - cpu_to_be32(s->zoned_header.max_append_bytes) > + cpu_to_be32(s->zoned_header.max_append_bytes), > + .zonedmeta_offset = > + cpu_to_be64(s->zoned_header.zonedmeta_offset), > + .zonedmeta_size = cpu_to_be64(s->zoned_header.zonedmeta_size), > }; > ret = header_ext_add(buf, QCOW2_EXT_MAGIC_ZONED_FORMAT, > &zoned_header, sizeof(zoned_header), > @@ -3766,7 +4213,8 @@ qcow2_co_create(BlockdevCreateOptions *create_options, Error **errp) > int version; > int refcount_order; > uint64_t *refcount_table; > - int ret; > + uint64_t zoned_meta_size, zoned_clusterlen; > + int ret, offset, i; > uint8_t compression_type = QCOW2_COMPRESSION_TYPE_ZLIB; > > assert(create_options->driver == BLOCKDEV_DRIVER_QCOW2); > @@ -4113,6 +4561,42 @@ qcow2_co_create(BlockdevCreateOptions *create_options, Error **errp) > ret = -EINVAL; > goto unlock; > } > + > + uint32_t nrz = s->zoned_header.nr_zones; > + zoned_meta_size = sizeof(uint64_t) * nrz; > + g_autofree uint64_t *meta = NULL; > + meta = g_new0(uint64_t, nrz); > + > + for (i = 0; i < s->zoned_header.conventional_zones; ++i) { > + meta[i] = i * s->zoned_header.zone_size; > + meta[i] |= 1ULL << 59; > + } > + > + for (; i < nrz; ++i) { > + meta[i] = i * s->zoned_header.zone_size; > + } > + > + offset = qcow2_alloc_clusters(blk_bs(blk), zoned_meta_size); > + if (offset < 0) { > + ret = offset; > + error_setg_errno(errp, -ret, "Could not allocate clusters " > + "for zoned metadata size"); > + goto unlock; > + } > + s->zoned_header.zonedmeta_offset = offset; > + s->zoned_header.zonedmeta_size = zoned_meta_size; > + > + zoned_clusterlen = size_to_clusters(s, zoned_meta_size) > + * s->cluster_size; > + ret = qcow2_pre_write_overlap_check(blk_bs(blk), 0, offset, > + zoned_clusterlen, false); > + assert(ret == 0); > + ret = bdrv_pwrite(blk_bs(blk)->file, offset, zoned_meta_size, meta, 0); > + if (ret < 0) { > + error_setg_errno(errp, -ret, "Could not write zoned metadata " > + "to disk"); > + goto unlock; > + } > } else { > s->zoned_header.zoned = BLK_Z_NONE; > } > @@ -4512,6 +4996,359 @@ qcow2_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes) > return ret; > } > > +static int coroutine_fn > +qcow2_co_zone_report(BlockDriverState *bs, int64_t offset, > + unsigned int *nr_zones, BlockZoneDescriptor *zones) > +{ > + BDRVQcow2State *s = bs->opaque; > + uint64_t zone_size = s->zoned_header.zone_size; > + int64_t capacity = bs->total_sectors << BDRV_SECTOR_BITS; > + int64_t size = bs->bl.nr_zones * zone_size; > + unsigned int nrz; > + int i = 0; > + int si; > + > + if (offset >= capacity) { > + error_report("offset %" PRId64 " is equal to or greater than the " > + "device capacity %" PRId64 "", offset, capacity); > + return -EINVAL; > + } > + > + nrz = ((*nr_zones) < bs->bl.nr_zones) ? (*nr_zones) : bs->bl.nr_zones; > + si = offset / zone_size; /* Zone size cannot be 0 for zoned device */ > + qemu_co_mutex_lock(&bs->wps->colock); > + for (; i < nrz; ++i) { > + if (i + si >= bs->bl.nr_zones) { > + break; > + } > + > + zones[i].start = (si + i) * zone_size; > + > + /* The last zone can be smaller than the zone size */ > + if ((si + i + 1) == bs->bl.nr_zones && size > capacity) { > + uint32_t l = zone_size - (size - capacity); > + zones[i].length = l; > + zones[i].cap = l; > + } else { > + zones[i].length = zone_size; > + zones[i].cap = zone_size; > + } > + > + uint64_t wp = bs->wps->wp[si + i]; > + if (QCOW2_ZT_IS_CONV(wp)) { > + zones[i].type = BLK_ZT_CONV; > + zones[i].state = BLK_ZS_NOT_WP; > + /* Clear masking bits */ > + wp = QCOW2_GET_WP(wp); > + } else { > + zones[i].type = BLK_ZT_SWR; > + zones[i].state = qcow2_get_zone_state(bs, si + i); > + } > + zones[i].wp = wp; > + } > + qemu_co_mutex_unlock(&bs->wps->colock); > + *nr_zones = i; > + return 0; > +} > + > +static int coroutine_fn GRAPH_RDLOCK > +qcow2_open_zone(BlockDriverState *bs, uint32_t index) { > + BDRVQcow2State *s = bs->opaque; > + int ret; > + > + qemu_co_mutex_lock(&bs->wps->colock); > + BlockZoneState zs = qcow2_get_zone_state(bs, index); > + trace_qcow2_imp_open_zones(BLK_ZO_OPEN, s->nr_zones_imp_open); > + > + switch (zs) { > + case BLK_ZS_EMPTY: > + if (!qcow2_can_activate_zone(bs)) { > + ret = -EBUSY; > + goto unlock; > + } > + break; > + case BLK_ZS_IOPEN: > + qcow2_rm_imp_open_zone(s, index); > + break; > + case BLK_ZS_EOPEN: > + return 0; > + case BLK_ZS_CLOSED: > + if (!qcow2_can_open_zone(bs)) { > + ret = -EINVAL; > + goto unlock; > + } > + qcow2_rm_closed_zone(s, index); > + break; > + case BLK_ZS_FULL: > + break; > + default: > + ret = -EINVAL; > + goto unlock; > + } > + > + qcow2_do_exp_open_zone(s, index); > + ret = 0; > + > +unlock: > + qemu_co_mutex_unlock(&bs->wps->colock); > + return ret; > +} > + > +static int qcow2_close_zone(BlockDriverState *bs, uint32_t index) > +{ > + int ret; > + > + qemu_co_mutex_lock(&bs->wps->colock); > + BlockZoneState zs = qcow2_get_zone_state(bs, index); > + > + switch (zs) { > + case BLK_ZS_EMPTY: > + break; > + case BLK_ZS_IOPEN: > + break; > + case BLK_ZS_EOPEN: > + break; > + case BLK_ZS_CLOSED: > + /* Closing a closed zone is not an error */ > + ret = 0; > + goto unlock; > + case BLK_ZS_FULL: > + break; > + default: > + ret = -EINVAL; > + goto unlock; > + } > + qcow2_do_close_zone(bs, index, zs); > + ret = 0; > + > +unlock: > + qemu_co_mutex_unlock(&bs->wps->colock); > + return ret; > +} > + > +static int coroutine_fn GRAPH_RDLOCK > +qcow2_finish_zone(BlockDriverState *bs, uint32_t index) { > + BDRVQcow2State *s = bs->opaque; > + int ret; > + > + qemu_co_mutex_lock(&bs->wps->colock); > + uint64_t *wp = &bs->wps->wp[index]; > + BlockZoneState zs = qcow2_get_zone_state(bs, index); > + > + switch (zs) { > + case BLK_ZS_EMPTY: > + if (!qcow2_can_activate_zone(bs)) { > + ret = -EBUSY; > + goto unlock; > + } > + break; > + case BLK_ZS_IOPEN: > + qcow2_rm_imp_open_zone(s, index); > + trace_qcow2_imp_open_zones(BLK_ZO_FINISH, s->nr_zones_imp_open); > + break; > + case BLK_ZS_EOPEN: > + qcow2_rm_exp_open_zone(s, index); > + break; > + case BLK_ZS_CLOSED: > + if (!qcow2_can_open_zone(bs)) { > + ret = -EINVAL; > + goto unlock; > + } > + qcow2_rm_closed_zone(s, index); > + break; > + case BLK_ZS_FULL: > + ret = 0; > + goto unlock; > + default: > + ret = -EINVAL; > + goto unlock; > + } > + > + *wp = ((uint64_t)index + 1) * s->zoned_header.zone_size; > + ret = qcow2_rw_wp_at(bs, wp, index, true); > + > +unlock: > + qemu_co_mutex_unlock(&bs->wps->colock); > + return ret; > +} > + > +static int coroutine_fn GRAPH_RDLOCK > +qcow2_reset_zone(BlockDriverState *bs, uint32_t index, > + int64_t len) { > + BDRVQcow2State *s = bs->opaque; > + int nrz = bs->bl.nr_zones; > + int zone_size = bs->bl.zone_size; > + int n, ret = 0; > + > + qemu_co_mutex_lock(&bs->wps->colock); > + uint64_t *wp = &bs->wps->wp[index]; > + if (len == bs->total_sectors << BDRV_SECTOR_BITS) { > + n = nrz; > + index = 0; > + } else { > + n = len / zone_size; > + } > + > + for (int i = 0; i < n; ++i) { > + uint64_t *wp_i = (uint64_t *)(wp + i); > + uint64_t wpi_v = *wp_i; > + if (QCOW2_ZT_IS_CONV(wpi_v)) { > + continue; > + } > + > + BlockZoneState zs = qcow2_get_zone_state(bs, index + i); > + switch (zs) { > + case BLK_ZS_EMPTY: > + break; > + case BLK_ZS_IOPEN: > + qcow2_rm_imp_open_zone(s, index + i); > + trace_qcow2_imp_open_zones(BLK_ZO_RESET, s->nr_zones_imp_open); > + break; > + case BLK_ZS_EOPEN: > + qcow2_rm_exp_open_zone(s, index + i); > + break; > + case BLK_ZS_CLOSED: > + qcow2_rm_closed_zone(s, index + i); > + break; > + case BLK_ZS_FULL: > + break; > + default: > + ret = -EINVAL; > + goto unlock; > + } > + > + if (zs == BLK_ZS_EMPTY) { > + continue; > + } > + > + *wp_i = ((uint64_t)index + i) * zone_size; > + ret = qcow2_rw_wp_at(bs, wp_i, index + i, true); > + if (ret < 0) { > + goto unlock; > + } > + /* clear data */ > + ret = qcow2_co_pwrite_zeroes(bs, *wp_i, zone_size, 0); > + if (ret < 0) { > + error_report("Failed to reset zone at 0x%" PRIx64 "", *wp_i); > + } > + } > + > +unlock: > + qemu_co_mutex_unlock(&bs->wps->colock); > + return ret; > +} > + > +static int coroutine_fn GRAPH_RDLOCK > +qcow2_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op, > + int64_t offset, int64_t len) > +{ > + BDRVQcow2State *s = bs->opaque; > + int ret = 0; > + int64_t capacity = bs->total_sectors << BDRV_SECTOR_BITS; > + int64_t zone_size = s->zoned_header.zone_size; > + int64_t zone_size_mask = zone_size - 1; > + uint32_t index = offset / zone_size; > + BlockZoneWps *wps = bs->wps; > + > + if (offset >= capacity) { > + error_report("offset %" PRId64 " is equal to or greater than the" > + "device capacity %" PRId64 "", offset, capacity); > + return -EINVAL; > + } > + > + if (offset & zone_size_mask) { > + error_report("sector offset %" PRId64 " is not aligned to zone size" > + " %" PRId64 "", offset / 512, zone_size / 512); > + return -EINVAL; > + } > + > + if (((offset + len) < capacity && len & zone_size_mask) || > + offset + len > capacity) { > + error_report("number of sectors %" PRId64 " is not aligned to zone" > + " size %" PRId64 "", len / 512, zone_size / 512); > + return -EINVAL; > + } > + > + qemu_co_mutex_lock(&wps->colock); > + uint64_t wpv = wps->wp[index]; Do we need to check that index < nr_zones? > + if (QCOW2_ZT_IS_CONV(wpv) && len != capacity) { Does the len != capacity check cover all cases or is it just for BLK_ZO_RESET? For example, when BLK_ZO_OPEN is called with len == capacity it seems possible to perform an open operation on the zone? > + error_report("zone mgmt operations are not allowed for " > + "conventional zones"); > + ret = -EIO; > + goto unlock; > + } > + qemu_co_mutex_unlock(&wps->colock); > + > + switch (op) { > + case BLK_ZO_OPEN: > + ret = qcow2_open_zone(bs, index); > + break; > + case BLK_ZO_CLOSE: > + ret = qcow2_close_zone(bs, index); > + break; > + case BLK_ZO_FINISH: > + ret = qcow2_finish_zone(bs, index); > + break; > + case BLK_ZO_RESET: > + ret = qcow2_reset_zone(bs, index, len); > + break; > + default: > + error_report("Unsupported zone op: 0x%x", op); > + ret = -ENOTSUP; > + break; > + } > + return ret; > + > +unlock: > + qemu_co_mutex_unlock(&wps->colock); > + return ret; > +} > + > +static int coroutine_fn GRAPH_RDLOCK > +qcow2_co_zone_append(BlockDriverState *bs, int64_t *offset, QEMUIOVector *qiov, > + BdrvRequestFlags flags) > +{ > + assert(flags == 0); > + int64_t capacity = bs->total_sectors << BDRV_SECTOR_BITS; > + int64_t zone_size_mask = bs->bl.zone_size - 1; > + int64_t iov_len = 0; > + int64_t len = 0; > + > + if (*offset >= capacity) { > + error_report("*offset %" PRId64 " is equal to or greater than the" > + "device capacity %" PRId64 "", *offset, capacity); > + return -EINVAL; > + } > + > + /* offset + len should not pass the end of that zone starting from offset */ > + if (*offset & zone_size_mask) { > + error_report("sector offset %" PRId64 " is not aligned to zone size " > + "%" PRId64 "", *offset / 512, bs->bl.zone_size / 512); > + return -EINVAL; > + } > + > + int64_t wg = bs->bl.write_granularity; > + int64_t wg_mask = wg - 1; > + for (int i = 0; i < qiov->niov; i++) { > + iov_len = qiov->iov[i].iov_len; > + if (iov_len & wg_mask) { > + error_report("len of IOVector[%d] 0x%" PRIx64 " is not aligned to " > + "block size 0x%" PRIx64 "", i, iov_len, wg); > + return -EINVAL; > + } > + } > + len = qiov->size; > + > + if ((len >> BDRV_SECTOR_BITS) > bs->bl.max_append_sectors) { > + error_report("len 0x%" PRIx64 " in sectors is greater than " > + "max_append_sectors 0x%" PRIx32 "", > + len >> BDRV_SECTOR_BITS, bs->bl.max_append_sectors); > + return -EINVAL; > + } > + > + return qcow2_co_pwv_part(bs, offset, len, qiov, 0, true, 0); > +} > + > static int coroutine_fn GRAPH_RDLOCK > qcow2_co_copy_range_from(BlockDriverState *bs, > BdrvChild *src, int64_t src_offset, > @@ -6578,6 +7415,10 @@ BlockDriver bdrv_qcow2 = { > .bdrv_co_pwritev_compressed_part = qcow2_co_pwritev_compressed_part, > .bdrv_make_empty = qcow2_make_empty, > > + .bdrv_co_zone_report = qcow2_co_zone_report, > + .bdrv_co_zone_mgmt = qcow2_co_zone_mgmt, This struct seems to align the equals signs so it's nicer to look at. Please align this equals sign for consistency. > + .bdrv_co_zone_append = qcow2_co_zone_append, > + > .bdrv_snapshot_create = qcow2_snapshot_create, > .bdrv_snapshot_goto = qcow2_snapshot_goto, > .bdrv_snapshot_delete = qcow2_snapshot_delete, > diff --git a/block/qcow2.h b/block/qcow2.h > index 743f2e3e79..4f72b8d62a 100644 > --- a/block/qcow2.h > +++ b/block/qcow2.h > @@ -256,9 +256,9 @@ typedef struct Qcow2ZonedHeaderExtension { > } QEMU_PACKED Qcow2ZonedHeaderExtension; > > typedef struct Qcow2ZoneListEntry { > - QLIST_ENTRY(Qcow2ZoneListEntry) exp_open_zone_entry; > - QLIST_ENTRY(Qcow2ZoneListEntry) imp_open_zone_entry; > - QLIST_ENTRY(Qcow2ZoneListEntry) closed_zone_entry; > + QTAILQ_ENTRY(Qcow2ZoneListEntry) exp_open_zone_entry; > + QTAILQ_ENTRY(Qcow2ZoneListEntry) imp_open_zone_entry; > + QTAILQ_ENTRY(Qcow2ZoneListEntry) closed_zone_entry; > } Qcow2ZoneListEntry; Please squash this into the patch that introduced Qcow2ZoneListEntry or, better yet, remove it from that patch and defer the definition of this struct until the current patch. > > typedef struct Qcow2UnknownHeaderExtension { > @@ -456,6 +456,13 @@ typedef struct BDRVQcow2State { > > /* States of zoned device */ > Qcow2ZonedHeaderExtension zoned_header; > + QTAILQ_HEAD(, Qcow2ZoneListEntry) exp_open_zones; > + QTAILQ_HEAD(, Qcow2ZoneListEntry) imp_open_zones; > + QTAILQ_HEAD(, Qcow2ZoneListEntry) closed_zones; > + Qcow2ZoneListEntry *zone_list_entries; > + uint32_t nr_zones_exp_open; > + uint32_t nr_zones_imp_open; > + uint32_t nr_zones_closed; > } BDRVQcow2State; > > typedef struct Qcow2COWRegion { > diff --git a/block/trace-events b/block/trace-events > index 950c82d4b8..30a3e303ca 100644 > --- a/block/trace-events > +++ b/block/trace-events > @@ -76,6 +76,8 @@ qcow2_writev_data(void *co, uint64_t offset) "co %p offset 0x%" PRIx64 > qcow2_pwrite_zeroes_start_req(void *co, int64_t offset, int64_t bytes) "co %p offset 0x%" PRIx64 " bytes %" PRId64 > qcow2_pwrite_zeroes(void *co, int64_t offset, int64_t bytes) "co %p offset 0x%" PRIx64 " bytes %" PRId64 > qcow2_skip_cow(void *co, uint64_t offset, int nb_clusters) "co %p offset 0x%" PRIx64 " nb_clusters %d" > +qcow2_wp_tracking(int index, uint64_t wp) "wps[%d]: 0x%" PRIx64 > +qcow2_imp_open_zones(uint8_t op, int nrz) "nr_imp_open_zones after op 0x%x: %d" > > # qcow2-cluster.c > qcow2_alloc_clusters_offset(void *co, uint64_t offset, int bytes) "co %p offset 0x%" PRIx64 " bytes %d" > diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c > index 9cb9f1fb2b..285db19ac7 100644 > --- a/hw/block/virtio-blk.c > +++ b/hw/block/virtio-blk.c Please move the virtio-blk changes into a separate patch. It doesn't seem to be directly related to qcow2 support. > @@ -288,6 +288,9 @@ static void virtio_blk_submit_multireq(VirtIOBlock *s, MultiReqBuffer *mrb) > int i = 0, start = 0, num_reqs = 0, niov = 0, nb_sectors = 0; > uint32_t max_transfer; > int64_t sector_num = 0; > + BlockDriverState *bs = blk_bs(s->blk); > + bool zone_cross; > + int64_t zone_sector, end_sector; > > if (mrb->num_reqs == 1) { > submit_requests(s, mrb, 0, 1, -1); > @@ -303,17 +306,34 @@ static void virtio_blk_submit_multireq(VirtIOBlock *s, MultiReqBuffer *mrb) > for (i = 0; i < mrb->num_reqs; i++) { > VirtIOBlockReq *req = mrb->reqs[i]; > if (num_reqs > 0) { > + zone_cross = false; > + > + /* > + * On zoned backends, a single backend write must not span a zone > + * boundary. Bail out of merging if combining req into the current > + * batch would straddle a zone. > + */ > + if (bs && bs->bl.zone_size > 0) { > + zone_sector = bs->bl.zone_size / BDRV_SECTOR_SIZE; > + end_sector = req->sector_num > + + req->qiov.size / BDRV_SECTOR_SIZE - 1; > + zone_cross = (sector_num / zone_sector) != > + (end_sector / zone_sector); > + } > + > /* > * NOTE: We cannot merge the requests in below situations: > * 1. requests are not sequential > * 2. merge would exceed maximum number of IOVs > * 3. merge would exceed maximum transfer length of backend device > + * 4. merge would cross a zone boundary on a zoned backend > */ > if (sector_num + nb_sectors != req->sector_num || > niov > blk_get_max_iov(s->blk) - req->qiov.niov || > req->qiov.size > max_transfer || > nb_sectors > (max_transfer - > - req->qiov.size) / BDRV_SECTOR_SIZE) { > + req->qiov.size) / BDRV_SECTOR_SIZE || > + zone_cross) { > submit_requests(s, mrb, start, num_reqs, niov); > num_reqs = 0; > } > diff --git a/include/qemu/queue.h b/include/qemu/queue.h > index e029e7bf66..3f0a48740e 100644 > --- a/include/qemu/queue.h > +++ b/include/qemu/queue.h > @@ -179,6 +179,7 @@ struct { \ > #define QLIST_EMPTY(head) ((head)->lh_first == NULL) > #define QLIST_FIRST(head) ((head)->lh_first) > #define QLIST_NEXT(elm, field) ((elm)->field.le_next) > +#define QLIST_LAST(head, field) (*(head)->lh_first->field.le_prev) If QLIST_LAST() is supposed to fetch the last element in the list, then it won't work: #define QLIST_INSERT_HEAD(head, elm, field) do { \ if (((elm)->field.le_next = (head)->lh_first) != NULL) \ (head)->lh_first->field.le_prev = &(elm)->field.le_next;\ (head)->lh_first = (elm); \ (elm)->field.le_prev = &(head)->lh_first; \ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The first node's le_prev points to the head's lh_first field, not to the last node. > > > /* > diff --git a/include/qemu/range.h b/include/qemu/range.h > index d446ad885d..d39ba68407 100644 > --- a/include/qemu/range.h > +++ b/include/qemu/range.h > @@ -213,6 +213,10 @@ static inline int range_covers_byte(uint64_t offset, uint64_t len, > static inline bool ranges_overlap(uint64_t first1, uint64_t len1, > uint64_t first2, uint64_t len2) > { > + if (first1 + len1 == 0 || first2 + len2 == 0) { > + return false; > + } What is this? > + > uint64_t last1 = range_get_last(first1, len1); > uint64_t last2 = range_get_last(first2, len2); > > -- > 2.43.0 >