qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v4 0/3] Add zone append write for zoned device
@ 2022-10-16 14:56 Sam Li
  2022-10-16 14:56 ` [PATCH v4 1/3] file-posix: add the tracking of the zones write pointers Sam Li
                   ` (2 more replies)
  0 siblings, 3 replies; 9+ messages in thread
From: Sam Li @ 2022-10-16 14:56 UTC (permalink / raw)
  To: qemu-devel
  Cc: stefanha, dmitry.fomichev, Hanna Reitz, Kevin Wolf, Fam Zheng,
	damien.lemoal, hare, qemu-block, Sam Li

v4:
- fix lock related issues[Damien]
- drop all field in zone_mgmt op [Damien]
- fix state checks in zong_mgmt command [Damien]
- return start sector of wp when issuing zap req [Damien]

v3:
- only read wps when it is locked [Damien]
- allow last smaller zone case [Damien]
- add zone type and state checks in zone_mgmt command [Damien]
- fix RESET_ALL related problems

v2:
- split patch to two patches for better reviewing
- change BlockZoneWps's structure to an array of integers
- use only mutex lock on locking conditions of zone wps
- coding styles and clean-ups

v1:
- introduce zone append write

Sam Li (3):
  file-posix: add the tracking of the zones write pointers
  block: introduce zone append write for zoned devices
  qemu-iotests: test zone append operation

 block/block-backend.c              |  65 ++++++++
 block/file-posix.c                 | 229 ++++++++++++++++++++++++++++-
 block/io.c                         |  21 +++
 block/raw-format.c                 |   8 +
 include/block/block-common.h       |  14 ++
 include/block/block-io.h           |   3 +
 include/block/block_int-common.h   |   8 +
 include/block/raw-aio.h            |   4 +-
 include/sysemu/block-backend-io.h  |   9 ++
 qemu-io-cmds.c                     |  63 ++++++++
 tests/qemu-iotests/tests/zoned.out |   7 +
 tests/qemu-iotests/tests/zoned.sh  |   9 ++
 12 files changed, 436 insertions(+), 4 deletions(-)

-- 
2.37.3



^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH v4 1/3] file-posix: add the tracking of the zones write pointers
  2022-10-16 14:56 [PATCH v4 0/3] Add zone append write for zoned device Sam Li
@ 2022-10-16 14:56 ` Sam Li
  2022-10-17  0:57   ` Dmitry Fomichev
  2022-10-17  5:11   ` Damien Le Moal
  2022-10-16 14:56 ` [PATCH v4 2/3] block: introduce zone append write for zoned devices Sam Li
  2022-10-16 14:56 ` [PATCH v4 3/3] qemu-iotests: test zone append operation Sam Li
  2 siblings, 2 replies; 9+ messages in thread
From: Sam Li @ 2022-10-16 14:56 UTC (permalink / raw)
  To: qemu-devel
  Cc: stefanha, dmitry.fomichev, Hanna Reitz, Kevin Wolf, Fam Zheng,
	damien.lemoal, hare, qemu-block, Sam Li

Since Linux doesn't have a user API to issue zone append operations to
zoned devices from user space, the file-posix driver is modified to add
zone append emulation using regular writes. To do this, the file-posix
driver tracks the wp location of all zones of the device. It uses an
array of uint64_t. The most significant bit of each wp location indicates
if the zone type is conventional zones.

The zones wp can be changed due to the following operations issued:
- zone reset: change the wp to the start offset of that zone
- zone finish: change to the end location of that zone
- write to a zone
- zone append

Signed-off-by: Sam Li <faithilikerun@gmail.com>
---
 block/file-posix.c               | 144 +++++++++++++++++++++++++++++++
 include/block/block-common.h     |  14 +++
 include/block/block_int-common.h |   3 +
 3 files changed, 161 insertions(+)

diff --git a/block/file-posix.c b/block/file-posix.c
index 7c5a330fc1..5ff5500301 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -1324,6 +1324,66 @@ static int hdev_get_max_segments(int fd, struct stat *st)
 #endif
 }
 
+#if defined(CONFIG_BLKZONED)
+static int get_zones_wp(int fd, BlockZoneWps *wps, int64_t offset,
+                        unsigned int nrz) {
+    struct blk_zone *blkz;
+    int64_t rep_size;
+    int64_t sector = offset >> BDRV_SECTOR_BITS;
+    int ret, n = 0, i = 0;
+    rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
+    g_autofree struct blk_zone_report *rep = NULL;
+
+    rep = g_malloc(rep_size);
+    blkz = (struct blk_zone *)(rep + 1);
+    while (n < nrz) {
+        memset(rep, 0, rep_size);
+        rep->sector = sector;
+        rep->nr_zones = nrz - n;
+
+        do {
+            ret = ioctl(fd, BLKREPORTZONE, rep);
+        } while (ret != 0 && errno == EINTR);
+        if (ret != 0) {
+            error_report("%d: ioctl BLKREPORTZONE at %" PRId64 " failed %d",
+                    fd, offset, errno);
+            return -errno;
+        }
+
+        if (!rep->nr_zones) {
+            break;
+        }
+
+        for (i = 0; i < rep->nr_zones; i++, n++) {
+            /*
+             * The wp tracking cares only about sequential writes required and
+             * sequential write preferred zones so that the wp can advance to
+             * the right location.
+             * Use the most significant bit of the wp location to indicate the
+             * zone type: 0 for SWR/SWP zones and 1 for conventional zones.
+             */
+            if (blkz[i].type == BLK_ZONE_TYPE_CONVENTIONAL) {
+                wps->wp[i] = 1ULL << 63;
+            } else {
+                wps->wp[i] = blkz[i].wp << BDRV_SECTOR_BITS;
+            }
+        }
+        sector = blkz[i - 1].start + blkz[i - 1].len;
+    }
+
+    return 0;
+}
+
+static void update_zones_wp(int fd, BlockZoneWps *wps, int64_t offset,
+                            unsigned int nrz) {
+    qemu_mutex_lock(&wps->lock);
+    if (get_zones_wp(fd, wps, offset, nrz) < 0) {
+        error_report("update zone wp failed");
+    }
+    qemu_mutex_unlock(&wps->lock);
+}
+#endif
+
 static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
 {
     BDRVRawState *s = bs->opaque;
@@ -1414,6 +1474,14 @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
         if (ret >= 0) {
             bs->bl.max_active_zones = ret;
         }
+
+        bs->bl.wps = g_malloc(sizeof(BlockZoneWps) + sizeof(int64_t) * ret);
+        if (get_zones_wp(s->fd, bs->bl.wps, 0, ret) < 0) {
+            error_report("report wps failed");
+            g_free(bs->bl.wps);
+            return;
+        }
+        qemu_mutex_init(&bs->bl.wps->lock);
     }
 }
 
@@ -1725,6 +1793,25 @@ static int handle_aiocb_rw(void *opaque)
 
 out:
     if (nbytes == aiocb->aio_nbytes) {
+#if defined(CONFIG_BLKZONED)
+        if (aiocb->aio_type & QEMU_AIO_WRITE) {
+            BlockZoneWps *wps = aiocb->bs->bl.wps;
+            int index = aiocb->aio_offset / aiocb->bs->bl.zone_size;
+            if (wps) {
+                qemu_mutex_lock(&wps->lock);
+                if (!BDRV_ZT_IS_CONV(wps->wp[index])) {
+                    uint64_t wend_offset =
+                            aiocb->aio_offset + aiocb->aio_nbytes;
+
+                    /* Advance the wp if needed */
+                    if (wend_offset > wps->wp[index]) {
+                        wps->wp[index] = wend_offset;
+                    }
+                }
+                qemu_mutex_unlock(&wps->lock);
+            }
+        }
+#endif
         return 0;
     } else if (nbytes >= 0 && nbytes < aiocb->aio_nbytes) {
         if (aiocb->aio_type & QEMU_AIO_WRITE) {
@@ -1736,6 +1823,11 @@ out:
         }
     } else {
         assert(nbytes < 0);
+#if defined(CONFIG_BLKZONED)
+        if (aiocb->aio_type & QEMU_AIO_WRITE) {
+            update_zones_wp(aiocb->aio_fildes, aiocb->bs->bl.wps, 0, 1);
+        }
+#endif
         return nbytes;
     }
 }
@@ -2022,14 +2114,29 @@ static int handle_aiocb_zone_report(void *opaque)
 #endif
 
 #if defined(CONFIG_BLKZONED)
+static bool zone_is_empty(BlockDriverState *bs, int64_t index)
+{
+    return bs->bl.wps->wp[index] == index * bs->bl.zone_size;
+}
+
 static int handle_aiocb_zone_mgmt(void *opaque)
 {
     RawPosixAIOData *aiocb = opaque;
+    BlockDriverState *bs = aiocb->bs;
     int fd = aiocb->aio_fildes;
     int64_t sector = aiocb->aio_offset / 512;
     int64_t nr_sectors = aiocb->aio_nbytes / 512;
     struct blk_zone_range range;
     int ret;
+    uint64_t wend_offset;
+    BlockZoneWps *wps = bs->bl.wps;
+    int index = aiocb->aio_offset / bs->bl.zone_size;
+
+    if (BDRV_ZT_IS_CONV(wps->wp[index]) &&
+        aiocb->aio_nbytes != bs->bl.capacity) {
+        error_report("zone mgmt operations not allowed for conventional zones");
+        return -EIO;
+    }
 
     /* Execute the operation */
     range.sector = sector;
@@ -2038,11 +2145,42 @@ static int handle_aiocb_zone_mgmt(void *opaque)
         ret = ioctl(fd, aiocb->zone_mgmt.zone_op, &range);
     } while (ret != 0 && errno == EINTR);
     if (ret != 0) {
+        update_zones_wp(aiocb->aio_fildes, aiocb->bs->bl.wps,
+                        aiocb->aio_offset,
+                        aiocb->aio_nbytes / bs->bl.zone_size);
         ret = -errno;
         error_report("ioctl %s failed %d", aiocb->zone_mgmt.zone_op_name,
                      ret);
         return ret;
     }
+    qemu_mutex_lock(&wps->lock);
+    if (!BDRV_ZT_IS_CONV(wps->wp[index])) {
+        /*
+         * The zoned device allows the last zone smaller that the zone size.
+         */
+        if (aiocb->aio_nbytes < bs->bl.zone_size) {
+            wend_offset = aiocb->aio_offset + aiocb->aio_nbytes;
+        } else {
+            wend_offset = aiocb->aio_offset + bs->bl.zone_size;
+        }
+
+        if (!zone_is_empty(bs, index) &&
+            aiocb->zone_mgmt.zone_op == BLKRESETZONE) {
+            if (aiocb->aio_nbytes == bs->bl.capacity) {
+                for (int i = 0; i < bs->bl.nr_zones; ++i) {
+                    if (!BDRV_ZT_IS_CONV(wps->wp[index])) {
+                        wps->wp[i] = i * bs->bl.zone_size;
+                    }
+                }
+            } else {
+                wps->wp[index] = aiocb->aio_offset;
+            }
+        } else if (aiocb->zone_mgmt.zone_op == BLKFINISHZONE) {
+            wps->wp[index] = wend_offset;
+        }
+    }
+    qemu_mutex_unlock(&wps->lock);
+
     return 0;
 }
 #endif
@@ -2483,6 +2621,12 @@ static void raw_close(BlockDriverState *bs)
     BDRVRawState *s = bs->opaque;
 
     if (s->fd >= 0) {
+#if defined(CONFIG_BLKZONED)
+        if (bs->bl.wps) {
+            qemu_mutex_destroy(&bs->bl.wps->lock);
+            g_free(bs->bl.wps);
+        }
+#endif
         qemu_close(s->fd);
         s->fd = -1;
     }
diff --git a/include/block/block-common.h b/include/block/block-common.h
index 36bd0e480e..52372f8252 100644
--- a/include/block/block-common.h
+++ b/include/block/block-common.h
@@ -92,6 +92,14 @@ typedef struct BlockZoneDescriptor {
     BlockZoneCondition cond;
 } BlockZoneDescriptor;
 
+/*
+ * Track write pointers of a zone in bytes.
+ */
+typedef struct BlockZoneWps {
+    QemuMutex lock;
+    uint64_t wp[];
+} BlockZoneWps;
+
 typedef struct BlockDriverInfo {
     /* in bytes, 0 if irrelevant */
     int cluster_size;
@@ -205,6 +213,12 @@ typedef enum {
 #define BDRV_SECTOR_BITS   9
 #define BDRV_SECTOR_SIZE   (1ULL << BDRV_SECTOR_BITS)
 
+/*
+ * Get the first most significant bit of wp. If it is zero, then
+ * the zone type is SWR.
+ */
+#define BDRV_ZT_IS_CONV(wp)    (wp & (1ULL << 63))
+
 #define BDRV_REQUEST_MAX_SECTORS MIN_CONST(SIZE_MAX >> BDRV_SECTOR_BITS, \
                                            INT_MAX >> BDRV_SECTOR_BITS)
 #define BDRV_REQUEST_MAX_BYTES (BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS)
diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
index 37dddc603c..e3136146aa 100644
--- a/include/block/block_int-common.h
+++ b/include/block/block_int-common.h
@@ -857,6 +857,9 @@ typedef struct BlockLimits {
 
     /* device capacity expressed in bytes */
     int64_t capacity;
+
+    /* array of write pointers' location of each zone in the zoned device. */
+    BlockZoneWps *wps;
 } BlockLimits;
 
 typedef struct BdrvOpBlocker BdrvOpBlocker;
-- 
2.37.3



^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH v4 2/3] block: introduce zone append write for zoned devices
  2022-10-16 14:56 [PATCH v4 0/3] Add zone append write for zoned device Sam Li
  2022-10-16 14:56 ` [PATCH v4 1/3] file-posix: add the tracking of the zones write pointers Sam Li
@ 2022-10-16 14:56 ` Sam Li
  2022-10-17  0:58   ` Dmitry Fomichev
  2022-10-17  5:22   ` Damien Le Moal
  2022-10-16 14:56 ` [PATCH v4 3/3] qemu-iotests: test zone append operation Sam Li
  2 siblings, 2 replies; 9+ messages in thread
From: Sam Li @ 2022-10-16 14:56 UTC (permalink / raw)
  To: qemu-devel
  Cc: stefanha, dmitry.fomichev, Hanna Reitz, Kevin Wolf, Fam Zheng,
	damien.lemoal, hare, qemu-block, Sam Li

A zone append command is a write operation that specifies the first
logical block of a zone as the write position. When writing to a zoned
block device using zone append, the byte offset of writes is pointing
to the write pointer of that zone. Upon completion the device will
respond with the position the data has been written in the zone.

Signed-off-by: Sam Li <faithilikerun@gmail.com>
---
 block/block-backend.c             | 65 ++++++++++++++++++++++
 block/file-posix.c                | 89 +++++++++++++++++++++++++++++--
 block/io.c                        | 21 ++++++++
 block/raw-format.c                |  8 +++
 include/block/block-io.h          |  3 ++
 include/block/block_int-common.h  |  5 ++
 include/block/raw-aio.h           |  4 +-
 include/sysemu/block-backend-io.h |  9 ++++
 8 files changed, 198 insertions(+), 6 deletions(-)

diff --git a/block/block-backend.c b/block/block-backend.c
index 1c618e9c68..06931ddd24 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -1439,6 +1439,9 @@ typedef struct BlkRwCo {
         struct {
             unsigned long op;
         } zone_mgmt;
+        struct {
+            int64_t *append_sector;
+        } zone_append;
     };
 } BlkRwCo;
 
@@ -1871,6 +1874,47 @@ BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
     return &acb->common;
 }
 
+static void coroutine_fn blk_aio_zone_append_entry(void *opaque)
+{
+    BlkAioEmAIOCB *acb = opaque;
+    BlkRwCo *rwco = &acb->rwco;
+
+    rwco->ret = blk_co_zone_append(rwco->blk, rwco->zone_append.append_sector,
+                                   rwco->iobuf, rwco->flags);
+    blk_aio_complete(acb);
+}
+
+BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset,
+                                QEMUIOVector *qiov, BdrvRequestFlags flags,
+                                BlockCompletionFunc *cb, void *opaque) {
+    BlkAioEmAIOCB *acb;
+    Coroutine *co;
+    IO_CODE();
+
+    blk_inc_in_flight(blk);
+    acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
+    acb->rwco = (BlkRwCo) {
+            .blk    = blk,
+            .ret    = NOT_DONE,
+            .flags  = flags,
+            .iobuf  = qiov,
+            .zone_append = {
+                    .append_sector = offset,
+            },
+    };
+    acb->has_returned = false;
+
+    co = qemu_coroutine_create(blk_aio_zone_append_entry, acb);
+    bdrv_coroutine_enter(blk_bs(blk), co);
+    acb->has_returned = true;
+    if (acb->rwco.ret != NOT_DONE) {
+        replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
+                                         blk_aio_complete_bh, acb);
+    }
+
+    return &acb->common;
+}
+
 /*
  * Send a zone_report command.
  * offset is a byte offset from the start of the device. No alignment
@@ -1923,6 +1967,27 @@ int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
     return ret;
 }
 
+/*
+ * Send a zone_append command.
+ */
+int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset,
+        QEMUIOVector *qiov, BdrvRequestFlags flags)
+{
+    int ret;
+    IO_CODE();
+
+    blk_inc_in_flight(blk);
+    blk_wait_while_drained(blk);
+    if (!blk_is_available(blk)) {
+        blk_dec_in_flight(blk);
+        return -ENOMEDIUM;
+    }
+
+    ret = bdrv_co_zone_append(blk_bs(blk), offset, qiov, flags);
+    blk_dec_in_flight(blk);
+    return ret;
+}
+
 void blk_drain(BlockBackend *blk)
 {
     BlockDriverState *bs = blk_bs(blk);
diff --git a/block/file-posix.c b/block/file-posix.c
index 5ff5500301..3d0cc33d02 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -205,6 +205,7 @@ typedef struct RawPosixAIOData {
         struct {
             struct iovec *iov;
             int niov;
+            int64_t *offset;
         } io;
         struct {
             uint64_t cmd;
@@ -1475,6 +1476,11 @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
             bs->bl.max_active_zones = ret;
         }
 
+        ret = get_sysfs_long_val(&st, "physical_block_size");
+        if (ret >= 0) {
+            bs->bl.write_granularity = ret;
+        }
+
         bs->bl.wps = g_malloc(sizeof(BlockZoneWps) + sizeof(int64_t) * ret);
         if (get_zones_wp(s->fd, bs->bl.wps, 0, ret) < 0) {
             error_report("report wps failed");
@@ -1647,9 +1653,18 @@ qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset)
 static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb)
 {
     ssize_t len;
+    BlockZoneWps *wps = aiocb->bs->bl.wps;
+    int index = aiocb->aio_offset / aiocb->bs->bl.zone_size;
+
+    if (wps) {
+        qemu_mutex_lock(&wps->lock);
+        if (aiocb->aio_type & QEMU_AIO_ZONE_APPEND) {
+            aiocb->aio_offset = wps->wp[index];
+        }
+    }
 
     do {
-        if (aiocb->aio_type & QEMU_AIO_WRITE)
+        if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))
             len = qemu_pwritev(aiocb->aio_fildes,
                                aiocb->io.iov,
                                aiocb->io.niov,
@@ -1660,6 +1675,9 @@ static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb)
                               aiocb->io.niov,
                               aiocb->aio_offset);
     } while (len == -1 && errno == EINTR);
+    if (wps) {
+        qemu_mutex_unlock(&wps->lock);
+    }
 
     if (len == -1) {
         return -errno;
@@ -1677,9 +1695,17 @@ static ssize_t handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf)
 {
     ssize_t offset = 0;
     ssize_t len;
+    BlockZoneWps *wps = aiocb->bs->bl.wps;
+    int index = aiocb->aio_offset / aiocb->bs->bl.zone_size;
 
+    if (wps) {
+        qemu_mutex_lock(&wps->lock);
+        if (aiocb->aio_type & QEMU_AIO_ZONE_APPEND) {
+            aiocb->aio_offset = wps->wp[index];
+        }
+    }
     while (offset < aiocb->aio_nbytes) {
-        if (aiocb->aio_type & QEMU_AIO_WRITE) {
+        if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
             len = pwrite(aiocb->aio_fildes,
                          (const char *)buf + offset,
                          aiocb->aio_nbytes - offset,
@@ -1709,6 +1735,9 @@ static ssize_t handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf)
         }
         offset += len;
     }
+    if (wps) {
+        qemu_mutex_unlock(&wps->lock);
+    }
 
     return offset;
 }
@@ -1772,7 +1801,7 @@ static int handle_aiocb_rw(void *opaque)
     }
 
     nbytes = handle_aiocb_rw_linear(aiocb, buf);
-    if (!(aiocb->aio_type & QEMU_AIO_WRITE)) {
+    if (!(aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))) {
         char *p = buf;
         size_t count = aiocb->aio_nbytes, copy;
         int i;
@@ -1794,7 +1823,7 @@ static int handle_aiocb_rw(void *opaque)
 out:
     if (nbytes == aiocb->aio_nbytes) {
 #if defined(CONFIG_BLKZONED)
-        if (aiocb->aio_type & QEMU_AIO_WRITE) {
+        if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
             BlockZoneWps *wps = aiocb->bs->bl.wps;
             int index = aiocb->aio_offset / aiocb->bs->bl.zone_size;
             if (wps) {
@@ -1802,6 +1831,9 @@ out:
                 if (!BDRV_ZT_IS_CONV(wps->wp[index])) {
                     uint64_t wend_offset =
                             aiocb->aio_offset + aiocb->aio_nbytes;
+                    if (aiocb->aio_type & QEMU_AIO_ZONE_APPEND) {
+                        *aiocb->io.offset = wps->wp[index];
+                    }
 
                     /* Advance the wp if needed */
                     if (wend_offset > wps->wp[index]) {
@@ -1824,7 +1856,7 @@ out:
     } else {
         assert(nbytes < 0);
 #if defined(CONFIG_BLKZONED)
-        if (aiocb->aio_type & QEMU_AIO_WRITE) {
+        if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
             update_zones_wp(aiocb->aio_fildes, aiocb->bs->bl.wps, 0, 1);
         }
 #endif
@@ -3478,6 +3510,52 @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
 }
 #endif
 
+#if defined(CONFIG_BLKZONED)
+static int coroutine_fn raw_co_zone_append(BlockDriverState *bs,
+                                           int64_t *offset,
+                                           QEMUIOVector *qiov,
+                                           BdrvRequestFlags flags) {
+    BDRVRawState *s = bs->opaque;
+    int64_t zone_size_mask = bs->bl.zone_size - 1;
+    int64_t iov_len = 0;
+    int64_t len = 0;
+    RawPosixAIOData acb;
+
+    if (*offset & zone_size_mask) {
+        error_report("sector offset %" PRId64 " is not aligned to zone size "
+                     "%" PRId32 "", *offset / 512, bs->bl.zone_size / 512);
+        return -EINVAL;
+    }
+
+    int64_t wg = bs->bl.write_granularity;
+    int64_t wg_mask = wg - 1;
+    for (int i = 0; i < qiov->niov; i++) {
+        iov_len = qiov->iov[i].iov_len;
+        if (iov_len & wg_mask) {
+            error_report("len of IOVector[%d] %" PRId64 " is not aligned to "
+                         "block size %" PRId64 "", i, iov_len, wg);
+            return -EINVAL;
+        }
+        len += iov_len;
+    }
+
+    acb = (RawPosixAIOData) {
+        .bs = bs,
+        .aio_fildes = s->fd,
+        .aio_type = QEMU_AIO_ZONE_APPEND,
+        .aio_offset = *offset,
+        .aio_nbytes = len,
+        .io = {
+                .iov = qiov->iov,
+                .niov = qiov->niov,
+                .offset = offset,
+        },
+    };
+
+    return raw_thread_pool_submit(bs, handle_aiocb_rw, &acb);
+}
+#endif
+
 static coroutine_fn int
 raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes,
                 bool blkdev)
@@ -4253,6 +4331,7 @@ static BlockDriver bdrv_zoned_host_device = {
     /* zone management operations */
     .bdrv_co_zone_report = raw_co_zone_report,
     .bdrv_co_zone_mgmt = raw_co_zone_mgmt,
+    .bdrv_co_zone_append = raw_co_zone_append,
 };
 #endif
 
diff --git a/block/io.c b/block/io.c
index 88f707ea4d..03e1109056 100644
--- a/block/io.c
+++ b/block/io.c
@@ -3230,6 +3230,27 @@ out:
     return co.ret;
 }
 
+int coroutine_fn bdrv_co_zone_append(BlockDriverState *bs, int64_t *offset,
+                        QEMUIOVector *qiov,
+                        BdrvRequestFlags flags)
+{
+    BlockDriver *drv = bs->drv;
+    CoroutineIOCompletion co = {
+            .coroutine = qemu_coroutine_self(),
+    };
+    IO_CODE();
+
+    bdrv_inc_in_flight(bs);
+    if (!drv || !drv->bdrv_co_zone_append) {
+        co.ret = -ENOTSUP;
+        goto out;
+    }
+    co.ret = drv->bdrv_co_zone_append(bs, offset, qiov, flags);
+out:
+    bdrv_dec_in_flight(bs);
+    return co.ret;
+}
+
 void *qemu_blockalign(BlockDriverState *bs, size_t size)
 {
     IO_CODE();
diff --git a/block/raw-format.c b/block/raw-format.c
index 18dc52a150..33bff8516e 100644
--- a/block/raw-format.c
+++ b/block/raw-format.c
@@ -325,6 +325,13 @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
     return bdrv_co_zone_mgmt(bs->file->bs, op, offset, len);
 }
 
+static int coroutine_fn raw_co_zone_append(BlockDriverState *bs,
+                                           int64_t *offset,
+                                           QEMUIOVector *qiov,
+                                           BdrvRequestFlags flags) {
+    return bdrv_co_zone_append(bs->file->bs, offset, qiov, flags);
+}
+
 static int64_t raw_getlength(BlockDriverState *bs)
 {
     int64_t len;
@@ -629,6 +636,7 @@ BlockDriver bdrv_raw = {
     .bdrv_co_pdiscard     = &raw_co_pdiscard,
     .bdrv_co_zone_report  = &raw_co_zone_report,
     .bdrv_co_zone_mgmt  = &raw_co_zone_mgmt,
+    .bdrv_co_zone_append = &raw_co_zone_append,
     .bdrv_co_block_status = &raw_co_block_status,
     .bdrv_co_copy_range_from = &raw_co_copy_range_from,
     .bdrv_co_copy_range_to  = &raw_co_copy_range_to,
diff --git a/include/block/block-io.h b/include/block/block-io.h
index f0cdf67d33..6a54453578 100644
--- a/include/block/block-io.h
+++ b/include/block/block-io.h
@@ -94,6 +94,9 @@ int coroutine_fn bdrv_co_zone_report(BlockDriverState *bs, int64_t offset,
                                      BlockZoneDescriptor *zones);
 int coroutine_fn bdrv_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
                                    int64_t offset, int64_t len);
+int coroutine_fn bdrv_co_zone_append(BlockDriverState *bs, int64_t *offset,
+                                     QEMUIOVector *qiov,
+                                     BdrvRequestFlags flags);
 
 int bdrv_co_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes);
 bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs);
diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
index e3136146aa..a7e7db5646 100644
--- a/include/block/block_int-common.h
+++ b/include/block/block_int-common.h
@@ -701,6 +701,9 @@ struct BlockDriver {
             BlockZoneDescriptor *zones);
     int coroutine_fn (*bdrv_co_zone_mgmt)(BlockDriverState *bs, BlockZoneOp op,
             int64_t offset, int64_t len);
+    int coroutine_fn (*bdrv_co_zone_append)(BlockDriverState *bs,
+            int64_t *offset, QEMUIOVector *qiov,
+            BdrvRequestFlags flags);
 
     /* removable device specific */
     bool (*bdrv_is_inserted)(BlockDriverState *bs);
@@ -860,6 +863,8 @@ typedef struct BlockLimits {
 
     /* array of write pointers' location of each zone in the zoned device. */
     BlockZoneWps *wps;
+
+    int64_t write_granularity;
 } BlockLimits;
 
 typedef struct BdrvOpBlocker BdrvOpBlocker;
diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h
index 877b2240b3..53033a5ca7 100644
--- a/include/block/raw-aio.h
+++ b/include/block/raw-aio.h
@@ -31,6 +31,7 @@
 #define QEMU_AIO_TRUNCATE     0x0080
 #define QEMU_AIO_ZONE_REPORT  0x0100
 #define QEMU_AIO_ZONE_MGMT    0x0200
+#define QEMU_AIO_ZONE_APPEND  0x0400
 #define QEMU_AIO_TYPE_MASK \
         (QEMU_AIO_READ | \
          QEMU_AIO_WRITE | \
@@ -41,7 +42,8 @@
          QEMU_AIO_COPY_RANGE | \
          QEMU_AIO_TRUNCATE | \
          QEMU_AIO_ZONE_REPORT | \
-         QEMU_AIO_ZONE_MGMT)
+         QEMU_AIO_ZONE_MGMT | \
+         QEMU_AIO_ZONE_APPEND)
 
 /* AIO flags */
 #define QEMU_AIO_MISALIGNED   0x1000
diff --git a/include/sysemu/block-backend-io.h b/include/sysemu/block-backend-io.h
index 1b5fc7db6b..ff9f777f52 100644
--- a/include/sysemu/block-backend-io.h
+++ b/include/sysemu/block-backend-io.h
@@ -52,6 +52,9 @@ BlockAIOCB *blk_aio_zone_report(BlockBackend *blk, int64_t offset,
 BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
                               int64_t offset, int64_t len,
                               BlockCompletionFunc *cb, void *opaque);
+BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset,
+                                QEMUIOVector *qiov, BdrvRequestFlags flags,
+                                BlockCompletionFunc *cb, void *opaque);
 BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes,
                              BlockCompletionFunc *cb, void *opaque);
 void blk_aio_cancel_async(BlockAIOCB *acb);
@@ -173,6 +176,12 @@ int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
                                   int64_t offset, int64_t len);
 int generated_co_wrapper blk_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
                                        int64_t offset, int64_t len);
+int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset,
+                                    QEMUIOVector *qiov,
+                                    BdrvRequestFlags flags);
+int generated_co_wrapper blk_zone_append(BlockBackend *blk, int64_t *offset,
+                                         QEMUIOVector *qiov,
+                                         BdrvRequestFlags flags);
 
 int generated_co_wrapper blk_pdiscard(BlockBackend *blk, int64_t offset,
                                       int64_t bytes);
-- 
2.37.3



^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH v4 3/3] qemu-iotests: test zone append operation
  2022-10-16 14:56 [PATCH v4 0/3] Add zone append write for zoned device Sam Li
  2022-10-16 14:56 ` [PATCH v4 1/3] file-posix: add the tracking of the zones write pointers Sam Li
  2022-10-16 14:56 ` [PATCH v4 2/3] block: introduce zone append write for zoned devices Sam Li
@ 2022-10-16 14:56 ` Sam Li
  2 siblings, 0 replies; 9+ messages in thread
From: Sam Li @ 2022-10-16 14:56 UTC (permalink / raw)
  To: qemu-devel
  Cc: stefanha, dmitry.fomichev, Hanna Reitz, Kevin Wolf, Fam Zheng,
	damien.lemoal, hare, qemu-block, Sam Li

This tests is mainly a helper to indicate append writes in block layer
behaves as expected.

Signed-off-by: Sam Li <faithilikerun@gmail.com>
---
 qemu-io-cmds.c                     | 63 ++++++++++++++++++++++++++++++
 tests/qemu-iotests/tests/zoned.out |  7 ++++
 tests/qemu-iotests/tests/zoned.sh  |  9 +++++
 3 files changed, 79 insertions(+)

diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c
index c1b28ea108..ca92291a44 100644
--- a/qemu-io-cmds.c
+++ b/qemu-io-cmds.c
@@ -1856,6 +1856,68 @@ static const cmdinfo_t zone_reset_cmd = {
     .oneline = "reset a zone write pointer in zone block device",
 };
 
+static int do_aio_zone_append(BlockBackend *blk, QEMUIOVector *qiov,
+                              int64_t *offset, int flags, int *total)
+{
+    int async_ret = NOT_DONE;
+
+    blk_aio_zone_append(blk, offset, qiov, flags, aio_rw_done, &async_ret);
+    while (async_ret == NOT_DONE) {
+        main_loop_wait(false);
+    }
+
+    *total = qiov->size;
+    return async_ret < 0 ? async_ret : 1;
+}
+
+static int zone_append_f(BlockBackend *blk, int argc, char **argv)
+{
+    int ret;
+    int flags = 0;
+    int total = 0;
+    int64_t offset;
+    char *buf;
+    int nr_iov;
+    int pattern = 0xcd;
+    QEMUIOVector qiov;
+
+    if (optind > argc - 2) {
+        return -EINVAL;
+    }
+    optind++;
+    offset = cvtnum(argv[optind]);
+    if (offset < 0) {
+        print_cvtnum_err(offset, argv[optind]);
+        return offset;
+    }
+    optind++;
+    nr_iov = argc - optind;
+    buf = create_iovec(blk, &qiov, &argv[optind], nr_iov, pattern);
+    if (buf == NULL) {
+        return -EINVAL;
+    }
+    ret = do_aio_zone_append(blk, &qiov, &offset, flags, &total);
+    if (ret < 0) {
+        printf("zone append failed: %s\n", strerror(-ret));
+        goto out;
+    }
+
+    out:
+    qemu_iovec_destroy(&qiov);
+    qemu_io_free(buf);
+    return ret;
+}
+
+static const cmdinfo_t zone_append_cmd = {
+    .name = "zone_append",
+    .altname = "zap",
+    .cfunc = zone_append_f,
+    .argmin = 3,
+    .argmax = 3,
+    .args = "offset len [len..]",
+    .oneline = "append write a number of bytes at a specified offset",
+};
+
 static int truncate_f(BlockBackend *blk, int argc, char **argv);
 static const cmdinfo_t truncate_cmd = {
     .name       = "truncate",
@@ -2653,6 +2715,7 @@ static void __attribute((constructor)) init_qemuio_commands(void)
     qemuio_add_command(&zone_close_cmd);
     qemuio_add_command(&zone_finish_cmd);
     qemuio_add_command(&zone_reset_cmd);
+    qemuio_add_command(&zone_append_cmd);
     qemuio_add_command(&truncate_cmd);
     qemuio_add_command(&length_cmd);
     qemuio_add_command(&info_cmd);
diff --git a/tests/qemu-iotests/tests/zoned.out b/tests/qemu-iotests/tests/zoned.out
index 0c8f96deb9..b3b139b4ec 100644
--- a/tests/qemu-iotests/tests/zoned.out
+++ b/tests/qemu-iotests/tests/zoned.out
@@ -50,4 +50,11 @@ start: 0x80000, len 0x80000, cap 0x80000, wptr 0x100000, zcond:14, [type: 2]
 (5) resetting the second zone
 After resetting a zone:
 start: 0x80000, len 0x80000, cap 0x80000, wptr 0x80000, zcond:1, [type: 2]
+
+
+(6) append write
+After appending the first zone:
+start: 0x0, len 0x80000, cap 0x80000, wptr 0x18, zcond:2, [type: 2]
+After appending the second zone:
+start: 0x80000, len 0x80000, cap 0x80000, wptr 0x80018, zcond:2, [type: 2]
 *** done
diff --git a/tests/qemu-iotests/tests/zoned.sh b/tests/qemu-iotests/tests/zoned.sh
index fced0194c5..888711eef2 100755
--- a/tests/qemu-iotests/tests/zoned.sh
+++ b/tests/qemu-iotests/tests/zoned.sh
@@ -79,6 +79,15 @@ echo "(5) resetting the second zone"
 sudo $QEMU_IO $IMG -c "zrs 268435456 268435456"
 echo "After resetting a zone:"
 sudo $QEMU_IO $IMG -c "zrp 268435456 1"
+echo
+echo
+echo "(6) append write" # physical block size of the device is 4096
+sudo $QEMU_IO $IMG -c "zap 0 0x1000 0x2000"
+echo "After appending the first zone:"
+sudo $QEMU_IO $IMG -c "zrp 0 1"
+sudo $QEMU_IO $IMG -c "zap 268435456 0x1000 0x2000"
+echo "After appending the second zone:"
+sudo $QEMU_IO $IMG -c "zrp 268435456 1"
 
 # success, all done
 echo "*** done"
-- 
2.37.3



^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH v4 1/3] file-posix: add the tracking of the zones write pointers
  2022-10-16 14:56 ` [PATCH v4 1/3] file-posix: add the tracking of the zones write pointers Sam Li
@ 2022-10-17  0:57   ` Dmitry Fomichev
  2022-10-17  5:11   ` Damien Le Moal
  1 sibling, 0 replies; 9+ messages in thread
From: Dmitry Fomichev @ 2022-10-17  0:57 UTC (permalink / raw)
  To: faithilikerun@gmail.com, qemu-devel@nongnu.org
  Cc: stefanha@redhat.com, hreitz@redhat.com, fam@euphon.net,
	kwolf@redhat.com, hare@suse.de, damien.lemoal@opensource.wdc.com,
	qemu-block@nongnu.org

On Sun, 2022-10-16 at 22:56 +0800, Sam Li wrote:
> Since Linux doesn't have a user API to issue zone append operations to
> zoned devices from user space, the file-posix driver is modified to add
> zone append emulation using regular writes. To do this, the file-posix
> driver tracks the wp location of all zones of the device. It uses an
> array of uint64_t. The most significant bit of each wp location indicates
> if the zone type is conventional zones.
> 
> The zones wp can be changed due to the following operations issued:
> - zone reset: change the wp to the start offset of that zone
> - zone finish: change to the end location of that zone
> - write to a zone
> - zone append
> 
> Signed-off-by: Sam Li <faithilikerun@gmail.com>
> ---
>  block/file-posix.c               | 144 +++++++++++++++++++++++++++++++
>  include/block/block-common.h     |  14 +++
>  include/block/block_int-common.h |   3 +
>  3 files changed, 161 insertions(+)
> 
> diff --git a/block/file-posix.c b/block/file-posix.c
> index 7c5a330fc1..5ff5500301 100644
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -1324,6 +1324,66 @@ static int hdev_get_max_segments(int fd, struct stat
> *st)
>  #endif
>  }
>  
> +#if defined(CONFIG_BLKZONED)
> +static int get_zones_wp(int fd, BlockZoneWps *wps, int64_t offset,
> +                        unsigned int nrz) {
> +    struct blk_zone *blkz;
> +    int64_t rep_size;

size_t

> +    int64_t sector = offset >> BDRV_SECTOR_BITS;

uint64_t

> +    int ret, n = 0, i = 0;
> +    rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
> +    g_autofree struct blk_zone_report *rep = NULL;
> +
> +    rep = g_malloc(rep_size);
> +    blkz = (struct blk_zone *)(rep + 1);
> +    while (n < nrz) {
> +        memset(rep, 0, rep_size);
> +        rep->sector = sector;
> +        rep->nr_zones = nrz - n;
> +
> +        do {
> +            ret = ioctl(fd, BLKREPORTZONE, rep);
> +        } while (ret != 0 && errno == EINTR);
> +        if (ret != 0) {
> +            error_report("%d: ioctl BLKREPORTZONE at %" PRId64 " failed %d",
> +                    fd, offset, errno);
> +            return -errno;
> +        }
> +
> +        if (!rep->nr_zones) {
> +            break;
> +        }
> +
> +        for (i = 0; i < rep->nr_zones; i++, n++) {
> +            /*
> +             * The wp tracking cares only about sequential writes required and
> +             * sequential write preferred zones so that the wp can advance to
> +             * the right location.
> +             * Use the most significant bit of the wp location to indicate the
> +             * zone type: 0 for SWR/SWP zones and 1 for conventional zones.
> +             */
> +            if (blkz[i].type == BLK_ZONE_TYPE_CONVENTIONAL) {
> +                wps->wp[i] = 1ULL << 63;
> +            } else {
> +                wps->wp[i] = blkz[i].wp << BDRV_SECTOR_BITS;
> +            }
> +        }
> +        sector = blkz[i - 1].start + blkz[i - 1].len;
> +    }
> +
> +    return 0;
> +}
> +
> +static void update_zones_wp(int fd, BlockZoneWps *wps, int64_t offset,
> +                            unsigned int nrz) {
> +    qemu_mutex_lock(&wps->lock);
> +    if (get_zones_wp(fd, wps, offset, nrz) < 0) {
> +        error_report("update zone wp failed");
> +    }
> +    qemu_mutex_unlock(&wps->lock);
> +}
> +#endif
> +
>  static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
>  {
>      BDRVRawState *s = bs->opaque;
> @@ -1414,6 +1474,14 @@ static void raw_refresh_limits(BlockDriverState *bs,
> Error **errp)
>          if (ret >= 0) {
>              bs->bl.max_active_zones = ret;
>          }
> +
> +        bs->bl.wps = g_malloc(sizeof(BlockZoneWps) + sizeof(int64_t) * ret);
> +        if (get_zones_wp(s->fd, bs->bl.wps, 0, ret) < 0) {
> +            error_report("report wps failed");
> +            g_free(bs->bl.wps);
> +            return;
> +        }
> +        qemu_mutex_init(&bs->bl.wps->lock);
>      }
>  }
>  
> @@ -1725,6 +1793,25 @@ static int handle_aiocb_rw(void *opaque)
>  
>  out:
>      if (nbytes == aiocb->aio_nbytes) {
> +#if defined(CONFIG_BLKZONED)
> +        if (aiocb->aio_type & QEMU_AIO_WRITE) {
> +            BlockZoneWps *wps = aiocb->bs->bl.wps;
> +            int index = aiocb->aio_offset / aiocb->bs->bl.zone_size;
> +            if (wps) {

In my testing, I get a divide by zero exception in the "index"
calculation above. Changing this part as follows

-            int index = aiocb->aio_offset / aiocb->bs->bl.zone_size;
-            if (wps) {
+            if (wps && aiocb->bs->bl.zone_size) {
+                int index = aiocb->aio_offset / aiocb->bs->bl.zone_size;
+

fixes the crash.

> +                qemu_mutex_lock(&wps->lock);
> +                if (!BDRV_ZT_IS_CONV(wps->wp[index])) {
> +                    uint64_t wend_offset =
> +                            aiocb->aio_offset + aiocb->aio_nbytes;
> +
> +                    /* Advance the wp if needed */
> +                    if (wend_offset > wps->wp[index]) {
> +                        wps->wp[index] = wend_offset;
> +                    }
> +                }
> +                qemu_mutex_unlock(&wps->lock);
> +            }
> +        }
> +#endif
>          return 0;
>      } else if (nbytes >= 0 && nbytes < aiocb->aio_nbytes) {
>          if (aiocb->aio_type & QEMU_AIO_WRITE) {
> @@ -1736,6 +1823,11 @@ out:
>          }
>      } else {
>          assert(nbytes < 0);
> +#if defined(CONFIG_BLKZONED)
> +        if (aiocb->aio_type & QEMU_AIO_WRITE) {
> +            update_zones_wp(aiocb->aio_fildes, aiocb->bs->bl.wps, 0, 1);
> +        }
> +#endif
>          return nbytes;
>      }
>  }
> @@ -2022,14 +2114,29 @@ static int handle_aiocb_zone_report(void *opaque)
>  #endif
>  
>  #if defined(CONFIG_BLKZONED)
> +static bool zone_is_empty(BlockDriverState *bs, int64_t index)
> +{
> +    return bs->bl.wps->wp[index] == index * bs->bl.zone_size;
> +}
> +
>  static int handle_aiocb_zone_mgmt(void *opaque)
>  {
>      RawPosixAIOData *aiocb = opaque;
> +    BlockDriverState *bs = aiocb->bs;
>      int fd = aiocb->aio_fildes;
>      int64_t sector = aiocb->aio_offset / 512;
>      int64_t nr_sectors = aiocb->aio_nbytes / 512;
>      struct blk_zone_range range;
>      int ret;
> +    uint64_t wend_offset;
> +    BlockZoneWps *wps = bs->bl.wps;
> +    int index = aiocb->aio_offset / bs->bl.zone_size;

uint32_t

> +
> +    if (BDRV_ZT_IS_CONV(wps->wp[index]) &&
> +        aiocb->aio_nbytes != bs->bl.capacity) {
> +        error_report("zone mgmt operations not allowed for conventional
> zones");
> +        return -EIO;
> +    }
>  
>      /* Execute the operation */
>      range.sector = sector;
> @@ -2038,11 +2145,42 @@ static int handle_aiocb_zone_mgmt(void *opaque)
>          ret = ioctl(fd, aiocb->zone_mgmt.zone_op, &range);
>      } while (ret != 0 && errno == EINTR);
>      if (ret != 0) {
> +        update_zones_wp(aiocb->aio_fildes, aiocb->bs->bl.wps,
> +                        aiocb->aio_offset,
> +                        aiocb->aio_nbytes / bs->bl.zone_size);
>          ret = -errno;
>          error_report("ioctl %s failed %d", aiocb->zone_mgmt.zone_op_name,
>                       ret);
>          return ret;
>      }
> +    qemu_mutex_lock(&wps->lock);
> +    if (!BDRV_ZT_IS_CONV(wps->wp[index])) {
> +        /*
> +         * The zoned device allows the last zone smaller that the zone size.
> +         */
> +        if (aiocb->aio_nbytes < bs->bl.zone_size) {
> +            wend_offset = aiocb->aio_offset + aiocb->aio_nbytes;
> +        } else {
> +            wend_offset = aiocb->aio_offset + bs->bl.zone_size;
> +        }
> +
> +        if (!zone_is_empty(bs, index) &&
> +            aiocb->zone_mgmt.zone_op == BLKRESETZONE) {
> +            if (aiocb->aio_nbytes == bs->bl.capacity) {
> +                for (int i = 0; i < bs->bl.nr_zones; ++i) {
> +                    if (!BDRV_ZT_IS_CONV(wps->wp[index])) {
> +                        wps->wp[i] = i * bs->bl.zone_size;
> +                    }
> +                }
> +            } else {
> +                wps->wp[index] = aiocb->aio_offset;
> +            }
> +        } else if (aiocb->zone_mgmt.zone_op == BLKFINISHZONE) {
> +            wps->wp[index] = wend_offset;
> +        }
> +    }
> +    qemu_mutex_unlock(&wps->lock);
> +
>      return 0;
>  }
>  #endif
> @@ -2483,6 +2621,12 @@ static void raw_close(BlockDriverState *bs)
>      BDRVRawState *s = bs->opaque;
>  
>      if (s->fd >= 0) {
> +#if defined(CONFIG_BLKZONED)
> +        if (bs->bl.wps) {
> +            qemu_mutex_destroy(&bs->bl.wps->lock);
> +            g_free(bs->bl.wps);
> +        }
> +#endif
>          qemu_close(s->fd);
>          s->fd = -1;
>      }
> diff --git a/include/block/block-common.h b/include/block/block-common.h
> index 36bd0e480e..52372f8252 100644
> --- a/include/block/block-common.h
> +++ b/include/block/block-common.h
> @@ -92,6 +92,14 @@ typedef struct BlockZoneDescriptor {
>      BlockZoneCondition cond;
>  } BlockZoneDescriptor;
>  
> +/*
> + * Track write pointers of a zone in bytes.
> + */
> +typedef struct BlockZoneWps {
> +    QemuMutex lock;
> +    uint64_t wp[];
> +} BlockZoneWps;
> +
>  typedef struct BlockDriverInfo {
>      /* in bytes, 0 if irrelevant */
>      int cluster_size;
> @@ -205,6 +213,12 @@ typedef enum {
>  #define BDRV_SECTOR_BITS   9
>  #define BDRV_SECTOR_SIZE   (1ULL << BDRV_SECTOR_BITS)
>  
> +/*
> + * Get the first most significant bit of wp. If it is zero, then
> + * the zone type is SWR.
> + */
> +#define BDRV_ZT_IS_CONV(wp)    (wp & (1ULL << 63))
> +
>  #define BDRV_REQUEST_MAX_SECTORS MIN_CONST(SIZE_MAX >> BDRV_SECTOR_BITS, \
>                                             INT_MAX >> BDRV_SECTOR_BITS)
>  #define BDRV_REQUEST_MAX_BYTES (BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS)
> diff --git a/include/block/block_int-common.h b/include/block/block_int-
> common.h
> index 37dddc603c..e3136146aa 100644
> --- a/include/block/block_int-common.h
> +++ b/include/block/block_int-common.h
> @@ -857,6 +857,9 @@ typedef struct BlockLimits {
>  
>      /* device capacity expressed in bytes */
>      int64_t capacity;
> +
> +    /* array of write pointers' location of each zone in the zoned device. */
> +    BlockZoneWps *wps;
>  } BlockLimits;
>  
>  typedef struct BdrvOpBlocker BdrvOpBlocker;


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v4 2/3] block: introduce zone append write for zoned devices
  2022-10-16 14:56 ` [PATCH v4 2/3] block: introduce zone append write for zoned devices Sam Li
@ 2022-10-17  0:58   ` Dmitry Fomichev
  2022-10-17  5:22   ` Damien Le Moal
  1 sibling, 0 replies; 9+ messages in thread
From: Dmitry Fomichev @ 2022-10-17  0:58 UTC (permalink / raw)
  To: faithilikerun@gmail.com, qemu-devel@nongnu.org
  Cc: stefanha@redhat.com, hreitz@redhat.com, fam@euphon.net,
	kwolf@redhat.com, hare@suse.de, damien.lemoal@opensource.wdc.com,
	qemu-block@nongnu.org

On Sun, 2022-10-16 at 22:56 +0800, Sam Li wrote:
> A zone append command is a write operation that specifies the first
> logical block of a zone as the write position. When writing to a zoned
> block device using zone append, the byte offset of writes is pointing
> to the write pointer of that zone. Upon completion the device will
> respond with the position the data has been written in the zone.
> 
> Signed-off-by: Sam Li <faithilikerun@gmail.com>
> ---
>  block/block-backend.c             | 65 ++++++++++++++++++++++
>  block/file-posix.c                | 89 +++++++++++++++++++++++++++++--
>  block/io.c                        | 21 ++++++++
>  block/raw-format.c                |  8 +++
>  include/block/block-io.h          |  3 ++
>  include/block/block_int-common.h  |  5 ++
>  include/block/raw-aio.h           |  4 +-
>  include/sysemu/block-backend-io.h |  9 ++++
>  8 files changed, 198 insertions(+), 6 deletions(-)
> 
> diff --git a/block/block-backend.c b/block/block-backend.c
> index 1c618e9c68..06931ddd24 100644
> --- a/block/block-backend.c
> +++ b/block/block-backend.c
> @@ -1439,6 +1439,9 @@ typedef struct BlkRwCo {
>          struct {
>              unsigned long op;
>          } zone_mgmt;
> +        struct {
> +            int64_t *append_sector;
> +        } zone_append;
>      };
>  } BlkRwCo;
>  
> @@ -1871,6 +1874,47 @@ BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk,
> BlockZoneOp op,
>      return &acb->common;
>  }
>  
> +static void coroutine_fn blk_aio_zone_append_entry(void *opaque)
> +{
> +    BlkAioEmAIOCB *acb = opaque;
> +    BlkRwCo *rwco = &acb->rwco;
> +
> +    rwco->ret = blk_co_zone_append(rwco->blk, rwco->zone_append.append_sector,
> +                                   rwco->iobuf, rwco->flags);
> +    blk_aio_complete(acb);
> +}
> +
> +BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset,
> +                                QEMUIOVector *qiov, BdrvRequestFlags flags,
> +                                BlockCompletionFunc *cb, void *opaque) {
> +    BlkAioEmAIOCB *acb;
> +    Coroutine *co;
> +    IO_CODE();
> +
> +    blk_inc_in_flight(blk);
> +    acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
> +    acb->rwco = (BlkRwCo) {
> +            .blk    = blk,
> +            .ret    = NOT_DONE,
> +            .flags  = flags,
> +            .iobuf  = qiov,
> +            .zone_append = {
> +                    .append_sector = offset,
> +            },
> +    };
> +    acb->has_returned = false;
> +
> +    co = qemu_coroutine_create(blk_aio_zone_append_entry, acb);
> +    bdrv_coroutine_enter(blk_bs(blk), co);
> +    acb->has_returned = true;
> +    if (acb->rwco.ret != NOT_DONE) {
> +        replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
> +                                         blk_aio_complete_bh, acb);
> +    }
> +
> +    return &acb->common;
> +}
> +
>  /*
>   * Send a zone_report command.
>   * offset is a byte offset from the start of the device. No alignment
> @@ -1923,6 +1967,27 @@ int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk,
> BlockZoneOp op,
>      return ret;
>  }
>  
> +/*
> + * Send a zone_append command.
> + */
> +int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset,
> +        QEMUIOVector *qiov, BdrvRequestFlags flags)
> +{
> +    int ret;
> +    IO_CODE();
> +
> +    blk_inc_in_flight(blk);
> +    blk_wait_while_drained(blk);
> +    if (!blk_is_available(blk)) {
> +        blk_dec_in_flight(blk);
> +        return -ENOMEDIUM;
> +    }
> +
> +    ret = bdrv_co_zone_append(blk_bs(blk), offset, qiov, flags);
> +    blk_dec_in_flight(blk);
> +    return ret;
> +}
> +
>  void blk_drain(BlockBackend *blk)
>  {
>      BlockDriverState *bs = blk_bs(blk);
> diff --git a/block/file-posix.c b/block/file-posix.c
> index 5ff5500301..3d0cc33d02 100644
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -205,6 +205,7 @@ typedef struct RawPosixAIOData {
>          struct {
>              struct iovec *iov;
>              int niov;
> +            int64_t *offset;
>          } io;
>          struct {
>              uint64_t cmd;
> @@ -1475,6 +1476,11 @@ static void raw_refresh_limits(BlockDriverState *bs,
> Error **errp)
>              bs->bl.max_active_zones = ret;
>          }
>  
> +        ret = get_sysfs_long_val(&st, "physical_block_size");
> +        if (ret >= 0) {
> +            bs->bl.write_granularity = ret;
> +        }
> +
>          bs->bl.wps = g_malloc(sizeof(BlockZoneWps) + sizeof(int64_t) * ret);
>          if (get_zones_wp(s->fd, bs->bl.wps, 0, ret) < 0) {
>              error_report("report wps failed");
> @@ -1647,9 +1653,18 @@ qemu_pwritev(int fd, const struct iovec *iov, int
> nr_iov, off_t offset)
>  static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb)
>  {
>      ssize_t len;
> +    BlockZoneWps *wps = aiocb->bs->bl.wps;
> +    int index = aiocb->aio_offset / aiocb->bs->bl.zone_size;

Can this code ever be called for a non-zoned device with 0 zone size?
If yes, you need to avoid division by zero here...

> +
> +    if (wps) {
> +        qemu_mutex_lock(&wps->lock);
> +        if (aiocb->aio_type & QEMU_AIO_ZONE_APPEND) {
> +            aiocb->aio_offset = wps->wp[index];
> +        }
> +    }
>  
>      do {
> -        if (aiocb->aio_type & QEMU_AIO_WRITE)
> +        if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))
>              len = qemu_pwritev(aiocb->aio_fildes,
>                                 aiocb->io.iov,
>                                 aiocb->io.niov,
> @@ -1660,6 +1675,9 @@ static ssize_t handle_aiocb_rw_vector(RawPosixAIOData
> *aiocb)
>                                aiocb->io.niov,
>                                aiocb->aio_offset);
>      } while (len == -1 && errno == EINTR);
> +    if (wps) {
> +        qemu_mutex_unlock(&wps->lock);
> +    }
>  
>      if (len == -1) {
>          return -errno;
> @@ -1677,9 +1695,17 @@ static ssize_t handle_aiocb_rw_linear(RawPosixAIOData
> *aiocb, char *buf)
>  {
>      ssize_t offset = 0;
>      ssize_t len;
> +    BlockZoneWps *wps = aiocb->bs->bl.wps;
> +    int index = aiocb->aio_offset / aiocb->bs->bl.zone_size;

same here

>  
> +    if (wps) {
> +        qemu_mutex_lock(&wps->lock);
> +        if (aiocb->aio_type & QEMU_AIO_ZONE_APPEND) {
> +            aiocb->aio_offset = wps->wp[index];
> +        }
> +    }
>      while (offset < aiocb->aio_nbytes) {
> -        if (aiocb->aio_type & QEMU_AIO_WRITE) {
> +        if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
>              len = pwrite(aiocb->aio_fildes,
>                           (const char *)buf + offset,
>                           aiocb->aio_nbytes - offset,
> @@ -1709,6 +1735,9 @@ static ssize_t handle_aiocb_rw_linear(RawPosixAIOData
> *aiocb, char *buf)
>          }
>          offset += len;
>      }
> +    if (wps) {
> +        qemu_mutex_unlock(&wps->lock);
> +    }
>  
>      return offset;
>  }
> @@ -1772,7 +1801,7 @@ static int handle_aiocb_rw(void *opaque)
>      }
>  
>      nbytes = handle_aiocb_rw_linear(aiocb, buf);
> -    if (!(aiocb->aio_type & QEMU_AIO_WRITE)) {
> +    if (!(aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))) {
>          char *p = buf;
>          size_t count = aiocb->aio_nbytes, copy;
>          int i;
> @@ -1794,7 +1823,7 @@ static int handle_aiocb_rw(void *opaque)
>  out:
>      if (nbytes == aiocb->aio_nbytes) {
>  #if defined(CONFIG_BLKZONED)
> -        if (aiocb->aio_type & QEMU_AIO_WRITE) {
> +        if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
>              BlockZoneWps *wps = aiocb->bs->bl.wps;
>              int index = aiocb->aio_offset / aiocb->bs->bl.zone_size;

and here

>              if (wps) {
> @@ -1802,6 +1831,9 @@ out:
>                  if (!BDRV_ZT_IS_CONV(wps->wp[index])) {
>                      uint64_t wend_offset =
>                              aiocb->aio_offset + aiocb->aio_nbytes;
> +                    if (aiocb->aio_type & QEMU_AIO_ZONE_APPEND) {
> +                        *aiocb->io.offset = wps->wp[index];
> +                    }
>  
>                      /* Advance the wp if needed */
>                      if (wend_offset > wps->wp[index]) {
> @@ -1824,7 +1856,7 @@ out:
>      } else {
>          assert(nbytes < 0);
>  #if defined(CONFIG_BLKZONED)
> -        if (aiocb->aio_type & QEMU_AIO_WRITE) {
> +        if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
>              update_zones_wp(aiocb->aio_fildes, aiocb->bs->bl.wps, 0, 1);
>          }
>  #endif
> @@ -3478,6 +3510,52 @@ static int coroutine_fn
> raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
>  }
>  #endif
>  
> +#if defined(CONFIG_BLKZONED)
> +static int coroutine_fn raw_co_zone_append(BlockDriverState *bs,
> +                                           int64_t *offset,
> +                                           QEMUIOVector *qiov,
> +                                           BdrvRequestFlags flags) {
> +    BDRVRawState *s = bs->opaque;
> +    int64_t zone_size_mask = bs->bl.zone_size - 1;
> +    int64_t iov_len = 0;
> +    int64_t len = 0;
> +    RawPosixAIOData acb;
> +
> +    if (*offset & zone_size_mask) {
> +        error_report("sector offset %" PRId64 " is not aligned to zone size "
> +                     "%" PRId32 "", *offset / 512, bs->bl.zone_size / 512);
> +        return -EINVAL;
> +    }
> +
> +    int64_t wg = bs->bl.write_granularity;
> +    int64_t wg_mask = wg - 1;
> +    for (int i = 0; i < qiov->niov; i++) {
> +        iov_len = qiov->iov[i].iov_len;
> +        if (iov_len & wg_mask) {
> +            error_report("len of IOVector[%d] %" PRId64 " is not aligned to "
> +                         "block size %" PRId64 "", i, iov_len, wg);
> +            return -EINVAL;
> +        }
> +        len += iov_len;
> +    }
> +
> +    acb = (RawPosixAIOData) {
> +        .bs = bs,
> +        .aio_fildes = s->fd,
> +        .aio_type = QEMU_AIO_ZONE_APPEND,
> +        .aio_offset = *offset,
> +        .aio_nbytes = len,
> +        .io = {
> +                .iov = qiov->iov,
> +                .niov = qiov->niov,
> +                .offset = offset,
> +        },
> +    };
> +
> +    return raw_thread_pool_submit(bs, handle_aiocb_rw, &acb);
> +}
> +#endif
> +
>  static coroutine_fn int
>  raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes,
>                  bool blkdev)
> @@ -4253,6 +4331,7 @@ static BlockDriver bdrv_zoned_host_device = {
>      /* zone management operations */
>      .bdrv_co_zone_report = raw_co_zone_report,
>      .bdrv_co_zone_mgmt = raw_co_zone_mgmt,
> +    .bdrv_co_zone_append = raw_co_zone_append,
>  };
>  #endif
>  
> diff --git a/block/io.c b/block/io.c
> index 88f707ea4d..03e1109056 100644
> --- a/block/io.c
> +++ b/block/io.c
> @@ -3230,6 +3230,27 @@ out:
>      return co.ret;
>  }
>  
> +int coroutine_fn bdrv_co_zone_append(BlockDriverState *bs, int64_t *offset,
> +                        QEMUIOVector *qiov,
> +                        BdrvRequestFlags flags)
> +{
> +    BlockDriver *drv = bs->drv;
> +    CoroutineIOCompletion co = {
> +            .coroutine = qemu_coroutine_self(),
> +    };
> +    IO_CODE();
> +
> +    bdrv_inc_in_flight(bs);
> +    if (!drv || !drv->bdrv_co_zone_append) {
> +        co.ret = -ENOTSUP;
> +        goto out;
> +    }
> +    co.ret = drv->bdrv_co_zone_append(bs, offset, qiov, flags);
> +out:
> +    bdrv_dec_in_flight(bs);
> +    return co.ret;
> +}
> +
>  void *qemu_blockalign(BlockDriverState *bs, size_t size)
>  {
>      IO_CODE();
> diff --git a/block/raw-format.c b/block/raw-format.c
> index 18dc52a150..33bff8516e 100644
> --- a/block/raw-format.c
> +++ b/block/raw-format.c
> @@ -325,6 +325,13 @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState
> *bs, BlockZoneOp op,
>      return bdrv_co_zone_mgmt(bs->file->bs, op, offset, len);
>  }
>  
> +static int coroutine_fn raw_co_zone_append(BlockDriverState *bs,
> +                                           int64_t *offset,
> +                                           QEMUIOVector *qiov,
> +                                           BdrvRequestFlags flags) {
> +    return bdrv_co_zone_append(bs->file->bs, offset, qiov, flags);
> +}
> +
>  static int64_t raw_getlength(BlockDriverState *bs)
>  {
>      int64_t len;
> @@ -629,6 +636,7 @@ BlockDriver bdrv_raw = {
>      .bdrv_co_pdiscard     = &raw_co_pdiscard,
>      .bdrv_co_zone_report  = &raw_co_zone_report,
>      .bdrv_co_zone_mgmt  = &raw_co_zone_mgmt,
> +    .bdrv_co_zone_append = &raw_co_zone_append,
>      .bdrv_co_block_status = &raw_co_block_status,
>      .bdrv_co_copy_range_from = &raw_co_copy_range_from,
>      .bdrv_co_copy_range_to  = &raw_co_copy_range_to,
> diff --git a/include/block/block-io.h b/include/block/block-io.h
> index f0cdf67d33..6a54453578 100644
> --- a/include/block/block-io.h
> +++ b/include/block/block-io.h
> @@ -94,6 +94,9 @@ int coroutine_fn bdrv_co_zone_report(BlockDriverState *bs,
> int64_t offset,
>                                       BlockZoneDescriptor *zones);
>  int coroutine_fn bdrv_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
>                                     int64_t offset, int64_t len);
> +int coroutine_fn bdrv_co_zone_append(BlockDriverState *bs, int64_t *offset,
> +                                     QEMUIOVector *qiov,
> +                                     BdrvRequestFlags flags);
>  
>  int bdrv_co_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes);
>  bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs);
> diff --git a/include/block/block_int-common.h b/include/block/block_int-
> common.h
> index e3136146aa..a7e7db5646 100644
> --- a/include/block/block_int-common.h
> +++ b/include/block/block_int-common.h
> @@ -701,6 +701,9 @@ struct BlockDriver {
>              BlockZoneDescriptor *zones);
>      int coroutine_fn (*bdrv_co_zone_mgmt)(BlockDriverState *bs, BlockZoneOp
> op,
>              int64_t offset, int64_t len);
> +    int coroutine_fn (*bdrv_co_zone_append)(BlockDriverState *bs,
> +            int64_t *offset, QEMUIOVector *qiov,
> +            BdrvRequestFlags flags);
>  
>      /* removable device specific */
>      bool (*bdrv_is_inserted)(BlockDriverState *bs);
> @@ -860,6 +863,8 @@ typedef struct BlockLimits {
>  
>      /* array of write pointers' location of each zone in the zoned device. */
>      BlockZoneWps *wps;
> +
> +    int64_t write_granularity;
>  } BlockLimits;
>  
>  typedef struct BdrvOpBlocker BdrvOpBlocker;
> diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h
> index 877b2240b3..53033a5ca7 100644
> --- a/include/block/raw-aio.h
> +++ b/include/block/raw-aio.h
> @@ -31,6 +31,7 @@
>  #define QEMU_AIO_TRUNCATE     0x0080
>  #define QEMU_AIO_ZONE_REPORT  0x0100
>  #define QEMU_AIO_ZONE_MGMT    0x0200
> +#define QEMU_AIO_ZONE_APPEND  0x0400
>  #define QEMU_AIO_TYPE_MASK \
>          (QEMU_AIO_READ | \
>           QEMU_AIO_WRITE | \
> @@ -41,7 +42,8 @@
>           QEMU_AIO_COPY_RANGE | \
>           QEMU_AIO_TRUNCATE | \
>           QEMU_AIO_ZONE_REPORT | \
> -         QEMU_AIO_ZONE_MGMT)
> +         QEMU_AIO_ZONE_MGMT | \
> +         QEMU_AIO_ZONE_APPEND)
>  
>  /* AIO flags */
>  #define QEMU_AIO_MISALIGNED   0x1000
> diff --git a/include/sysemu/block-backend-io.h b/include/sysemu/block-backend-
> io.h
> index 1b5fc7db6b..ff9f777f52 100644
> --- a/include/sysemu/block-backend-io.h
> +++ b/include/sysemu/block-backend-io.h
> @@ -52,6 +52,9 @@ BlockAIOCB *blk_aio_zone_report(BlockBackend *blk, int64_t
> offset,
>  BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
>                                int64_t offset, int64_t len,
>                                BlockCompletionFunc *cb, void *opaque);
> +BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset,
> +                                QEMUIOVector *qiov, BdrvRequestFlags flags,
> +                                BlockCompletionFunc *cb, void *opaque);
>  BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes,
>                               BlockCompletionFunc *cb, void *opaque);
>  void blk_aio_cancel_async(BlockAIOCB *acb);
> @@ -173,6 +176,12 @@ int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk,
> BlockZoneOp op,
>                                    int64_t offset, int64_t len);
>  int generated_co_wrapper blk_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
>                                         int64_t offset, int64_t len);
> +int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset,
> +                                    QEMUIOVector *qiov,
> +                                    BdrvRequestFlags flags);
> +int generated_co_wrapper blk_zone_append(BlockBackend *blk, int64_t *offset,
> +                                         QEMUIOVector *qiov,
> +                                         BdrvRequestFlags flags);
>  
>  int generated_co_wrapper blk_pdiscard(BlockBackend *blk, int64_t offset,
>                                        int64_t bytes);


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v4 1/3] file-posix: add the tracking of the zones write pointers
  2022-10-16 14:56 ` [PATCH v4 1/3] file-posix: add the tracking of the zones write pointers Sam Li
  2022-10-17  0:57   ` Dmitry Fomichev
@ 2022-10-17  5:11   ` Damien Le Moal
  1 sibling, 0 replies; 9+ messages in thread
From: Damien Le Moal @ 2022-10-17  5:11 UTC (permalink / raw)
  To: Sam Li, qemu-devel
  Cc: stefanha, dmitry.fomichev, Hanna Reitz, Kevin Wolf, Fam Zheng,
	hare, qemu-block

On 10/16/22 23:56, Sam Li wrote:
> Since Linux doesn't have a user API to issue zone append operations to
> zoned devices from user space, the file-posix driver is modified to add
> zone append emulation using regular writes. To do this, the file-posix
> driver tracks the wp location of all zones of the device. It uses an
> array of uint64_t. The most significant bit of each wp location indicates
> if the zone type is conventional zones.
> 
> The zones wp can be changed due to the following operations issued:
> - zone reset: change the wp to the start offset of that zone
> - zone finish: change to the end location of that zone
> - write to a zone
> - zone append
> 
> Signed-off-by: Sam Li <faithilikerun@gmail.com>
> ---
>  block/file-posix.c               | 144 +++++++++++++++++++++++++++++++
>  include/block/block-common.h     |  14 +++
>  include/block/block_int-common.h |   3 +
>  3 files changed, 161 insertions(+)
> 
> diff --git a/block/file-posix.c b/block/file-posix.c
> index 7c5a330fc1..5ff5500301 100644
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -1324,6 +1324,66 @@ static int hdev_get_max_segments(int fd, struct stat *st)
>  #endif
>  }
>  
> +#if defined(CONFIG_BLKZONED)
> +static int get_zones_wp(int fd, BlockZoneWps *wps, int64_t offset,
> +                        unsigned int nrz) {
> +    struct blk_zone *blkz;
> +    int64_t rep_size;
> +    int64_t sector = offset >> BDRV_SECTOR_BITS;
> +    int ret, n = 0, i = 0;
> +    rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
> +    g_autofree struct blk_zone_report *rep = NULL;
> +
> +    rep = g_malloc(rep_size);
> +    blkz = (struct blk_zone *)(rep + 1);
> +    while (n < nrz) {
> +        memset(rep, 0, rep_size);
> +        rep->sector = sector;
> +        rep->nr_zones = nrz - n;
> +
> +        do {
> +            ret = ioctl(fd, BLKREPORTZONE, rep);
> +        } while (ret != 0 && errno == EINTR);
> +        if (ret != 0) {
> +            error_report("%d: ioctl BLKREPORTZONE at %" PRId64 " failed %d",
> +                    fd, offset, errno);
> +            return -errno;
> +        }
> +
> +        if (!rep->nr_zones) {
> +            break;
> +        }
> +
> +        for (i = 0; i < rep->nr_zones; i++, n++) {
> +            /*
> +             * The wp tracking cares only about sequential writes required and
> +             * sequential write preferred zones so that the wp can advance to
> +             * the right location.
> +             * Use the most significant bit of the wp location to indicate the
> +             * zone type: 0 for SWR/SWP zones and 1 for conventional zones.
> +             */
> +            if (blkz[i].type == BLK_ZONE_TYPE_CONVENTIONAL) {
> +                wps->wp[i] = 1ULL << 63;
> +            } else {
> +                wps->wp[i] = blkz[i].wp << BDRV_SECTOR_BITS;

Nit: For full, read-only and offline zones, the wp of a zone is undefined,
that is, its value may be total garbage and should not be used. The kernel
will normally report a wp set to zone start + zone len for these cases,
but better do the same here too. So this single line should be something
like this:

switch (blkz[i].cond) {
case BLK_ZONE_COND_FULL:
case BLK_ZONE_COND_READONLY:
    /* Zone not writeable */
    wps->wp[i] = (blkz[i].start + blkz[i].len) << BDRV_SECTOR_BITS;
    break;
case BLK_ZONE_COND_OFFLINE:
    /* Zone not writable nor readable */
    wps->wp[i] = blkz[i].start << BDRV_SECTOR_BITS;
    break;
default:
    wps->wp[i] = blkz[i].wp << BDRV_SECTOR_BITS;
    break;
}

> +            }
> +        }
> +        sector = blkz[i - 1].start + blkz[i - 1].len;
> +    }
> +
> +    return 0;
> +}
> +
> +static void update_zones_wp(int fd, BlockZoneWps *wps, int64_t offset,
> +                            unsigned int nrz) {
> +    qemu_mutex_lock(&wps->lock);
> +    if (get_zones_wp(fd, wps, offset, nrz) < 0) {
> +        error_report("update zone wp failed");
> +    }
> +    qemu_mutex_unlock(&wps->lock);
> +}
> +#endif
> +
>  static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
>  {
>      BDRVRawState *s = bs->opaque;
> @@ -1414,6 +1474,14 @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
>          if (ret >= 0) {
>              bs->bl.max_active_zones = ret;
>          }
> +
> +        bs->bl.wps = g_malloc(sizeof(BlockZoneWps) + sizeof(int64_t) * ret);
> +        if (get_zones_wp(s->fd, bs->bl.wps, 0, ret) < 0) {
> +            error_report("report wps failed");
> +            g_free(bs->bl.wps);
> +            return;
> +        }
> +        qemu_mutex_init(&bs->bl.wps->lock);
>      }
>  }
>  
> @@ -1725,6 +1793,25 @@ static int handle_aiocb_rw(void *opaque)
>  
>  out:
>      if (nbytes == aiocb->aio_nbytes) {
> +#if defined(CONFIG_BLKZONED)
> +        if (aiocb->aio_type & QEMU_AIO_WRITE) {
> +            BlockZoneWps *wps = aiocb->bs->bl.wps;
> +            int index = aiocb->aio_offset / aiocb->bs->bl.zone_size;
> +            if (wps) {
> +                qemu_mutex_lock(&wps->lock);
> +                if (!BDRV_ZT_IS_CONV(wps->wp[index])) {
> +                    uint64_t wend_offset =
> +                            aiocb->aio_offset + aiocb->aio_nbytes;
> +
> +                    /* Advance the wp if needed */
> +                    if (wend_offset > wps->wp[index]) {
> +                        wps->wp[index] = wend_offset;
> +                    }
> +                }
> +                qemu_mutex_unlock(&wps->lock);
> +            }
> +        }
> +#endif
>          return 0;
>      } else if (nbytes >= 0 && nbytes < aiocb->aio_nbytes) {
>          if (aiocb->aio_type & QEMU_AIO_WRITE) {
> @@ -1736,6 +1823,11 @@ out:
>          }
>      } else {
>          assert(nbytes < 0);
> +#if defined(CONFIG_BLKZONED)
> +        if (aiocb->aio_type & QEMU_AIO_WRITE) {
> +            update_zones_wp(aiocb->aio_fildes, aiocb->bs->bl.wps, 0, 1);
> +        }
> +#endif
>          return nbytes;
>      }
>  }
> @@ -2022,14 +2114,29 @@ static int handle_aiocb_zone_report(void *opaque)
>  #endif
>  
>  #if defined(CONFIG_BLKZONED)
> +static bool zone_is_empty(BlockDriverState *bs, int64_t index)
> +{
> +    return bs->bl.wps->wp[index] == index * bs->bl.zone_size;
> +}
> +
>  static int handle_aiocb_zone_mgmt(void *opaque)
>  {
>      RawPosixAIOData *aiocb = opaque;
> +    BlockDriverState *bs = aiocb->bs;
>      int fd = aiocb->aio_fildes;
>      int64_t sector = aiocb->aio_offset / 512;
>      int64_t nr_sectors = aiocb->aio_nbytes / 512;
>      struct blk_zone_range range;
>      int ret;
> +    uint64_t wend_offset;
> +    BlockZoneWps *wps = bs->bl.wps;
> +    int index = aiocb->aio_offset / bs->bl.zone_size;
> +
> +    if (BDRV_ZT_IS_CONV(wps->wp[index]) &&
> +        aiocb->aio_nbytes != bs->bl.capacity) {
> +        error_report("zone mgmt operations not allowed for conventional zones");
> +        return -EIO;
> +    }
>  
>      /* Execute the operation */
>      range.sector = sector;
> @@ -2038,11 +2145,42 @@ static int handle_aiocb_zone_mgmt(void *opaque)
>          ret = ioctl(fd, aiocb->zone_mgmt.zone_op, &range);
>      } while (ret != 0 && errno == EINTR);
>      if (ret != 0) {
> +        update_zones_wp(aiocb->aio_fildes, aiocb->bs->bl.wps,
> +                        aiocb->aio_offset,
> +                        aiocb->aio_nbytes / bs->bl.zone_size);
>          ret = -errno;
>          error_report("ioctl %s failed %d", aiocb->zone_mgmt.zone_op_name,
>                       ret);
>          return ret;
>      }
> +    qemu_mutex_lock(&wps->lock);

This lock MUST be done before the ioctl() operation. Otherwise, you can
get operation and wp update reversal resulting in a state inconsistent
with the device state.

Note that for the write operation you are not taking the lock before the
write operation execution, which is OK if you consider only write
requeuests thanks to the "if (wend_offset > wps->wp[index]) {" code.
But if you consider write requests and zone management operations
simultaneously executed by different qemu workers, you get to an
inconsistent state too. So you must take the lock before starting a write
operation too.

operation + wp update must be serialized against simultaneous operations
for the same zone.

> +    if (!BDRV_ZT_IS_CONV(wps->wp[index])) {
> +        /*
> +         * The zoned device allows the last zone smaller that the zone size.
> +         */
> +        if (aiocb->aio_nbytes < bs->bl.zone_size) {
> +            wend_offset = aiocb->aio_offset + aiocb->aio_nbytes;
> +        } else {
> +            wend_offset = aiocb->aio_offset + bs->bl.zone_size;
> +        }
> +
> +        if (!zone_is_empty(bs, index) &&
> +            aiocb->zone_mgmt.zone_op == BLKRESETZONE) {
> +            if (aiocb->aio_nbytes == bs->bl.capacity) {
> +                for (int i = 0; i < bs->bl.nr_zones; ++i) {
> +                    if (!BDRV_ZT_IS_CONV(wps->wp[index])) {
> +                        wps->wp[i] = i * bs->bl.zone_size;
> +                    }
> +                }
> +            } else {
> +                wps->wp[index] = aiocb->aio_offset;
> +            }
> +        } else if (aiocb->zone_mgmt.zone_op == BLKFINISHZONE) {
> +            wps->wp[index] = wend_offset;
> +        }
> +    }
> +    qemu_mutex_unlock(&wps->lock);
> +
>      return 0;
>  }
>  #endif
> @@ -2483,6 +2621,12 @@ static void raw_close(BlockDriverState *bs)
>      BDRVRawState *s = bs->opaque;
>  
>      if (s->fd >= 0) {
> +#if defined(CONFIG_BLKZONED)
> +        if (bs->bl.wps) {
> +            qemu_mutex_destroy(&bs->bl.wps->lock);
> +            g_free(bs->bl.wps);
> +        }
> +#endif
>          qemu_close(s->fd);
>          s->fd = -1;
>      }
> diff --git a/include/block/block-common.h b/include/block/block-common.h
> index 36bd0e480e..52372f8252 100644
> --- a/include/block/block-common.h
> +++ b/include/block/block-common.h
> @@ -92,6 +92,14 @@ typedef struct BlockZoneDescriptor {
>      BlockZoneCondition cond;
>  } BlockZoneDescriptor;
>  
> +/*
> + * Track write pointers of a zone in bytes.
> + */
> +typedef struct BlockZoneWps {
> +    QemuMutex lock;
> +    uint64_t wp[];
> +} BlockZoneWps;
> +
>  typedef struct BlockDriverInfo {
>      /* in bytes, 0 if irrelevant */
>      int cluster_size;
> @@ -205,6 +213,12 @@ typedef enum {
>  #define BDRV_SECTOR_BITS   9
>  #define BDRV_SECTOR_SIZE   (1ULL << BDRV_SECTOR_BITS)
>  
> +/*
> + * Get the first most significant bit of wp. If it is zero, then
> + * the zone type is SWR.
> + */
> +#define BDRV_ZT_IS_CONV(wp)    (wp & (1ULL << 63))
> +
>  #define BDRV_REQUEST_MAX_SECTORS MIN_CONST(SIZE_MAX >> BDRV_SECTOR_BITS, \
>                                             INT_MAX >> BDRV_SECTOR_BITS)
>  #define BDRV_REQUEST_MAX_BYTES (BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS)
> diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
> index 37dddc603c..e3136146aa 100644
> --- a/include/block/block_int-common.h
> +++ b/include/block/block_int-common.h
> @@ -857,6 +857,9 @@ typedef struct BlockLimits {
>  
>      /* device capacity expressed in bytes */
>      int64_t capacity;
> +
> +    /* array of write pointers' location of each zone in the zoned device. */
> +    BlockZoneWps *wps;
>  } BlockLimits;
>  
>  typedef struct BdrvOpBlocker BdrvOpBlocker;

-- 
Damien Le Moal
Western Digital Research



^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v4 2/3] block: introduce zone append write for zoned devices
  2022-10-16 14:56 ` [PATCH v4 2/3] block: introduce zone append write for zoned devices Sam Li
  2022-10-17  0:58   ` Dmitry Fomichev
@ 2022-10-17  5:22   ` Damien Le Moal
  2022-10-18  8:48     ` Sam Li
  1 sibling, 1 reply; 9+ messages in thread
From: Damien Le Moal @ 2022-10-17  5:22 UTC (permalink / raw)
  To: Sam Li, qemu-devel
  Cc: stefanha, dmitry.fomichev, Hanna Reitz, Kevin Wolf, Fam Zheng,
	hare, qemu-block

On 10/16/22 23:56, Sam Li wrote:
> A zone append command is a write operation that specifies the first
> logical block of a zone as the write position. When writing to a zoned
> block device using zone append, the byte offset of writes is pointing
> to the write pointer of that zone. Upon completion the device will
> respond with the position the data has been written in the zone.
> 
> Signed-off-by: Sam Li <faithilikerun@gmail.com>
> ---
>  block/block-backend.c             | 65 ++++++++++++++++++++++
>  block/file-posix.c                | 89 +++++++++++++++++++++++++++++--
>  block/io.c                        | 21 ++++++++
>  block/raw-format.c                |  8 +++
>  include/block/block-io.h          |  3 ++
>  include/block/block_int-common.h  |  5 ++
>  include/block/raw-aio.h           |  4 +-
>  include/sysemu/block-backend-io.h |  9 ++++
>  8 files changed, 198 insertions(+), 6 deletions(-)
> 
> diff --git a/block/block-backend.c b/block/block-backend.c
> index 1c618e9c68..06931ddd24 100644
> --- a/block/block-backend.c
> +++ b/block/block-backend.c
> @@ -1439,6 +1439,9 @@ typedef struct BlkRwCo {
>          struct {
>              unsigned long op;
>          } zone_mgmt;
> +        struct {
> +            int64_t *append_sector;

As mentioned previosuly, call this sector. "append" is already in the
zone_append struct member name....

> +        } zone_append;
>      };
>  } BlkRwCo;
>  
> @@ -1871,6 +1874,47 @@ BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
>      return &acb->common;
>  }
>  
> +static void coroutine_fn blk_aio_zone_append_entry(void *opaque)
> +{
> +    BlkAioEmAIOCB *acb = opaque;
> +    BlkRwCo *rwco = &acb->rwco;
> +
> +    rwco->ret = blk_co_zone_append(rwco->blk, rwco->zone_append.append_sector,

...so you avoid awkward repetitions of "append" like here. You'll have:
rwco->zone_append.sector, which is shorter and more natural.

> +                                   rwco->iobuf, rwco->flags);
> +    blk_aio_complete(acb);
> +}
> +
> +BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset,
> +                                QEMUIOVector *qiov, BdrvRequestFlags flags,
> +                                BlockCompletionFunc *cb, void *opaque) {
> +    BlkAioEmAIOCB *acb;
> +    Coroutine *co;
> +    IO_CODE();
> +
> +    blk_inc_in_flight(blk);
> +    acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
> +    acb->rwco = (BlkRwCo) {
> +            .blk    = blk,
> +            .ret    = NOT_DONE,
> +            .flags  = flags,
> +            .iobuf  = qiov,
> +            .zone_append = {
> +                    .append_sector = offset,
> +            },
> +    };
> +    acb->has_returned = false;
> +
> +    co = qemu_coroutine_create(blk_aio_zone_append_entry, acb);
> +    bdrv_coroutine_enter(blk_bs(blk), co);
> +    acb->has_returned = true;
> +    if (acb->rwco.ret != NOT_DONE) {
> +        replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
> +                                         blk_aio_complete_bh, acb);
> +    }
> +
> +    return &acb->common;
> +}
> +
>  /*
>   * Send a zone_report command.
>   * offset is a byte offset from the start of the device. No alignment
> @@ -1923,6 +1967,27 @@ int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
>      return ret;
>  }
>  
> +/*
> + * Send a zone_append command.
> + */
> +int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset,
> +        QEMUIOVector *qiov, BdrvRequestFlags flags)
> +{
> +    int ret;
> +    IO_CODE();
> +
> +    blk_inc_in_flight(blk);
> +    blk_wait_while_drained(blk);
> +    if (!blk_is_available(blk)) {
> +        blk_dec_in_flight(blk);
> +        return -ENOMEDIUM;
> +    }
> +
> +    ret = bdrv_co_zone_append(blk_bs(blk), offset, qiov, flags);
> +    blk_dec_in_flight(blk);
> +    return ret;
> +}
> +
>  void blk_drain(BlockBackend *blk)
>  {
>      BlockDriverState *bs = blk_bs(blk);
> diff --git a/block/file-posix.c b/block/file-posix.c
> index 5ff5500301..3d0cc33d02 100644
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -205,6 +205,7 @@ typedef struct RawPosixAIOData {
>          struct {
>              struct iovec *iov;
>              int niov;
> +            int64_t *offset;
>          } io;
>          struct {
>              uint64_t cmd;
> @@ -1475,6 +1476,11 @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
>              bs->bl.max_active_zones = ret;
>          }
>  
> +        ret = get_sysfs_long_val(&st, "physical_block_size");
> +        if (ret >= 0) {
> +            bs->bl.write_granularity = ret;
> +        }
> +
>          bs->bl.wps = g_malloc(sizeof(BlockZoneWps) + sizeof(int64_t) * ret);
>          if (get_zones_wp(s->fd, bs->bl.wps, 0, ret) < 0) {
>              error_report("report wps failed");
> @@ -1647,9 +1653,18 @@ qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset)
>  static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb)
>  {
>      ssize_t len;
> +    BlockZoneWps *wps = aiocb->bs->bl.wps;
> +    int index = aiocb->aio_offset / aiocb->bs->bl.zone_size;
> +
> +    if (wps) {
> +        qemu_mutex_lock(&wps->lock);
> +        if (aiocb->aio_type & QEMU_AIO_ZONE_APPEND) {
> +            aiocb->aio_offset = wps->wp[index];
> +        }
> +    }
>  
>      do {
> -        if (aiocb->aio_type & QEMU_AIO_WRITE)
> +        if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))
>              len = qemu_pwritev(aiocb->aio_fildes,
>                                 aiocb->io.iov,
>                                 aiocb->io.niov,
> @@ -1660,6 +1675,9 @@ static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb)
>                                aiocb->io.niov,
>                                aiocb->aio_offset);
>      } while (len == -1 && errno == EINTR);
> +    if (wps) {
> +        qemu_mutex_unlock(&wps->lock);

As commented in the previous patch, you cannot release the lock until you
update the wp array entry. So this means that you should be taking the wp
lock at the beginning of handle_aiocb_rw() and release it only after the
wp array is updated. That will also simplify the code and avoid spreading
lock/unlock of that array to many places.

> +    }
>  
>      if (len == -1) {
>          return -errno;
> @@ -1677,9 +1695,17 @@ static ssize_t handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf)
>  {
>      ssize_t offset = 0;
>      ssize_t len;
> +    BlockZoneWps *wps = aiocb->bs->bl.wps;
> +    int index = aiocb->aio_offset / aiocb->bs->bl.zone_size;
>  
> +    if (wps) {
> +        qemu_mutex_lock(&wps->lock);
> +        if (aiocb->aio_type & QEMU_AIO_ZONE_APPEND) {
> +            aiocb->aio_offset = wps->wp[index];
> +        }
> +    }
>      while (offset < aiocb->aio_nbytes) {
> -        if (aiocb->aio_type & QEMU_AIO_WRITE) {
> +        if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
>              len = pwrite(aiocb->aio_fildes,
>                           (const char *)buf + offset,
>                           aiocb->aio_nbytes - offset,
> @@ -1709,6 +1735,9 @@ static ssize_t handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf)
>          }
>          offset += len;
>      }
> +    if (wps) {
> +        qemu_mutex_unlock(&wps->lock);

Same comment as above.

> +    }
>  
>      return offset;
>  }
> @@ -1772,7 +1801,7 @@ static int handle_aiocb_rw(void *opaque)
>      }
>  
>      nbytes = handle_aiocb_rw_linear(aiocb, buf);
> -    if (!(aiocb->aio_type & QEMU_AIO_WRITE)) {
> +    if (!(aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))) {
>          char *p = buf;
>          size_t count = aiocb->aio_nbytes, copy;
>          int i;
> @@ -1794,7 +1823,7 @@ static int handle_aiocb_rw(void *opaque)
>  out:
>      if (nbytes == aiocb->aio_nbytes) {
>  #if defined(CONFIG_BLKZONED)
> -        if (aiocb->aio_type & QEMU_AIO_WRITE) {
> +        if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
>              BlockZoneWps *wps = aiocb->bs->bl.wps;
>              int index = aiocb->aio_offset / aiocb->bs->bl.zone_size;
>              if (wps) {
> @@ -1802,6 +1831,9 @@ out:
>                  if (!BDRV_ZT_IS_CONV(wps->wp[index])) {
>                      uint64_t wend_offset =
>                              aiocb->aio_offset + aiocb->aio_nbytes;
> +                    if (aiocb->aio_type & QEMU_AIO_ZONE_APPEND) {
> +                        *aiocb->io.offset = wps->wp[index];
> +                    }

The comment above will address the problem here, which is that you are
accessing the array without it being locked. You cannot do that.

>  
>                      /* Advance the wp if needed */
>                      if (wend_offset > wps->wp[index]) {
> @@ -1824,7 +1856,7 @@ out:
>      } else {
>          assert(nbytes < 0);
>  #if defined(CONFIG_BLKZONED)
> -        if (aiocb->aio_type & QEMU_AIO_WRITE) {
> +        if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
>              update_zones_wp(aiocb->aio_fildes, aiocb->bs->bl.wps, 0, 1);
>          }
>  #endif
> @@ -3478,6 +3510,52 @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
>  }
>  #endif
>  
> +#if defined(CONFIG_BLKZONED)
> +static int coroutine_fn raw_co_zone_append(BlockDriverState *bs,
> +                                           int64_t *offset,
> +                                           QEMUIOVector *qiov,
> +                                           BdrvRequestFlags flags) {
> +    BDRVRawState *s = bs->opaque;
> +    int64_t zone_size_mask = bs->bl.zone_size - 1;
> +    int64_t iov_len = 0;
> +    int64_t len = 0;
> +    RawPosixAIOData acb;
> +
> +    if (*offset & zone_size_mask) {
> +        error_report("sector offset %" PRId64 " is not aligned to zone size "
> +                     "%" PRId32 "", *offset / 512, bs->bl.zone_size / 512);
> +        return -EINVAL;
> +    }
> +
> +    int64_t wg = bs->bl.write_granularity;
> +    int64_t wg_mask = wg - 1;
> +    for (int i = 0; i < qiov->niov; i++) {
> +        iov_len = qiov->iov[i].iov_len;
> +        if (iov_len & wg_mask) {
> +            error_report("len of IOVector[%d] %" PRId64 " is not aligned to "
> +                         "block size %" PRId64 "", i, iov_len, wg);
> +            return -EINVAL;
> +        }
> +        len += iov_len;
> +    }
> +
> +    acb = (RawPosixAIOData) {
> +        .bs = bs,
> +        .aio_fildes = s->fd,
> +        .aio_type = QEMU_AIO_ZONE_APPEND,
> +        .aio_offset = *offset,
> +        .aio_nbytes = len,
> +        .io = {
> +                .iov = qiov->iov,
> +                .niov = qiov->niov,
> +                .offset = offset,
> +        },
> +    };
> +
> +    return raw_thread_pool_submit(bs, handle_aiocb_rw, &acb);
> +}
> +#endif
> +
>  static coroutine_fn int
>  raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes,
>                  bool blkdev)
> @@ -4253,6 +4331,7 @@ static BlockDriver bdrv_zoned_host_device = {
>      /* zone management operations */
>      .bdrv_co_zone_report = raw_co_zone_report,
>      .bdrv_co_zone_mgmt = raw_co_zone_mgmt,
> +    .bdrv_co_zone_append = raw_co_zone_append,
>  };
>  #endif
>  
> diff --git a/block/io.c b/block/io.c
> index 88f707ea4d..03e1109056 100644
> --- a/block/io.c
> +++ b/block/io.c
> @@ -3230,6 +3230,27 @@ out:
>      return co.ret;
>  }
>  
> +int coroutine_fn bdrv_co_zone_append(BlockDriverState *bs, int64_t *offset,
> +                        QEMUIOVector *qiov,
> +                        BdrvRequestFlags flags)
> +{
> +    BlockDriver *drv = bs->drv;
> +    CoroutineIOCompletion co = {
> +            .coroutine = qemu_coroutine_self(),
> +    };
> +    IO_CODE();
> +
> +    bdrv_inc_in_flight(bs);
> +    if (!drv || !drv->bdrv_co_zone_append) {
> +        co.ret = -ENOTSUP;
> +        goto out;
> +    }
> +    co.ret = drv->bdrv_co_zone_append(bs, offset, qiov, flags);
> +out:
> +    bdrv_dec_in_flight(bs);
> +    return co.ret;
> +}
> +
>  void *qemu_blockalign(BlockDriverState *bs, size_t size)
>  {
>      IO_CODE();
> diff --git a/block/raw-format.c b/block/raw-format.c
> index 18dc52a150..33bff8516e 100644
> --- a/block/raw-format.c
> +++ b/block/raw-format.c
> @@ -325,6 +325,13 @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
>      return bdrv_co_zone_mgmt(bs->file->bs, op, offset, len);
>  }
>  
> +static int coroutine_fn raw_co_zone_append(BlockDriverState *bs,
> +                                           int64_t *offset,
> +                                           QEMUIOVector *qiov,
> +                                           BdrvRequestFlags flags) {
> +    return bdrv_co_zone_append(bs->file->bs, offset, qiov, flags);
> +}
> +
>  static int64_t raw_getlength(BlockDriverState *bs)
>  {
>      int64_t len;
> @@ -629,6 +636,7 @@ BlockDriver bdrv_raw = {
>      .bdrv_co_pdiscard     = &raw_co_pdiscard,
>      .bdrv_co_zone_report  = &raw_co_zone_report,
>      .bdrv_co_zone_mgmt  = &raw_co_zone_mgmt,
> +    .bdrv_co_zone_append = &raw_co_zone_append,
>      .bdrv_co_block_status = &raw_co_block_status,
>      .bdrv_co_copy_range_from = &raw_co_copy_range_from,
>      .bdrv_co_copy_range_to  = &raw_co_copy_range_to,
> diff --git a/include/block/block-io.h b/include/block/block-io.h
> index f0cdf67d33..6a54453578 100644
> --- a/include/block/block-io.h
> +++ b/include/block/block-io.h
> @@ -94,6 +94,9 @@ int coroutine_fn bdrv_co_zone_report(BlockDriverState *bs, int64_t offset,
>                                       BlockZoneDescriptor *zones);
>  int coroutine_fn bdrv_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
>                                     int64_t offset, int64_t len);
> +int coroutine_fn bdrv_co_zone_append(BlockDriverState *bs, int64_t *offset,
> +                                     QEMUIOVector *qiov,
> +                                     BdrvRequestFlags flags);
>  
>  int bdrv_co_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes);
>  bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs);
> diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
> index e3136146aa..a7e7db5646 100644
> --- a/include/block/block_int-common.h
> +++ b/include/block/block_int-common.h
> @@ -701,6 +701,9 @@ struct BlockDriver {
>              BlockZoneDescriptor *zones);
>      int coroutine_fn (*bdrv_co_zone_mgmt)(BlockDriverState *bs, BlockZoneOp op,
>              int64_t offset, int64_t len);
> +    int coroutine_fn (*bdrv_co_zone_append)(BlockDriverState *bs,
> +            int64_t *offset, QEMUIOVector *qiov,
> +            BdrvRequestFlags flags);
>  
>      /* removable device specific */
>      bool (*bdrv_is_inserted)(BlockDriverState *bs);
> @@ -860,6 +863,8 @@ typedef struct BlockLimits {
>  
>      /* array of write pointers' location of each zone in the zoned device. */
>      BlockZoneWps *wps;
> +
> +    int64_t write_granularity;
>  } BlockLimits;
>  
>  typedef struct BdrvOpBlocker BdrvOpBlocker;
> diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h
> index 877b2240b3..53033a5ca7 100644
> --- a/include/block/raw-aio.h
> +++ b/include/block/raw-aio.h
> @@ -31,6 +31,7 @@
>  #define QEMU_AIO_TRUNCATE     0x0080
>  #define QEMU_AIO_ZONE_REPORT  0x0100
>  #define QEMU_AIO_ZONE_MGMT    0x0200
> +#define QEMU_AIO_ZONE_APPEND  0x0400
>  #define QEMU_AIO_TYPE_MASK \
>          (QEMU_AIO_READ | \
>           QEMU_AIO_WRITE | \
> @@ -41,7 +42,8 @@
>           QEMU_AIO_COPY_RANGE | \
>           QEMU_AIO_TRUNCATE | \
>           QEMU_AIO_ZONE_REPORT | \
> -         QEMU_AIO_ZONE_MGMT)
> +         QEMU_AIO_ZONE_MGMT | \
> +         QEMU_AIO_ZONE_APPEND)
>  
>  /* AIO flags */
>  #define QEMU_AIO_MISALIGNED   0x1000
> diff --git a/include/sysemu/block-backend-io.h b/include/sysemu/block-backend-io.h
> index 1b5fc7db6b..ff9f777f52 100644
> --- a/include/sysemu/block-backend-io.h
> +++ b/include/sysemu/block-backend-io.h
> @@ -52,6 +52,9 @@ BlockAIOCB *blk_aio_zone_report(BlockBackend *blk, int64_t offset,
>  BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
>                                int64_t offset, int64_t len,
>                                BlockCompletionFunc *cb, void *opaque);
> +BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset,
> +                                QEMUIOVector *qiov, BdrvRequestFlags flags,
> +                                BlockCompletionFunc *cb, void *opaque);
>  BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes,
>                               BlockCompletionFunc *cb, void *opaque);
>  void blk_aio_cancel_async(BlockAIOCB *acb);
> @@ -173,6 +176,12 @@ int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
>                                    int64_t offset, int64_t len);
>  int generated_co_wrapper blk_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
>                                         int64_t offset, int64_t len);
> +int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset,
> +                                    QEMUIOVector *qiov,
> +                                    BdrvRequestFlags flags);
> +int generated_co_wrapper blk_zone_append(BlockBackend *blk, int64_t *offset,
> +                                         QEMUIOVector *qiov,
> +                                         BdrvRequestFlags flags);
>  
>  int generated_co_wrapper blk_pdiscard(BlockBackend *blk, int64_t offset,
>                                        int64_t bytes);

-- 
Damien Le Moal
Western Digital Research



^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v4 2/3] block: introduce zone append write for zoned devices
  2022-10-17  5:22   ` Damien Le Moal
@ 2022-10-18  8:48     ` Sam Li
  0 siblings, 0 replies; 9+ messages in thread
From: Sam Li @ 2022-10-18  8:48 UTC (permalink / raw)
  To: Damien Le Moal
  Cc: qemu-devel, stefanha, dmitry.fomichev, Hanna Reitz, Kevin Wolf,
	Fam Zheng, hare, qemu-block

Damien Le Moal <damien.lemoal@opensource.wdc.com> 于2022年10月17日周一 13:22写道:
>
> On 10/16/22 23:56, Sam Li wrote:
> > A zone append command is a write operation that specifies the first
> > logical block of a zone as the write position. When writing to a zoned
> > block device using zone append, the byte offset of writes is pointing
> > to the write pointer of that zone. Upon completion the device will
> > respond with the position the data has been written in the zone.
> >
> > Signed-off-by: Sam Li <faithilikerun@gmail.com>
> > ---
> >  block/block-backend.c             | 65 ++++++++++++++++++++++
> >  block/file-posix.c                | 89 +++++++++++++++++++++++++++++--
> >  block/io.c                        | 21 ++++++++
> >  block/raw-format.c                |  8 +++
> >  include/block/block-io.h          |  3 ++
> >  include/block/block_int-common.h  |  5 ++
> >  include/block/raw-aio.h           |  4 +-
> >  include/sysemu/block-backend-io.h |  9 ++++
> >  8 files changed, 198 insertions(+), 6 deletions(-)
> >
> > diff --git a/block/block-backend.c b/block/block-backend.c
> > index 1c618e9c68..06931ddd24 100644
> > --- a/block/block-backend.c
> > +++ b/block/block-backend.c
> > @@ -1439,6 +1439,9 @@ typedef struct BlkRwCo {
> >          struct {
> >              unsigned long op;
> >          } zone_mgmt;
> > +        struct {
> > +            int64_t *append_sector;
>
> As mentioned previosuly, call this sector. "append" is already in the
> zone_append struct member name....

Right, missed it here. I'll use "offset" then since it's in block layer.

>
> > +        } zone_append;
> >      };
> >  } BlkRwCo;
> >
> > @@ -1871,6 +1874,47 @@ BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
> >      return &acb->common;
> >  }
> >
> > +static void coroutine_fn blk_aio_zone_append_entry(void *opaque)
> > +{
> > +    BlkAioEmAIOCB *acb = opaque;
> > +    BlkRwCo *rwco = &acb->rwco;
> > +
> > +    rwco->ret = blk_co_zone_append(rwco->blk, rwco->zone_append.append_sector,
>
> ...so you avoid awkward repetitions of "append" like here. You'll have:
> rwco->zone_append.sector, which is shorter and more natural.
>
> > +                                   rwco->iobuf, rwco->flags);
> > +    blk_aio_complete(acb);
> > +}
> > +
> > +BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset,
> > +                                QEMUIOVector *qiov, BdrvRequestFlags flags,
> > +                                BlockCompletionFunc *cb, void *opaque) {
> > +    BlkAioEmAIOCB *acb;
> > +    Coroutine *co;
> > +    IO_CODE();
> > +
> > +    blk_inc_in_flight(blk);
> > +    acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
> > +    acb->rwco = (BlkRwCo) {
> > +            .blk    = blk,
> > +            .ret    = NOT_DONE,
> > +            .flags  = flags,
> > +            .iobuf  = qiov,
> > +            .zone_append = {
> > +                    .append_sector = offset,
> > +            },
> > +    };
> > +    acb->has_returned = false;
> > +
> > +    co = qemu_coroutine_create(blk_aio_zone_append_entry, acb);
> > +    bdrv_coroutine_enter(blk_bs(blk), co);
> > +    acb->has_returned = true;
> > +    if (acb->rwco.ret != NOT_DONE) {
> > +        replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
> > +                                         blk_aio_complete_bh, acb);
> > +    }
> > +
> > +    return &acb->common;
> > +}
> > +
> >  /*
> >   * Send a zone_report command.
> >   * offset is a byte offset from the start of the device. No alignment
> > @@ -1923,6 +1967,27 @@ int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
> >      return ret;
> >  }
> >
> > +/*
> > + * Send a zone_append command.
> > + */
> > +int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset,
> > +        QEMUIOVector *qiov, BdrvRequestFlags flags)
> > +{
> > +    int ret;
> > +    IO_CODE();
> > +
> > +    blk_inc_in_flight(blk);
> > +    blk_wait_while_drained(blk);
> > +    if (!blk_is_available(blk)) {
> > +        blk_dec_in_flight(blk);
> > +        return -ENOMEDIUM;
> > +    }
> > +
> > +    ret = bdrv_co_zone_append(blk_bs(blk), offset, qiov, flags);
> > +    blk_dec_in_flight(blk);
> > +    return ret;
> > +}
> > +
> >  void blk_drain(BlockBackend *blk)
> >  {
> >      BlockDriverState *bs = blk_bs(blk);
> > diff --git a/block/file-posix.c b/block/file-posix.c
> > index 5ff5500301..3d0cc33d02 100644
> > --- a/block/file-posix.c
> > +++ b/block/file-posix.c
> > @@ -205,6 +205,7 @@ typedef struct RawPosixAIOData {
> >          struct {
> >              struct iovec *iov;
> >              int niov;
> > +            int64_t *offset;
> >          } io;
> >          struct {
> >              uint64_t cmd;
> > @@ -1475,6 +1476,11 @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
> >              bs->bl.max_active_zones = ret;
> >          }
> >
> > +        ret = get_sysfs_long_val(&st, "physical_block_size");
> > +        if (ret >= 0) {
> > +            bs->bl.write_granularity = ret;
> > +        }
> > +
> >          bs->bl.wps = g_malloc(sizeof(BlockZoneWps) + sizeof(int64_t) * ret);
> >          if (get_zones_wp(s->fd, bs->bl.wps, 0, ret) < 0) {
> >              error_report("report wps failed");
> > @@ -1647,9 +1653,18 @@ qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset)
> >  static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb)
> >  {
> >      ssize_t len;
> > +    BlockZoneWps *wps = aiocb->bs->bl.wps;
> > +    int index = aiocb->aio_offset / aiocb->bs->bl.zone_size;
> > +
> > +    if (wps) {
> > +        qemu_mutex_lock(&wps->lock);
> > +        if (aiocb->aio_type & QEMU_AIO_ZONE_APPEND) {
> > +            aiocb->aio_offset = wps->wp[index];
> > +        }
> > +    }
> >
> >      do {
> > -        if (aiocb->aio_type & QEMU_AIO_WRITE)
> > +        if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))
> >              len = qemu_pwritev(aiocb->aio_fildes,
> >                                 aiocb->io.iov,
> >                                 aiocb->io.niov,
> > @@ -1660,6 +1675,9 @@ static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb)
> >                                aiocb->io.niov,
> >                                aiocb->aio_offset);
> >      } while (len == -1 && errno == EINTR);
> > +    if (wps) {
> > +        qemu_mutex_unlock(&wps->lock);
>
> As commented in the previous patch, you cannot release the lock until you
> update the wp array entry. So this means that you should be taking the wp
> lock at the beginning of handle_aiocb_rw() and release it only after the
> wp array is updated. That will also simplify the code and avoid spreading
> lock/unlock of that array to many places.
>
> > +    }
> >
> >      if (len == -1) {
> >          return -errno;
> > @@ -1677,9 +1695,17 @@ static ssize_t handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf)
> >  {
> >      ssize_t offset = 0;
> >      ssize_t len;
> > +    BlockZoneWps *wps = aiocb->bs->bl.wps;
> > +    int index = aiocb->aio_offset / aiocb->bs->bl.zone_size;
> >
> > +    if (wps) {
> > +        qemu_mutex_lock(&wps->lock);
> > +        if (aiocb->aio_type & QEMU_AIO_ZONE_APPEND) {
> > +            aiocb->aio_offset = wps->wp[index];
> > +        }
> > +    }
> >      while (offset < aiocb->aio_nbytes) {
> > -        if (aiocb->aio_type & QEMU_AIO_WRITE) {
> > +        if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
> >              len = pwrite(aiocb->aio_fildes,
> >                           (const char *)buf + offset,
> >                           aiocb->aio_nbytes - offset,
> > @@ -1709,6 +1735,9 @@ static ssize_t handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf)
> >          }
> >          offset += len;
> >      }
> > +    if (wps) {
> > +        qemu_mutex_unlock(&wps->lock);
>
> Same comment as above.
>
> > +    }
> >
> >      return offset;
> >  }
> > @@ -1772,7 +1801,7 @@ static int handle_aiocb_rw(void *opaque)
> >      }
> >
> >      nbytes = handle_aiocb_rw_linear(aiocb, buf);
> > -    if (!(aiocb->aio_type & QEMU_AIO_WRITE)) {
> > +    if (!(aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))) {
> >          char *p = buf;
> >          size_t count = aiocb->aio_nbytes, copy;
> >          int i;
> > @@ -1794,7 +1823,7 @@ static int handle_aiocb_rw(void *opaque)
> >  out:
> >      if (nbytes == aiocb->aio_nbytes) {
> >  #if defined(CONFIG_BLKZONED)
> > -        if (aiocb->aio_type & QEMU_AIO_WRITE) {
> > +        if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
> >              BlockZoneWps *wps = aiocb->bs->bl.wps;
> >              int index = aiocb->aio_offset / aiocb->bs->bl.zone_size;
> >              if (wps) {
> > @@ -1802,6 +1831,9 @@ out:
> >                  if (!BDRV_ZT_IS_CONV(wps->wp[index])) {
> >                      uint64_t wend_offset =
> >                              aiocb->aio_offset + aiocb->aio_nbytes;
> > +                    if (aiocb->aio_type & QEMU_AIO_ZONE_APPEND) {
> > +                        *aiocb->io.offset = wps->wp[index];
> > +                    }
>
> The comment above will address the problem here, which is that you are
> accessing the array without it being locked. You cannot do that.
>
> >
> >                      /* Advance the wp if needed */
> >                      if (wend_offset > wps->wp[index]) {
> > @@ -1824,7 +1856,7 @@ out:
> >      } else {
> >          assert(nbytes < 0);
> >  #if defined(CONFIG_BLKZONED)
> > -        if (aiocb->aio_type & QEMU_AIO_WRITE) {
> > +        if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
> >              update_zones_wp(aiocb->aio_fildes, aiocb->bs->bl.wps, 0, 1);
> >          }
> >  #endif
> > @@ -3478,6 +3510,52 @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
> >  }
> >  #endif
> >
> > +#if defined(CONFIG_BLKZONED)
> > +static int coroutine_fn raw_co_zone_append(BlockDriverState *bs,
> > +                                           int64_t *offset,
> > +                                           QEMUIOVector *qiov,
> > +                                           BdrvRequestFlags flags) {
> > +    BDRVRawState *s = bs->opaque;
> > +    int64_t zone_size_mask = bs->bl.zone_size - 1;
> > +    int64_t iov_len = 0;
> > +    int64_t len = 0;
> > +    RawPosixAIOData acb;
> > +
> > +    if (*offset & zone_size_mask) {
> > +        error_report("sector offset %" PRId64 " is not aligned to zone size "
> > +                     "%" PRId32 "", *offset / 512, bs->bl.zone_size / 512);
> > +        return -EINVAL;
> > +    }
> > +
> > +    int64_t wg = bs->bl.write_granularity;
> > +    int64_t wg_mask = wg - 1;
> > +    for (int i = 0; i < qiov->niov; i++) {
> > +        iov_len = qiov->iov[i].iov_len;
> > +        if (iov_len & wg_mask) {
> > +            error_report("len of IOVector[%d] %" PRId64 " is not aligned to "
> > +                         "block size %" PRId64 "", i, iov_len, wg);
> > +            return -EINVAL;
> > +        }
> > +        len += iov_len;
> > +    }
> > +
> > +    acb = (RawPosixAIOData) {
> > +        .bs = bs,
> > +        .aio_fildes = s->fd,
> > +        .aio_type = QEMU_AIO_ZONE_APPEND,
> > +        .aio_offset = *offset,
> > +        .aio_nbytes = len,
> > +        .io = {
> > +                .iov = qiov->iov,
> > +                .niov = qiov->niov,
> > +                .offset = offset,
> > +        },
> > +    };
> > +
> > +    return raw_thread_pool_submit(bs, handle_aiocb_rw, &acb);
> > +}
> > +#endif
> > +
> >  static coroutine_fn int
> >  raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes,
> >                  bool blkdev)
> > @@ -4253,6 +4331,7 @@ static BlockDriver bdrv_zoned_host_device = {
> >      /* zone management operations */
> >      .bdrv_co_zone_report = raw_co_zone_report,
> >      .bdrv_co_zone_mgmt = raw_co_zone_mgmt,
> > +    .bdrv_co_zone_append = raw_co_zone_append,
> >  };
> >  #endif
> >
> > diff --git a/block/io.c b/block/io.c
> > index 88f707ea4d..03e1109056 100644
> > --- a/block/io.c
> > +++ b/block/io.c
> > @@ -3230,6 +3230,27 @@ out:
> >      return co.ret;
> >  }
> >
> > +int coroutine_fn bdrv_co_zone_append(BlockDriverState *bs, int64_t *offset,
> > +                        QEMUIOVector *qiov,
> > +                        BdrvRequestFlags flags)
> > +{
> > +    BlockDriver *drv = bs->drv;
> > +    CoroutineIOCompletion co = {
> > +            .coroutine = qemu_coroutine_self(),
> > +    };
> > +    IO_CODE();
> > +
> > +    bdrv_inc_in_flight(bs);
> > +    if (!drv || !drv->bdrv_co_zone_append) {
> > +        co.ret = -ENOTSUP;
> > +        goto out;
> > +    }
> > +    co.ret = drv->bdrv_co_zone_append(bs, offset, qiov, flags);
> > +out:
> > +    bdrv_dec_in_flight(bs);
> > +    return co.ret;
> > +}
> > +
> >  void *qemu_blockalign(BlockDriverState *bs, size_t size)
> >  {
> >      IO_CODE();
> > diff --git a/block/raw-format.c b/block/raw-format.c
> > index 18dc52a150..33bff8516e 100644
> > --- a/block/raw-format.c
> > +++ b/block/raw-format.c
> > @@ -325,6 +325,13 @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
> >      return bdrv_co_zone_mgmt(bs->file->bs, op, offset, len);
> >  }
> >
> > +static int coroutine_fn raw_co_zone_append(BlockDriverState *bs,
> > +                                           int64_t *offset,
> > +                                           QEMUIOVector *qiov,
> > +                                           BdrvRequestFlags flags) {
> > +    return bdrv_co_zone_append(bs->file->bs, offset, qiov, flags);
> > +}
> > +
> >  static int64_t raw_getlength(BlockDriverState *bs)
> >  {
> >      int64_t len;
> > @@ -629,6 +636,7 @@ BlockDriver bdrv_raw = {
> >      .bdrv_co_pdiscard     = &raw_co_pdiscard,
> >      .bdrv_co_zone_report  = &raw_co_zone_report,
> >      .bdrv_co_zone_mgmt  = &raw_co_zone_mgmt,
> > +    .bdrv_co_zone_append = &raw_co_zone_append,
> >      .bdrv_co_block_status = &raw_co_block_status,
> >      .bdrv_co_copy_range_from = &raw_co_copy_range_from,
> >      .bdrv_co_copy_range_to  = &raw_co_copy_range_to,
> > diff --git a/include/block/block-io.h b/include/block/block-io.h
> > index f0cdf67d33..6a54453578 100644
> > --- a/include/block/block-io.h
> > +++ b/include/block/block-io.h
> > @@ -94,6 +94,9 @@ int coroutine_fn bdrv_co_zone_report(BlockDriverState *bs, int64_t offset,
> >                                       BlockZoneDescriptor *zones);
> >  int coroutine_fn bdrv_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
> >                                     int64_t offset, int64_t len);
> > +int coroutine_fn bdrv_co_zone_append(BlockDriverState *bs, int64_t *offset,
> > +                                     QEMUIOVector *qiov,
> > +                                     BdrvRequestFlags flags);
> >
> >  int bdrv_co_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes);
> >  bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs);
> > diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
> > index e3136146aa..a7e7db5646 100644
> > --- a/include/block/block_int-common.h
> > +++ b/include/block/block_int-common.h
> > @@ -701,6 +701,9 @@ struct BlockDriver {
> >              BlockZoneDescriptor *zones);
> >      int coroutine_fn (*bdrv_co_zone_mgmt)(BlockDriverState *bs, BlockZoneOp op,
> >              int64_t offset, int64_t len);
> > +    int coroutine_fn (*bdrv_co_zone_append)(BlockDriverState *bs,
> > +            int64_t *offset, QEMUIOVector *qiov,
> > +            BdrvRequestFlags flags);
> >
> >      /* removable device specific */
> >      bool (*bdrv_is_inserted)(BlockDriverState *bs);
> > @@ -860,6 +863,8 @@ typedef struct BlockLimits {
> >
> >      /* array of write pointers' location of each zone in the zoned device. */
> >      BlockZoneWps *wps;
> > +
> > +    int64_t write_granularity;
> >  } BlockLimits;
> >
> >  typedef struct BdrvOpBlocker BdrvOpBlocker;
> > diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h
> > index 877b2240b3..53033a5ca7 100644
> > --- a/include/block/raw-aio.h
> > +++ b/include/block/raw-aio.h
> > @@ -31,6 +31,7 @@
> >  #define QEMU_AIO_TRUNCATE     0x0080
> >  #define QEMU_AIO_ZONE_REPORT  0x0100
> >  #define QEMU_AIO_ZONE_MGMT    0x0200
> > +#define QEMU_AIO_ZONE_APPEND  0x0400
> >  #define QEMU_AIO_TYPE_MASK \
> >          (QEMU_AIO_READ | \
> >           QEMU_AIO_WRITE | \
> > @@ -41,7 +42,8 @@
> >           QEMU_AIO_COPY_RANGE | \
> >           QEMU_AIO_TRUNCATE | \
> >           QEMU_AIO_ZONE_REPORT | \
> > -         QEMU_AIO_ZONE_MGMT)
> > +         QEMU_AIO_ZONE_MGMT | \
> > +         QEMU_AIO_ZONE_APPEND)
> >
> >  /* AIO flags */
> >  #define QEMU_AIO_MISALIGNED   0x1000
> > diff --git a/include/sysemu/block-backend-io.h b/include/sysemu/block-backend-io.h
> > index 1b5fc7db6b..ff9f777f52 100644
> > --- a/include/sysemu/block-backend-io.h
> > +++ b/include/sysemu/block-backend-io.h
> > @@ -52,6 +52,9 @@ BlockAIOCB *blk_aio_zone_report(BlockBackend *blk, int64_t offset,
> >  BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
> >                                int64_t offset, int64_t len,
> >                                BlockCompletionFunc *cb, void *opaque);
> > +BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset,
> > +                                QEMUIOVector *qiov, BdrvRequestFlags flags,
> > +                                BlockCompletionFunc *cb, void *opaque);
> >  BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes,
> >                               BlockCompletionFunc *cb, void *opaque);
> >  void blk_aio_cancel_async(BlockAIOCB *acb);
> > @@ -173,6 +176,12 @@ int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
> >                                    int64_t offset, int64_t len);
> >  int generated_co_wrapper blk_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
> >                                         int64_t offset, int64_t len);
> > +int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset,
> > +                                    QEMUIOVector *qiov,
> > +                                    BdrvRequestFlags flags);
> > +int generated_co_wrapper blk_zone_append(BlockBackend *blk, int64_t *offset,
> > +                                         QEMUIOVector *qiov,
> > +                                         BdrvRequestFlags flags);
> >
> >  int generated_co_wrapper blk_pdiscard(BlockBackend *blk, int64_t offset,
> >                                        int64_t bytes);
>
> --
> Damien Le Moal
> Western Digital Research
>


^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2022-10-18  9:03 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2022-10-16 14:56 [PATCH v4 0/3] Add zone append write for zoned device Sam Li
2022-10-16 14:56 ` [PATCH v4 1/3] file-posix: add the tracking of the zones write pointers Sam Li
2022-10-17  0:57   ` Dmitry Fomichev
2022-10-17  5:11   ` Damien Le Moal
2022-10-16 14:56 ` [PATCH v4 2/3] block: introduce zone append write for zoned devices Sam Li
2022-10-17  0:58   ` Dmitry Fomichev
2022-10-17  5:22   ` Damien Le Moal
2022-10-18  8:48     ` Sam Li
2022-10-16 14:56 ` [PATCH v4 3/3] qemu-iotests: test zone append operation Sam Li

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).