qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed
From: "Denis V. Lunev" via <qemu-devel@nongnu.org>
To: qemu-devel@nongnu.org
Cc: qemu-block@nongnu.org, "Denis V. Lunev" <den@openvz.org>,
	Andrey Drobyshev <andrey.drobyshev@virtuozzo.com>,
	Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>,
	Kevin Wolf <kwolf@redhat.com>
Subject: [PATCH 2/2] block/preallocate: fix image truncation logic
Date: Wed,  9 Oct 2024 18:37:37 +0300	[thread overview]
Message-ID: <20241009153924.158721-3-den@openvz.org> (raw)
In-Reply-To: <20241009153924.158721-1-den@openvz.org>

Recent QEMU changes around preallocate_set_perm mandates that it is not
possible to poll on aio_context inside this function anymore. Thus
truncate operation has been moved inside bottom half. This bottom half
is scheduled from preallocate_set_perm() and that is all.

This approach proven to be problematic in a lot of places once
additional operations are executed over preallocate filter in
production. The code validates that permissions have been really changed
just after the call to the set operation.

All permissions operations or block driver graph changes are performed
inside the quiscent state in terms of the block layer. This means that
there are no in-flight packets which is guaranteed by the passing
through bdrv_drain() section.

The idea is that we should effectively disable preallocate filter inside
bdrv_drain() and unblock permission changes. This section is definitely
not on the hot path and additional single truncate operation will not
hurt.

Unfortunately bdrv_drain_begin() callback according to the documentation
also disallow waiting inside. Thus original approach with the bottom
half is not changed. bdrv_drain_begin() schedules the operation and in
order to ensure that it has been really executed before completion of
the section increments the amount of in-flight requests.

Signed-off-by: Denis V. Lunev <den@openvz.org>
CC: Andrey Drobyshev <andrey.drobyshev@virtuozzo.com>
CC: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
CC: Kevin Wolf <kwolf@redhat.com>
---
 block/preallocate.c    | 42 ++++++++++++++++++++++++++++++++++++++----
 tests/qemu-iotests/298 |  6 ++++--
 2 files changed, 42 insertions(+), 6 deletions(-)

diff --git a/block/preallocate.c b/block/preallocate.c
index 1016c511cb..16a92a2e0d 100644
--- a/block/preallocate.c
+++ b/block/preallocate.c
@@ -78,6 +78,7 @@ typedef struct BDRVPreallocateState {
 
     /* Gives up the resize permission on children when parents don't need it */
     QEMUBH *drop_resize_bh;
+    bool    drop_resize_armed;
 } BDRVPreallocateState;
 
 static int preallocate_drop_resize(BlockDriverState *bs, Error **errp);
@@ -151,6 +152,7 @@ static int preallocate_open(BlockDriverState *bs, QDict *options, int flags,
      */
     s->file_end = s->zero_start = s->data_end = -EINVAL;
     s->drop_resize_bh = qemu_bh_new(preallocate_drop_resize_bh, bs);
+    s->drop_resize_armed = false;
 
     ret = bdrv_open_file_child(NULL, options, "file", bs, errp);
     if (ret < 0) {
@@ -208,7 +210,7 @@ static void preallocate_close(BlockDriverState *bs)
     GLOBAL_STATE_CODE();
     GRAPH_RDLOCK_GUARD_MAINLOOP();
 
-    qemu_bh_cancel(s->drop_resize_bh);
+    assert(!s->drop_resize_armed);
     qemu_bh_delete(s->drop_resize_bh);
 
     if (s->data_end >= 0) {
@@ -516,6 +518,8 @@ preallocate_drop_resize(BlockDriverState *bs, Error **errp)
     BDRVPreallocateState *s = bs->opaque;
     int ret;
 
+    s->drop_resize_armed = false;
+
     if (s->data_end < 0) {
         return 0;
     }
@@ -544,6 +548,12 @@ preallocate_drop_resize(BlockDriverState *bs, Error **errp)
 
 static void preallocate_drop_resize_bh(void *opaque)
 {
+    BlockDriverState *bs = opaque;
+
+     /*
+      * In case of errors, we'll simply keep the exclusive lock on the image
+      * indefinitely.
+      */
     GLOBAL_STATE_CODE();
     GRAPH_RDLOCK_GUARD_MAINLOOP();
 
@@ -551,7 +561,9 @@ static void preallocate_drop_resize_bh(void *opaque)
      * In case of errors, we'll simply keep the exclusive lock on the image
      * indefinitely.
      */
-    preallocate_drop_resize(opaque, NULL);
+    preallocate_drop_resize(bs, NULL);
+
+    bdrv_dec_in_flight(bs);
 }
 
 static void GRAPH_RDLOCK
@@ -560,13 +572,13 @@ preallocate_set_perm(BlockDriverState *bs, uint64_t perm, uint64_t shared)
     BDRVPreallocateState *s = bs->opaque;
 
     if (can_write_resize(perm)) {
-        qemu_bh_cancel(s->drop_resize_bh);
         if (s->data_end < 0) {
             s->data_end = s->file_end = s->zero_start =
                 bs->file->bs->total_sectors * BDRV_SECTOR_SIZE;
         }
     } else {
-        qemu_bh_schedule(s->drop_resize_bh);
+        assert(!s->drop_resize_armed);
+        assert(s->data_end < 0);
     }
 }
 
@@ -605,6 +617,26 @@ static int preallocate_check_perm(BlockDriverState *bs, uint64_t perm,
     return 0;
 }
 
+static void preallocate_drain_begin(BlockDriverState *bs)
+{
+    BDRVPreallocateState *s = bs->opaque;
+
+    if (s->data_end < 0) {
+        return;
+    }
+    if (s->drop_resize_armed) {
+        return;
+    }
+    if (s->data_end == s->file_end) {
+        s->file_end = s->zero_start = s->data_end = -EINVAL;
+        return;
+    }
+
+    s->drop_resize_armed = true;
+    bdrv_inc_in_flight(bs);
+    qemu_bh_schedule(s->drop_resize_bh);
+}
+
 static BlockDriver bdrv_preallocate_filter = {
     .format_name = "preallocate",
     .instance_size = sizeof(BDRVPreallocateState),
@@ -613,6 +645,8 @@ static BlockDriver bdrv_preallocate_filter = {
     .bdrv_open            = preallocate_open,
     .bdrv_close           = preallocate_close,
 
+    .bdrv_drain_begin     = preallocate_drain_begin,
+
     .bdrv_reopen_prepare  = preallocate_reopen_prepare,
     .bdrv_reopen_commit   = preallocate_reopen_commit,
     .bdrv_reopen_abort    = preallocate_reopen_abort,
diff --git a/tests/qemu-iotests/298 b/tests/qemu-iotests/298
index 09c9290711..fe03d29802 100755
--- a/tests/qemu-iotests/298
+++ b/tests/qemu-iotests/298
@@ -92,8 +92,10 @@ class TestPreallocateFilter(TestPreallocateBase):
         self.vm.cmd('block-commit', device='overlay')
         self.complete_and_wait()
 
-        # commit of new megabyte should trigger preallocation
-        self.check_big()
+        # commit of new megabyte should trigger preallocation, but drain
+        # will make file smaller
+        self.check_small()
+
 
     def test_reopen_opts(self):
         self.vm.cmd('blockdev-reopen', options=[{
-- 
2.43.5



  parent reply	other threads:[~2024-10-09 15:40 UTC|newest]

Thread overview: 6+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-10-09 15:37 [PATCH v2 0/2] block/preallocate: fix image truncation logic Denis V. Lunev via
2024-10-09 15:37 ` [PATCH 1/2] preallocate: do not allow to change BDS permission improperly Denis V. Lunev via
2024-10-09 15:37 ` Denis V. Lunev via [this message]
  -- strict thread matches above, loose matches on Subject: below --
2024-10-09 13:58 [PATCH 0/2] block/preallocate: fix image truncation logic Denis V. Lunev via
2024-10-09 13:58 ` [PATCH 2/2] " Denis V. Lunev via
2024-10-09 14:54   ` Andrey Drobyshev
2024-10-09 14:54     ` Denis V. Lunev

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20241009153924.158721-3-den@openvz.org \
    --to=qemu-devel@nongnu.org \
    --cc=andrey.drobyshev@virtuozzo.com \
    --cc=den@openvz.org \
    --cc=kwolf@redhat.com \
    --cc=qemu-block@nongnu.org \
    --cc=vsementsov@yandex-team.ru \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).