[PATCH for-10.2] Revert "nvme: Fix coroutine waking"

qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed

* [PATCH for-10.2] Revert "nvme: Fix coroutine waking"
@ 2025-12-12 10:25 Hanna Czenczek
  2025-12-12 12:27 ` Hanna Czenczek
                   ` (2 more replies)
  0 siblings, 3 replies; 4+ messages in thread
From: Hanna Czenczek @ 2025-12-12 10:25 UTC (permalink / raw)
  To: qemu-block
  Cc: qemu-devel, Hanna Czenczek, Lukáš Doktor,
	Stefan Hajnoczi, Kevin Wolf, Fam Zheng,
	Philippe Mathieu-Daudé, qemu-stable

This reverts commit 0f142cbd919fcb6cea7aa176f7e4939925806dd9.

Lukáš Doktor reported a simple single-threaded nvme test case hanging
and bisected it to this commit.  While we are still investigating, it is
best to revert the commit for now.

(This breaks multiqueue for nvme, but better to have single-queue
working than neither.)

Cc: qemu-stable@nongnu.org
Reported-by: Lukáš Doktor <ldoktor@redhat.com>
Signed-off-by: Hanna Czenczek <hreitz@redhat.com>
---
 block/nvme.c | 56 +++++++++++++++++++++++++---------------------------
 1 file changed, 27 insertions(+), 29 deletions(-)

diff --git a/block/nvme.c b/block/nvme.c
index 919e14cef9..c3d3b99d1f 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -1200,36 +1200,26 @@ fail:
 
 typedef struct {
     Coroutine *co;
-    bool skip_yield;
     int ret;
+    AioContext *ctx;
 } NVMeCoData;
 
+static void nvme_rw_cb_bh(void *opaque)
+{
+    NVMeCoData *data = opaque;
+    qemu_coroutine_enter(data->co);
+}
+
 /* Put into NVMeRequest.cb, so runs in the BDS's main AioContext */
 static void nvme_rw_cb(void *opaque, int ret)
 {
     NVMeCoData *data = opaque;
-
     data->ret = ret;
-
-    if (data->co == qemu_coroutine_self()) {
-        /*
-         * Fast path: We are inside of the request coroutine (through
-         * nvme_submit_command, nvme_deferred_fn, nvme_process_completion).
-         * We can set data->skip_yield here to keep the coroutine from
-         * yielding, and then we don't need to schedule a BH to wake it.
-         */
-        data->skip_yield = true;
-    } else {
-        /*
-         * Safe to call: The case where we run in the request coroutine is
-         * handled above, so we must be independent of it; and without
-         * skip_yield set, the coroutine will yield.
-         * No need to release NVMeQueuePair.lock (we are called without it
-         * held).  (Note: If we enter the coroutine here, @data will
-         * probably be dangling once aio_co_wake() returns.)
-         */
-        aio_co_wake(data->co);
+    if (!data->co) {
+        /* The rw coroutine hasn't yielded, don't try to enter. */
+        return;
     }
+    replay_bh_schedule_oneshot_event(data->ctx, nvme_rw_cb_bh, data);
 }
 
 static coroutine_fn int nvme_co_prw_aligned(BlockDriverState *bs,
@@ -1253,7 +1243,7 @@ static coroutine_fn int nvme_co_prw_aligned(BlockDriverState *bs,
         .cdw12 = cpu_to_le32(cdw12),
     };
     NVMeCoData data = {
-        .co = qemu_coroutine_self(),
+        .ctx = bdrv_get_aio_context(bs),
         .ret = -EINPROGRESS,
     };
 
@@ -1270,7 +1260,9 @@ static coroutine_fn int nvme_co_prw_aligned(BlockDriverState *bs,
         return r;
     }
     nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
-    if (!data.skip_yield) {
+
+    data.co = qemu_coroutine_self();
+    while (data.ret == -EINPROGRESS) {
         qemu_coroutine_yield();
     }
 
@@ -1366,7 +1358,7 @@ static coroutine_fn int nvme_co_flush(BlockDriverState *bs)
         .nsid = cpu_to_le32(s->nsid),
     };
     NVMeCoData data = {
-        .co = qemu_coroutine_self(),
+        .ctx = bdrv_get_aio_context(bs),
         .ret = -EINPROGRESS,
     };
 
@@ -1374,7 +1366,9 @@ static coroutine_fn int nvme_co_flush(BlockDriverState *bs)
     req = nvme_get_free_req(ioq);
     assert(req);
     nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
-    if (!data.skip_yield) {
+
+    data.co = qemu_coroutine_self();
+    if (data.ret == -EINPROGRESS) {
         qemu_coroutine_yield();
     }
 
@@ -1415,7 +1409,7 @@ static coroutine_fn int nvme_co_pwrite_zeroes(BlockDriverState *bs,
     };
 
     NVMeCoData data = {
-        .co = qemu_coroutine_self(),
+        .ctx = bdrv_get_aio_context(bs),
         .ret = -EINPROGRESS,
     };
 
@@ -1435,7 +1429,9 @@ static coroutine_fn int nvme_co_pwrite_zeroes(BlockDriverState *bs,
     assert(req);
 
     nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
-    if (!data.skip_yield) {
+
+    data.co = qemu_coroutine_self();
+    while (data.ret == -EINPROGRESS) {
         qemu_coroutine_yield();
     }
 
@@ -1463,7 +1459,7 @@ static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs,
     };
 
     NVMeCoData data = {
-        .co = qemu_coroutine_self(),
+        .ctx = bdrv_get_aio_context(bs),
         .ret = -EINPROGRESS,
     };
 
@@ -1508,7 +1504,9 @@ static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs,
     trace_nvme_dsm(s, offset, bytes);
 
     nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
-    if (!data.skip_yield) {
+
+    data.co = qemu_coroutine_self();
+    while (data.ret == -EINPROGRESS) {
         qemu_coroutine_yield();
     }
 
-- 
2.52.0



^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH for-10.2] Revert "nvme: Fix coroutine waking"
  2025-12-12 10:25 [PATCH for-10.2] Revert "nvme: Fix coroutine waking" Hanna Czenczek
@ 2025-12-12 12:27 ` Hanna Czenczek
  2025-12-12 17:31 ` Hanna Czenczek
  2025-12-12 17:46 ` Lukáš Doktor
  2 siblings, 0 replies; 4+ messages in thread
From: Hanna Czenczek @ 2025-12-12 12:27 UTC (permalink / raw)
  To: qemu-block
  Cc: qemu-devel, Lukáš Doktor, Stefan Hajnoczi, Kevin Wolf,
	Fam Zheng, Philippe Mathieu-Daudé, qemu-stable

On 12.12.25 11:25, Hanna Czenczek wrote:
> This reverts commit 0f142cbd919fcb6cea7aa176f7e4939925806dd9.
>
> Lukáš Doktor reported a simple single-threaded nvme test case hanging
> and bisected it to this commit.  While we are still investigating, it is
> best to revert the commit for now.
>
> (This breaks multiqueue for nvme, but better to have single-queue
> working than neither.)

I’ve tested this on Lukáš’s system, and there is no hang with this 
revert patch applied.

(For what it’s worth, the hang only seems to appear when writing, which 
is why I didn’t see it when testing myself.  I can’t really overwrite 
the only “testing” NVMe SSD I have, so I only tested reads.)

Hanna



^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH for-10.2] Revert "nvme: Fix coroutine waking"
  2025-12-12 10:25 [PATCH for-10.2] Revert "nvme: Fix coroutine waking" Hanna Czenczek
  2025-12-12 12:27 ` Hanna Czenczek
@ 2025-12-12 17:31 ` Hanna Czenczek
  2025-12-12 17:46 ` Lukáš Doktor
  2 siblings, 0 replies; 4+ messages in thread
From: Hanna Czenczek @ 2025-12-12 17:31 UTC (permalink / raw)
  To: qemu-block
  Cc: qemu-devel, Lukáš Doktor, Stefan Hajnoczi, Kevin Wolf,
	Fam Zheng, Philippe Mathieu-Daudé, qemu-stable

On 12.12.25 11:25, Hanna Czenczek wrote:
> This reverts commit 0f142cbd919fcb6cea7aa176f7e4939925806dd9.
>
> Lukáš Doktor reported a simple single-threaded nvme test case hanging
> and bisected it to this commit.  While we are still investigating, it is
> best to revert the commit for now.
>
> (This breaks multiqueue for nvme, but better to have single-queue
> working than neither.)
>
> Cc: qemu-stable@nongnu.org
> Reported-by: Lukáš Doktor <ldoktor@redhat.com>
> Signed-off-by: Hanna Czenczek <hreitz@redhat.com>
> ---
>   block/nvme.c | 56 +++++++++++++++++++++++++---------------------------
>   1 file changed, 27 insertions(+), 29 deletions(-)
>
> diff --git a/block/nvme.c b/block/nvme.c
> index 919e14cef9..c3d3b99d1f 100644
> --- a/block/nvme.c
> +++ b/block/nvme.c

[...]

>   /* Put into NVMeRequest.cb, so runs in the BDS's main AioContext */
>   static void nvme_rw_cb(void *opaque, int ret)
>   {

[...]

> -        aio_co_wake(data->co);
[...]
> +    replay_bh_schedule_oneshot_event(data->ctx, nvme_rw_cb_bh, data);
>   }

 From testing, this bit seems to be the important one: The hang seems to 
be caused by entering directly the coroutine directly instead of always 
going through a BH.  Why that is, I haven’t yet found out, only that 
s/aio_co_wake()/aio_co_schedule()/ seems to make it work.

I’ll spend more time trying to find out why.

(The only thing I know so far is that iscsi similarly should not use 
aio_co_wake(), and for that we do have a documented reason: 
https://gitlab.com/qemu-project/qemu/-/commit/8b9dfe9098 – in light of 
that, it probably makes sense not to use aio_co_wake() for NFS either, 
which was the third case in the original series where I replaced a 
oneshot schedule by aio_co_wake().)

Hanna



^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH for-10.2] Revert "nvme: Fix coroutine waking"
  2025-12-12 10:25 [PATCH for-10.2] Revert "nvme: Fix coroutine waking" Hanna Czenczek
  2025-12-12 12:27 ` Hanna Czenczek
  2025-12-12 17:31 ` Hanna Czenczek
@ 2025-12-12 17:46 ` Lukáš Doktor
  2 siblings, 0 replies; 4+ messages in thread
From: Lukáš Doktor @ 2025-12-12 17:46 UTC (permalink / raw)
  To: Hanna Czenczek, qemu-block
  Cc: qemu-devel, Stefan Hajnoczi, Kevin Wolf, Fam Zheng,
	Philippe Mathieu-Daudé, qemu-stable


[-- Attachment #1.1.1: Type: text/plain, Size: 5296 bytes --]

Thanks, tested this on the CI system and with this patch it seems to work well.

Regards,
Lukáš

Tested-by: Lukáš Doktor <ldoktor@redhat.com>

Dne 12. 12. 25 v 11:25 Hanna Czenczek napsal(a):
> This reverts commit 0f142cbd919fcb6cea7aa176f7e4939925806dd9.
> 
> Lukáš Doktor reported a simple single-threaded nvme test case hanging
> and bisected it to this commit.  While we are still investigating, it is
> best to revert the commit for now.
> 
> (This breaks multiqueue for nvme, but better to have single-queue
> working than neither.)
> 
> Cc: qemu-stable@nongnu.org
> Reported-by: Lukáš Doktor <ldoktor@redhat.com>
> Signed-off-by: Hanna Czenczek <hreitz@redhat.com>
> ---
>  block/nvme.c | 56 +++++++++++++++++++++++++---------------------------
>  1 file changed, 27 insertions(+), 29 deletions(-)
> 
> diff --git a/block/nvme.c b/block/nvme.c
> index 919e14cef9..c3d3b99d1f 100644
> --- a/block/nvme.c
> +++ b/block/nvme.c
> @@ -1200,36 +1200,26 @@ fail:
>  
>  typedef struct {
>      Coroutine *co;
> -    bool skip_yield;
>      int ret;
> +    AioContext *ctx;
>  } NVMeCoData;
>  
> +static void nvme_rw_cb_bh(void *opaque)
> +{
> +    NVMeCoData *data = opaque;
> +    qemu_coroutine_enter(data->co);
> +}
> +
>  /* Put into NVMeRequest.cb, so runs in the BDS's main AioContext */
>  static void nvme_rw_cb(void *opaque, int ret)
>  {
>      NVMeCoData *data = opaque;
> -
>      data->ret = ret;
> -
> -    if (data->co == qemu_coroutine_self()) {
> -        /*
> -         * Fast path: We are inside of the request coroutine (through
> -         * nvme_submit_command, nvme_deferred_fn, nvme_process_completion).
> -         * We can set data->skip_yield here to keep the coroutine from
> -         * yielding, and then we don't need to schedule a BH to wake it.
> -         */
> -        data->skip_yield = true;
> -    } else {
> -        /*
> -         * Safe to call: The case where we run in the request coroutine is
> -         * handled above, so we must be independent of it; and without
> -         * skip_yield set, the coroutine will yield.
> -         * No need to release NVMeQueuePair.lock (we are called without it
> -         * held).  (Note: If we enter the coroutine here, @data will
> -         * probably be dangling once aio_co_wake() returns.)
> -         */
> -        aio_co_wake(data->co);
> +    if (!data->co) {
> +        /* The rw coroutine hasn't yielded, don't try to enter. */
> +        return;
>      }
> +    replay_bh_schedule_oneshot_event(data->ctx, nvme_rw_cb_bh, data);
>  }
>  
>  static coroutine_fn int nvme_co_prw_aligned(BlockDriverState *bs,
> @@ -1253,7 +1243,7 @@ static coroutine_fn int nvme_co_prw_aligned(BlockDriverState *bs,
>          .cdw12 = cpu_to_le32(cdw12),
>      };
>      NVMeCoData data = {
> -        .co = qemu_coroutine_self(),
> +        .ctx = bdrv_get_aio_context(bs),
>          .ret = -EINPROGRESS,
>      };
>  
> @@ -1270,7 +1260,9 @@ static coroutine_fn int nvme_co_prw_aligned(BlockDriverState *bs,
>          return r;
>      }
>      nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
> -    if (!data.skip_yield) {
> +
> +    data.co = qemu_coroutine_self();
> +    while (data.ret == -EINPROGRESS) {
>          qemu_coroutine_yield();
>      }
>  
> @@ -1366,7 +1358,7 @@ static coroutine_fn int nvme_co_flush(BlockDriverState *bs)
>          .nsid = cpu_to_le32(s->nsid),
>      };
>      NVMeCoData data = {
> -        .co = qemu_coroutine_self(),
> +        .ctx = bdrv_get_aio_context(bs),
>          .ret = -EINPROGRESS,
>      };
>  
> @@ -1374,7 +1366,9 @@ static coroutine_fn int nvme_co_flush(BlockDriverState *bs)
>      req = nvme_get_free_req(ioq);
>      assert(req);
>      nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
> -    if (!data.skip_yield) {
> +
> +    data.co = qemu_coroutine_self();
> +    if (data.ret == -EINPROGRESS) {
>          qemu_coroutine_yield();
>      }
>  
> @@ -1415,7 +1409,7 @@ static coroutine_fn int nvme_co_pwrite_zeroes(BlockDriverState *bs,
>      };
>  
>      NVMeCoData data = {
> -        .co = qemu_coroutine_self(),
> +        .ctx = bdrv_get_aio_context(bs),
>          .ret = -EINPROGRESS,
>      };
>  
> @@ -1435,7 +1429,9 @@ static coroutine_fn int nvme_co_pwrite_zeroes(BlockDriverState *bs,
>      assert(req);
>  
>      nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
> -    if (!data.skip_yield) {
> +
> +    data.co = qemu_coroutine_self();
> +    while (data.ret == -EINPROGRESS) {
>          qemu_coroutine_yield();
>      }
>  
> @@ -1463,7 +1459,7 @@ static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs,
>      };
>  
>      NVMeCoData data = {
> -        .co = qemu_coroutine_self(),
> +        .ctx = bdrv_get_aio_context(bs),
>          .ret = -EINPROGRESS,
>      };
>  
> @@ -1508,7 +1504,9 @@ static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs,
>      trace_nvme_dsm(s, offset, bytes);
>  
>      nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
> -    if (!data.skip_yield) {
> +
> +    data.co = qemu_coroutine_self();
> +    while (data.ret == -EINPROGRESS) {
>          qemu_coroutine_yield();
>      }
>  

[-- Attachment #1.1.2: OpenPGP public key --]
[-- Type: application/pgp-keys, Size: 15119 bytes --]

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 495 bytes --]

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2025-12-12 17:49 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-12-12 10:25 [PATCH for-10.2] Revert "nvme: Fix coroutine waking" Hanna Czenczek
2025-12-12 12:27 ` Hanna Czenczek
2025-12-12 17:31 ` Hanna Czenczek
2025-12-12 17:46 ` Lukáš Doktor

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).