qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed
From: Stefan Hajnoczi <stefanha@redhat.com>
To: qemu-devel@nongnu.org, Peter Maydell <peter.maydell@linaro.org>
Cc: Kevin Wolf <kwolf@redhat.com>, Fam Zheng <fam@euphon.net>,
	Sergio Lopez <slp@redhat.com>,
	Eduardo Habkost <ehabkost@redhat.com>,
	qemu-block@nongnu.org, Max Reitz <mreitz@redhat.com>,
	Stefan Hajnoczi <stefanha@redhat.com>,
	Cleber Rosa <crosa@redhat.com>
Subject: [PULL 12/12] block/nvme: support nested aio_poll()
Date: Wed, 24 Jun 2020 11:02:10 +0100	[thread overview]
Message-ID: <20200624100210.59975-13-stefanha@redhat.com> (raw)
In-Reply-To: <20200624100210.59975-1-stefanha@redhat.com>

QEMU block drivers are supposed to support aio_poll() from I/O
completion callback functions. This means completion processing must be
re-entrant.

The standard approach is to schedule a BH during completion processing
and cancel it at the end of processing. If aio_poll() is invoked by a
callback function then the BH will run. The BH continues the suspended
completion processing.

All of this means that request A's cb() can synchronously wait for
request B to complete. Previously the nvme block driver would hang
because it didn't process completions from nested aio_poll().

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Sergio Lopez <slp@redhat.com>
Message-id: 20200617132201.1832152-8-stefanha@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/nvme.c       | 67 ++++++++++++++++++++++++++++++++++++++++------
 block/trace-events |  2 +-
 2 files changed, 60 insertions(+), 9 deletions(-)

diff --git a/block/nvme.c b/block/nvme.c
index 8dc68d3daa..374e268915 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -74,9 +74,11 @@ typedef struct {
     int         cq_phase;
     int         free_req_head;
     NVMeRequest reqs[NVME_NUM_REQS];
-    bool        busy;
     int         need_kick;
     int         inflight;
+
+    /* Thread-safe, no lock necessary */
+    QEMUBH      *completion_bh;
 } NVMeQueuePair;
 
 /* Memory mapped registers */
@@ -140,6 +142,8 @@ struct BDRVNVMeState {
 #define NVME_BLOCK_OPT_DEVICE "device"
 #define NVME_BLOCK_OPT_NAMESPACE "namespace"
 
+static void nvme_process_completion_bh(void *opaque);
+
 static QemuOptsList runtime_opts = {
     .name = "nvme",
     .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
@@ -181,6 +185,9 @@ static void nvme_init_queue(BlockDriverState *bs, NVMeQueue *q,
 
 static void nvme_free_queue_pair(NVMeQueuePair *q)
 {
+    if (q->completion_bh) {
+        qemu_bh_delete(q->completion_bh);
+    }
     qemu_vfree(q->prp_list_pages);
     qemu_vfree(q->sq.queue);
     qemu_vfree(q->cq.queue);
@@ -214,6 +221,8 @@ static NVMeQueuePair *nvme_create_queue_pair(BlockDriverState *bs,
     q->index = idx;
     qemu_co_queue_init(&q->free_req_queue);
     q->prp_list_pages = qemu_blockalign0(bs, s->page_size * NVME_NUM_REQS);
+    q->completion_bh = aio_bh_new(bdrv_get_aio_context(bs),
+                                  nvme_process_completion_bh, q);
     r = qemu_vfio_dma_map(s->vfio, q->prp_list_pages,
                           s->page_size * NVME_NUM_REQS,
                           false, &prp_list_iova);
@@ -352,11 +361,21 @@ static bool nvme_process_completion(NVMeQueuePair *q)
     NvmeCqe *c;
 
     trace_nvme_process_completion(s, q->index, q->inflight);
-    if (q->busy || s->plugged) {
-        trace_nvme_process_completion_queue_busy(s, q->index);
+    if (s->plugged) {
+        trace_nvme_process_completion_queue_plugged(s, q->index);
         return false;
     }
-    q->busy = true;
+
+    /*
+     * Support re-entrancy when a request cb() function invokes aio_poll().
+     * Pending completions must be visible to aio_poll() so that a cb()
+     * function can wait for the completion of another request.
+     *
+     * The aio_poll() loop will execute our BH and we'll resume completion
+     * processing there.
+     */
+    qemu_bh_schedule(q->completion_bh);
+
     assert(q->inflight >= 0);
     while (q->inflight) {
         int ret;
@@ -384,10 +403,10 @@ static bool nvme_process_completion(NVMeQueuePair *q)
         assert(req.cb);
         nvme_put_free_req_locked(q, preq);
         preq->cb = preq->opaque = NULL;
-        qemu_mutex_unlock(&q->lock);
-        req.cb(req.opaque, ret);
-        qemu_mutex_lock(&q->lock);
         q->inflight--;
+        qemu_mutex_unlock(&q->lock);
+        req.cb(req.opaque, ret);
+        qemu_mutex_lock(&q->lock);
         progress = true;
     }
     if (progress) {
@@ -396,10 +415,28 @@ static bool nvme_process_completion(NVMeQueuePair *q)
         *q->cq.doorbell = cpu_to_le32(q->cq.head);
         nvme_wake_free_req_locked(q);
     }
-    q->busy = false;
+
+    qemu_bh_cancel(q->completion_bh);
+
     return progress;
 }
 
+static void nvme_process_completion_bh(void *opaque)
+{
+    NVMeQueuePair *q = opaque;
+
+    /*
+     * We're being invoked because a nvme_process_completion() cb() function
+     * called aio_poll(). The callback may be waiting for further completions
+     * so notify the device that it has space to fill in more completions now.
+     */
+    smp_mb_release();
+    *q->cq.doorbell = cpu_to_le32(q->cq.head);
+    nvme_wake_free_req_locked(q);
+
+    nvme_process_completion(q);
+}
+
 static void nvme_trace_command(const NvmeCmd *cmd)
 {
     int i;
@@ -1309,6 +1346,13 @@ static void nvme_detach_aio_context(BlockDriverState *bs)
 {
     BDRVNVMeState *s = bs->opaque;
 
+    for (int i = 0; i < s->nr_queues; i++) {
+        NVMeQueuePair *q = s->queues[i];
+
+        qemu_bh_delete(q->completion_bh);
+        q->completion_bh = NULL;
+    }
+
     aio_set_event_notifier(bdrv_get_aio_context(bs), &s->irq_notifier,
                            false, NULL, NULL);
 }
@@ -1321,6 +1365,13 @@ static void nvme_attach_aio_context(BlockDriverState *bs,
     s->aio_context = new_context;
     aio_set_event_notifier(new_context, &s->irq_notifier,
                            false, nvme_handle_event, nvme_poll_cb);
+
+    for (int i = 0; i < s->nr_queues; i++) {
+        NVMeQueuePair *q = s->queues[i];
+
+        q->completion_bh =
+            aio_bh_new(new_context, nvme_process_completion_bh, q);
+    }
 }
 
 static void nvme_aio_plug(BlockDriverState *bs)
diff --git a/block/trace-events b/block/trace-events
index 29dff8881c..dbe76a7613 100644
--- a/block/trace-events
+++ b/block/trace-events
@@ -158,7 +158,7 @@ nvme_kick(void *s, int queue) "s %p queue %d"
 nvme_dma_flush_queue_wait(void *s) "s %p"
 nvme_error(int cmd_specific, int sq_head, int sqid, int cid, int status) "cmd_specific %d sq_head %d sqid %d cid %d status 0x%x"
 nvme_process_completion(void *s, int index, int inflight) "s %p queue %d inflight %d"
-nvme_process_completion_queue_busy(void *s, int index) "s %p queue %d"
+nvme_process_completion_queue_plugged(void *s, int index) "s %p queue %d"
 nvme_complete_command(void *s, int index, int cid) "s %p queue %d cid %d"
 nvme_submit_command(void *s, int index, int cid) "s %p queue %d cid %d"
 nvme_submit_command_raw(int c0, int c1, int c2, int c3, int c4, int c5, int c6, int c7) "%02x %02x %02x %02x %02x %02x %02x %02x"
-- 
2.26.2


  parent reply	other threads:[~2020-06-24 10:07 UTC|newest]

Thread overview: 25+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-06-24 10:01 [PULL 00/12] Block patches Stefan Hajnoczi
2020-06-24 10:01 ` [PULL 01/12] minikconf: explicitly set encoding to UTF-8 Stefan Hajnoczi
2020-06-24 10:02 ` [PULL 02/12] coroutine: support SafeStack in ucontext backend Stefan Hajnoczi
2020-06-24 10:02 ` [PULL 03/12] coroutine: add check for SafeStack in sigaltstack Stefan Hajnoczi
2020-06-24 10:02 ` [PULL 04/12] configure: add flags to support SafeStack Stefan Hajnoczi
2020-06-24 10:02 ` [PULL 05/12] check-block: enable iotests with SafeStack Stefan Hajnoczi
2020-06-24 10:02 ` [PULL 06/12] block/nvme: poll queues without q->lock Stefan Hajnoczi
2020-06-24 10:02 ` [PULL 07/12] block/nvme: drop tautologous assertion Stefan Hajnoczi
2020-06-24 10:02 ` [PULL 08/12] block/nvme: don't access CQE after moving cq.head Stefan Hajnoczi
2020-06-24 10:02 ` [PULL 09/12] block/nvme: switch to a NVMeRequest freelist Stefan Hajnoczi
2020-06-24 10:02 ` [PULL 10/12] block/nvme: clarify that free_req_queue is protected by q->lock Stefan Hajnoczi
2020-06-24 10:02 ` [PULL 11/12] block/nvme: keep BDRVNVMeState pointer in NVMeQueuePair Stefan Hajnoczi
2020-06-24 10:02 ` Stefan Hajnoczi [this message]
2020-06-25 13:31 ` [PULL 00/12] Block patches Peter Maydell
2020-06-26 10:25   ` Stefan Hajnoczi
2020-06-26 10:49     ` Peter Maydell
2020-06-26 13:01       ` Stefan Hajnoczi
2020-06-26 15:54         ` Peter Maydell
2020-07-07 15:28     ` Philippe Mathieu-Daudé
2020-07-07 22:05       ` Eduardo Habkost
2020-07-09 15:02         ` Kevin Wolf
2020-07-09 18:41           ` Eduardo Habkost
2020-07-10  8:36             ` Kevin Wolf
2020-07-16 12:40           ` Peter Maydell
2020-07-16 13:29             ` Kevin Wolf

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200624100210.59975-13-stefanha@redhat.com \
    --to=stefanha@redhat.com \
    --cc=crosa@redhat.com \
    --cc=ehabkost@redhat.com \
    --cc=fam@euphon.net \
    --cc=kwolf@redhat.com \
    --cc=mreitz@redhat.com \
    --cc=peter.maydell@linaro.org \
    --cc=qemu-block@nongnu.org \
    --cc=qemu-devel@nongnu.org \
    --cc=slp@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).