qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed
From: Kevin Wolf <kwolf@redhat.com>
To: anthony@codemonkey.ws
Cc: kwolf@redhat.com, qemu-devel@nongnu.org
Subject: [Qemu-devel] [PULL 01/42] rbd: switch from pipe to QEMUBH completion notification
Date: Wed, 15 Jan 2014 11:22:14 +0100	[thread overview]
Message-ID: <1389781375-11774-2-git-send-email-kwolf@redhat.com> (raw)
In-Reply-To: <1389781375-11774-1-git-send-email-kwolf@redhat.com>

From: Stefan Hajnoczi <stefanha@redhat.com>

rbd callbacks are called from non-QEMU threads.  Up until now a pipe was
used to signal completion back to the QEMU iothread.

The pipe writer code handles EAGAIN using select(2).  The select(2) API
is not scalable since fd_set size is static.  FD_SET() can write beyond
the end of fd_set if the file descriptor number is too high.  (QEMU's
main loop uses poll(2) to avoid this issue with select(2).)

Since the pipe itself is quite clumsy to use and QEMUBH is now
thread-safe, just schedule a BH from the rbd callback function.  This
way we can simplify I/O completion in addition to eliminating the
potential FD_SET() crash when file descriptor numbers become too high.

Crash scenario: QEMU already has 1024 file descriptors open.  Hotplug an
rbd drive and get the pipe writer to take the select(2) code path.

Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
Tested-by: Josh Durgin <josh.durgin@inktank.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/rbd.c | 130 ++++++++++--------------------------------------------------
 1 file changed, 22 insertions(+), 108 deletions(-)

diff --git a/block/rbd.c b/block/rbd.c
index f453f04..121fae2 100644
--- a/block/rbd.c
+++ b/block/rbd.c
@@ -95,18 +95,13 @@ typedef struct RADOSCB {
 #define RBD_FD_WRITE 1
 
 typedef struct BDRVRBDState {
-    int fds[2];
     rados_t cluster;
     rados_ioctx_t io_ctx;
     rbd_image_t image;
     char name[RBD_MAX_IMAGE_NAME_SIZE];
     char *snap;
-    int event_reader_pos;
-    RADOSCB *event_rcb;
 } BDRVRBDState;
 
-static void rbd_aio_bh_cb(void *opaque);
-
 static int qemu_rbd_next_tok(char *dst, int dst_len,
                              char *src, char delim,
                              const char *name,
@@ -369,9 +364,8 @@ static int qemu_rbd_create(const char *filename, QEMUOptionParameter *options,
 }
 
 /*
- * This aio completion is being called from qemu_rbd_aio_event_reader()
- * and runs in qemu context. It schedules a bh, but just in case the aio
- * was not cancelled before.
+ * This aio completion is being called from rbd_finish_bh() and runs in qemu
+ * BH context.
  */
 static void qemu_rbd_complete_aio(RADOSCB *rcb)
 {
@@ -401,36 +395,19 @@ static void qemu_rbd_complete_aio(RADOSCB *rcb)
             acb->ret = r;
         }
     }
-    /* Note that acb->bh can be NULL in case where the aio was cancelled */
-    acb->bh = qemu_bh_new(rbd_aio_bh_cb, acb);
-    qemu_bh_schedule(acb->bh);
-    g_free(rcb);
-}
 
-/*
- * aio fd read handler. It runs in the qemu context and calls the
- * completion handling of completed rados aio operations.
- */
-static void qemu_rbd_aio_event_reader(void *opaque)
-{
-    BDRVRBDState *s = opaque;
+    g_free(rcb);
 
-    ssize_t ret;
+    if (acb->cmd == RBD_AIO_READ) {
+        qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
+    }
+    qemu_vfree(acb->bounce);
+    acb->common.cb(acb->common.opaque, (acb->ret > 0 ? 0 : acb->ret));
+    acb->status = 0;
 
-    do {
-        char *p = (char *)&s->event_rcb;
-
-        /* now read the rcb pointer that was sent from a non qemu thread */
-        ret = read(s->fds[RBD_FD_READ], p + s->event_reader_pos,
-                   sizeof(s->event_rcb) - s->event_reader_pos);
-        if (ret > 0) {
-            s->event_reader_pos += ret;
-            if (s->event_reader_pos == sizeof(s->event_rcb)) {
-                s->event_reader_pos = 0;
-                qemu_rbd_complete_aio(s->event_rcb);
-            }
-        }
-    } while (ret < 0 && errno == EINTR);
+    if (!acb->cancelled) {
+        qemu_aio_release(acb);
+    }
 }
 
 /* TODO Convert to fine grained options */
@@ -538,23 +515,9 @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags,
 
     bs->read_only = (s->snap != NULL);
 
-    s->event_reader_pos = 0;
-    r = qemu_pipe(s->fds);
-    if (r < 0) {
-        error_report("error opening eventfd");
-        goto failed;
-    }
-    fcntl(s->fds[0], F_SETFL, O_NONBLOCK);
-    fcntl(s->fds[1], F_SETFL, O_NONBLOCK);
-    qemu_aio_set_fd_handler(s->fds[RBD_FD_READ], qemu_rbd_aio_event_reader,
-                            NULL, s);
-
-
     qemu_opts_del(opts);
     return 0;
 
-failed:
-    rbd_close(s->image);
 failed_open:
     rados_ioctx_destroy(s->io_ctx);
 failed_shutdown:
@@ -569,10 +532,6 @@ static void qemu_rbd_close(BlockDriverState *bs)
 {
     BDRVRBDState *s = bs->opaque;
 
-    close(s->fds[0]);
-    close(s->fds[1]);
-    qemu_aio_set_fd_handler(s->fds[RBD_FD_READ], NULL, NULL, NULL);
-
     rbd_close(s->image);
     rados_ioctx_destroy(s->io_ctx);
     g_free(s->snap);
@@ -600,34 +559,11 @@ static const AIOCBInfo rbd_aiocb_info = {
     .cancel = qemu_rbd_aio_cancel,
 };
 
-static int qemu_rbd_send_pipe(BDRVRBDState *s, RADOSCB *rcb)
+static void rbd_finish_bh(void *opaque)
 {
-    int ret = 0;
-    while (1) {
-        fd_set wfd;
-        int fd = s->fds[RBD_FD_WRITE];
-
-        /* send the op pointer to the qemu thread that is responsible
-           for the aio/op completion. Must do it in a qemu thread context */
-        ret = write(fd, (void *)&rcb, sizeof(rcb));
-        if (ret >= 0) {
-            break;
-        }
-        if (errno == EINTR) {
-            continue;
-        }
-        if (errno != EAGAIN) {
-            break;
-        }
-
-        FD_ZERO(&wfd);
-        FD_SET(fd, &wfd);
-        do {
-            ret = select(fd + 1, NULL, &wfd, NULL, NULL);
-        } while (ret < 0 && errno == EINTR);
-    }
-
-    return ret;
+    RADOSCB *rcb = opaque;
+    qemu_bh_delete(rcb->acb->bh);
+    qemu_rbd_complete_aio(rcb);
 }
 
 /*
@@ -635,40 +571,18 @@ static int qemu_rbd_send_pipe(BDRVRBDState *s, RADOSCB *rcb)
  *
  * Note: this function is being called from a non qemu thread so
  * we need to be careful about what we do here. Generally we only
- * write to the block notification pipe, and do the rest of the
- * io completion handling from qemu_rbd_aio_event_reader() which
- * runs in a qemu context.
+ * schedule a BH, and do the rest of the io completion handling
+ * from rbd_finish_bh() which runs in a qemu context.
  */
 static void rbd_finish_aiocb(rbd_completion_t c, RADOSCB *rcb)
 {
-    int ret;
+    RBDAIOCB *acb = rcb->acb;
+
     rcb->ret = rbd_aio_get_return_value(c);
     rbd_aio_release(c);
-    ret = qemu_rbd_send_pipe(rcb->s, rcb);
-    if (ret < 0) {
-        error_report("failed writing to acb->s->fds");
-        g_free(rcb);
-    }
-}
-
-/* Callback when all queued rbd_aio requests are complete */
 
-static void rbd_aio_bh_cb(void *opaque)
-{
-    RBDAIOCB *acb = opaque;
-
-    if (acb->cmd == RBD_AIO_READ) {
-        qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
-    }
-    qemu_vfree(acb->bounce);
-    acb->common.cb(acb->common.opaque, (acb->ret > 0 ? 0 : acb->ret));
-    qemu_bh_delete(acb->bh);
-    acb->bh = NULL;
-    acb->status = 0;
-
-    if (!acb->cancelled) {
-        qemu_aio_release(acb);
-    }
+    acb->bh = qemu_bh_new(rbd_finish_bh, rcb);
+    qemu_bh_schedule(acb->bh);
 }
 
 static int rbd_aio_discard_wrapper(rbd_image_t image,
-- 
1.8.1.4

  reply	other threads:[~2014-01-15 10:23 UTC|newest]

Thread overview: 51+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2014-01-15 10:22 [Qemu-devel] [PULL 00/42] Block patches Kevin Wolf
2014-01-15 10:22 ` Kevin Wolf [this message]
2014-01-15 10:22 ` [Qemu-devel] [PULL 02/42] qemu-iotests: Introduce _unsupported_imgopts Kevin Wolf
2014-01-15 10:22 ` [Qemu-devel] [PULL 03/42] qemu-iotests: Add _unsupported_imgopts for vmdk subformats Kevin Wolf
2014-01-15 10:22 ` [Qemu-devel] [PULL 04/42] qemu-iotests: Clean up all extents for vmdk Kevin Wolf
2014-01-15 10:22 ` [Qemu-devel] [PULL 05/42] block/iscsi: return -ENOMEM if an async call fails immediately Kevin Wolf
2014-01-15 10:22 ` [Qemu-devel] [PULL 06/42] gluster: Convert aio routines into coroutines Kevin Wolf
2014-01-15 10:22 ` [Qemu-devel] [PULL 07/42] gluster: Implement .bdrv_co_write_zeroes for gluster Kevin Wolf
2014-01-15 10:22 ` [Qemu-devel] [PULL 08/42] gluster: Add support for creating zero-filled image Kevin Wolf
2014-01-15 10:22 ` [Qemu-devel] [PULL 09/42] sheepdog: fix clone operation by 'qemu-img create -b' Kevin Wolf
2014-01-15 10:22 ` [Qemu-devel] [PULL 10/42] qtest: Fix the bug about disable vnc causes "make check" fail Kevin Wolf
2014-01-17 15:06   ` Andreas Färber
2014-01-18 11:54     ` Kewei Yu
2014-01-26  0:06       ` Andreas Färber
2014-01-26  8:04         ` Kewei Yu
2014-01-15 10:22 ` [Qemu-devel] [PULL 11/42] docs: qcow2 compat=1.1 is now the default Kevin Wolf
2014-01-15 10:22 ` [Qemu-devel] [PULL 12/42] vmdk: Fix big flat extent IO Kevin Wolf
2014-01-15 10:22 ` [Qemu-devel] [PULL 13/42] readline: decouple readline from the monitor Kevin Wolf
2014-01-15 10:22 ` [Qemu-devel] [PULL 14/42] readline: move readline to a generic location Kevin Wolf
2014-01-15 10:22 ` [Qemu-devel] [PULL 15/42] osdep: add qemu_set_tty_echo() Kevin Wolf
2014-01-15 10:22 ` [Qemu-devel] [PULL 16/42] qemu-io: use readline.c Kevin Wolf
2014-01-15 10:22 ` [Qemu-devel] [PULL 17/42] qemu-io: add command completion Kevin Wolf
2014-01-15 10:22 ` [Qemu-devel] [PULL 18/42] blkdebug: Use errp for read_config() Kevin Wolf
2014-01-15 10:22 ` [Qemu-devel] [PULL 19/42] blkdebug: Don't require sophisticated filename Kevin Wolf
2014-01-15 10:22 ` [Qemu-devel] [PULL 20/42] qdict: Add qdict_array_split() Kevin Wolf
2014-01-15 10:22 ` [Qemu-devel] [PULL 21/42] qapi: extend qdict_flatten() for QLists Kevin Wolf
2014-01-15 10:22 ` [Qemu-devel] [PULL 22/42] qemu-option: Add qemu_config_parse_qdict() Kevin Wolf
2014-01-15 10:22 ` [Qemu-devel] [PULL 23/42] blkdebug: Always call read_config() Kevin Wolf
2014-01-15 10:22 ` [Qemu-devel] [PULL 24/42] blkdebug: Use command-line in read_config() Kevin Wolf
2014-01-15 10:22 ` [Qemu-devel] [PULL 25/42] block: Allow reference for bdrv_file_open() Kevin Wolf
2014-01-15 10:22 ` [Qemu-devel] [PULL 26/42] block: Pass reference to bdrv_file_open() Kevin Wolf
2014-01-15 10:22 ` [Qemu-devel] [PULL 27/42] block: Allow block devices without files Kevin Wolf
2014-01-15 10:22 ` [Qemu-devel] [PULL 28/42] block: Add bdrv_open_image() Kevin Wolf
2014-01-15 10:22 ` [Qemu-devel] [PULL 29/42] block: Use bdrv_open_image() in bdrv_open() Kevin Wolf
2014-01-15 10:22 ` [Qemu-devel] [PULL 30/42] block: Allow recursive "file"s Kevin Wolf
2014-01-15 10:22 ` [Qemu-devel] [PULL 31/42] blockdev: Move "file" to legacy_opts Kevin Wolf
2014-01-15 10:22 ` [Qemu-devel] [PULL 32/42] blkdebug: Allow command-line file configuration Kevin Wolf
2014-01-15 10:22 ` [Qemu-devel] [PULL 33/42] blkverify: Allow command-line configuration Kevin Wolf
2014-01-15 10:22 ` [Qemu-devel] [PULL 34/42] blkverify: Don't require protocol filename Kevin Wolf
2014-01-15 10:22 ` [Qemu-devel] [PULL 35/42] qapi: Add "errno" to the list of polluted words Kevin Wolf
2014-01-15 10:22 ` [Qemu-devel] [PULL 36/42] qapi: QMP interface for blkdebug and blkverify Kevin Wolf
2014-01-15 15:19   ` Eric Blake
2014-01-15 17:11     ` Paolo Bonzini
2014-01-16 10:03     ` Kevin Wolf
2014-01-17 15:01       ` Andreas Färber
2014-01-15 10:22 ` [Qemu-devel] [PULL 37/42] qemu-io: Make filename optional Kevin Wolf
2014-01-15 10:22 ` [Qemu-devel] [PULL 38/42] tests: Add test for qdict_array_split() Kevin Wolf
2014-01-15 10:22 ` [Qemu-devel] [PULL 39/42] tests: Add test for qdict_flatten() Kevin Wolf
2014-01-15 10:22 ` [Qemu-devel] [PULL 40/42] iotests: Test new blkdebug/blkverify interface Kevin Wolf
2014-01-15 10:22 ` [Qemu-devel] [PULL 41/42] iotests: Test file format nesting Kevin Wolf
2014-01-15 10:22 ` [Qemu-devel] [PULL 42/42] block: fix backing file segfault Kevin Wolf

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1389781375-11774-2-git-send-email-kwolf@redhat.com \
    --to=kwolf@redhat.com \
    --cc=anthony@codemonkey.ws \
    --cc=qemu-devel@nongnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).