From: Fam Zheng <famz@redhat.com>
To: qemu-devel@nongnu.org
Cc: Kevin Wolf <kwolf@redhat.com>,
pbonzini@redhat.com, qemu-block@nongnu.org,
Stefan Hajnoczi <stefanha@redhat.com>
Subject: [Qemu-devel] [PATCH RFC 4/4] aio-posix: Use epoll in aio_poll
Date: Tue, 30 Jun 2015 21:19:45 +0800 [thread overview]
Message-ID: <1435670385-625-5-git-send-email-famz@redhat.com> (raw)
In-Reply-To: <1435670385-625-1-git-send-email-famz@redhat.com>
This patch let aio_poll use epoll_wait(2) syscall instead of
qemu_poll_ns, if possible. It improves scalability of
iothread (for example, virtio-scsi-dataplane.)
The epollfd is managed together with the GSource and ctx->aio_handlers,
by creating epoll_event instances for each watched aio fd and adding to
the epollfd with epoll_ctl.
The following table is a fio benchmark comparison on a single guest
block device, with different number of disks attached to the same scsi
bus (in MB/s):
=====================================================================
# of scsi-disks | master | epoll
| rd wr randrw | rd wr randrw
---------------------------------------------------------------------
1 | 103 96 49 | 105 99 49
4 | 92 96 48 | 103 98 49
8 | 96 94 46 | 101 97 50
16 | 91 91 45 | 101 95 48
32 | 84 83 40 | 95 95 48
64 | 75 73 35 | 91 90 44
128 | 54 53 26 | 79 80 39
256 | 41 39 19 | 63 62 30
=====================================================================
Signed-off-by: Fam Zheng <famz@redhat.com>
---
aio-posix.c | 118 ++++++++++++++++++++++++++++++++++++++++++++++++++--
include/block/aio.h | 3 ++
2 files changed, 117 insertions(+), 4 deletions(-)
diff --git a/aio-posix.c b/aio-posix.c
index 22406ce..111d7fb 100644
--- a/aio-posix.c
+++ b/aio-posix.c
@@ -17,6 +17,9 @@
#include "block/block.h"
#include "qemu/queue.h"
#include "qemu/sockets.h"
+#ifdef CONFIG_EPOLL
+#include <sys/epoll.h>
+#endif
struct AioHandler
{
@@ -44,6 +47,12 @@ static AioHandler *find_aio_handler(AioContext *ctx, int fd)
void aio_context_setup(AioContext *ctx, Error **errp)
{
+#ifdef CONFIG_EPOLL
+ ctx->epollfd = epoll_create1(EPOLL_CLOEXEC);
+ if (ctx->epollfd < 0) {
+ error_setg(errp, "Failed to create epoll fd: %s", strerror(errno));
+ }
+#endif
}
void aio_set_fd_handler_pri(AioContext *ctx,
@@ -54,6 +63,11 @@ void aio_set_fd_handler_pri(AioContext *ctx,
void *opaque)
{
AioHandler *node;
+#ifdef CONFIG_EPOLL
+ struct epoll_event event;
+ int r;
+ bool add = false;
+#endif
node = find_aio_handler(ctx, fd);
@@ -61,6 +75,10 @@ void aio_set_fd_handler_pri(AioContext *ctx,
if (!io_read && !io_write && !io_read_pri) {
if (node) {
g_source_remove_poll(&ctx->source, &node->pfd);
+#ifdef CONFIG_EPOLL
+ r = epoll_ctl(ctx->epollfd, EPOLL_CTL_DEL, fd, &event);
+ assert(!r);
+#endif
/* If the lock is held, just mark the node as deleted */
if (ctx->walking_handlers) {
@@ -83,6 +101,9 @@ void aio_set_fd_handler_pri(AioContext *ctx,
QLIST_INSERT_HEAD(&ctx->aio_handlers, node, node);
g_source_add_poll(&ctx->source, &node->pfd);
+#ifdef CONFIG_EPOLL
+ add = true;
+#endif
}
/* Update handler with latest information */
node->io_read = io_read;
@@ -93,6 +114,13 @@ void aio_set_fd_handler_pri(AioContext *ctx,
node->pfd.events = (io_read ? G_IO_IN | G_IO_HUP | G_IO_ERR : 0);
node->pfd.events |= (io_write ? G_IO_OUT | G_IO_ERR : 0);
node->pfd.events |= (io_read_pri ? G_IO_PRI | G_IO_HUP | G_IO_ERR : 0);
+#ifdef CONFIG_EPOLL
+ event.data.ptr = node;
+ event.events = node->pfd.events;
+ r = epoll_ctl(ctx->epollfd, add ? EPOLL_CTL_ADD : EPOLL_CTL_MOD,
+ fd, &event);
+ assert(!r);
+#endif
}
aio_notify(ctx);
@@ -198,7 +226,80 @@ bool aio_dispatch(AioContext *ctx)
return progress;
}
-/* These thread-local variables are used only in a small part of aio_poll
+#ifdef CONFIG_EPOLL
+QEMU_BUILD_BUG_ON((int)G_IO_IN != EPOLLIN);
+QEMU_BUILD_BUG_ON((int)G_IO_OUT != EPOLLOUT);
+QEMU_BUILD_BUG_ON((int)G_IO_PRI != EPOLLPRI);
+QEMU_BUILD_BUG_ON((int)G_IO_ERR != EPOLLERR);
+QEMU_BUILD_BUG_ON((int)G_IO_HUP != EPOLLHUP);
+
+#define EPOLL_BATCH 128
+static bool aio_poll_epoll(AioContext *ctx, bool blocking)
+{
+ AioHandler *node;
+ bool was_dispatching;
+ int i, ret;
+ bool progress;
+ int64_t timeout;
+ struct epoll_event events[EPOLL_BATCH];
+
+ aio_context_acquire(ctx);
+ was_dispatching = ctx->dispatching;
+ progress = false;
+
+ /* aio_notify can avoid the expensive event_notifier_set if
+ * everything (file descriptors, bottom halves, timers) will
+ * be re-evaluated before the next blocking poll(). This is
+ * already true when aio_poll is called with blocking == false;
+ * if blocking == true, it is only true after poll() returns.
+ *
+ * If we're in a nested event loop, ctx->dispatching might be true.
+ * In that case we can restore it just before returning, but we
+ * have to clear it now.
+ */
+ aio_set_dispatching(ctx, !blocking);
+
+ ctx->walking_handlers++;
+
+ timeout = blocking ? aio_compute_timeout(ctx) : 0;
+
+ if (timeout > 0) {
+ timeout = DIV_ROUND_UP(timeout, 1000000);
+ }
+
+ /* wait until next event */
+ if (timeout) {
+ aio_context_release(ctx);
+ }
+ ret = epoll_wait(ctx->epollfd, events, EPOLL_BATCH, timeout);
+ if (timeout) {
+ aio_context_acquire(ctx);
+ }
+
+ /* if we have any readable fds, dispatch event */
+ if (ret > 0) {
+ for (i = 0; i < ret; i++) {
+ node = events[i].data.ptr;
+ node->pfd.revents = events[i].events;
+ }
+ }
+
+ ctx->walking_handlers--;
+
+ /* Run dispatch even if there were no readable fds to run timers */
+ aio_set_dispatching(ctx, true);
+ if (aio_dispatch(ctx)) {
+ progress = true;
+ }
+
+ aio_set_dispatching(ctx, was_dispatching);
+ aio_context_release(ctx);
+
+ return progress;
+}
+#else
+
+/* These thread-local variables are used only in a small part of aio_poll_posix
* around the call to the poll() system call. In particular they are not
* used while aio_poll is performing callbacks, which makes it much easier
* to think about reentrancy!
@@ -212,7 +313,6 @@ bool aio_dispatch(AioContext *ctx)
static __thread GPollFD *pollfds;
static __thread AioHandler **nodes;
static __thread unsigned npfd, nalloc;
-static __thread Notifier pollfds_cleanup_notifier;
static void pollfds_cleanup(Notifier *n, void *unused)
{
@@ -221,7 +321,7 @@ static void pollfds_cleanup(Notifier *n, void *unused)
g_free(nodes);
nalloc = 0;
}
-
+static __thread Notifier pollfds_cleanup_notifier;
static void add_pollfd(AioHandler *node)
{
if (npfd == nalloc) {
@@ -244,7 +344,7 @@ static void add_pollfd(AioHandler *node)
npfd++;
}
-bool aio_poll(AioContext *ctx, bool blocking)
+bool aio_poll_posix(AioContext *ctx, bool blocking)
{
AioHandler *node;
bool was_dispatching;
@@ -311,3 +411,13 @@ bool aio_poll(AioContext *ctx, bool blocking)
return progress;
}
+#endif
+
+bool aio_poll(AioContext *ctx, bool blocking)
+{
+#ifdef CONFIG_EPOLL
+ return aio_poll_epoll(ctx, blocking);
+#else
+ return aio_poll_posix(ctx, blocking);
+#endif
+}
diff --git a/include/block/aio.h b/include/block/aio.h
index 5120583..9178ff2 100644
--- a/include/block/aio.h
+++ b/include/block/aio.h
@@ -87,6 +87,9 @@ struct AioContext {
/* TimerLists for calling timers - one per clock type */
QEMUTimerListGroup tlg;
+
+ /* epoll fd */
+ int epollfd;
};
/* Used internally to synchronize aio_poll against qemu_bh_schedule. */
--
2.4.3
next prev parent reply other threads:[~2015-06-30 13:24 UTC|newest]
Thread overview: 20+ messages / expand[flat|nested] mbox.gz Atom feed top
2015-06-30 13:19 [Qemu-devel] [PATCH RFC 0/4] aio: Use epoll_wait in aio_poll Fam Zheng
2015-06-30 13:19 ` [Qemu-devel] [PATCH RFC 1/4] aio: Introduce aio_set_fd_handler_pri Fam Zheng
2015-07-07 14:29 ` Stefan Hajnoczi
2015-07-08 1:07 ` Fam Zheng
2015-06-30 13:19 ` [Qemu-devel] [PATCH RFC 2/4] aio: Move aio_set_fd_handler to async.c Fam Zheng
2015-07-07 14:30 ` Stefan Hajnoczi
2015-06-30 13:19 ` [Qemu-devel] [PATCH RFC 3/4] aio: Introduce aio_context_setup Fam Zheng
2015-07-07 14:35 ` Stefan Hajnoczi
2015-07-08 1:15 ` Fam Zheng
2015-07-08 10:51 ` Stefan Hajnoczi
2015-06-30 13:19 ` Fam Zheng [this message]
2015-07-07 15:08 ` [Qemu-devel] [PATCH RFC 4/4] aio-posix: Use epoll in aio_poll Stefan Hajnoczi
2015-07-07 15:27 ` Paolo Bonzini
2015-07-08 1:01 ` Fam Zheng
2015-07-08 10:58 ` Stefan Hajnoczi
2015-07-10 0:46 ` Fam Zheng
2015-07-13 10:02 ` Stefan Hajnoczi
2015-07-07 14:54 ` [Qemu-devel] [PATCH RFC 0/4] aio: Use epoll_wait " Christian Borntraeger
2015-07-08 1:02 ` Fam Zheng
2015-07-08 7:59 ` Christian Borntraeger
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1435670385-625-5-git-send-email-famz@redhat.com \
--to=famz@redhat.com \
--cc=kwolf@redhat.com \
--cc=pbonzini@redhat.com \
--cc=qemu-block@nongnu.org \
--cc=qemu-devel@nongnu.org \
--cc=stefanha@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).