* [Qemu-devel] [PATCH v2 1/2] main-loop: Pass AioContext into qemu_poll_ns
2014-09-30 3:35 [Qemu-devel] [PATCH v2 0/2] main-loop: Use epoll on Linux Fam Zheng
@ 2014-09-30 3:35 ` Fam Zheng
2014-10-02 15:26 ` Stefan Hajnoczi
2014-09-30 3:35 ` [Qemu-devel] [PATCH v2 2/2] main-loop: Use epoll on Linux Fam Zheng
1 sibling, 1 reply; 4+ messages in thread
From: Fam Zheng @ 2014-09-30 3:35 UTC (permalink / raw)
To: qemu-devel; +Cc: kwolf, Paolo Bonzini, Stefan Hajnoczi
qemu_poll_ns may ultilize the information in AioContext to achieve
better performance.
Signed-off-by: Fam Zheng <famz@redhat.com>
---
aio-posix.c | 2 +-
include/qemu/timer.h | 3 ++-
main-loop.c | 6 ++++--
qemu-timer.c | 2 +-
4 files changed, 8 insertions(+), 5 deletions(-)
diff --git a/aio-posix.c b/aio-posix.c
index d3ac06e..d7a3ce3 100644
--- a/aio-posix.c
+++ b/aio-posix.c
@@ -228,7 +228,7 @@ bool aio_poll(AioContext *ctx, bool blocking)
ctx->walking_handlers--;
/* wait until next event */
- ret = qemu_poll_ns((GPollFD *)ctx->pollfds->data,
+ ret = qemu_poll_ns(ctx, (GPollFD *)ctx->pollfds->data,
ctx->pollfds->len,
blocking ? aio_compute_timeout(ctx) : 0);
diff --git a/include/qemu/timer.h b/include/qemu/timer.h
index 5f5210d..66d20d0 100644
--- a/include/qemu/timer.h
+++ b/include/qemu/timer.h
@@ -647,6 +647,7 @@ int qemu_timeout_ns_to_ms(int64_t ns);
/**
* qemu_poll_ns:
+ * @ctx: Aio Context we are polling in
* @fds: Array of file descriptors
* @nfds: number of file descriptors
* @timeout: timeout in nanoseconds
@@ -656,7 +657,7 @@ int qemu_timeout_ns_to_ms(int64_t ns);
*
* Returns: number of fds ready
*/
-int qemu_poll_ns(GPollFD *fds, guint nfds, int64_t timeout);
+int qemu_poll_ns(AioContext *ctx, GPollFD *fds, guint nfds, int64_t timeout);
/**
* qemu_soonest_timeout:
diff --git a/main-loop.c b/main-loop.c
index d2e64f1..4641ef4 100644
--- a/main-loop.c
+++ b/main-loop.c
@@ -234,7 +234,8 @@ static int os_host_main_loop_wait(int64_t timeout)
spin_counter++;
}
- ret = qemu_poll_ns((GPollFD *)gpollfds->data, gpollfds->len, timeout);
+ ret = qemu_poll_ns(qemu_aio_context, (GPollFD *)gpollfds->data,
+ gpollfds->len, timeout);
if (timeout) {
qemu_mutex_lock_iothread();
@@ -439,7 +440,8 @@ static int os_host_main_loop_wait(int64_t timeout)
poll_timeout_ns = qemu_soonest_timeout(poll_timeout_ns, timeout);
qemu_mutex_unlock_iothread();
- g_poll_ret = qemu_poll_ns(poll_fds, n_poll_fds + w->num, poll_timeout_ns);
+ g_poll_ret = qemu_poll_ns(qemu_aio_context,
+ poll_fds, n_poll_fds + w->num, poll_timeout_ns);
qemu_mutex_lock_iothread();
if (g_poll_ret > 0) {
diff --git a/qemu-timer.c b/qemu-timer.c
index 00a5d35..7336b20 100644
--- a/qemu-timer.c
+++ b/qemu-timer.c
@@ -307,7 +307,7 @@ int qemu_timeout_ns_to_ms(int64_t ns)
/* qemu implementation of g_poll which uses a nanosecond timeout but is
* otherwise identical to g_poll
*/
-int qemu_poll_ns(GPollFD *fds, guint nfds, int64_t timeout)
+int qemu_poll_ns(AioContext *ctx, GPollFD *fds, guint nfds, int64_t timeout)
{
#ifdef CONFIG_PPOLL
if (timeout < 0) {
--
1.9.3
^ permalink raw reply related [flat|nested] 4+ messages in thread
* [Qemu-devel] [PATCH v2 2/2] main-loop: Use epoll on Linux
2014-09-30 3:35 [Qemu-devel] [PATCH v2 0/2] main-loop: Use epoll on Linux Fam Zheng
2014-09-30 3:35 ` [Qemu-devel] [PATCH v2 1/2] main-loop: Pass AioContext into qemu_poll_ns Fam Zheng
@ 2014-09-30 3:35 ` Fam Zheng
1 sibling, 0 replies; 4+ messages in thread
From: Fam Zheng @ 2014-09-30 3:35 UTC (permalink / raw)
To: qemu-devel; +Cc: kwolf, Paolo Bonzini, Stefan Hajnoczi
A new implementation for qemu_poll_ns based on epoll is introduced here
to address the slowness of g_poll and ppoll when the number of fds are
high.
On my laptop this would reduce the virtio-blk on top of null-aio
device's response time from 32 us to 29 us with few fds (~10), and 48 us
to 32 us with more fds (for example when virtio-serial is plugged and
~64 more io handlers are enabled).
Signed-off-by: Fam Zheng <famz@redhat.com>
---
Makefile.objs | 1 +
include/block/aio.h | 16 +++++
include/qemu/main-loop.h | 1 +
qemu-epoll.c | 163 +++++++++++++++++++++++++++++++++++++++++++++++
qemu-timer.c | 4 +-
tests/Makefile | 2 +-
6 files changed, 185 insertions(+), 2 deletions(-)
create mode 100644 qemu-epoll.c
diff --git a/Makefile.objs b/Makefile.objs
index 97db978..52ee086 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -9,6 +9,7 @@ util-obj-y = util/ qobject/ qapi/ qapi-types.o qapi-visit.o qapi-event.o
block-obj-y = async.o thread-pool.o
block-obj-y += nbd.o block.o blockjob.o
block-obj-y += main-loop.o iohandler.o qemu-timer.o
+block-obj-$(CONFIG_LINUX) += qemu-epoll.o
block-obj-$(CONFIG_POSIX) += aio-posix.o
block-obj-$(CONFIG_WIN32) += aio-win32.o
block-obj-y += block/
diff --git a/include/block/aio.h b/include/block/aio.h
index 1562721..b51494a 100644
--- a/include/block/aio.h
+++ b/include/block/aio.h
@@ -90,6 +90,22 @@ struct AioContext {
/* TimerLists for calling timers - one per clock type */
QEMUTimerListGroup tlg;
+
+#ifdef CONFIG_LINUX
+ struct EpollState {
+ /* A copy of last fd array, used to skip epoll_prepare when nothing
+ * changed. */
+ GPollFD *last_fds;
+ guint last_nfds;
+ /* An array of fds that failed epoll_ctl and fall back to ppoll. Rare
+ * case too. */
+ GPollFD *g_poll_fds;
+ guint g_poll_nfds;
+ int *g_poll_fd_idx;
+ int epollfd;
+ } epoll_state;
+#endif
+
};
/* Used internally to synchronize aio_poll against qemu_bh_schedule. */
diff --git a/include/qemu/main-loop.h b/include/qemu/main-loop.h
index 62c68c0..82afb4e 100644
--- a/include/qemu/main-loop.h
+++ b/include/qemu/main-loop.h
@@ -307,5 +307,6 @@ void qemu_iohandler_poll(GArray *pollfds, int rc);
QEMUBH *qemu_bh_new(QEMUBHFunc *cb, void *opaque);
void qemu_bh_schedule_idle(QEMUBH *bh);
+int qemu_epoll(AioContext *ctx, GPollFD *fds, guint nfds, int64_t timeout);
#endif
diff --git a/qemu-epoll.c b/qemu-epoll.c
new file mode 100644
index 0000000..9225003
--- /dev/null
+++ b/qemu-epoll.c
@@ -0,0 +1,163 @@
+/*
+ * QEMU Event Loop
+ *
+ * Copyright (c) 2014 Red Hat, Inc.
+ *
+ * Authors:
+ * Fam Zheng <famz@redhat.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <sys/epoll.h>
+#include "qemu/main-loop.h"
+
+static bool g_poll_fds_changed(const GPollFD *fds_a, const guint nfds_a,
+ const GPollFD *fds_b, const guint nfds_b)
+{
+ int i;
+
+ if (nfds_a != nfds_b) {
+ return true;
+ }
+ if (!!fds_a != !!fds_b) {
+ return true;
+ }
+ for (i = 0; i < nfds_a; i++) {
+ if (fds_a[i].fd != fds_b[i].fd ||
+ fds_a[i].events != fds_b[i].events) {
+ return true;
+ }
+ }
+ return false;
+}
+
+static inline int io_condition_from_epoll_events(int e)
+{
+ return (e & EPOLLIN ? G_IO_IN : 0) |
+ (e & EPOLLOUT ? G_IO_OUT : 0) |
+ (e & EPOLLERR ? G_IO_ERR : 0) |
+ (e & EPOLLHUP ? G_IO_HUP : 0);
+}
+
+static inline void epoll_event_from_g_poll_fd(struct epoll_event *event,
+ GPollFD *fd)
+{
+ int e = fd->events;
+
+ event->events = (e & G_IO_IN ? EPOLLIN : 0) |
+ (e & G_IO_OUT ? EPOLLOUT : 0) |
+ (e & G_IO_ERR ? EPOLLERR : 0) |
+ (e & G_IO_HUP ? EPOLLHUP : 0);
+ event->data.ptr = fd;
+}
+
+static int epoll_prepare(int epollfd,
+ GPollFD *fds, guint nfds,
+ GPollFD **g_poll_fds,
+ guint *g_poll_nfds,
+ int **g_poll_fd_idx)
+{
+ int i;
+
+ GPollFD *pfds = NULL;
+ int npfds = 0;
+ int *idx = NULL;
+
+ for (i = 0; i < nfds; i++) {
+ int r;
+ struct epoll_event event;
+ epoll_event_from_g_poll_fd(&event, &fds[i]);
+
+ r = epoll_ctl(epollfd, EPOLL_CTL_ADD, fds[i].fd, &event);
+ if (r) {
+ /* Some fds may not support epoll, fall back and add them to
+ * ppoll_fds */
+ pfds = g_renew(GPollFD, pfds, npfds + 1);
+ pfds[npfds] = fds[i];
+ idx = g_renew(int, idx, npfds + 1);
+ idx[npfds] = i;
+ npfds++;
+ }
+ }
+
+ *g_poll_fds = pfds;
+ *g_poll_nfds = npfds;
+ *g_poll_fd_idx = idx;
+
+ return epollfd;
+}
+
+int qemu_epoll(AioContext *ctx, GPollFD *fds, guint nfds, int64_t timeout)
+{
+ struct EpollState *e = &ctx->epoll_state;
+
+ const int max_events = 40;
+ struct epoll_event events[max_events];
+ int ret = 0;
+ int r, i;
+
+ if (!e->last_fds || g_poll_fds_changed(fds, nfds,
+ e->last_fds, e->last_nfds)) {
+ if (e->last_fds) {
+ close(e->epollfd);
+ }
+ e->epollfd = epoll_create(1);
+ if (e->epollfd < 0) {
+ perror("epoll_create");
+ abort();
+ }
+ g_free(e->g_poll_fds);
+ g_free(e->g_poll_fd_idx);
+ e->epollfd = epoll_prepare(e->epollfd, fds, nfds,
+ &e->g_poll_fds,
+ &e->g_poll_nfds,
+ &e->g_poll_fd_idx);
+ g_free(e->last_fds);
+ e->last_fds = g_memdup(fds, nfds * sizeof(GPollFD));
+ e->last_nfds = nfds;
+ }
+ if (e->g_poll_nfds) {
+ ret = g_poll(e->g_poll_fds, e->g_poll_nfds,
+ qemu_timeout_ns_to_ms(timeout));
+ if (ret < 0) {
+ return ret;
+ }
+ /* Sync revents back to original fds */
+ for (i = 0; i < ret; i++) {
+ GPollFD *fd = &fds[e->g_poll_fd_idx[i]];
+ assert(fd->fd == e->g_poll_fds[i].fd);
+ fd->revents = e->g_poll_fds[i].revents;
+ }
+ }
+
+ r = epoll_wait(e->epollfd, events, max_events,
+ qemu_timeout_ns_to_ms(timeout));
+ if (r < 0) {
+ return r;
+ }
+
+ for (i = 0; i < r; i++) {
+ GPollFD *gpfd = events[i].data.ptr;
+ gpfd->revents = io_condition_from_epoll_events(events[i].events);
+ }
+
+ ret += r;
+ return ret;
+}
diff --git a/qemu-timer.c b/qemu-timer.c
index 7336b20..635be98 100644
--- a/qemu-timer.c
+++ b/qemu-timer.c
@@ -309,7 +309,9 @@ int qemu_timeout_ns_to_ms(int64_t ns)
*/
int qemu_poll_ns(AioContext *ctx, GPollFD *fds, guint nfds, int64_t timeout)
{
-#ifdef CONFIG_PPOLL
+#ifdef CONFIG_LINUX
+ return qemu_epoll(ctx, fds, nfds, timeout);
+#elif CONFIG_PPOLL
if (timeout < 0) {
return ppoll((struct pollfd *)fds, nfds, NULL, NULL);
} else {
diff --git a/tests/Makefile b/tests/Makefile
index f5de29c..96b9e4a 100644
--- a/tests/Makefile
+++ b/tests/Makefile
@@ -347,7 +347,7 @@ tests/usb-hcd-ohci-test$(EXESUF): tests/usb-hcd-ohci-test.o
tests/usb-hcd-uhci-test$(EXESUF): tests/usb-hcd-uhci-test.o
tests/usb-hcd-ehci-test$(EXESUF): tests/usb-hcd-ehci-test.o $(libqos-pc-obj-y)
tests/usb-hcd-xhci-test$(EXESUF): tests/usb-hcd-xhci-test.o
-tests/vhost-user-test$(EXESUF): tests/vhost-user-test.o qemu-char.o qemu-timer.o $(qtest-obj-y)
+tests/vhost-user-test$(EXESUF): tests/vhost-user-test.o qemu-char.o qemu-timer.o qemu-epoll.o $(qtest-obj-y)
tests/qemu-iotests/socket_scm_helper$(EXESUF): tests/qemu-iotests/socket_scm_helper.o
tests/test-qemu-opts$(EXESUF): tests/test-qemu-opts.o libqemuutil.a libqemustub.a
--
1.9.3
^ permalink raw reply related [flat|nested] 4+ messages in thread