From: Fam Zheng <famz@redhat.com>
To: qemu-devel@nongnu.org
Cc: kwolf@redhat.com, Paolo Bonzini <pbonzini@redhat.com>,
Stefan Hajnoczi <stefanha@redhat.com>
Subject: [Qemu-devel] [PATCH] main-loop: Use epoll on Linux
Date: Mon, 29 Sep 2014 13:26:29 +0800 [thread overview]
Message-ID: <1411968389-7353-1-git-send-email-famz@redhat.com> (raw)
A new implementation for qemu_poll_ns based on epoll is introduced here
to address the slowness of g_poll and ppoll when the number of fds are
high.
On my laptop this would reduce the virtio-blk on top of null-aio
device's response time from 32 us to 29 us with few fds (~10), and 48 us
to 32 us with more fds (for example when virtio-serial is plugged and
~64 more io handlers are enabled).
Signed-off-by: Fam Zheng <famz@redhat.com>
---
Makefile.objs | 1 +
include/qemu/main-loop.h | 1 +
qemu-epoll.c | 165 +++++++++++++++++++++++++++++++++++++++++++++++
qemu-timer.c | 4 +-
tests/Makefile | 2 +-
5 files changed, 171 insertions(+), 2 deletions(-)
create mode 100644 qemu-epoll.c
diff --git a/Makefile.objs b/Makefile.objs
index 97db978..52ee086 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -9,6 +9,7 @@ util-obj-y = util/ qobject/ qapi/ qapi-types.o qapi-visit.o qapi-event.o
block-obj-y = async.o thread-pool.o
block-obj-y += nbd.o block.o blockjob.o
block-obj-y += main-loop.o iohandler.o qemu-timer.o
+block-obj-$(CONFIG_LINUX) += qemu-epoll.o
block-obj-$(CONFIG_POSIX) += aio-posix.o
block-obj-$(CONFIG_WIN32) += aio-win32.o
block-obj-y += block/
diff --git a/include/qemu/main-loop.h b/include/qemu/main-loop.h
index 62c68c0..eb01b95 100644
--- a/include/qemu/main-loop.h
+++ b/include/qemu/main-loop.h
@@ -307,5 +307,6 @@ void qemu_iohandler_poll(GArray *pollfds, int rc);
QEMUBH *qemu_bh_new(QEMUBHFunc *cb, void *opaque);
void qemu_bh_schedule_idle(QEMUBH *bh);
+int qemu_epoll(GPollFD *fds, guint nfds, int64_t timeout);
#endif
diff --git a/qemu-epoll.c b/qemu-epoll.c
new file mode 100644
index 0000000..89ec12a
--- /dev/null
+++ b/qemu-epoll.c
@@ -0,0 +1,165 @@
+/*
+ * QEMU Event Loop
+ *
+ * Copyright (c) 2014 Red Hat, Inc.
+ *
+ * Authors:
+ * Fam Zheng <famz@redhat.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <sys/epoll.h>
+#include "qemu/main-loop.h"
+
+static bool g_poll_fds_changed(const GPollFD *fds_a, const guint nfds_a,
+ const GPollFD *fds_b, const guint nfds_b)
+{
+ int i;
+
+ if (nfds_a != nfds_b) {
+ return true;
+ }
+ if (!!fds_a != !!fds_b) {
+ return true;
+ }
+ for (i = 0; i < nfds_a; i++) {
+ if (fds_a[i].fd != fds_b[i].fd ||
+ fds_a[i].events != fds_b[i].events) {
+ return true;
+ }
+ }
+ return false;
+}
+
+static inline int g_io_condition_from_epoll_events(int e)
+{
+ return (e & EPOLLIN ? G_IO_IN : 0) |
+ (e & EPOLLOUT ? G_IO_OUT : 0) |
+ (e & EPOLLERR ? G_IO_ERR : 0) |
+ (e & EPOLLHUP ? G_IO_HUP : 0);
+}
+
+static inline void epoll_event_from_g_poll_fd(struct epoll_event *event,
+ GPollFD *fd)
+{
+ int e = fd->events;
+
+ event->events = (e & G_IO_IN ? EPOLLIN : 0) |
+ (e & G_IO_OUT ? EPOLLOUT : 0) |
+ (e & G_IO_ERR ? EPOLLERR : 0) |
+ (e & G_IO_HUP ? EPOLLHUP : 0);
+ event->data.ptr = fd;
+}
+
+static int epoll_prepare(int epollfd,
+ GPollFD *fds, guint nfds,
+ GPollFD **g_poll_fds,
+ guint *g_poll_nfds,
+ int **g_poll_fd_idx)
+{
+ int i;
+
+ GPollFD *pfds = NULL;
+ int npfds = 0;
+ int *idx = NULL;
+
+ for (i = 0; i < nfds; i++) {
+ int r;
+ struct epoll_event event;
+ epoll_event_from_g_poll_fd(&event, &fds[i]);
+
+ r = epoll_ctl(epollfd, EPOLL_CTL_ADD, fds[i].fd, &event);
+ if (r) {
+ /* Some fds may not support epoll, fall back and add them to
+ * ppoll_fds */
+ pfds = g_renew(GPollFD, pfds, npfds + 1);
+ pfds[npfds] = fds[i];
+ idx = g_renew(int, idx, npfds + 1);
+ idx[npfds] = i;
+ npfds++;
+ }
+ }
+
+ g_free(*g_poll_fds);
+ *g_poll_fds = pfds;
+ *g_poll_nfds = npfds;
+ *g_poll_fd_idx = idx;
+
+ return epollfd;
+}
+
+int qemu_epoll(GPollFD *fds, guint nfds, int64_t timeout)
+{
+ /* A copy of last fd array, used to skip epoll_prepare when nothing
+ * changed. */
+ static GPollFD *last_fds;
+ static guint last_nfds;
+ /* An array of fds that failed epoll_ctl and fall back to ppoll. Rare case
+ * too. */
+ static GPollFD *g_poll_fds;
+ static guint g_poll_nfds;
+ static int *g_poll_fd_idx;
+ static int epollfd = -1;
+ const int max_events = 40;
+ struct epoll_event events[max_events];
+ int ret = 0;
+ int r, i;
+
+ if (!last_fds || g_poll_fds_changed(fds, nfds, last_fds, last_nfds)) {
+ if (epollfd >= 0) {
+ close(epollfd);
+ }
+ epollfd = epoll_create(1);
+ if (epollfd < 0) {
+ perror("epoll_create");
+ abort();
+ }
+ epollfd = epoll_prepare(epollfd, fds, nfds, &g_poll_fds, &g_poll_nfds,
+ &g_poll_fd_idx);
+ last_fds = g_memdup(fds, nfds * sizeof(GPollFD));
+ last_nfds = nfds;
+ }
+ if (g_poll_nfds) {
+ ret = g_poll(g_poll_fds, g_poll_nfds, qemu_timeout_ns_to_ms(timeout));
+ if (ret < 0) {
+ return ret;
+ }
+ /* Sync revents back to original fds */
+ for (i = 0; i < ret; i++) {
+ GPollFD *fd = &fds[g_poll_fd_idx[i]];
+ assert(fd->fd == g_poll_fds[i].fd);
+ fd->revents = g_poll_fds[i].revents;
+ }
+ }
+
+ r = epoll_wait(epollfd, events, max_events,
+ qemu_timeout_ns_to_ms(timeout));
+ if (r < 0) {
+ return r;
+ }
+
+ for (i = 0; i < r; i++) {
+ GPollFD *gpfd = events[i].data.ptr;
+ gpfd->revents = g_io_condition_from_epoll_events(events[i].events);
+ }
+
+ ret += r;
+ return ret;
+}
diff --git a/qemu-timer.c b/qemu-timer.c
index 00a5d35..c100b8f 100644
--- a/qemu-timer.c
+++ b/qemu-timer.c
@@ -309,7 +309,9 @@ int qemu_timeout_ns_to_ms(int64_t ns)
*/
int qemu_poll_ns(GPollFD *fds, guint nfds, int64_t timeout)
{
-#ifdef CONFIG_PPOLL
+#ifdef CONFIG_LINUX
+ return qemu_epoll(fds, nfds, timeout);
+#elif CONFIG_PPOLL
if (timeout < 0) {
return ppoll((struct pollfd *)fds, nfds, NULL, NULL);
} else {
diff --git a/tests/Makefile b/tests/Makefile
index f5de29c..96b9e4a 100644
--- a/tests/Makefile
+++ b/tests/Makefile
@@ -347,7 +347,7 @@ tests/usb-hcd-ohci-test$(EXESUF): tests/usb-hcd-ohci-test.o
tests/usb-hcd-uhci-test$(EXESUF): tests/usb-hcd-uhci-test.o
tests/usb-hcd-ehci-test$(EXESUF): tests/usb-hcd-ehci-test.o $(libqos-pc-obj-y)
tests/usb-hcd-xhci-test$(EXESUF): tests/usb-hcd-xhci-test.o
-tests/vhost-user-test$(EXESUF): tests/vhost-user-test.o qemu-char.o qemu-timer.o $(qtest-obj-y)
+tests/vhost-user-test$(EXESUF): tests/vhost-user-test.o qemu-char.o qemu-timer.o qemu-epoll.o $(qtest-obj-y)
tests/qemu-iotests/socket_scm_helper$(EXESUF): tests/qemu-iotests/socket_scm_helper.o
tests/test-qemu-opts$(EXESUF): tests/test-qemu-opts.o libqemuutil.a libqemustub.a
--
1.9.3
next reply other threads:[~2014-09-29 5:26 UTC|newest]
Thread overview: 5+ messages / expand[flat|nested] mbox.gz Atom feed top
2014-09-29 5:26 Fam Zheng [this message]
2014-09-29 9:17 ` [Qemu-devel] [PATCH] main-loop: Use epoll on Linux Fam Zheng
2014-09-29 10:28 ` Stefan Hajnoczi
2014-09-30 2:51 ` Fam Zheng
2014-09-29 10:29 ` Stefan Hajnoczi
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1411968389-7353-1-git-send-email-famz@redhat.com \
--to=famz@redhat.com \
--cc=kwolf@redhat.com \
--cc=pbonzini@redhat.com \
--cc=qemu-devel@nongnu.org \
--cc=stefanha@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).