[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Qemu-devel] [PATCH v3 2/2] main-loop: Use epoll on Linux
From: |
Fam Zheng |
Subject: |
[Qemu-devel] [PATCH v3 2/2] main-loop: Use epoll on Linux |
Date: |
Mon, 27 Oct 2014 15:30:48 +0800 |
A new implementation for qemu_poll_ns based on epoll is introduced here
to address the slowness of g_poll and ppoll when the number of fds are
high.
On my laptop this would reduce the virtio-blk on top of null-aio
device's response time from 32 us to 29 us with few fds (~10), and 48 us
to 32 us with more fds (for example when virtio-serial is plugged and
~64 more io handlers are enabled).
Signed-off-by: Fam Zheng <address@hidden>
---
Makefile.objs | 1 +
include/qemu/main-loop.h | 1 +
include/qemu/timer.h | 10 +++
qemu-epoll.c | 161 +++++++++++++++++++++++++++++++++++++++++++++++
qemu-timer.c | 4 +-
tests/Makefile | 2 +-
6 files changed, 177 insertions(+), 2 deletions(-)
create mode 100644 qemu-epoll.c
diff --git a/Makefile.objs b/Makefile.objs
index 18fd35c..db4a487 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -9,6 +9,7 @@ util-obj-y = util/ qobject/ qapi/ qapi-types.o qapi-visit.o
qapi-event.o
block-obj-y = async.o thread-pool.o
block-obj-y += nbd.o block.o blockjob.o
block-obj-y += main-loop.o iohandler.o qemu-timer.o
+block-obj-$(CONFIG_LINUX) += qemu-epoll.o
block-obj-$(CONFIG_POSIX) += aio-posix.o
block-obj-$(CONFIG_WIN32) += aio-win32.o
block-obj-y += block/
diff --git a/include/qemu/main-loop.h b/include/qemu/main-loop.h
index 62c68c0..d51bf4d 100644
--- a/include/qemu/main-loop.h
+++ b/include/qemu/main-loop.h
@@ -307,5 +307,6 @@ void qemu_iohandler_poll(GArray *pollfds, int rc);
QEMUBH *qemu_bh_new(QEMUBHFunc *cb, void *opaque);
void qemu_bh_schedule_idle(QEMUBH *bh);
+int qemu_epoll(QEMUPollContext *ctx, GPollFD *fds, guint nfds, int64_t
timeout);
#endif
diff --git a/include/qemu/timer.h b/include/qemu/timer.h
index be2a4a3..4097bc0 100644
--- a/include/qemu/timer.h
+++ b/include/qemu/timer.h
@@ -646,6 +646,16 @@ void timer_put(QEMUFile *f, QEMUTimer *ts);
int qemu_timeout_ns_to_ms(int64_t ns);
typedef struct {
+ /* A copy of last fd array, used to skip epoll_prepare when nothing
+ * changed. */
+ GPollFD *last_fds;
+ guint last_nfds;
+ /* An array of fds that failed epoll_ctl and fall back to ppoll. Rare case
+ * too. */
+ GPollFD *g_poll_fds;
+ guint g_poll_nfds;
+ int *g_poll_fd_idx;
+ int epollfd;
} QEMUPollContext;
/**
diff --git a/qemu-epoll.c b/qemu-epoll.c
new file mode 100644
index 0000000..505a3be
--- /dev/null
+++ b/qemu-epoll.c
@@ -0,0 +1,161 @@
+/*
+ * QEMU Event Loop
+ *
+ * Copyright (c) 2014 Red Hat, Inc.
+ *
+ * Authors:
+ * Fam Zheng <address@hidden>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <sys/epoll.h>
+#include "qemu/main-loop.h"
+
+static bool g_poll_fds_changed(const GPollFD *fds_a, const guint nfds_a,
+ const GPollFD *fds_b, const guint nfds_b)
+{
+ int i;
+
+ if (nfds_a != nfds_b) {
+ return true;
+ }
+ if (!!fds_a != !!fds_b) {
+ return true;
+ }
+ for (i = 0; i < nfds_a; i++) {
+ if (fds_a[i].fd != fds_b[i].fd ||
+ fds_a[i].events != fds_b[i].events) {
+ return true;
+ }
+ }
+ return false;
+}
+
+static inline int io_condition_from_epoll_events(int e)
+{
+ return (e & EPOLLIN ? G_IO_IN : 0) |
+ (e & EPOLLOUT ? G_IO_OUT : 0) |
+ (e & EPOLLERR ? G_IO_ERR : 0) |
+ (e & EPOLLHUP ? G_IO_HUP : 0);
+}
+
+static inline void epoll_event_from_g_poll_fd(struct epoll_event *event,
+ GPollFD *fd)
+{
+ int e = fd->events;
+
+ event->events = (e & G_IO_IN ? EPOLLIN : 0) |
+ (e & G_IO_OUT ? EPOLLOUT : 0) |
+ (e & G_IO_ERR ? EPOLLERR : 0) |
+ (e & G_IO_HUP ? EPOLLHUP : 0);
+ event->data.ptr = fd;
+}
+
+static int epoll_prepare(int epollfd,
+ GPollFD *fds, guint nfds,
+ GPollFD **g_poll_fds,
+ guint *g_poll_nfds,
+ int **g_poll_fd_idx)
+{
+ int i;
+
+ GPollFD *pfds = NULL;
+ int npfds = 0;
+ int *idx = NULL;
+
+ for (i = 0; i < nfds; i++) {
+ int r;
+ struct epoll_event event;
+ epoll_event_from_g_poll_fd(&event, &fds[i]);
+
+ r = epoll_ctl(epollfd, EPOLL_CTL_ADD, fds[i].fd, &event);
+ if (r) {
+ /* Some fds may not support epoll, fall back and add them to
+ * ppoll_fds */
+ pfds = g_renew(GPollFD, pfds, npfds + 1);
+ pfds[npfds] = fds[i];
+ idx = g_renew(int, idx, npfds + 1);
+ idx[npfds] = i;
+ npfds++;
+ }
+ }
+
+ *g_poll_fds = pfds;
+ *g_poll_nfds = npfds;
+ *g_poll_fd_idx = idx;
+
+ return epollfd;
+}
+
+int qemu_epoll(QEMUPollContext *ctx, GPollFD *fds, guint nfds, int64_t timeout)
+{
+ const int max_events = 40;
+ struct epoll_event events[max_events];
+ int ret = 0;
+ int r, i;
+
+ if (!ctx->last_fds || g_poll_fds_changed(fds, nfds,
+ ctx->last_fds, ctx->last_nfds)) {
+ if (ctx->last_fds) {
+ close(ctx->epollfd);
+ }
+ ctx->epollfd = epoll_create(1);
+ if (ctx->epollfd < 0) {
+ perror("epoll_create");
+ abort();
+ }
+ g_free(ctx->g_poll_fds);
+ g_free(ctx->g_poll_fd_idx);
+ ctx->epollfd = epoll_prepare(ctx->epollfd, fds, nfds,
+ &ctx->g_poll_fds,
+ &ctx->g_poll_nfds,
+ &ctx->g_poll_fd_idx);
+ g_free(ctx->last_fds);
+ ctx->last_fds = g_memdup(fds, nfds * sizeof(GPollFD));
+ ctx->last_nfds = nfds;
+ }
+ if (ctx->g_poll_nfds) {
+ ret = g_poll(ctx->g_poll_fds, ctx->g_poll_nfds,
+ qemu_timeout_ns_to_ms(timeout));
+ if (ret < 0) {
+ return ret;
+ }
+ /* Sync revents back to original fds */
+ for (i = 0; i < ret; i++) {
+ GPollFD *fd = &fds[ctx->g_poll_fd_idx[i]];
+ assert(fd->fd == ctx->g_poll_fds[i].fd);
+ fd->revents = ctx->g_poll_fds[i].revents;
+ }
+ }
+
+ r = epoll_wait(ctx->epollfd, events, max_events,
+ qemu_timeout_ns_to_ms(timeout));
+ if (r < 0) {
+ return r;
+ }
+
+ for (i = 0; i < r; i++) {
+ GPollFD *gpfd = events[i].data.ptr;
+ gpfd->revents = io_condition_from_epoll_events(events[i].events);
+ }
+
+ ret += r;
+ return ret;
+}
diff --git a/qemu-timer.c b/qemu-timer.c
index fe78fdf..fbb9ded 100644
--- a/qemu-timer.c
+++ b/qemu-timer.c
@@ -310,7 +310,9 @@ int qemu_timeout_ns_to_ms(int64_t ns)
int qemu_poll_ns(QEMUPollContext *ctx, GPollFD *fds,
guint nfds, int64_t timeout)
{
-#ifdef CONFIG_PPOLL
+#ifdef CONFIG_LINUX
+ return qemu_epoll(ctx, fds, nfds, timeout);
+#elif CONFIG_PPOLL
if (timeout < 0) {
return ppoll((struct pollfd *)fds, nfds, NULL, NULL);
} else {
diff --git a/tests/Makefile b/tests/Makefile
index 16f0e4c..575ffd2 100644
--- a/tests/Makefile
+++ b/tests/Makefile
@@ -348,7 +348,7 @@ tests/usb-hcd-ohci-test$(EXESUF): tests/usb-hcd-ohci-test.o
$(libqos-usb-obj-y)
tests/usb-hcd-uhci-test$(EXESUF): tests/usb-hcd-uhci-test.o $(libqos-usb-obj-y)
tests/usb-hcd-ehci-test$(EXESUF): tests/usb-hcd-ehci-test.o $(libqos-usb-obj-y)
tests/usb-hcd-xhci-test$(EXESUF): tests/usb-hcd-xhci-test.o $(libqos-usb-obj-y)
-tests/vhost-user-test$(EXESUF): tests/vhost-user-test.o qemu-char.o
qemu-timer.o $(qtest-obj-y)
+tests/vhost-user-test$(EXESUF): tests/vhost-user-test.o qemu-char.o
qemu-timer.o qemu-epoll.o $(qtest-obj-y)
tests/qemu-iotests/socket_scm_helper$(EXESUF):
tests/qemu-iotests/socket_scm_helper.o
tests/test-qemu-opts$(EXESUF): tests/test-qemu-opts.o libqemuutil.a
libqemustub.a
--
1.9.3