* Re: [PATCH] Switch to poll in xenconsoled's io loop.
2013-01-03 17:14 [PATCH] Switch to poll in xenconsoled's io loop Wei Liu
@ 2013-01-03 18:22 ` Mats Petersson
2013-01-04 12:30 ` Wei Liu
2013-01-04 15:58 ` [PATCH V2] Switch from select() to poll() in xenconsoled's IO loop Wei Liu
` (2 subsequent siblings)
3 siblings, 1 reply; 21+ messages in thread
From: Mats Petersson @ 2013-01-03 18:22 UTC (permalink / raw)
To: xen-devel
On 03/01/13 17:14, Wei Liu wrote:
> The original implementation utilies select(). In Linux select() typically
> supports up to 1024 file descriptors. This can be a problem when user tries to
> boot up many guests. Switching to poll() has minimum impact on existing code
> and has better scalibility.
>
> Up to 8192 file descriptors are supported in the current implementation.
>
> Signed-off-by: Wei Liu <wei.liu2@citrix.com>
> ---
> tools/console/daemon/io.c | 90 +++++++++++++++++++++++++--------------------
> 1 file changed, 50 insertions(+), 40 deletions(-)
>
> diff --git a/tools/console/daemon/io.c b/tools/console/daemon/io.c
> index 48fe151..4e3c55c 100644
> --- a/tools/console/daemon/io.c
> +++ b/tools/console/daemon/io.c
> @@ -28,7 +28,7 @@
> #include <stdlib.h>
> #include <errno.h>
> #include <string.h>
> -#include <sys/select.h>
> +#include <poll.h>
> #include <fcntl.h>
> #include <unistd.h>
> #include <termios.h>
> @@ -930,7 +930,6 @@ static void handle_log_reload(void)
>
> void handle_io(void)
> {
> - fd_set readfds, writefds;
> int ret;
>
> if (log_hv) {
> @@ -959,21 +958,33 @@ void handle_io(void)
>
> for (;;) {
> struct domain *d, *n;
> - int max_fd = -1;
> - struct timeval timeout;
> + int poll_timeout; /* timeout in milliseconds */
> struct timespec ts;
> long long now, next_timeout = 0;
>
> - FD_ZERO(&readfds);
> - FD_ZERO(&writefds);
> -
> - FD_SET(xs_fileno(xs), &readfds);
> - max_fd = MAX(xs_fileno(xs), max_fd);
> -
> - if (log_hv) {
> - FD_SET(xc_evtchn_fd(xce_handle), &readfds);
> - max_fd = MAX(xc_evtchn_fd(xce_handle), max_fd);
> - }
> +#define MAX_POLL_FDS 8192
> + static struct pollfd fds[MAX_POLL_FDS];
> + static struct pollfd *fd_to_pollfd[MAX_POLL_FDS];
> + int nr_fds;
> +#define SET_FDS(_fd, _events) do { \
> + if (_fd >= MAX_POLL_FDS) \
> + break; \
> + fds[nr_fds].fd = (_fd); \
> + fds[nr_fds].events = (_events); \
> + fd_to_pollfd[(_fd)] = &fds[nr_fds]; \
> + nr_fds++; \
> + } while (0)
> +#define FD_REVENTS(_fd) (((_fd) < MAX_POLL_FDS && fd_to_pollfd[(_fd)]) ? \
> + fd_to_pollfd[(_fd)]->revents : 0)
> +
> + nr_fds = 0;
> + memset(fds, 0, sizeof(fds));
> + memset(fd_to_pollfd, 0, sizeof(fd_to_pollfd));
> +
> + SET_FDS(xs_fileno(xs), POLLIN);
> +
> + if (log_hv)
> + SET_FDS(xc_evtchn_fd(xce_handle), POLLIN);
>
Would it not make sense to use dynamically allocated memory instead -
that way, when we run out of 8192, there is nothing to change.
At the very least, can we have some sort of error output when it goes
above the limit, so that we don't just silently "miss" a few consoles
out from the poll.
--
Mats
> if (clock_gettime(CLOCK_MONOTONIC, &ts) < 0)
> return;
> @@ -982,11 +993,7 @@ void handle_io(void)
> /* Re-calculate any event counter allowances & unblock
> domains with new allowance */
> for (d = dom_head; d; d = d->next) {
> - /* Add 5ms of fuzz since select() often returns
> - a couple of ms sooner than requested. Without
> - the fuzz we typically do an extra spin in select()
> - with a 1/2 ms timeout every other iteration */
> - if ((now+5) > d->next_period) {
> + if (now > d->next_period) {
> d->next_period = now + RATE_LIMIT_PERIOD;
> if (d->event_count >= RATE_LIMIT_ALLOWANCE) {
> (void)xc_evtchn_unmask(d->xce_handle, d->local_port);
> @@ -1006,74 +1013,73 @@ void handle_io(void)
> !d->buffer.max_capacity ||
> d->buffer.size < d->buffer.max_capacity) {
> int evtchn_fd = xc_evtchn_fd(d->xce_handle);
> - FD_SET(evtchn_fd, &readfds);
> - max_fd = MAX(evtchn_fd, max_fd);
> + SET_FDS(evtchn_fd, POLLIN);
> }
> }
>
> if (d->master_fd != -1) {
> + short events = 0;
> if (!d->is_dead && ring_free_bytes(d))
> - FD_SET(d->master_fd, &readfds);
> + events |= POLLIN;
>
> if (!buffer_empty(&d->buffer))
> - FD_SET(d->master_fd, &writefds);
> - max_fd = MAX(d->master_fd, max_fd);
> + events |= POLLOUT;
> +
> + if (events)
> + SET_FDS(d->master_fd, events);
> }
> }
>
> /* If any domain has been rate limited, we need to work
> - out what timeout to supply to select */
> + out what timeout to supply to poll */
> if (next_timeout) {
> long long duration = (next_timeout - now);
> if (duration <= 0) /* sanity check */
> duration = 1;
> - timeout.tv_sec = duration / 1000;
> - timeout.tv_usec = ((duration - (timeout.tv_sec * 1000))
> - * 1000);
> + poll_timeout = (int)duration;
> }
>
> - ret = select(max_fd + 1, &readfds, &writefds, 0,
> - next_timeout ? &timeout : NULL);
> + ret = poll(fds, nr_fds, next_timeout ? poll_timeout : -1);
>
> if (log_reload) {
> handle_log_reload();
> log_reload = 0;
> }
>
> - /* Abort if select failed, except for EINTR cases
> + /* Abort if poll failed, except for EINTR cases
> which indicate a possible log reload */
> if (ret == -1) {
> if (errno == EINTR)
> continue;
> - dolog(LOG_ERR, "Failure in select: %d (%s)",
> + dolog(LOG_ERR, "Failure in poll: %d (%s)",
> errno, strerror(errno));
> break;
> }
>
> - if (log_hv && FD_ISSET(xc_evtchn_fd(xce_handle), &readfds))
> + if (log_hv && FD_REVENTS(xc_evtchn_fd(xce_handle)) & POLLIN)
> handle_hv_logs();
>
> if (ret <= 0)
> continue;
>
> - if (FD_ISSET(xs_fileno(xs), &readfds))
> + if (FD_REVENTS(xs_fileno(xs)) & POLLIN)
> handle_xs();
>
> for (d = dom_head; d; d = n) {
> n = d->next;
> if (d->event_count < RATE_LIMIT_ALLOWANCE) {
> if (d->xce_handle != NULL &&
> - FD_ISSET(xc_evtchn_fd(d->xce_handle),
> - &readfds))
> + FD_REVENTS(xc_evtchn_fd(d->xce_handle)) &
> + POLLIN)
> handle_ring_read(d);
> }
>
> - if (d->master_fd != -1 && FD_ISSET(d->master_fd,
> - &readfds))
> + if (d->master_fd != -1 &&
> + FD_REVENTS(d->master_fd) & POLLIN)
> handle_tty_read(d);
>
> - if (d->master_fd != -1 && FD_ISSET(d->master_fd,
> - &writefds))
> + if (d->master_fd != -1 &&
> + FD_REVENTS(d->master_fd) & POLLOUT)
> handle_tty_write(d);
>
> if (d->last_seen != enum_pass)
> @@ -1082,6 +1088,10 @@ void handle_io(void)
> if (d->is_dead)
> cleanup_domain(d);
> }
> +
> +#undef MAX_POLL_FDS
> +#undef SET_FDS
> +#undef FD_REVENTS
> }
>
> out:
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH] Switch to poll in xenconsoled's io loop.
2013-01-03 18:22 ` Mats Petersson
@ 2013-01-04 12:30 ` Wei Liu
0 siblings, 0 replies; 21+ messages in thread
From: Wei Liu @ 2013-01-04 12:30 UTC (permalink / raw)
To: Mats Petersson; +Cc: wei.liu2, xen-devel@lists.xen.org
On Thu, 2013-01-03 at 18:22 +0000, Mats Petersson wrote:
> On 03/01/13 17:14, Wei Liu wrote:
> > The original implementation utilies select(). In Linux select() typically
> > supports up to 1024 file descriptors. This can be a problem when user tries to
> > boot up many guests. Switching to poll() has minimum impact on existing code
> > and has better scalibility.
> >
> > Up to 8192 file descriptors are supported in the current implementation.
> >
> > Signed-off-by: Wei Liu <wei.liu2@citrix.com>
> > ---
> > tools/console/daemon/io.c | 90 +++++++++++++++++++++++++--------------------
> > 1 file changed, 50 insertions(+), 40 deletions(-)
> >
> > diff --git a/tools/console/daemon/io.c b/tools/console/daemon/io.c
> > index 48fe151..4e3c55c 100644
> > --- a/tools/console/daemon/io.c
> > +++ b/tools/console/daemon/io.c
> > @@ -28,7 +28,7 @@
> > #include <stdlib.h>
> > #include <errno.h>
> > #include <string.h>
> > -#include <sys/select.h>
> > +#include <poll.h>
> > #include <fcntl.h>
> > #include <unistd.h>
> > #include <termios.h>
> > @@ -930,7 +930,6 @@ static void handle_log_reload(void)
> >
> > void handle_io(void)
> > {
> > - fd_set readfds, writefds;
> > int ret;
> >
> > if (log_hv) {
> > @@ -959,21 +958,33 @@ void handle_io(void)
> >
> > for (;;) {
> > struct domain *d, *n;
> > - int max_fd = -1;
> > - struct timeval timeout;
> > + int poll_timeout; /* timeout in milliseconds */
> > struct timespec ts;
> > long long now, next_timeout = 0;
> >
> > - FD_ZERO(&readfds);
> > - FD_ZERO(&writefds);
> > -
> > - FD_SET(xs_fileno(xs), &readfds);
> > - max_fd = MAX(xs_fileno(xs), max_fd);
> > -
> > - if (log_hv) {
> > - FD_SET(xc_evtchn_fd(xce_handle), &readfds);
> > - max_fd = MAX(xc_evtchn_fd(xce_handle), max_fd);
> > - }
> > +#define MAX_POLL_FDS 8192
> > + static struct pollfd fds[MAX_POLL_FDS];
> > + static struct pollfd *fd_to_pollfd[MAX_POLL_FDS];
> > + int nr_fds;
> > +#define SET_FDS(_fd, _events) do { \
> > + if (_fd >= MAX_POLL_FDS) \
> > + break; \
> > + fds[nr_fds].fd = (_fd); \
> > + fds[nr_fds].events = (_events); \
> > + fd_to_pollfd[(_fd)] = &fds[nr_fds]; \
> > + nr_fds++; \
> > + } while (0)
> > +#define FD_REVENTS(_fd) (((_fd) < MAX_POLL_FDS && fd_to_pollfd[(_fd)]) ? \
> > + fd_to_pollfd[(_fd)]->revents : 0)
> > +
> > + nr_fds = 0;
> > + memset(fds, 0, sizeof(fds));
> > + memset(fd_to_pollfd, 0, sizeof(fd_to_pollfd));
> > +
> > + SET_FDS(xs_fileno(xs), POLLIN);
> > +
> > + if (log_hv)
> > + SET_FDS(xc_evtchn_fd(xce_handle), POLLIN);
> >
> Would it not make sense to use dynamically allocated memory instead -
> that way, when we run out of 8192, there is nothing to change.
Writing a new version to use dynamically allocated memory.
Wei.
^ permalink raw reply [flat|nested] 21+ messages in thread
* [PATCH V2] Switch from select() to poll() in xenconsoled's IO loop.
2013-01-03 17:14 [PATCH] Switch to poll in xenconsoled's io loop Wei Liu
2013-01-03 18:22 ` Mats Petersson
@ 2013-01-04 15:58 ` Wei Liu
2013-01-04 16:08 ` Ian Campbell
2013-01-04 17:17 ` [PATCH V3] " Wei Liu
2013-01-07 14:28 ` [PATCH V4] " Wei Liu
3 siblings, 1 reply; 21+ messages in thread
From: Wei Liu @ 2013-01-04 15:58 UTC (permalink / raw)
To: xen-devel@lists.xen.org; +Cc: wei.liu2
In Linux select() typically supports up to 1024 file descriptors. This can be
a problem when user tries to boot up many guests. Switching to poll() has
minimum impact on existing code and has better scalibility.
Tracking arrays are dynamically allocated / reallocated. If the tracking
arrays fail to expand, we just ignore the incoming fd.
Signed-off-by: Wei Liu <wei.liu2@citrix.com>
---
tools/console/daemon/io.c | 159 ++++++++++++++++++++++++++++++++++-----------
1 file changed, 121 insertions(+), 38 deletions(-)
diff --git a/tools/console/daemon/io.c b/tools/console/daemon/io.c
index 48fe151..830fc18 100644
--- a/tools/console/daemon/io.c
+++ b/tools/console/daemon/io.c
@@ -28,7 +28,7 @@
#include <stdlib.h>
#include <errno.h>
#include <string.h>
-#include <sys/select.h>
+#include <poll.h>
#include <fcntl.h>
#include <unistd.h>
#include <termios.h>
@@ -928,9 +928,98 @@ static void handle_log_reload(void)
}
}
+
+/* Should have at least max_fd + 1 elements */
+#define DEFAULT_ARRAY_SIZE 1024
+#define GROWTH_LENGTH 512
+static struct pollfd *fds;
+static struct pollfd **fd_to_pollfd;
+static unsigned int current_array_size;
+static unsigned int nr_fds;
+
+static int initialize_pollfd_arrays(void)
+{
+ fds = (struct pollfd *)
+ malloc(sizeof(struct pollfd) * DEFAULT_ARRAY_SIZE);
+ if (!fds)
+ goto fail;
+ fd_to_pollfd = (struct pollfd **)
+ malloc(sizeof(struct pollfd *) * DEFAULT_ARRAY_SIZE);
+ if (!fd_to_pollfd)
+ goto fail;
+ memset(fds, 0, sizeof(struct pollfd) * DEFAULT_ARRAY_SIZE);
+ memset(fd_to_pollfd, 0, sizeof(struct pollfd *) * DEFAULT_ARRAY_SIZE);
+ current_array_size = DEFAULT_ARRAY_SIZE;
+ return 0;
+fail:
+ free(fds);
+ free(fd_to_pollfd);
+ return -ENOMEM;
+}
+
+static void destroy_pollfd_arrays(void)
+{
+ free(fds);
+ free(fd_to_pollfd);
+ current_array_size = 0;
+}
+
+static void set_fds(int fd, short events)
+{
+ if (current_array_size < fd+1) {
+ struct pollfd *p1 = NULL;
+ struct pollfd **p2 = NULL;
+ unsigned int newsize = current_array_size;
+
+ do { newsize += GROWTH_LENGTH; } while (newsize < fd+1);
+
+ p1 = realloc(fds, sizeof(struct pollfd)*newsize);
+ if (!p1)
+ goto fail;
+ fds = p1;
+
+ p2 = realloc(fd_to_pollfd, sizeof(struct pollfd *)*newsize);
+ if (!p2)
+ goto fail;
+ fd_to_pollfd = p2;
+
+ memset(&fds[0] + current_array_size, 0,
+ sizeof(struct pollfd) * (newsize-current_array_size));
+ memset(&fd_to_pollfd[0] + current_array_size, 0,
+ sizeof(struct pollfd *) * (newsize-current_array_size));
+ current_array_size = newsize;
+ }
+
+ fds[nr_fds].fd = fd;
+ fds[nr_fds].events = events;
+ fd_to_pollfd[fd] = &fds[nr_fds];
+ nr_fds++;
+
+ return;
+fail:
+ dolog(LOG_ERR, "realloc failed, ignoring fd %d\n", fd);
+ return;
+}
+
+static short fd_revents(int fd)
+{
+ if (fd >= current_array_size)
+ return 0;
+ if (fd_to_pollfd[fd] == NULL)
+ return 0;
+ return fd_to_pollfd[fd]->revents;
+}
+
+static void reset_fds(void)
+{
+ nr_fds = 0;
+ memset(fds, 0, sizeof(struct pollfd) * current_array_size);
+ memset(fd_to_pollfd, 0,
+ sizeof(struct pollfd *) * current_array_size);
+}
+
void handle_io(void)
{
- fd_set readfds, writefds;
int ret;
if (log_hv) {
@@ -957,23 +1046,21 @@ void handle_io(void)
}
}
+ if (initialize_pollfd_arrays())
+ goto out;
+
for (;;) {
struct domain *d, *n;
- int max_fd = -1;
- struct timeval timeout;
+ int poll_timeout; /* timeout in milliseconds */
struct timespec ts;
long long now, next_timeout = 0;
- FD_ZERO(&readfds);
- FD_ZERO(&writefds);
+ reset_fds();
- FD_SET(xs_fileno(xs), &readfds);
- max_fd = MAX(xs_fileno(xs), max_fd);
+ set_fds(xs_fileno(xs), POLLIN);
- if (log_hv) {
- FD_SET(xc_evtchn_fd(xce_handle), &readfds);
- max_fd = MAX(xc_evtchn_fd(xce_handle), max_fd);
- }
+ if (log_hv)
+ set_fds(xc_evtchn_fd(xce_handle), POLLIN);
if (clock_gettime(CLOCK_MONOTONIC, &ts) < 0)
return;
@@ -982,11 +1069,7 @@ void handle_io(void)
/* Re-calculate any event counter allowances & unblock
domains with new allowance */
for (d = dom_head; d; d = d->next) {
- /* Add 5ms of fuzz since select() often returns
- a couple of ms sooner than requested. Without
- the fuzz we typically do an extra spin in select()
- with a 1/2 ms timeout every other iteration */
- if ((now+5) > d->next_period) {
+ if (now > d->next_period) {
d->next_period = now + RATE_LIMIT_PERIOD;
if (d->event_count >= RATE_LIMIT_ALLOWANCE) {
(void)xc_evtchn_unmask(d->xce_handle, d->local_port);
@@ -1006,74 +1089,73 @@ void handle_io(void)
!d->buffer.max_capacity ||
d->buffer.size < d->buffer.max_capacity) {
int evtchn_fd = xc_evtchn_fd(d->xce_handle);
- FD_SET(evtchn_fd, &readfds);
- max_fd = MAX(evtchn_fd, max_fd);
+ set_fds(evtchn_fd, POLLIN);
}
}
if (d->master_fd != -1) {
+ short events = 0;
if (!d->is_dead && ring_free_bytes(d))
- FD_SET(d->master_fd, &readfds);
+ events |= POLLIN;
if (!buffer_empty(&d->buffer))
- FD_SET(d->master_fd, &writefds);
- max_fd = MAX(d->master_fd, max_fd);
+ events |= POLLOUT;
+
+ if (events)
+ set_fds(d->master_fd, events);
}
}
/* If any domain has been rate limited, we need to work
- out what timeout to supply to select */
+ out what timeout to supply to poll */
if (next_timeout) {
long long duration = (next_timeout - now);
if (duration <= 0) /* sanity check */
duration = 1;
- timeout.tv_sec = duration / 1000;
- timeout.tv_usec = ((duration - (timeout.tv_sec * 1000))
- * 1000);
+ poll_timeout = (int)duration;
}
- ret = select(max_fd + 1, &readfds, &writefds, 0,
- next_timeout ? &timeout : NULL);
+ ret = poll(fds, nr_fds, next_timeout ? poll_timeout : -1);
if (log_reload) {
handle_log_reload();
log_reload = 0;
}
- /* Abort if select failed, except for EINTR cases
+ /* Abort if poll failed, except for EINTR cases
which indicate a possible log reload */
if (ret == -1) {
if (errno == EINTR)
continue;
- dolog(LOG_ERR, "Failure in select: %d (%s)",
+ dolog(LOG_ERR, "Failure in poll: %d (%s)",
errno, strerror(errno));
break;
}
- if (log_hv && FD_ISSET(xc_evtchn_fd(xce_handle), &readfds))
+ if (log_hv && fd_revents(xc_evtchn_fd(xce_handle)) & POLLIN)
handle_hv_logs();
if (ret <= 0)
continue;
- if (FD_ISSET(xs_fileno(xs), &readfds))
+ if (fd_revents(xs_fileno(xs)) & POLLIN)
handle_xs();
for (d = dom_head; d; d = n) {
n = d->next;
if (d->event_count < RATE_LIMIT_ALLOWANCE) {
if (d->xce_handle != NULL &&
- FD_ISSET(xc_evtchn_fd(d->xce_handle),
- &readfds))
+ fd_revents(xc_evtchn_fd(d->xce_handle)) &
+ POLLIN)
handle_ring_read(d);
}
- if (d->master_fd != -1 && FD_ISSET(d->master_fd,
- &readfds))
+ if (d->master_fd != -1 &&
+ fd_revents(d->master_fd) & POLLIN)
handle_tty_read(d);
- if (d->master_fd != -1 && FD_ISSET(d->master_fd,
- &writefds))
+ if (d->master_fd != -1 &&
+ fd_revents(d->master_fd) & POLLOUT)
handle_tty_write(d);
if (d->last_seen != enum_pass)
@@ -1084,6 +1166,7 @@ void handle_io(void)
}
}
+ destroy_pollfd_arrays();
out:
if (log_hv_fd != -1) {
close(log_hv_fd);
--
1.7.10.4
^ permalink raw reply related [flat|nested] 21+ messages in thread
* Re: [PATCH V2] Switch from select() to poll() in xenconsoled's IO loop.
2013-01-04 15:58 ` [PATCH V2] Switch from select() to poll() in xenconsoled's IO loop Wei Liu
@ 2013-01-04 16:08 ` Ian Campbell
2013-01-04 16:38 ` Wei Liu
0 siblings, 1 reply; 21+ messages in thread
From: Ian Campbell @ 2013-01-04 16:08 UTC (permalink / raw)
To: Wei Liu; +Cc: xen-devel@lists.xen.org
> +static int initialize_pollfd_arrays(void)
> +{
> + fds = (struct pollfd *)
> + malloc(sizeof(struct pollfd) * DEFAULT_ARRAY_SIZE);
> + if (!fds)
> + goto fail;
> + fd_to_pollfd = (struct pollfd **)
> + malloc(sizeof(struct pollfd *) * DEFAULT_ARRAY_SIZE);
> + if (!fd_to_pollfd)
> + goto fail;
> + memset(fds, 0, sizeof(struct pollfd) * DEFAULT_ARRAY_SIZE);
> + memset(fd_to_pollfd, 0, sizeof(struct pollfd *) * DEFAULT_ARRAY_SIZE);
> + current_array_size = DEFAULT_ARRAY_SIZE;
> + return 0;
> +fail:
> + free(fds);
> + free(fd_to_pollfd);
> + return -ENOMEM;
> +}
> +
> +static void destroy_pollfd_arrays(void)
> +{
> + free(fds);
> + free(fd_to_pollfd);
> + current_array_size = 0;
> +}
> +
> +static void set_fds(int fd, short events)
> +{
> + if (current_array_size < fd+1) {
> + struct pollfd *p1 = NULL;
> + struct pollfd **p2 = NULL;
> + unsigned int newsize = current_array_size;
> +
> + do { newsize += GROWTH_LENGTH; } while (newsize < fd+1);
Steal #define ROUNDUP from tools/libxc/xc_linux_osdep.c.
> +
> + p1 = realloc(fds, sizeof(struct pollfd)*newsize);
> + if (!p1)
> + goto fail;
> + fds = p1;
> +
> + p2 = realloc(fd_to_pollfd, sizeof(struct pollfd *)*newsize);
realloc(NULL, ...) is the same as malloc() so I think you can initialise
current_array_size to 0 and the various pointers to NULL and avoid the
need for initialize_pollfd_arrays -- i.e. it will just grow from 0 on
the first use here.
> for (;;) {
> struct domain *d, *n;
> - int max_fd = -1;
> - struct timeval timeout;
> + int poll_timeout; /* timeout in milliseconds */
> struct timespec ts;
> long long now, next_timeout = 0;
>
> - FD_ZERO(&readfds);
> - FD_ZERO(&writefds);
> + reset_fds();
>
> - FD_SET(xs_fileno(xs), &readfds);
> - max_fd = MAX(xs_fileno(xs), max_fd);
> + set_fds(xs_fileno(xs), POLLIN);
Do you know, or can you track, the maximum fd over time?
If you can then you could likely make use of automatic stack allocations
(struct pollfd fds[max_fd]) and therefore avoid the pain of manual
memory management.
Not sure what the semantics of those are inside a for loop where max_fd
can change but worst case you could put the content of the loop into a
function.
Ian.
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH V2] Switch from select() to poll() in xenconsoled's IO loop.
2013-01-04 16:08 ` Ian Campbell
@ 2013-01-04 16:38 ` Wei Liu
2013-01-04 16:51 ` Mats Petersson
0 siblings, 1 reply; 21+ messages in thread
From: Wei Liu @ 2013-01-04 16:38 UTC (permalink / raw)
To: Ian Campbell; +Cc: wei.liu2, xen-devel@lists.xen.org
On Fri, 2013-01-04 at 16:08 +0000, Ian Campbell wrote:
> > +static int initialize_pollfd_arrays(void)
> > +{
> > + fds = (struct pollfd *)
> > + malloc(sizeof(struct pollfd) * DEFAULT_ARRAY_SIZE);
> > + if (!fds)
> > + goto fail;
> > + fd_to_pollfd = (struct pollfd **)
> > + malloc(sizeof(struct pollfd *) * DEFAULT_ARRAY_SIZE);
> > + if (!fd_to_pollfd)
> > + goto fail;
> > + memset(fds, 0, sizeof(struct pollfd) * DEFAULT_ARRAY_SIZE);
> > + memset(fd_to_pollfd, 0, sizeof(struct pollfd *) * DEFAULT_ARRAY_SIZE);
> > + current_array_size = DEFAULT_ARRAY_SIZE;
> > + return 0;
> > +fail:
> > + free(fds);
> > + free(fd_to_pollfd);
> > + return -ENOMEM;
> > +}
> > +
> > +static void destroy_pollfd_arrays(void)
> > +{
> > + free(fds);
> > + free(fd_to_pollfd);
> > + current_array_size = 0;
> > +}
> > +
> > +static void set_fds(int fd, short events)
> > +{
> > + if (current_array_size < fd+1) {
> > + struct pollfd *p1 = NULL;
> > + struct pollfd **p2 = NULL;
> > + unsigned int newsize = current_array_size;
> > +
> > + do { newsize += GROWTH_LENGTH; } while (newsize < fd+1);
>
> Steal #define ROUNDUP from tools/libxc/xc_linux_osdep.c.
> > +
> > + p1 = realloc(fds, sizeof(struct pollfd)*newsize);
> > + if (!p1)
> > + goto fail;
> > + fds = p1;
> > +
> > + p2 = realloc(fd_to_pollfd, sizeof(struct pollfd *)*newsize);
>
> realloc(NULL, ...) is the same as malloc() so I think you can initialise
> current_array_size to 0 and the various pointers to NULL and avoid the
> need for initialize_pollfd_arrays -- i.e. it will just grow from 0 on
> the first use here.
>
Huh, good idea.
> > for (;;) {
> > struct domain *d, *n;
> > - int max_fd = -1;
> > - struct timeval timeout;
> > + int poll_timeout; /* timeout in milliseconds */
> > struct timespec ts;
> > long long now, next_timeout = 0;
> >
> > - FD_ZERO(&readfds);
> > - FD_ZERO(&writefds);
> > + reset_fds();
> >
> > - FD_SET(xs_fileno(xs), &readfds);
> > - max_fd = MAX(xs_fileno(xs), max_fd);
> > + set_fds(xs_fileno(xs), POLLIN);
>
> Do you know, or can you track, the maximum fd over time?
> If you can then you could likely make use of automatic stack allocations
> (struct pollfd fds[max_fd]) and therefore avoid the pain of manual
> memory management.
Stack size is subject to system setting. Typically it is limited to 8MB
in Linux as shown by `ulimit -s`. This is actually very small compared
to heap space.
Of course we can make xenconsoled special among all the processes... But
that's not ideal.
Wei.
> Not sure what the semantics of those are inside a for loop where max_fd
> can change but worst case you could put the content of the loop into a
> function.
>
> Ian.
>
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH V2] Switch from select() to poll() in xenconsoled's IO loop.
2013-01-04 16:38 ` Wei Liu
@ 2013-01-04 16:51 ` Mats Petersson
0 siblings, 0 replies; 21+ messages in thread
From: Mats Petersson @ 2013-01-04 16:51 UTC (permalink / raw)
To: xen-devel
On 04/01/13 16:38, Wei Liu wrote:
> On Fri, 2013-01-04 at 16:08 +0000, Ian Campbell wrote:
>>> +static int initialize_pollfd_arrays(void)
>>> +{
>>> + fds = (struct pollfd *)
>>> + malloc(sizeof(struct pollfd) * DEFAULT_ARRAY_SIZE);
>>> + if (!fds)
>>> + goto fail;
>>> + fd_to_pollfd = (struct pollfd **)
>>> + malloc(sizeof(struct pollfd *) * DEFAULT_ARRAY_SIZE);
I believe it's considered "bad" to cast the result of malloc... Unless
you are using a C++ compiler to compile C - in which case it's
necessary. But then it should really be converted to "new" and "delete"
where relevant. [Although that does cause problems with realloc!]
I do notice you are not casting realloc, so I expect it's safe to remove
the casts here too.
Of course, if you remove these calls to malloc and just use realloc in
the first place, you can ignore this comment! ;)
>>> + if (!fd_to_pollfd)
>>> + goto fail;
>>> + memset(fds, 0, sizeof(struct pollfd) * DEFAULT_ARRAY_SIZE);
>>> + memset(fd_to_pollfd, 0, sizeof(struct pollfd *) * DEFAULT_ARRAY_SIZE);
>>> + current_array_size = DEFAULT_ARRAY_SIZE;
>>> + return 0;
>>> +fail:
>>> + free(fds);
>>> + free(fd_to_pollfd);
>>> + return -ENOMEM;
>>> +}
>>> +
>>> +static void destroy_pollfd_arrays(void)
>>> +{
>>> + free(fds);
>>> + free(fd_to_pollfd);
>>> + current_array_size = 0;
>>> +}
>>> +
>>> +static void set_fds(int fd, short events)
>>> +{
>>> + if (current_array_size < fd+1) {
>>> + struct pollfd *p1 = NULL;
>>> + struct pollfd **p2 = NULL;
>>> + unsigned int newsize = current_array_size;
>>> +
>>> + do { newsize += GROWTH_LENGTH; } while (newsize < fd+1);
>> Steal #define ROUNDUP from tools/libxc/xc_linux_osdep.c.
>>> +
>>> + p1 = realloc(fds, sizeof(struct pollfd)*newsize);
>>> + if (!p1)
>>> + goto fail;
>>> + fds = p1;
>>> +
>>> + p2 = realloc(fd_to_pollfd, sizeof(struct pollfd *)*newsize);
>> realloc(NULL, ...) is the same as malloc() so I think you can initialise
>> current_array_size to 0 and the various pointers to NULL and avoid the
>> need for initialize_pollfd_arrays -- i.e. it will just grow from 0 on
>> the first use here.
>>
> Huh, good idea.
>
>>> for (;;) {
>>> struct domain *d, *n;
>>> - int max_fd = -1;
>>> - struct timeval timeout;
>>> + int poll_timeout; /* timeout in milliseconds */
>>> struct timespec ts;
>>> long long now, next_timeout = 0;
>>>
>>> - FD_ZERO(&readfds);
>>> - FD_ZERO(&writefds);
>>> + reset_fds();
>>>
>>> - FD_SET(xs_fileno(xs), &readfds);
>>> - max_fd = MAX(xs_fileno(xs), max_fd);
>>> + set_fds(xs_fileno(xs), POLLIN);
>> Do you know, or can you track, the maximum fd over time?
>> If you can then you could likely make use of automatic stack allocations
>> (struct pollfd fds[max_fd]) and therefore avoid the pain of manual
>> memory management.
> Stack size is subject to system setting. Typically it is limited to 8MB
> in Linux as shown by `ulimit -s`. This is actually very small compared
> to heap space.
Not to mention that if you run out of heapspace, you can "do something
about it" (even if it's just print an error message and exit, it's
better than "Stack overflow crash with no message at all").
I prefer allocations for that reason.
On the other hand, the number of fds we can fit in 8MB (or 4MB) is
probably more than we'd ever get from running many guests with
consoles.... And it wouldn't be hard to add a "set stack size to 16MB"
or something like that as part of "start xenconsoled".
--
Mats
>
> Of course we can make xenconsoled special among all the processes... But
> that's not ideal.
>
>
> Wei.
>
>> Not sure what the semantics of those are inside a for loop where max_fd
>> can change but worst case you could put the content of the loop into a
>> function.
>>
>> Ian.
>>
>
>
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> http://lists.xen.org/xen-devel
>
>
^ permalink raw reply [flat|nested] 21+ messages in thread
* [PATCH V3] Switch from select() to poll() in xenconsoled's IO loop.
2013-01-03 17:14 [PATCH] Switch to poll in xenconsoled's io loop Wei Liu
2013-01-03 18:22 ` Mats Petersson
2013-01-04 15:58 ` [PATCH V2] Switch from select() to poll() in xenconsoled's IO loop Wei Liu
@ 2013-01-04 17:17 ` Wei Liu
2013-01-07 10:20 ` Ian Campbell
2013-01-07 14:28 ` [PATCH V4] " Wei Liu
3 siblings, 1 reply; 21+ messages in thread
From: Wei Liu @ 2013-01-04 17:17 UTC (permalink / raw)
To: xen-devel@lists.xen.org; +Cc: wei.liu2
In Linux select() typically supports up to 1024 file descriptors. This can be
a problem when user tries to boot up many guests. Switching to poll() has
minimum impact on existing code and has better scalibility.
Tracking arrays are dynamically allocated / reallocated. If the tracking
arrays fail to expand, we just ignore the incoming fd.
Change from V2:
* remove unnecessary malloc in initialize_pollfd_arrays
* use ROUND_UP to get new size of arrays
Signed-off-by: Wei Liu <wei.liu2@citrix.com>
---
tools/console/daemon/io.c | 147 +++++++++++++++++++++++++++++++++------------
1 file changed, 109 insertions(+), 38 deletions(-)
diff --git a/tools/console/daemon/io.c b/tools/console/daemon/io.c
index 48fe151..0c53d30 100644
--- a/tools/console/daemon/io.c
+++ b/tools/console/daemon/io.c
@@ -28,7 +28,7 @@
#include <stdlib.h>
#include <errno.h>
#include <string.h>
-#include <sys/select.h>
+#include <poll.h>
#include <fcntl.h>
#include <unistd.h>
#include <termios.h>
@@ -928,9 +928,87 @@ static void handle_log_reload(void)
}
}
+
+/* Should have at least max_fd + 1 elements */
+static struct pollfd *fds;
+static struct pollfd **fd_to_pollfd;
+static unsigned int current_array_size;
+static unsigned int nr_fds;
+
+static void initialize_pollfd_arrays(void)
+{
+ fds = NULL;
+ fd_to_pollfd = NULL;
+ current_array_size = 0;
+}
+
+static void destroy_pollfd_arrays(void)
+{
+ free(fds);
+ free(fd_to_pollfd);
+ current_array_size = 0;
+}
+
+#define ROUNDUP(_x,_w) (((unsigned long)(_x)+(1UL<<(_w))-1) & ~((1UL<<(_w))-1))
+static void set_fds(int fd, short events)
+{
+ if (current_array_size < fd+1) {
+ struct pollfd *p1 = NULL;
+ struct pollfd **p2 = NULL;
+ unsigned long newsize;
+
+ /* Round up to 2^8 boundary, in practice this just
+ * make newsize larger than current_array_size.
+ */
+ newsize = ROUNDUP(fd+1, 8);
+
+ p1 = realloc(fds, sizeof(struct pollfd)*newsize);
+ if (!p1)
+ goto fail;
+ fds = p1;
+
+ p2 = realloc(fd_to_pollfd, sizeof(struct pollfd *)*newsize);
+ if (!p2)
+ goto fail;
+ fd_to_pollfd = p2;
+
+ memset(&fds[0] + current_array_size, 0,
+ sizeof(struct pollfd) * (newsize-current_array_size));
+ memset(&fd_to_pollfd[0] + current_array_size, 0,
+ sizeof(struct pollfd *) * (newsize-current_array_size));
+ current_array_size = newsize;
+ }
+
+ fds[nr_fds].fd = fd;
+ fds[nr_fds].events = events;
+ fd_to_pollfd[fd] = &fds[nr_fds];
+ nr_fds++;
+
+ return;
+fail:
+ dolog(LOG_ERR, "realloc failed, ignoring fd %d\n", fd);
+ return;
+}
+
+static short fd_revents(int fd)
+{
+ if (fd >= current_array_size)
+ return 0;
+ if (fd_to_pollfd[fd] == NULL)
+ return 0;
+ return fd_to_pollfd[fd]->revents;
+}
+
+static void reset_fds(void)
+{
+ nr_fds = 0;
+ memset(fds, 0, sizeof(struct pollfd) * current_array_size);
+ memset(fd_to_pollfd, 0,
+ sizeof(struct pollfd *) * current_array_size);
+}
+
void handle_io(void)
{
- fd_set readfds, writefds;
int ret;
if (log_hv) {
@@ -957,23 +1035,20 @@ void handle_io(void)
}
}
+ initialize_pollfd_arrays();
+
for (;;) {
struct domain *d, *n;
- int max_fd = -1;
- struct timeval timeout;
+ int poll_timeout; /* timeout in milliseconds */
struct timespec ts;
long long now, next_timeout = 0;
- FD_ZERO(&readfds);
- FD_ZERO(&writefds);
+ reset_fds();
- FD_SET(xs_fileno(xs), &readfds);
- max_fd = MAX(xs_fileno(xs), max_fd);
+ set_fds(xs_fileno(xs), POLLIN);
- if (log_hv) {
- FD_SET(xc_evtchn_fd(xce_handle), &readfds);
- max_fd = MAX(xc_evtchn_fd(xce_handle), max_fd);
- }
+ if (log_hv)
+ set_fds(xc_evtchn_fd(xce_handle), POLLIN);
if (clock_gettime(CLOCK_MONOTONIC, &ts) < 0)
return;
@@ -982,11 +1057,7 @@ void handle_io(void)
/* Re-calculate any event counter allowances & unblock
domains with new allowance */
for (d = dom_head; d; d = d->next) {
- /* Add 5ms of fuzz since select() often returns
- a couple of ms sooner than requested. Without
- the fuzz we typically do an extra spin in select()
- with a 1/2 ms timeout every other iteration */
- if ((now+5) > d->next_period) {
+ if (now > d->next_period) {
d->next_period = now + RATE_LIMIT_PERIOD;
if (d->event_count >= RATE_LIMIT_ALLOWANCE) {
(void)xc_evtchn_unmask(d->xce_handle, d->local_port);
@@ -1006,74 +1077,73 @@ void handle_io(void)
!d->buffer.max_capacity ||
d->buffer.size < d->buffer.max_capacity) {
int evtchn_fd = xc_evtchn_fd(d->xce_handle);
- FD_SET(evtchn_fd, &readfds);
- max_fd = MAX(evtchn_fd, max_fd);
+ set_fds(evtchn_fd, POLLIN);
}
}
if (d->master_fd != -1) {
+ short events = 0;
if (!d->is_dead && ring_free_bytes(d))
- FD_SET(d->master_fd, &readfds);
+ events |= POLLIN;
if (!buffer_empty(&d->buffer))
- FD_SET(d->master_fd, &writefds);
- max_fd = MAX(d->master_fd, max_fd);
+ events |= POLLOUT;
+
+ if (events)
+ set_fds(d->master_fd, events);
}
}
/* If any domain has been rate limited, we need to work
- out what timeout to supply to select */
+ out what timeout to supply to poll */
if (next_timeout) {
long long duration = (next_timeout - now);
if (duration <= 0) /* sanity check */
duration = 1;
- timeout.tv_sec = duration / 1000;
- timeout.tv_usec = ((duration - (timeout.tv_sec * 1000))
- * 1000);
+ poll_timeout = (int)duration;
}
- ret = select(max_fd + 1, &readfds, &writefds, 0,
- next_timeout ? &timeout : NULL);
+ ret = poll(fds, nr_fds, next_timeout ? poll_timeout : -1);
if (log_reload) {
handle_log_reload();
log_reload = 0;
}
- /* Abort if select failed, except for EINTR cases
+ /* Abort if poll failed, except for EINTR cases
which indicate a possible log reload */
if (ret == -1) {
if (errno == EINTR)
continue;
- dolog(LOG_ERR, "Failure in select: %d (%s)",
+ dolog(LOG_ERR, "Failure in poll: %d (%s)",
errno, strerror(errno));
break;
}
- if (log_hv && FD_ISSET(xc_evtchn_fd(xce_handle), &readfds))
+ if (log_hv && fd_revents(xc_evtchn_fd(xce_handle)) & POLLIN)
handle_hv_logs();
if (ret <= 0)
continue;
- if (FD_ISSET(xs_fileno(xs), &readfds))
+ if (fd_revents(xs_fileno(xs)) & POLLIN)
handle_xs();
for (d = dom_head; d; d = n) {
n = d->next;
if (d->event_count < RATE_LIMIT_ALLOWANCE) {
if (d->xce_handle != NULL &&
- FD_ISSET(xc_evtchn_fd(d->xce_handle),
- &readfds))
+ fd_revents(xc_evtchn_fd(d->xce_handle)) &
+ POLLIN)
handle_ring_read(d);
}
- if (d->master_fd != -1 && FD_ISSET(d->master_fd,
- &readfds))
+ if (d->master_fd != -1 &&
+ fd_revents(d->master_fd) & POLLIN)
handle_tty_read(d);
- if (d->master_fd != -1 && FD_ISSET(d->master_fd,
- &writefds))
+ if (d->master_fd != -1 &&
+ fd_revents(d->master_fd) & POLLOUT)
handle_tty_write(d);
if (d->last_seen != enum_pass)
@@ -1084,6 +1154,7 @@ void handle_io(void)
}
}
+ destroy_pollfd_arrays();
out:
if (log_hv_fd != -1) {
close(log_hv_fd);
--
1.7.10.4
^ permalink raw reply related [flat|nested] 21+ messages in thread
* Re: [PATCH V3] Switch from select() to poll() in xenconsoled's IO loop.
2013-01-04 17:17 ` [PATCH V3] " Wei Liu
@ 2013-01-07 10:20 ` Ian Campbell
2013-01-07 12:12 ` Wei Liu
0 siblings, 1 reply; 21+ messages in thread
From: Ian Campbell @ 2013-01-07 10:20 UTC (permalink / raw)
To: Wei Liu; +Cc: xen-devel@lists.xen.org
On Fri, 2013-01-04 at 17:17 +0000, Wei Liu wrote:
> In Linux select() typically supports up to 1024 file descriptors. This can be
> a problem when user tries to boot up many guests. Switching to poll() has
> minimum impact on existing code and has better scalibility.
>
> Tracking arrays are dynamically allocated / reallocated. If the tracking
> arrays fail to expand, we just ignore the incoming fd.
>
> Change from V2:
> * remove unnecessary malloc in initialize_pollfd_arrays
> * use ROUND_UP to get new size of arrays
>
> Signed-off-by: Wei Liu <wei.liu2@citrix.com>
> ---
> tools/console/daemon/io.c | 147 +++++++++++++++++++++++++++++++++------------
> 1 file changed, 109 insertions(+), 38 deletions(-)
>
> diff --git a/tools/console/daemon/io.c b/tools/console/daemon/io.c
> index 48fe151..0c53d30 100644
> --- a/tools/console/daemon/io.c
> +++ b/tools/console/daemon/io.c
> @@ -28,7 +28,7 @@
> #include <stdlib.h>
> #include <errno.h>
> #include <string.h>
> -#include <sys/select.h>
> +#include <poll.h>
> #include <fcntl.h>
> #include <unistd.h>
> #include <termios.h>
> @@ -928,9 +928,87 @@ static void handle_log_reload(void)
> }
> }
>
> +
> +/* Should have at least max_fd + 1 elements */
> +static struct pollfd *fds;
> +static struct pollfd **fd_to_pollfd;
> +static unsigned int current_array_size;
> +static unsigned int nr_fds;
> +
> +static void initialize_pollfd_arrays(void)
> +{
> + fds = NULL;
> + fd_to_pollfd = NULL;
> + current_array_size = 0;
> +}
These can all be done as part of the definition of the variables.
> +
> +static void destroy_pollfd_arrays(void)
> +{
> + free(fds);
> + free(fd_to_pollfd);
> + current_array_size = 0;
> +}
Having done the above I'd be inclined to inline this at the single
callsite.
> +
> +#define ROUNDUP(_x,_w) (((unsigned long)(_x)+(1UL<<(_w))-1) & ~((1UL<<(_w))-1))
> +static void set_fds(int fd, short events)
> +{
> + if (current_array_size < fd+1) {
> + struct pollfd *p1 = NULL;
> + struct pollfd **p2 = NULL;
> + unsigned long newsize;
> +
> + /* Round up to 2^8 boundary, in practice this just
> + * make newsize larger than current_array_size.
> + */
> + newsize = ROUNDUP(fd+1, 8);
> +
> + p1 = realloc(fds, sizeof(struct pollfd)*newsize);
> + if (!p1)
> + goto fail;
> + fds = p1;
> +
> + p2 = realloc(fd_to_pollfd, sizeof(struct pollfd *)*newsize);
> + if (!p2)
> + goto fail;
> + fd_to_pollfd = p2;
Do these two arrays really need to be the same size?
I can see why fd_to_pollfd needs to be as big as the highest fd we are
polling but the fds array only actually needs to scale with the total
*number* of fds we are polling. Is it the case that the set of fds we
are waiting on is pretty densely packed?
Or maybe even if the fds are sparse it doesn't matter and this is
simpler at the cost of a little bit of extra memory usage?
> +
> + memset(&fds[0] + current_array_size, 0,
> + sizeof(struct pollfd) * (newsize-current_array_size));
> + memset(&fd_to_pollfd[0] + current_array_size, 0,
> + sizeof(struct pollfd *) * (newsize-current_array_size));
> + current_array_size = newsize;
> + }
> +
> + fds[nr_fds].fd = fd;
> + fds[nr_fds].events = events;
> + fd_to_pollfd[fd] = &fds[nr_fds];
> + nr_fds++;
> +
> + return;
> +fail:
> + dolog(LOG_ERR, "realloc failed, ignoring fd %d\n", fd);
> + return;
> +}
[...]
> - if (log_hv && FD_ISSET(xc_evtchn_fd(xce_handle), &readfds))
> + if (log_hv && fd_revents(xc_evtchn_fd(xce_handle)) & POLLIN)
> handle_hv_logs();
>
> if (ret <= 0)
> continue;
>
> - if (FD_ISSET(xs_fileno(xs), &readfds))
> + if (fd_revents(xs_fileno(xs)) & POLLIN)
It seems like both this and the logging fd could be handled with a
struct pollfd * variable specifically for each. Which combined with...
> handle_xs();
>
> for (d = dom_head; d; d = n) {
> n = d->next;
> if (d->event_count < RATE_LIMIT_ALLOWANCE) {
> if (d->xce_handle != NULL &&
> - FD_ISSET(xc_evtchn_fd(d->xce_handle),
> - &readfds))
> + fd_revents(xc_evtchn_fd(d->xce_handle)) &
... adding some struct pollfd pointers to struct domain (for this and
master_fd) would remove the need for the fd_to_pollfd lookup array
altogether.
> + POLLIN)
> handle_ring_read(d);
> }
>
> - if (d->master_fd != -1 && FD_ISSET(d->master_fd,
> - &readfds))
> + if (d->master_fd != -1 &&
> + fd_revents(d->master_fd) & POLLIN)
> handle_tty_read(d);
>
> - if (d->master_fd != -1 && FD_ISSET(d->master_fd,
> - &writefds))
> + if (d->master_fd != -1 &&
> + fd_revents(d->master_fd) & POLLOUT)
> handle_tty_write(d);
>
> if (d->last_seen != enum_pass)
[...]
Ian.
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH V3] Switch from select() to poll() in xenconsoled's IO loop.
2013-01-07 10:20 ` Ian Campbell
@ 2013-01-07 12:12 ` Wei Liu
2013-01-07 12:16 ` Ian Campbell
0 siblings, 1 reply; 21+ messages in thread
From: Wei Liu @ 2013-01-07 12:12 UTC (permalink / raw)
To: Ian Campbell; +Cc: wei.liu2, xen-devel@lists.xen.org
On Mon, 2013-01-07 at 10:20 +0000, Ian Campbell wrote:
> On Fri, 2013-01-04 at 17:17 +0000, Wei Liu wrote:
> > In Linux select() typically supports up to 1024 file descriptors. This can be
> > a problem when user tries to boot up many guests. Switching to poll() has
> > minimum impact on existing code and has better scalibility.
> >
> > Tracking arrays are dynamically allocated / reallocated. If the tracking
> > arrays fail to expand, we just ignore the incoming fd.
> >
> > Change from V2:
> > * remove unnecessary malloc in initialize_pollfd_arrays
> > * use ROUND_UP to get new size of arrays
> >
> > Signed-off-by: Wei Liu <wei.liu2@citrix.com>
> > ---
> > tools/console/daemon/io.c | 147 +++++++++++++++++++++++++++++++++------------
> > 1 file changed, 109 insertions(+), 38 deletions(-)
> >
> > diff --git a/tools/console/daemon/io.c b/tools/console/daemon/io.c
> > index 48fe151..0c53d30 100644
> > --- a/tools/console/daemon/io.c
> > +++ b/tools/console/daemon/io.c
> > @@ -28,7 +28,7 @@
> > #include <stdlib.h>
> > #include <errno.h>
> > #include <string.h>
> > -#include <sys/select.h>
> > +#include <poll.h>
> > #include <fcntl.h>
> > #include <unistd.h>
> > #include <termios.h>
> > @@ -928,9 +928,87 @@ static void handle_log_reload(void)
> > }
> > }
> >
> > +
> > +/* Should have at least max_fd + 1 elements */
> > +static struct pollfd *fds;
> > +static struct pollfd **fd_to_pollfd;
> > +static unsigned int current_array_size;
> > +static unsigned int nr_fds;
> > +
> > +static void initialize_pollfd_arrays(void)
> > +{
> > + fds = NULL;
> > + fd_to_pollfd = NULL;
> > + current_array_size = 0;
> > +}
>
> These can all be done as part of the definition of the variables.
>
> > +
> > +static void destroy_pollfd_arrays(void)
> > +{
> > + free(fds);
> > + free(fd_to_pollfd);
> > + current_array_size = 0;
> > +}
>
> Having done the above I'd be inclined to inline this at the single
> callsite.
>
I left the initialize() there just to pair with destroy(). But as you
suggested, I can eliminate both.
> > +
> > +#define ROUNDUP(_x,_w) (((unsigned long)(_x)+(1UL<<(_w))-1) & ~((1UL<<(_w))-1))
> > +static void set_fds(int fd, short events)
> > +{
> > + if (current_array_size < fd+1) {
> > + struct pollfd *p1 = NULL;
> > + struct pollfd **p2 = NULL;
> > + unsigned long newsize;
> > +
> > + /* Round up to 2^8 boundary, in practice this just
> > + * make newsize larger than current_array_size.
> > + */
> > + newsize = ROUNDUP(fd+1, 8);
> > +
> > + p1 = realloc(fds, sizeof(struct pollfd)*newsize);
> > + if (!p1)
> > + goto fail;
> > + fds = p1;
> > +
> > + p2 = realloc(fd_to_pollfd, sizeof(struct pollfd *)*newsize);
> > + if (!p2)
> > + goto fail;
> > + fd_to_pollfd = p2;
>
> Do these two arrays really need to be the same size?
>
Not necessary. I will see what I can do.
> I can see why fd_to_pollfd needs to be as big as the highest fd we are
> polling but the fds array only actually needs to scale with the total
> *number* of fds we are polling. Is it the case that the set of fds we
> are waiting on is pretty densely packed?
>
It depends. If we have many guests up then destroy them all, then the
arrays can be sparse. There is no way to shrink the arrays. But see
below.
> Or maybe even if the fds are sparse it doesn't matter and this is
> simpler at the cost of a little bit of extra memory usage?
>
>
Even we have thousands of guests, the tracking arrays only occupy a few
megabytes.
> > +
> > + memset(&fds[0] + current_array_size, 0,
> > + sizeof(struct pollfd) * (newsize-current_array_size));
> > + memset(&fd_to_pollfd[0] + current_array_size, 0,
> > + sizeof(struct pollfd *) * (newsize-current_array_size));
> > + current_array_size = newsize;
> > + }
> > +
> > + fds[nr_fds].fd = fd;
> > + fds[nr_fds].events = events;
> > + fd_to_pollfd[fd] = &fds[nr_fds];
> > + nr_fds++;
> > +
> > + return;
> > +fail:
> > + dolog(LOG_ERR, "realloc failed, ignoring fd %d\n", fd);
> > + return;
> > +}
> [...]
> > - if (log_hv && FD_ISSET(xc_evtchn_fd(xce_handle), &readfds))
> > + if (log_hv && fd_revents(xc_evtchn_fd(xce_handle)) & POLLIN)
> > handle_hv_logs();
> >
> > if (ret <= 0)
> > continue;
> >
> > - if (FD_ISSET(xs_fileno(xs), &readfds))
> > + if (fd_revents(xs_fileno(xs)) & POLLIN)
>
> It seems like both this and the logging fd could be handled with a
> struct pollfd * variable specifically for each. Which combined with...
>
> > handle_xs();
> >
> > for (d = dom_head; d; d = n) {
> > n = d->next;
> > if (d->event_count < RATE_LIMIT_ALLOWANCE) {
> > if (d->xce_handle != NULL &&
> > - FD_ISSET(xc_evtchn_fd(d->xce_handle),
> > - &readfds))
> > + fd_revents(xc_evtchn_fd(d->xce_handle)) &
>
> ... adding some struct pollfd pointers to struct domain (for this and
> master_fd) would remove the need for the fd_to_pollfd lookup array
> altogether.
>
Yes, this is possible. But my original thought was to make as little
impact to original code as possible, so I didn't touch struct domain and
choose to add extra tracking facilities to make it just work.
If we hit poll() bottleneck we can move to libevent or libev, which
requires big surgery on existing code.
Of course I can do that as well if that suits you.
Wei.
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH V3] Switch from select() to poll() in xenconsoled's IO loop.
2013-01-07 12:12 ` Wei Liu
@ 2013-01-07 12:16 ` Ian Campbell
0 siblings, 0 replies; 21+ messages in thread
From: Ian Campbell @ 2013-01-07 12:16 UTC (permalink / raw)
To: Wei Liu; +Cc: xen-devel@lists.xen.org
> > > handle_xs();
> > >
> > > for (d = dom_head; d; d = n) {
> > > n = d->next;
> > > if (d->event_count < RATE_LIMIT_ALLOWANCE) {
> > > if (d->xce_handle != NULL &&
> > > - FD_ISSET(xc_evtchn_fd(d->xce_handle),
> > > - &readfds))
> > > + fd_revents(xc_evtchn_fd(d->xce_handle)) &
> >
> > ... adding some struct pollfd pointers to struct domain (for this and
> > master_fd) would remove the need for the fd_to_pollfd lookup array
> > altogether.
> >
>
> Yes, this is possible. But my original thought was to make as little
> impact to original code as possible, so I didn't touch struct domain and
> choose to add extra tracking facilities to make it just work.
I think adding a few pointers to struct domain would be less code
changed over all since you wouldn't need to create/manage one of the
arrays at all.
> If we hit poll() bottleneck we can move to libevent or libev, which
> requires big surgery on existing code.
>
> Of course I can do that as well if that suits you.
>
>
> Wei.
>
^ permalink raw reply [flat|nested] 21+ messages in thread
* [PATCH V4] Switch from select() to poll() in xenconsoled's IO loop
2013-01-03 17:14 [PATCH] Switch to poll in xenconsoled's io loop Wei Liu
` (2 preceding siblings ...)
2013-01-04 17:17 ` [PATCH V3] " Wei Liu
@ 2013-01-07 14:28 ` Wei Liu
2013-01-07 14:39 ` Ian Campbell
2013-01-07 14:41 ` Mats Petersson
3 siblings, 2 replies; 21+ messages in thread
From: Wei Liu @ 2013-01-07 14:28 UTC (permalink / raw)
To: xen-devel@lists.xen.org; +Cc: wei.liu2
In Linux select() typically supports up to 1024 file descriptors. This can be
a problem when user tries to boot up many guests. Switching to poll() has
minimum impact on existing code and has better scalibility.
pollfd array is dynamically allocated / reallocated. If the array fails to
expand, we just ignore the incoming fd.
Change from V2:
* remove unnecessary malloc in initialize_pollfd_arrays
* use ROUND_UP to get new size of arrays
Change from V3:
* remove initialize and destroy function for array
* embedded tracking structure in struct domain, eliminate fd_to_pollfd
Signed-off-by: Wei Liu <wei.liu2@citrix.com>
---
tools/console/daemon/io.c | 119 ++++++++++++++++++++++++++++--------------
tools/console/daemon/utils.c | 2 +
tools/console/daemon/utils.h | 2 +
3 files changed, 85 insertions(+), 38 deletions(-)
diff --git a/tools/console/daemon/io.c b/tools/console/daemon/io.c
index 48fe151..80f7144 100644
--- a/tools/console/daemon/io.c
+++ b/tools/console/daemon/io.c
@@ -28,7 +28,7 @@
#include <stdlib.h>
#include <errno.h>
#include <string.h>
-#include <sys/select.h>
+#include <poll.h>
#include <fcntl.h>
#include <unistd.h>
#include <termios.h>
@@ -69,6 +69,7 @@ static int log_hv_fd = -1;
static evtchn_port_or_error_t log_hv_evtchn = -1;
static xc_interface *xch; /* why does xenconsoled have two xc handles ? */
static xc_evtchn *xce_handle = NULL;
+static struct pollfd *xce_pollfd = NULL;
struct buffer {
char *data;
@@ -81,7 +82,9 @@ struct buffer {
struct domain {
int domid;
int master_fd;
+ struct pollfd *master_pollfd;
int slave_fd;
+ struct pollfd *slave_pollfd;
int log_fd;
bool is_dead;
unsigned last_seen;
@@ -92,6 +95,7 @@ struct domain {
evtchn_port_or_error_t local_port;
evtchn_port_or_error_t remote_port;
xc_evtchn *xce_handle;
+ struct pollfd *xce_pollfd;
struct xencons_interface *interface;
int event_count;
long long next_period;
@@ -928,9 +932,52 @@ static void handle_log_reload(void)
}
}
+static struct pollfd *fds;
+static unsigned int current_array_size;
+static unsigned int nr_fds;
+
+#define ROUNDUP(_x,_w) (((unsigned long)(_x)+(1UL<<(_w))-1) & ~((1UL<<(_w))-1))
+static struct pollfd *set_fds(int fd, short events)
+{
+ struct pollfd *ret;
+ if (current_array_size < nr_fds + 1) {
+ struct pollfd *new_fds = NULL;
+ unsigned long newsize;
+
+ /* Round up to 2^8 boundary, in practice this just
+ * make newsize larger than current_array_size.
+ */
+ newsize = ROUNDUP(nr_fds + 1, 8);
+
+ new_fds = realloc(fds, sizeof(struct pollfd)*newsize);
+ if (!new_fds)
+ goto fail;
+ fds = new_fds;
+
+ memset(&fds[0] + current_array_size, 0,
+ sizeof(struct pollfd) * (newsize-current_array_size));
+ current_array_size = newsize;
+ }
+
+ fds[nr_fds].fd = fd;
+ fds[nr_fds].events = events;
+ ret = &fds[nr_fds];
+ nr_fds++;
+
+ return ret;
+fail:
+ dolog(LOG_ERR, "realloc failed, ignoring fd %d\n", fd);
+ return NULL;
+}
+
+static void reset_fds(void)
+{
+ nr_fds = 0;
+ memset(fds, 0, sizeof(struct pollfd) * current_array_size);
+}
+
void handle_io(void)
{
- fd_set readfds, writefds;
int ret;
if (log_hv) {
@@ -959,21 +1006,16 @@ void handle_io(void)
for (;;) {
struct domain *d, *n;
- int max_fd = -1;
- struct timeval timeout;
+ int poll_timeout; /* timeout in milliseconds */
struct timespec ts;
long long now, next_timeout = 0;
- FD_ZERO(&readfds);
- FD_ZERO(&writefds);
+ reset_fds();
- FD_SET(xs_fileno(xs), &readfds);
- max_fd = MAX(xs_fileno(xs), max_fd);
+ xs_pollfd = set_fds(xs_fileno(xs), POLLIN);
- if (log_hv) {
- FD_SET(xc_evtchn_fd(xce_handle), &readfds);
- max_fd = MAX(xc_evtchn_fd(xce_handle), max_fd);
- }
+ if (log_hv)
+ xce_pollfd = set_fds(xc_evtchn_fd(xce_handle), POLLIN);
if (clock_gettime(CLOCK_MONOTONIC, &ts) < 0)
return;
@@ -982,11 +1024,7 @@ void handle_io(void)
/* Re-calculate any event counter allowances & unblock
domains with new allowance */
for (d = dom_head; d; d = d->next) {
- /* Add 5ms of fuzz since select() often returns
- a couple of ms sooner than requested. Without
- the fuzz we typically do an extra spin in select()
- with a 1/2 ms timeout every other iteration */
- if ((now+5) > d->next_period) {
+ if (now > d->next_period) {
d->next_period = now + RATE_LIMIT_PERIOD;
if (d->event_count >= RATE_LIMIT_ALLOWANCE) {
(void)xc_evtchn_unmask(d->xce_handle, d->local_port);
@@ -1006,74 +1044,76 @@ void handle_io(void)
!d->buffer.max_capacity ||
d->buffer.size < d->buffer.max_capacity) {
int evtchn_fd = xc_evtchn_fd(d->xce_handle);
- FD_SET(evtchn_fd, &readfds);
- max_fd = MAX(evtchn_fd, max_fd);
+ d->xce_pollfd = set_fds(evtchn_fd, POLLIN);
}
}
if (d->master_fd != -1) {
+ short events = 0;
if (!d->is_dead && ring_free_bytes(d))
- FD_SET(d->master_fd, &readfds);
+ events |= POLLIN;
if (!buffer_empty(&d->buffer))
- FD_SET(d->master_fd, &writefds);
- max_fd = MAX(d->master_fd, max_fd);
+ events |= POLLOUT;
+
+ if (events)
+ d->master_pollfd =
+ set_fds(d->master_fd, events);
}
}
/* If any domain has been rate limited, we need to work
- out what timeout to supply to select */
+ out what timeout to supply to poll */
if (next_timeout) {
long long duration = (next_timeout - now);
if (duration <= 0) /* sanity check */
duration = 1;
- timeout.tv_sec = duration / 1000;
- timeout.tv_usec = ((duration - (timeout.tv_sec * 1000))
- * 1000);
+ poll_timeout = (int)duration;
}
- ret = select(max_fd + 1, &readfds, &writefds, 0,
- next_timeout ? &timeout : NULL);
+ ret = poll(fds, nr_fds, next_timeout ? poll_timeout : -1);
if (log_reload) {
handle_log_reload();
log_reload = 0;
}
- /* Abort if select failed, except for EINTR cases
+ /* Abort if poll failed, except for EINTR cases
which indicate a possible log reload */
if (ret == -1) {
if (errno == EINTR)
continue;
- dolog(LOG_ERR, "Failure in select: %d (%s)",
+ dolog(LOG_ERR, "Failure in poll: %d (%s)",
errno, strerror(errno));
break;
}
- if (log_hv && FD_ISSET(xc_evtchn_fd(xce_handle), &readfds))
+ if (log_hv && xce_pollfd && xce_pollfd->revents & POLLIN)
handle_hv_logs();
if (ret <= 0)
continue;
- if (FD_ISSET(xs_fileno(xs), &readfds))
+ if (xs_pollfd && xs_pollfd->revents & POLLIN)
handle_xs();
for (d = dom_head; d; d = n) {
n = d->next;
if (d->event_count < RATE_LIMIT_ALLOWANCE) {
if (d->xce_handle != NULL &&
- FD_ISSET(xc_evtchn_fd(d->xce_handle),
- &readfds))
+ d->xce_pollfd != NULL &&
+ d->xce_pollfd->revents & POLLIN)
handle_ring_read(d);
}
- if (d->master_fd != -1 && FD_ISSET(d->master_fd,
- &readfds))
+ if (d->master_fd != -1 &&
+ d->master_pollfd &&
+ d->master_pollfd->revents & POLLIN)
handle_tty_read(d);
- if (d->master_fd != -1 && FD_ISSET(d->master_fd,
- &writefds))
+ if (d->master_fd != -1 &&
+ d->master_pollfd &&
+ d->master_pollfd->revents & POLLOUT)
handle_tty_write(d);
if (d->last_seen != enum_pass)
@@ -1084,6 +1124,9 @@ void handle_io(void)
}
}
+ free(fds);
+ current_array_size = 0;
+
out:
if (log_hv_fd != -1) {
close(log_hv_fd);
diff --git a/tools/console/daemon/utils.c b/tools/console/daemon/utils.c
index aab6f42..d0b1b00 100644
--- a/tools/console/daemon/utils.c
+++ b/tools/console/daemon/utils.c
@@ -33,11 +33,13 @@
#include <sys/un.h>
#include <string.h>
#include <signal.h>
+#include <poll.h>
#include "xenctrl.h"
#include "utils.h"
struct xs_handle *xs;
+struct pollfd *xs_pollfd;
xc_interface *xc;
static void child_exit(int sig)
diff --git a/tools/console/daemon/utils.h b/tools/console/daemon/utils.h
index 8725dcd..8e72dab 100644
--- a/tools/console/daemon/utils.h
+++ b/tools/console/daemon/utils.h
@@ -24,6 +24,7 @@
#include <stdbool.h>
#include <syslog.h>
#include <stdio.h>
+#include <poll.h>
#include <xenctrl.h>
#include <xenstore.h>
@@ -32,6 +33,7 @@ void daemonize(const char *pidfile);
bool xen_setup(void);
extern struct xs_handle *xs;
+extern struct pollfd *xs_pollfd;
extern xc_interface *xc;
#if 1
--
1.7.10.4
^ permalink raw reply related [flat|nested] 21+ messages in thread
* Re: [PATCH V4] Switch from select() to poll() in xenconsoled's IO loop
2013-01-07 14:28 ` [PATCH V4] " Wei Liu
@ 2013-01-07 14:39 ` Ian Campbell
2013-01-07 14:44 ` Wei Liu
2013-01-07 14:52 ` Ian Jackson
2013-01-07 14:41 ` Mats Petersson
1 sibling, 2 replies; 21+ messages in thread
From: Ian Campbell @ 2013-01-07 14:39 UTC (permalink / raw)
To: Wei Liu; +Cc: Ian Jackson, xen-devel@lists.xen.org
On Mon, 2013-01-07 at 14:28 +0000, Wei Liu wrote
> @@ -69,6 +69,7 @@ static int log_hv_fd = -1;
> static evtchn_port_or_error_t log_hv_evtchn = -1;
> static xc_interface *xch; /* why does xenconsoled have two xc handles ? */
> static xc_evtchn *xce_handle = NULL;
> +static struct pollfd *xce_pollfd = NULL;
>
> struct buffer {
> char *data;
[...]
> @@ -32,6 +33,7 @@ void daemonize(const char *pidfile);
> bool xen_setup(void);
>
> extern struct xs_handle *xs;
> +extern struct pollfd *xs_pollfd;
> extern xc_interface *xc;
xs_pollfd and the xce_pollfd can both be local to the handle_io
function, I think.
>
> - if (d->master_fd != -1 && FD_ISSET(d->master_fd,
> - &readfds))
> + if (d->master_fd != -1 &&
> + d->master_pollfd &&
> + d->master_pollfd->revents & POLLIN)
> handle_tty_read(d);
>
> - if (d->master_fd != -1 && FD_ISSET(d->master_fd,
> - &writefds))
> + if (d->master_fd != -1 &&
> + d->master_pollfd &&
> + d->master_pollfd->revents & POLLOUT)
> handle_tty_write(d);
>
> if (d->last_seen != enum_pass)
This is probably one for Ian J but I wonder if you need to handle
POLLERR or POLLHUP here. ISTR some of oddness WRT these when Ian
implemented the libxl event subsystem.
Ian.
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH V4] Switch from select() to poll() in xenconsoled's IO loop
2013-01-07 14:39 ` Ian Campbell
@ 2013-01-07 14:44 ` Wei Liu
2013-01-07 14:52 ` Ian Jackson
1 sibling, 0 replies; 21+ messages in thread
From: Wei Liu @ 2013-01-07 14:44 UTC (permalink / raw)
To: Ian Campbell; +Cc: Ian Jackson, wei.liu2, xen-devel@lists.xen.org
On Mon, 2013-01-07 at 14:39 +0000, Ian Campbell wrote:
> On Mon, 2013-01-07 at 14:28 +0000, Wei Liu wrote
> > @@ -69,6 +69,7 @@ static int log_hv_fd = -1;
> > static evtchn_port_or_error_t log_hv_evtchn = -1;
> > static xc_interface *xch; /* why does xenconsoled have two xc handles ? */
> > static xc_evtchn *xce_handle = NULL;
> > +static struct pollfd *xce_pollfd = NULL;
> >
> > struct buffer {
> > char *data;
> [...]
> > @@ -32,6 +33,7 @@ void daemonize(const char *pidfile);
> > bool xen_setup(void);
> >
> > extern struct xs_handle *xs;
> > +extern struct pollfd *xs_pollfd;
> > extern xc_interface *xc;
>
> xs_pollfd and the xce_pollfd can both be local to the handle_io
> function, I think.
>
> >
> > - if (d->master_fd != -1 && FD_ISSET(d->master_fd,
> > - &readfds))
> > + if (d->master_fd != -1 &&
> > + d->master_pollfd &&
> > + d->master_pollfd->revents & POLLIN)
> > handle_tty_read(d);
> >
> > - if (d->master_fd != -1 && FD_ISSET(d->master_fd,
> > - &writefds))
> > + if (d->master_fd != -1 &&
> > + d->master_pollfd &&
> > + d->master_pollfd->revents & POLLOUT)
> > handle_tty_write(d);
> >
> > if (d->last_seen != enum_pass)
>
> This is probably one for Ian J but I wonder if you need to handle
> POLLERR or POLLHUP here. ISTR some of oddness WRT these when Ian
> implemented the libxl event subsystem.
>
Ian J, can you elaborate on this?
Wei.
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH V4] Switch from select() to poll() in xenconsoled's IO loop
2013-01-07 14:39 ` Ian Campbell
2013-01-07 14:44 ` Wei Liu
@ 2013-01-07 14:52 ` Ian Jackson
1 sibling, 0 replies; 21+ messages in thread
From: Ian Jackson @ 2013-01-07 14:52 UTC (permalink / raw)
To: Ian Campbell; +Cc: Wei Liu, xen-devel@lists.xen.org
Ian Campbell writes ("Re: [Xen-devel] [PATCH V4] Switch from select() to poll() in xenconsoled's IO loop"):
> On Mon, 2013-01-07 at 14:28 +0000, Wei Liu wrote
> > - if (d->master_fd != -1 && FD_ISSET(d->master_fd,
> > - &readfds))
...
> This is probably one for Ian J but I wonder if you need to handle
> POLLERR or POLLHUP here. ISTR some of oddness WRT these when Ian
> implemented the libxl event subsystem.
I haven't read the patch in question but:
You certainly need to check for POLLERR and POLLHUP. They appear in
revents even if not requested. Normally they can be treated as fatal
for the descriptor.
You should also think about POLLPRI. Often it's correct to request it
and then treat it as fatal for the descriptor if it occurs.
Ian.
(I haven't read the patch I'm afraid...)
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH V4] Switch from select() to poll() in xenconsoled's IO loop
2013-01-07 14:28 ` [PATCH V4] " Wei Liu
2013-01-07 14:39 ` Ian Campbell
@ 2013-01-07 14:41 ` Mats Petersson
2013-01-07 15:01 ` Wei Liu
1 sibling, 1 reply; 21+ messages in thread
From: Mats Petersson @ 2013-01-07 14:41 UTC (permalink / raw)
To: xen-devel
On 07/01/13 14:28, Wei Liu wrote:
> In Linux select() typically supports up to 1024 file descriptors. This can be
> a problem when user tries to boot up many guests. Switching to poll() has
> minimum impact on existing code and has better scalibility.
>
> pollfd array is dynamically allocated / reallocated. If the array fails to
> expand, we just ignore the incoming fd.
>
> Change from V2:
> * remove unnecessary malloc in initialize_pollfd_arrays
> * use ROUND_UP to get new size of arrays
>
> Change from V3:
> * remove initialize and destroy function for array
> * embedded tracking structure in struct domain, eliminate fd_to_pollfd
>
> Signed-off-by: Wei Liu <wei.liu2@citrix.com>
> ---
> tools/console/daemon/io.c | 119 ++++++++++++++++++++++++++++--------------
> tools/console/daemon/utils.c | 2 +
> tools/console/daemon/utils.h | 2 +
> 3 files changed, 85 insertions(+), 38 deletions(-)
>
> diff --git a/tools/console/daemon/io.c b/tools/console/daemon/io.c
> index 48fe151..80f7144 100644
> --- a/tools/console/daemon/io.c
> +++ b/tools/console/daemon/io.c
> @@ -28,7 +28,7 @@
> #include <stdlib.h>
> #include <errno.h>
> #include <string.h>
> -#include <sys/select.h>
> +#include <poll.h>
> #include <fcntl.h>
> #include <unistd.h>
> #include <termios.h>
> @@ -69,6 +69,7 @@ static int log_hv_fd = -1;
> static evtchn_port_or_error_t log_hv_evtchn = -1;
> static xc_interface *xch; /* why does xenconsoled have two xc handles ? */
> static xc_evtchn *xce_handle = NULL;
> +static struct pollfd *xce_pollfd = NULL;
>
> struct buffer {
> char *data;
> @@ -81,7 +82,9 @@ struct buffer {
> struct domain {
> int domid;
> int master_fd;
> + struct pollfd *master_pollfd;
> int slave_fd;
> + struct pollfd *slave_pollfd;
> int log_fd;
> bool is_dead;
> unsigned last_seen;
> @@ -92,6 +95,7 @@ struct domain {
> evtchn_port_or_error_t local_port;
> evtchn_port_or_error_t remote_port;
> xc_evtchn *xce_handle;
> + struct pollfd *xce_pollfd;
> struct xencons_interface *interface;
> int event_count;
> long long next_period;
> @@ -928,9 +932,52 @@ static void handle_log_reload(void)
> }
> }
>
> +static struct pollfd *fds;
> +static unsigned int current_array_size;
> +static unsigned int nr_fds;
> +
> +#define ROUNDUP(_x,_w) (((unsigned long)(_x)+(1UL<<(_w))-1) & ~((1UL<<(_w))-1))
> +static struct pollfd *set_fds(int fd, short events)
> +{
> + struct pollfd *ret;
> + if (current_array_size < nr_fds + 1) {
> + struct pollfd *new_fds = NULL;
> + unsigned long newsize;
> +
> + /* Round up to 2^8 boundary, in practice this just
> + * make newsize larger than current_array_size.
> + */
> + newsize = ROUNDUP(nr_fds + 1, 8);
> +
> + new_fds = realloc(fds, sizeof(struct pollfd)*newsize);
> + if (!new_fds)
> + goto fail;
> + fds = new_fds;
> +
> + memset(&fds[0] + current_array_size, 0,
> + sizeof(struct pollfd) * (newsize-current_array_size));
> + current_array_size = newsize;
> + }
> +
> + fds[nr_fds].fd = fd;
> + fds[nr_fds].events = events;
> + ret = &fds[nr_fds];
> + nr_fds++;
> +
> + return ret;
> +fail:
> + dolog(LOG_ERR, "realloc failed, ignoring fd %d\n", fd);
> + return NULL;
> +}
> +
> +static void reset_fds(void)
> +{
> + nr_fds = 0;
> + memset(fds, 0, sizeof(struct pollfd) * current_array_size);
> +}
> +
> void handle_io(void)
> {
> - fd_set readfds, writefds;
> int ret;
>
> if (log_hv) {
> @@ -959,21 +1006,16 @@ void handle_io(void)
>
> for (;;) {
> struct domain *d, *n;
> - int max_fd = -1;
> - struct timeval timeout;
> + int poll_timeout; /* timeout in milliseconds */
> struct timespec ts;
> long long now, next_timeout = 0;
>
> - FD_ZERO(&readfds);
> - FD_ZERO(&writefds);
> + reset_fds();
>
> - FD_SET(xs_fileno(xs), &readfds);
> - max_fd = MAX(xs_fileno(xs), max_fd);
> + xs_pollfd = set_fds(xs_fileno(xs), POLLIN);
>
> - if (log_hv) {
> - FD_SET(xc_evtchn_fd(xce_handle), &readfds);
> - max_fd = MAX(xc_evtchn_fd(xce_handle), max_fd);
> - }
> + if (log_hv)
> + xce_pollfd = set_fds(xc_evtchn_fd(xce_handle), POLLIN);
>
> if (clock_gettime(CLOCK_MONOTONIC, &ts) < 0)
> return;
> @@ -982,11 +1024,7 @@ void handle_io(void)
> /* Re-calculate any event counter allowances & unblock
> domains with new allowance */
> for (d = dom_head; d; d = d->next) {
> - /* Add 5ms of fuzz since select() often returns
> - a couple of ms sooner than requested. Without
> - the fuzz we typically do an extra spin in select()
> - with a 1/2 ms timeout every other iteration */
> - if ((now+5) > d->next_period) {
> + if (now > d->next_period) {
Is poll more accurate than select? I would have thought that they were
based on the same timing, and thus equally "fuzzy"?
> d->next_period = now + RATE_LIMIT_PERIOD;
> if (d->event_count >= RATE_LIMIT_ALLOWANCE) {
> (void)xc_evtchn_unmask(d->xce_handle, d->local_port);
> @@ -1006,74 +1044,76 @@ void handle_io(void)
> !d->buffer.max_capacity ||
> d->buffer.size < d->buffer.max_capacity) {
> int evtchn_fd = xc_evtchn_fd(d->xce_handle);
> - FD_SET(evtchn_fd, &readfds);
> - max_fd = MAX(evtchn_fd, max_fd);
> + d->xce_pollfd = set_fds(evtchn_fd, POLLIN);
> }
> }
>
> if (d->master_fd != -1) {
> + short events = 0;
> if (!d->is_dead && ring_free_bytes(d))
> - FD_SET(d->master_fd, &readfds);
> + events |= POLLIN;
>
> if (!buffer_empty(&d->buffer))
> - FD_SET(d->master_fd, &writefds);
> - max_fd = MAX(d->master_fd, max_fd);
> + events |= POLLOUT;
> +
> + if (events)
> + d->master_pollfd =
> + set_fds(d->master_fd, events);
> }
> }
>
> /* If any domain has been rate limited, we need to work
> - out what timeout to supply to select */
> + out what timeout to supply to poll */
> if (next_timeout) {
> long long duration = (next_timeout - now);
> if (duration <= 0) /* sanity check */
> duration = 1;
> - timeout.tv_sec = duration / 1000;
> - timeout.tv_usec = ((duration - (timeout.tv_sec * 1000))
> - * 1000);
> + poll_timeout = (int)duration;
> }
>
> - ret = select(max_fd + 1, &readfds, &writefds, 0,
> - next_timeout ? &timeout : NULL);
> + ret = poll(fds, nr_fds, next_timeout ? poll_timeout : -1);
>
> if (log_reload) {
> handle_log_reload();
> log_reload = 0;
> }
>
> - /* Abort if select failed, except for EINTR cases
> + /* Abort if poll failed, except for EINTR cases
> which indicate a possible log reload */
> if (ret == -1) {
> if (errno == EINTR)
> continue;
> - dolog(LOG_ERR, "Failure in select: %d (%s)",
> + dolog(LOG_ERR, "Failure in poll: %d (%s)",
> errno, strerror(errno));
> break;
> }
>
> - if (log_hv && FD_ISSET(xc_evtchn_fd(xce_handle), &readfds))
> + if (log_hv && xce_pollfd && xce_pollfd->revents & POLLIN)
> handle_hv_logs();
>
> if (ret <= 0)
> continue;
>
> - if (FD_ISSET(xs_fileno(xs), &readfds))
> + if (xs_pollfd && xs_pollfd->revents & POLLIN)
> handle_xs();
>
> for (d = dom_head; d; d = n) {
> n = d->next;
> if (d->event_count < RATE_LIMIT_ALLOWANCE) {
> if (d->xce_handle != NULL &&
> - FD_ISSET(xc_evtchn_fd(d->xce_handle),
> - &readfds))
> + d->xce_pollfd != NULL &&
> + d->xce_pollfd->revents & POLLIN)
> handle_ring_read(d);
> }
>
> - if (d->master_fd != -1 && FD_ISSET(d->master_fd,
> - &readfds))
> + if (d->master_fd != -1 &&
> + d->master_pollfd &&
> + d->master_pollfd->revents & POLLIN)
> handle_tty_read(d);
>
> - if (d->master_fd != -1 && FD_ISSET(d->master_fd,
> - &writefds))
> + if (d->master_fd != -1 &&
> + d->master_pollfd &&
> + d->master_pollfd->revents & POLLOUT)
> handle_tty_write(d);
>
> if (d->last_seen != enum_pass)
> @@ -1084,6 +1124,9 @@ void handle_io(void)
> }
> }
>
> + free(fds);
> + current_array_size = 0;
> +
> out:
> if (log_hv_fd != -1) {
> close(log_hv_fd);
> diff --git a/tools/console/daemon/utils.c b/tools/console/daemon/utils.c
> index aab6f42..d0b1b00 100644
> --- a/tools/console/daemon/utils.c
> +++ b/tools/console/daemon/utils.c
> @@ -33,11 +33,13 @@
> #include <sys/un.h>
> #include <string.h>
> #include <signal.h>
> +#include <poll.h>
>
> #include "xenctrl.h"
> #include "utils.h"
>
> struct xs_handle *xs;
> +struct pollfd *xs_pollfd;
I don't see this used outside of io.c - am I missing something?
Adding more dependencies between different source files seems
unnecessary...
--
Mats
> xc_interface *xc;
>
> static void child_exit(int sig)
> diff --git a/tools/console/daemon/utils.h b/tools/console/daemon/utils.h
> index 8725dcd..8e72dab 100644
> --- a/tools/console/daemon/utils.h
> +++ b/tools/console/daemon/utils.h
> @@ -24,6 +24,7 @@
> #include <stdbool.h>
> #include <syslog.h>
> #include <stdio.h>
> +#include <poll.h>
> #include <xenctrl.h>
>
> #include <xenstore.h>
> @@ -32,6 +33,7 @@ void daemonize(const char *pidfile);
> bool xen_setup(void);
>
> extern struct xs_handle *xs;
> +extern struct pollfd *xs_pollfd;
> extern xc_interface *xc;
>
> #if 1
> --
> 1.7.10.4
>
>
>
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> http://lists.xen.org/xen-devel
>
>
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH V4] Switch from select() to poll() in xenconsoled's IO loop
2013-01-07 14:41 ` Mats Petersson
@ 2013-01-07 15:01 ` Wei Liu
2013-01-07 15:06 ` Mats Petersson
2013-01-07 15:16 ` Ian Campbell
0 siblings, 2 replies; 21+ messages in thread
From: Wei Liu @ 2013-01-07 15:01 UTC (permalink / raw)
To: Mats Petersson; +Cc: wei.liu2, xen-devel@lists.xen.org
On Mon, 2013-01-07 at 14:41 +0000, Mats Petersson wrote:
> > return;
> > @@ -982,11 +1024,7 @@ void handle_io(void)
> > /* Re-calculate any event counter allowances & unblock
> > domains with new allowance */
> > for (d = dom_head; d; d = d->next) {
> > - /* Add 5ms of fuzz since select() often returns
> > - a couple of ms sooner than requested. Without
> > - the fuzz we typically do an extra spin in select()
> > - with a 1/2 ms timeout every other iteration */
> > - if ((now+5) > d->next_period) {
> > + if (now > d->next_period) {
> Is poll more accurate than select? I would have thought that they were
> based on the same timing, and thus equally "fuzzy"?
Is there any actual proof that the fuzz is needed? Specs of both
select() and poll() don't seem to mention this behaviour at all.
> > #include "utils.h"
> >
> > struct xs_handle *xs;
> > +struct pollfd *xs_pollfd;
> I don't see this used outside of io.c - am I missing something?
>
> Adding more dependencies between different source files seems
> unnecessary...
>
Fixed.
Wei.
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH V4] Switch from select() to poll() in xenconsoled's IO loop
2013-01-07 15:01 ` Wei Liu
@ 2013-01-07 15:06 ` Mats Petersson
2013-01-07 15:17 ` Ian Campbell
2013-01-07 15:16 ` Ian Campbell
1 sibling, 1 reply; 21+ messages in thread
From: Mats Petersson @ 2013-01-07 15:06 UTC (permalink / raw)
To: Wei Liu; +Cc: xen-devel@lists.xen.org
On 07/01/13 15:01, Wei Liu wrote:
> On Mon, 2013-01-07 at 14:41 +0000, Mats Petersson wrote:
>
>>> return;
>>> @@ -982,11 +1024,7 @@ void handle_io(void)
>>> /* Re-calculate any event counter allowances & unblock
>>> domains with new allowance */
>>> for (d = dom_head; d; d = d->next) {
>>> - /* Add 5ms of fuzz since select() often returns
>>> - a couple of ms sooner than requested. Without
>>> - the fuzz we typically do an extra spin in select()
>>> - with a 1/2 ms timeout every other iteration */
>>> - if ((now+5) > d->next_period) {
>>> + if (now > d->next_period) {
>> Is poll more accurate than select? I would have thought that they were
>> based on the same timing, and thus equally "fuzzy"?
> Is there any actual proof that the fuzz is needed? Specs of both
> select() and poll() don't seem to mention this behaviour at all.
That's a good question. I don't know. The tricky part with this sort of
thing is that it may well depend on configurations, hardware
differences, etc, so you may find that it works just fine on your
test-box, but some big customer with Another-brand Co's servers don't
work, because there is some subtle difference in hardware. Or it stops
working if you have more than X number of CPU's. If you are convinced
it's fine as it is, then by all means. I'm just thinking that it
probably wasn't put there "by accident".
--
Mats
>
>>> #include "utils.h"
>>>
>>> struct xs_handle *xs;
>>> +struct pollfd *xs_pollfd;
>> I don't see this used outside of io.c - am I missing something?
>>
>> Adding more dependencies between different source files seems
>> unnecessary...
>>
> Fixed.
>
>
> Wei.
>
>
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH V4] Switch from select() to poll() in xenconsoled's IO loop
2013-01-07 15:06 ` Mats Petersson
@ 2013-01-07 15:17 ` Ian Campbell
0 siblings, 0 replies; 21+ messages in thread
From: Ian Campbell @ 2013-01-07 15:17 UTC (permalink / raw)
To: Mats Petersson; +Cc: Wei Liu, xen-devel@lists.xen.org
On Mon, 2013-01-07 at 15:06 +0000, Mats Petersson wrote:
> On 07/01/13 15:01, Wei Liu wrote:
> > On Mon, 2013-01-07 at 14:41 +0000, Mats Petersson wrote:
> >
> >>> return;
> >>> @@ -982,11 +1024,7 @@ void handle_io(void)
> >>> /* Re-calculate any event counter allowances & unblock
> >>> domains with new allowance */
> >>> for (d = dom_head; d; d = d->next) {
> >>> - /* Add 5ms of fuzz since select() often returns
> >>> - a couple of ms sooner than requested. Without
> >>> - the fuzz we typically do an extra spin in select()
> >>> - with a 1/2 ms timeout every other iteration */
> >>> - if ((now+5) > d->next_period) {
> >>> + if (now > d->next_period) {
> >> Is poll more accurate than select? I would have thought that they were
> >> based on the same timing, and thus equally "fuzzy"?
> > Is there any actual proof that the fuzz is needed? Specs of both
> > select() and poll() don't seem to mention this behaviour at all.
> That's a good question. I don't know. The tricky part with this sort of
> thing is that it may well depend on configurations, hardware
> differences, etc, so you may find that it works just fine on your
> test-box, but some big customer with Another-brand Co's servers don't
> work, because there is some subtle difference in hardware. Or it stops
> working if you have more than X number of CPU's. If you are convinced
> it's fine as it is, then by all means. I'm just thinking that it
> probably wasn't put there "by accident".
There's certainly an argument for removing it in a separate changeset
though in case it does cause issues.
Ian
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH V4] Switch from select() to poll() in xenconsoled's IO loop
2013-01-07 15:01 ` Wei Liu
2013-01-07 15:06 ` Mats Petersson
@ 2013-01-07 15:16 ` Ian Campbell
2013-01-07 15:24 ` Wei Liu
1 sibling, 1 reply; 21+ messages in thread
From: Ian Campbell @ 2013-01-07 15:16 UTC (permalink / raw)
To: Wei Liu; +Cc: Mats Petersson, xen-devel@lists.xen.org
On Mon, 2013-01-07 at 15:01 +0000, Wei Liu wrote:
> On Mon, 2013-01-07 at 14:41 +0000, Mats Petersson wrote:
>
> > > return;
> > > @@ -982,11 +1024,7 @@ void handle_io(void)
> > > /* Re-calculate any event counter allowances & unblock
> > > domains with new allowance */
> > > for (d = dom_head; d; d = d->next) {
> > > - /* Add 5ms of fuzz since select() often returns
> > > - a couple of ms sooner than requested. Without
> > > - the fuzz we typically do an extra spin in select()
> > > - with a 1/2 ms timeout every other iteration */
> > > - if ((now+5) > d->next_period) {
> > > + if (now > d->next_period) {
> > Is poll more accurate than select? I would have thought that they were
> > based on the same timing, and thus equally "fuzzy"?
>
> Is there any actual proof that the fuzz is needed? Specs of both
> select() and poll() don't seem to mention this behaviour at all.
I agree that it seems pretty dubious but it might be interesting to see
what strace shows, specifically if it shows this extra spin every other
iteration.
The fuzz was introduced in 16257:955ee4fa1345 "Rate-limit activity
caused by each domU." but the commit log doesn't make any reference to
the reason for it.
It may just have been a kernel bug at around the time that patch was
authored, but google doesn't seem to show any evidence of such a bug
ever being widespread (i.e. I don't find any references to it).
Ian.
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH V4] Switch from select() to poll() in xenconsoled's IO loop
2013-01-07 15:16 ` Ian Campbell
@ 2013-01-07 15:24 ` Wei Liu
0 siblings, 0 replies; 21+ messages in thread
From: Wei Liu @ 2013-01-07 15:24 UTC (permalink / raw)
To: Ian Campbell; +Cc: Mats Petersson, wei.liu2, xen-devel@lists.xen.org
On Mon, 2013-01-07 at 15:16 +0000, Ian Campbell wrote:
> On Mon, 2013-01-07 at 15:01 +0000, Wei Liu wrote:
> > On Mon, 2013-01-07 at 14:41 +0000, Mats Petersson wrote:
> >
> > > > return;
> > > > @@ -982,11 +1024,7 @@ void handle_io(void)
> > > > /* Re-calculate any event counter allowances & unblock
> > > > domains with new allowance */
> > > > for (d = dom_head; d; d = d->next) {
> > > > - /* Add 5ms of fuzz since select() often returns
> > > > - a couple of ms sooner than requested. Without
> > > > - the fuzz we typically do an extra spin in select()
> > > > - with a 1/2 ms timeout every other iteration */
> > > > - if ((now+5) > d->next_period) {
> > > > + if (now > d->next_period) {
> > > Is poll more accurate than select? I would have thought that they were
> > > based on the same timing, and thus equally "fuzzy"?
> >
> > Is there any actual proof that the fuzz is needed? Specs of both
> > select() and poll() don't seem to mention this behaviour at all.
>
> I agree that it seems pretty dubious but it might be interesting to see
> what strace shows, specifically if it shows this extra spin every other
> iteration.
>
> The fuzz was introduced in 16257:955ee4fa1345 "Rate-limit activity
> caused by each domU." but the commit log doesn't make any reference to
> the reason for it.
>
> It may just have been a kernel bug at around the time that patch was
> authored, but google doesn't seem to show any evidence of such a bug
> ever being widespread (i.e. I don't find any references to it).
>
Then I will leave the fuzz here, and do another patch to remove it.
I will also add reference to CS 16257 in the comment.
Wei.
^ permalink raw reply [flat|nested] 21+ messages in thread