* [PATCH 2/4] eventpoll: split out wait handling
2022-10-07 16:56 [PATCHSET RFC 0/4] Add support for epoll min_wait Jens Axboe
2022-10-07 16:56 ` [PATCH 1/4] eventpoll: cleanup branches around sleeping for events Jens Axboe
@ 2022-10-07 16:56 ` Jens Axboe
2022-10-07 16:56 ` [PATCH 3/4] eventpoll: move expires to epoll_wq Jens Axboe
2022-10-07 16:56 ` [PATCH 4/4] eventpoll: add support for min-wait Jens Axboe
3 siblings, 0 replies; 5+ messages in thread
From: Jens Axboe @ 2022-10-07 16:56 UTC (permalink / raw)
To: linux-kernel, netdev; +Cc: Jens Axboe
In preparation for making changes to how wakeups and sleeps are done,
move the timeout scheduling into a helper and manage it rather than
rely on schedule_hrtimeout_range().
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
fs/eventpoll.c | 70 ++++++++++++++++++++++++++++++++++++++++----------
1 file changed, 56 insertions(+), 14 deletions(-)
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 8a75ae70e312..01b9dab2b68c 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1762,6 +1762,47 @@ static int ep_autoremove_wake_function(struct wait_queue_entry *wq_entry,
return ret;
}
+struct epoll_wq {
+ wait_queue_entry_t wait;
+ struct hrtimer timer;
+ bool timed_out;
+};
+
+static enum hrtimer_restart ep_timer(struct hrtimer *timer)
+{
+ struct epoll_wq *ewq = container_of(timer, struct epoll_wq, timer);
+ struct task_struct *task = ewq->wait.private;
+
+ ewq->timed_out = true;
+ wake_up_process(task);
+ return HRTIMER_NORESTART;
+}
+
+static void ep_schedule(struct eventpoll *ep, struct epoll_wq *ewq, ktime_t *to,
+ u64 slack)
+{
+ if (ewq->timed_out)
+ return;
+ if (to && *to == 0) {
+ ewq->timed_out = true;
+ return;
+ }
+ if (!to) {
+ schedule();
+ return;
+ }
+
+ hrtimer_init_on_stack(&ewq->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ ewq->timer.function = ep_timer;
+ hrtimer_set_expires_range_ns(&ewq->timer, *to, slack);
+ hrtimer_start_expires(&ewq->timer, HRTIMER_MODE_ABS);
+
+ schedule();
+
+ hrtimer_cancel(&ewq->timer);
+ destroy_hrtimer_on_stack(&ewq->timer);
+}
+
/**
* ep_poll - Retrieves ready events, and delivers them to the caller-supplied
* event buffer.
@@ -1782,13 +1823,15 @@ static int ep_autoremove_wake_function(struct wait_queue_entry *wq_entry,
static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
int maxevents, struct timespec64 *timeout)
{
- int res, eavail, timed_out = 0;
+ int res, eavail;
u64 slack = 0;
- wait_queue_entry_t wait;
ktime_t expires, *to = NULL;
+ struct epoll_wq ewq;
lockdep_assert_irqs_enabled();
+ ewq.timed_out = false;
+
if (timeout && (timeout->tv_sec | timeout->tv_nsec)) {
slack = select_estimate_accuracy(timeout);
to = &expires;
@@ -1798,7 +1841,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
* Avoid the unnecessary trip to the wait queue loop, if the
* caller specified a non blocking operation.
*/
- timed_out = 1;
+ ewq.timed_out = 1;
}
/*
@@ -1823,10 +1866,10 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
return res;
}
- if (timed_out)
+ if (ewq.timed_out)
return 0;
- eavail = ep_busy_loop(ep, timed_out);
+ eavail = ep_busy_loop(ep, ewq.timed_out);
if (eavail)
continue;
@@ -1850,8 +1893,8 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
* performance issue if a process is killed, causing all of its
* threads to wake up without being removed normally.
*/
- init_wait(&wait);
- wait.func = ep_autoremove_wake_function;
+ init_wait(&ewq.wait);
+ ewq.wait.func = ep_autoremove_wake_function;
write_lock_irq(&ep->lock);
/*
@@ -1870,10 +1913,9 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
*/
eavail = ep_events_available(ep);
if (!eavail) {
- __add_wait_queue_exclusive(&ep->wq, &wait);
+ __add_wait_queue_exclusive(&ep->wq, &ewq.wait);
write_unlock_irq(&ep->lock);
- timed_out = !schedule_hrtimeout_range(to, slack,
- HRTIMER_MODE_ABS);
+ ep_schedule(ep, &ewq, to, slack);
} else {
write_unlock_irq(&ep->lock);
}
@@ -1887,7 +1929,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
*/
eavail = 1;
- if (!list_empty_careful(&wait.entry)) {
+ if (!list_empty_careful(&ewq.wait.entry)) {
write_lock_irq(&ep->lock);
/*
* If the thread timed out and is not on the wait queue,
@@ -1896,9 +1938,9 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
* Thus, when wait.entry is empty, it needs to harvest
* events.
*/
- if (timed_out)
- eavail = list_empty(&wait.entry);
- __remove_wait_queue(&ep->wq, &wait);
+ if (ewq.timed_out)
+ eavail = list_empty(&ewq.wait.entry);
+ __remove_wait_queue(&ep->wq, &ewq.wait);
write_unlock_irq(&ep->lock);
}
}
--
2.35.1
^ permalink raw reply related [flat|nested] 5+ messages in thread* [PATCH 4/4] eventpoll: add support for min-wait
2022-10-07 16:56 [PATCHSET RFC 0/4] Add support for epoll min_wait Jens Axboe
` (2 preceding siblings ...)
2022-10-07 16:56 ` [PATCH 3/4] eventpoll: move expires to epoll_wq Jens Axboe
@ 2022-10-07 16:56 ` Jens Axboe
3 siblings, 0 replies; 5+ messages in thread
From: Jens Axboe @ 2022-10-07 16:56 UTC (permalink / raw)
To: linux-kernel, netdev; +Cc: Jens Axboe
Rather than just have a timeout value for waiting on events, add
EPOLL_CTL_MIN_WAIT to allow setting a minimum time that epoll_wait()
should always wait for events to arrive.
For medium workload efficiencies, some production workloads inject
artificial timers or sleeps before calling epoll_wait() to get
better batching and higher efficiencies. While this does help, it's
not as efficient as it could be. By adding support for epoll_wait()
for this directly, we can avoids extra context switches and scheduler
and timer overhead.
As an example, running an AB test on an identical workload at about
~370K reqs/second, without this change and with the sleep hack
mentioned above (using 200 usec as the timeout), we're doing 310K-340K
non-voluntary context switches per second. Idle CPU on the host is 27-34%.
With the the sleep hack removed and epoll set to the same 200 usec
value, we're handling the exact same load but at 292K-315k non-voluntary
context switches and idle CPU of 33-41%, a substantial win.
Basic test case:
struct d {
int p1, p2;
};
static void *fn(void *data)
{
struct d *d = data;
char b = 0x89;
/* Generate 2 events 20 msec apart */
usleep(10000);
write(d->p1, &b, sizeof(b));
usleep(10000);
write(d->p2, &b, sizeof(b));
return NULL;
}
int main(int argc, char *argv[])
{
struct epoll_event ev, events[2];
pthread_t thread;
int p1[2], p2[2];
struct d d;
int efd, ret;
efd = epoll_create1(0);
if (efd < 0) {
perror("epoll_create");
return 1;
}
if (pipe(p1) < 0) {
perror("pipe");
return 1;
}
if (pipe(p2) < 0) {
perror("pipe");
return 1;
}
ev.events = EPOLLIN;
ev.data.fd = p1[0];
if (epoll_ctl(efd, EPOLL_CTL_ADD, p1[0], &ev) < 0) {
perror("epoll add");
return 1;
}
ev.events = EPOLLIN;
ev.data.fd = p2[0];
if (epoll_ctl(efd, EPOLL_CTL_ADD, p2[0], &ev) < 0) {
perror("epoll add");
return 1;
}
/* always wait 200 msec for events */
ev.data.u64 = 200000;
if (epoll_ctl(efd, EPOLL_CTL_MIN_WAIT, -1, &ev) < 0) {
perror("epoll add set timeout");
return 1;
}
d.p1 = p1[1];
d.p2 = p2[1];
pthread_create(&thread, NULL, fn, &d);
/* expect to get 2 events here rather than just 1 */
ret = epoll_wait(efd, events, 2, -1);
printf("epoll_wait=%d\n", ret);
return 0;
}
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
fs/eventpoll.c | 132 ++++++++++++++++++++++++++-------
include/linux/eventpoll.h | 2 +-
include/uapi/linux/eventpoll.h | 1 +
3 files changed, 109 insertions(+), 26 deletions(-)
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 79aa61a951df..ccb8400e2252 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -39,6 +39,11 @@
#include <linux/rculist.h>
#include <net/busy_poll.h>
+/*
+ * If a default min_wait timeout is desired, set this to non-zero. In usecs.
+ */
+#define EPOLL_DEF_MIN_WAIT 0
+
/*
* LOCKING:
* There are three level of locking required by epoll :
@@ -117,6 +122,9 @@ struct eppoll_entry {
/* The "base" pointer is set to the container "struct epitem" */
struct epitem *base;
+ /* min wait time if (min_wait_ts) & 1 != 0 */
+ ktime_t min_wait_ts;
+
/*
* Wait queue item that will be linked to the target file wait
* queue head.
@@ -217,6 +225,9 @@ struct eventpoll {
u64 gen;
struct hlist_head refs;
+ /* min wait for epoll_wait() */
+ unsigned int min_wait_ts;
+
#ifdef CONFIG_NET_RX_BUSY_POLL
/* used to track busy poll napi_id */
unsigned int napi_id;
@@ -953,6 +964,7 @@ static int ep_alloc(struct eventpoll **pep)
ep->rbr = RB_ROOT_CACHED;
ep->ovflist = EP_UNACTIVE_PTR;
ep->user = user;
+ ep->min_wait_ts = EPOLL_DEF_MIN_WAIT;
*pep = ep;
@@ -1747,6 +1759,32 @@ static struct timespec64 *ep_timeout_to_timespec(struct timespec64 *to, long ms)
return to;
}
+struct epoll_wq {
+ wait_queue_entry_t wait;
+ struct hrtimer timer;
+ ktime_t timeout_ts;
+ ktime_t min_wait_ts;
+ struct eventpoll *ep;
+ bool timed_out;
+ int maxevents;
+ int wakeups;
+};
+
+static bool ep_should_min_wait(struct epoll_wq *ewq)
+{
+ if (ewq->min_wait_ts & 1) {
+ /* just an approximation */
+ if (++ewq->wakeups >= ewq->maxevents)
+ goto stop_wait;
+ if (ktime_before(ktime_get_ns(), ewq->min_wait_ts))
+ return true;
+ }
+
+stop_wait:
+ ewq->min_wait_ts &= ~(u64) 1;
+ return false;
+}
+
/*
* autoremove_wake_function, but remove even on failure to wake up, because we
* know that default_wake_function/ttwu will only fail if the thread is already
@@ -1756,27 +1794,37 @@ static struct timespec64 *ep_timeout_to_timespec(struct timespec64 *to, long ms)
static int ep_autoremove_wake_function(struct wait_queue_entry *wq_entry,
unsigned int mode, int sync, void *key)
{
- int ret = default_wake_function(wq_entry, mode, sync, key);
+ struct epoll_wq *ewq = container_of(wq_entry, struct epoll_wq, wait);
+ int ret;
+
+ /*
+ * If min wait time hasn't been satisfied yet, keep waiting
+ */
+ if (ep_should_min_wait(ewq))
+ return 0;
+ ret = default_wake_function(wq_entry, mode, sync, key);
list_del_init(&wq_entry->entry);
return ret;
}
-struct epoll_wq {
- wait_queue_entry_t wait;
- struct hrtimer timer;
- ktime_t timeout_ts;
- bool timed_out;
-};
-
static enum hrtimer_restart ep_timer(struct hrtimer *timer)
{
struct epoll_wq *ewq = container_of(timer, struct epoll_wq, timer);
struct task_struct *task = ewq->wait.private;
+ const bool is_min_wait = ewq->min_wait_ts & 1;
+
+ if (!is_min_wait || ep_events_available(ewq->ep)) {
+ if (!is_min_wait)
+ ewq->timed_out = true;
+ ewq->min_wait_ts &= ~(u64) 1;
+ wake_up_process(task);
+ return HRTIMER_NORESTART;
+ }
- ewq->timed_out = true;
- wake_up_process(task);
- return HRTIMER_NORESTART;
+ ewq->min_wait_ts &= ~(u64) 1;
+ hrtimer_set_expires_range_ns(&ewq->timer, ewq->timeout_ts, 0);
+ return HRTIMER_RESTART;
}
static void ep_schedule(struct eventpoll *ep, struct epoll_wq *ewq, ktime_t *to,
@@ -1831,12 +1879,14 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
lockdep_assert_irqs_enabled();
+ ewq.ep = ep;
ewq.timed_out = false;
+ ewq.maxevents = maxevents;
+ ewq.wakeups = 0;
if (timeout && (timeout->tv_sec | timeout->tv_nsec)) {
slack = select_estimate_accuracy(timeout);
- to = &ewq.timeout_ts;
- *to = timespec64_to_ktime(*timeout);
+ ewq.timeout_ts = timespec64_to_ktime(*timeout);
} else if (timeout) {
/*
* Avoid the unnecessary trip to the wait queue loop, if the
@@ -1845,6 +1895,21 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
ewq.timed_out = 1;
}
+ /*
+ * If min_wait is set for this epoll instance, note the min_wait
+ * time. Ensure the lowest bit is set in ewq.min_wait_ts, that's
+ * the state bit for whether or not min_wait is enabled.
+ */
+ if (ep->min_wait_ts) {
+ ewq.min_wait_ts = ktime_add_us(ktime_get_ns(),
+ ep->min_wait_ts);
+ ewq.min_wait_ts |= (u64) 1;
+ to = &ewq.min_wait_ts;
+ } else {
+ ewq.min_wait_ts = 0;
+ to = &ewq.timeout_ts;
+ }
+
/*
* This call is racy: We may or may not see events that are being added
* to the ready list under the lock (e.g., in IRQ callbacks). For cases
@@ -1913,7 +1978,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
* important.
*/
eavail = ep_events_available(ep);
- if (!eavail) {
+ if (!eavail || ewq.min_wait_ts & 1) {
__add_wait_queue_exclusive(&ep->wq, &ewq.wait);
write_unlock_irq(&ep->lock);
ep_schedule(ep, &ewq, to, slack);
@@ -2111,6 +2176,31 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
if (!f.file)
goto error_return;
+ /*
+ * We have to check that the file structure underneath the file
+ * descriptor the user passed to us _is_ an eventpoll file.
+ */
+ error = -EINVAL;
+ if (!is_file_epoll(f.file))
+ goto error_fput;
+
+ /*
+ * At this point it is safe to assume that the "private_data" contains
+ * our own data structure.
+ */
+ ep = f.file->private_data;
+
+ /*
+ * Handle EPOLL_CTL_MIN_WAIT upfront as we don't need to care about
+ * the fd being passed in.
+ */
+ if (op == EPOLL_CTL_MIN_WAIT) {
+ /* return old value */
+ error = ep->min_wait_ts;
+ ep->min_wait_ts = epds->data;
+ goto error_fput;
+ }
+
/* Get the "struct file *" for the target file */
tf = fdget(fd);
if (!tf.file)
@@ -2126,12 +2216,10 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
ep_take_care_of_epollwakeup(epds);
/*
- * We have to check that the file structure underneath the file descriptor
- * the user passed to us _is_ an eventpoll file. And also we do not permit
- * adding an epoll file descriptor inside itself.
+ * We do not permit adding an epoll file descriptor inside itself.
*/
error = -EINVAL;
- if (f.file == tf.file || !is_file_epoll(f.file))
+ if (f.file == tf.file)
goto error_tgt_fput;
/*
@@ -2147,12 +2235,6 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
goto error_tgt_fput;
}
- /*
- * At this point it is safe to assume that the "private_data" contains
- * our own data structure.
- */
- ep = f.file->private_data;
-
/*
* When we insert an epoll file descriptor inside another epoll file
* descriptor, there is the chance of creating closed loops, which are
@@ -2251,7 +2333,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
{
struct epoll_event epds;
- if (ep_op_has_event(op) &&
+ if ((ep_op_has_event(op) || op == EPOLL_CTL_MIN_WAIT) &&
copy_from_user(&epds, event, sizeof(struct epoll_event)))
return -EFAULT;
diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h
index 3337745d81bd..cbef635cb7e4 100644
--- a/include/linux/eventpoll.h
+++ b/include/linux/eventpoll.h
@@ -59,7 +59,7 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
/* Tells if the epoll_ctl(2) operation needs an event copy from userspace */
static inline int ep_op_has_event(int op)
{
- return op != EPOLL_CTL_DEL;
+ return op != EPOLL_CTL_DEL && op != EPOLL_CTL_MIN_WAIT;
}
#else
diff --git a/include/uapi/linux/eventpoll.h b/include/uapi/linux/eventpoll.h
index 8a3432d0f0dc..81ecb1ca36e0 100644
--- a/include/uapi/linux/eventpoll.h
+++ b/include/uapi/linux/eventpoll.h
@@ -26,6 +26,7 @@
#define EPOLL_CTL_ADD 1
#define EPOLL_CTL_DEL 2
#define EPOLL_CTL_MOD 3
+#define EPOLL_CTL_MIN_WAIT 4
/* Epoll event masks */
#define EPOLLIN (__force __poll_t)0x00000001
--
2.35.1
^ permalink raw reply related [flat|nested] 5+ messages in thread