* [RFC PATCH v5 1/2] eventfd: add configurable per-fd counter maximum for flow control
2026-04-08 17:24 [RFC PATCH v5 0/2] eventfd: add configurable maximum counter value for flow control wen.yang
@ 2026-04-08 17:24 ` wen.yang
2026-04-08 17:24 ` [RFC PATCH v5 2/2] selftests/eventfd: add EFD_IOC_{SET,GET}_MAXIMUM tests wen.yang
1 sibling, 0 replies; 3+ messages in thread
From: wen.yang @ 2026-04-08 17:24 UTC (permalink / raw)
To: Christian Brauner, Jan Kara, Alexander Viro
Cc: linux-fsdevel, linux-kernel, Wen Yang, Jens Axboe
From: Wen Yang <wen.yang@linux.dev>
In non-semaphore mode, write(2) accumulates into the counter and a
single read(2) drains it entirely. A producer issuing repeated
write(1) calls coalesces N signals into the counter; each write
succeeds immediately regardless of whether the consumer has processed
earlier events. With no bound below ULLONG_MAX (~1.8×10¹⁹), the
counter grows without bound, consumer lag is invisible to the producer,
and in tight loops both sides burn CPU at 100% even though the consumer
is not keeping up. Without a maximum, the batch size seen by each
read(2) is also unbounded: a slow consumer may drain thousands of
accumulated signals in one call, losing visibility into how far behind
it has fallen.
Introduce two ioctl commands:
EFD_IOC_SET_MAXIMUM (_IOW('J', 0, __u64))
Set the overflow threshold. A write(2) that would push the counter
to or beyond this value blocks (EAGAIN for O_NONBLOCK fds).
Returns -EINVAL if the requested maximum is <= the current counter.
Wakes any blocked writers so they re-evaluate the new limit without
waiting for the next read(2).
EFD_IOC_GET_MAXIMUM (_IOR('J', 1, __u64))
Return the current threshold. Defaults to ULLONG_MAX, preserving
the original unlimited behaviour. The value is also visible in
/proc/self/fdinfo as "eventfd-maximum".
The maximum acts as the overflow level, exactly as ULLONG_MAX did in the
original design: the kernel-internal eventfd_signal() path may still
raise the counter to maximum (triggering EPOLLERR), while userspace
writes are capped at maximum-1.
This follows the backpressure pattern established by pipe(2): writers
block when the buffer is full, and capacity is adjustable via
fcntl(F_SETPIPE_SZ). POSIX message queues apply the same model:
mq_send(3) blocks when the queue depth reaches mq_maxmsg.
The following self-contained program covers three benchmarks. Build and
run with: gcc -O2 -lpthread bench.c -o bench && ./bench
/* bench.c */
#define _GNU_SOURCE
#include <errno.h>
#include <inttypes.h>
#include <poll.h>
#include <pthread.h>
#include <sched.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/eventfd.h>
#include <sys/ioctl.h>
#include <time.h>
#include <unistd.h>
#include <linux/eventfd.h>
#define SECS 5
#define MAX 10ULL
#define LAT_N 5000
#define COAL_N 10000ULL
#define WINT 100000ULL /* 100 µs → 10 K events/s */
#define RSLT 125000ULL /* 125 µs → ~8 K events/s */
/* helpers */
static uint64_t cpu_ms(void) {
struct timespec t;
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &t);
return (uint64_t)t.tv_sec * 1000 + t.tv_nsec / 1000000;
}
static uint64_t mono_ns(void) {
struct timespec t;
clock_gettime(CLOCK_MONOTONIC, &t);
return (uint64_t)t.tv_sec * 1000000000ULL + t.tv_nsec;
}
static void set_max(int fd, uint64_t m) {
if (m) ioctl(fd, EFD_IOC_SET_MAXIMUM, &m);
}
static void maxstr(char *b, uint64_t m) {
if (!m) snprintf(b, 24, "ULLONG_MAX");
else snprintf(b, 24, "%llu", (unsigned long long)m);
}
/* bench 1: burst/CPU savings */
enum mode { BLOCKING, SPIN, POLL_OUT };
static int burst_fd; static volatile int stop;
static enum mode wmode;
static uint64_t wcpu, rcpu, neagain, nwrites, nreads;
static void *burst_writer(void *_) {
(void)_;
uint64_t v=1, n=0, ea=0, t0=cpu_ms();
struct pollfd p={.fd=burst_fd,.events=POLLOUT};
while (!stop) {
if (wmode==BLOCKING) { if (write(burst_fd,&v,8)==8) n++; }
else if (wmode==SPIN) { if (write(burst_fd,&v,8)<0 && errno==EAGAIN) ea++; else n++; }
else { while (!stop && !(poll(&p,1,20)>0 && p.revents&POLLOUT));
if (write(burst_fd,&v,8)==8) n++; }
}
wcpu=cpu_ms()-t0; neagain=ea; nwrites=n; return NULL;
}
static void *burst_reader(void *_) {
(void)_;
struct pollfd p={.fd=burst_fd,.events=POLLIN};
uint64_t v, nr=0, t0=cpu_ms();
while (stop==0 || (poll(&p,1,0)>0 && p.revents&POLLIN))
if (poll(&p,1,5)>0 && read(burst_fd,&v,8)==8) { nr++; usleep(1000); }
rcpu=cpu_ms()-t0; nreads=nr; return NULL;
}
static void run_burst(const char *lbl, enum mode m, uint64_t max) {
burst_fd=eventfd(0, m!=BLOCKING ? EFD_CLOEXEC|EFD_NONBLOCK : EFD_CLOEXEC);
set_max(burst_fd, max); wmode=m; stop=0;
pthread_t w,r;
pthread_create(&r,NULL,burst_reader,NULL); pthread_create(&w,NULL,burst_writer,NULL);
cpu_set_t c;
CPU_ZERO(&c); CPU_SET(0,&c); pthread_setaffinity_np(r,sizeof(c),&c);
CPU_ZERO(&c); CPU_SET(1,&c); pthread_setaffinity_np(w,sizeof(c),&c);
sleep(SECS); stop=1;
pthread_join(w,NULL); pthread_join(r,NULL); close(burst_fd);
char mb[24]; maxstr(mb, max);
printf(" %-22s %-12s %8llu %8llu %10llu %10llu %8llu\n", lbl, mb,
(unsigned long long)wcpu, (unsigned long long)rcpu,
(unsigned long long)neagain,
(unsigned long long)nwrites, (unsigned long long)nreads);
}
/* bench 2: latency tail (EFD_SEMAPHORE) */
static int latency_fd;
static uint64_t wts[LAT_N], rts[LAT_N];
static void *latency_writer(void *_) {
(void)_; uint64_t v=1, next=mono_ns();
for (int i=0; i<LAT_N; i++) {
while (mono_ns()<next); next+=WINT;
wts[i]=mono_ns();
int r; do { r=write(latency_fd,&v,8); } while (r<0 && errno==EINTR);
}
return NULL;
}
static void *latency_reader(void *_) {
(void)_; struct pollfd p={.fd=latency_fd,.events=POLLIN}; uint64_t v;
for (int i=0; i<LAT_N; i++) {
while (poll(&p,1,200)<=0 || !(p.revents&POLLIN));
(void)read(latency_fd,&v,8); rts[i]=mono_ns();
uint64_t w=mono_ns()+RSLT; while (mono_ns()<w);
}
return NULL;
}
static int cmp64(const void *a, const void *b) {
uint64_t x=*(const uint64_t*)a, y=*(const uint64_t*)b;
return (x>y)-(x<y);
}
static void run_latency(uint64_t max) {
latency_fd=eventfd(0, EFD_CLOEXEC|EFD_SEMAPHORE); set_max(latency_fd, max);
pthread_t w,r;
pthread_create(&r,NULL,latency_reader,NULL); pthread_create(&w,NULL,latency_writer,NULL);
pthread_join(w,NULL); pthread_join(r,NULL); close(latency_fd);
uint64_t lat[LAT_N];
for (int i=0; i<LAT_N; i++) lat[i]=rts[i]-wts[i];
qsort(lat,LAT_N,sizeof(lat[0]),cmp64);
char mb[24]; maxstr(mb, max);
printf(" %-12s %10.0f %10.0f %10.0f\n", mb,
lat[LAT_N*99/100]/1000.0, lat[LAT_N*999/1000]/1000.0,
lat[LAT_N-1]/1000.0);
}
/* bench 3: coalescing (non-EFD_SEMAPHORE) */
static int coal_fd;
static uint64_t coal_reads;
static void *coal_writer(void *_) {
(void)_; uint64_t v=1;
for (uint64_t i=0; i<COAL_N; i++) {
int r; do { r=write(coal_fd,&v,8); } while (r<0 && errno==EINTR);
}
return NULL;
}
static void *coal_reader(void *_) {
(void)_; uint64_t v, nr=0, tot=0;
while (tot < COAL_N) {
(void)read(coal_fd,&v,8); nr++; tot+=v;
uint64_t w=mono_ns()+RSLT; while(mono_ns()<w);
}
coal_reads=nr; return NULL;
}
static void run_coalesce(uint64_t max) {
coal_fd=eventfd(0, EFD_CLOEXEC); set_max(coal_fd, max);
pthread_t w,r;
pthread_create(&r,NULL,coal_reader,NULL); pthread_create(&w,NULL,coal_writer,NULL);
pthread_join(w,NULL); pthread_join(r,NULL); close(coal_fd);
char mb[24]; maxstr(mb, max);
printf(" %-12s %10llu %8llu %10.1f\n", mb,
(unsigned long long)COAL_N,
(unsigned long long)coal_reads,
(double)COAL_N/coal_reads);
}
/* main */
int main(void) {
printf("\nBench 1 – burst/CPU (writer vs reader, %ds)\n", SECS);
printf(" %-22s %-12s %8s %8s %10s %10s %8s\n",
"writer_mode","maximum","wcpu_ms","rcpu_ms","EAGAIN","writes","reads");
printf(" -----------------------------------------------------------------------\n");
run_burst("blocking, no limit", BLOCKING, 0);
run_burst("blocking", BLOCKING, MAX);
run_burst("O_NONBLOCK+spin", SPIN, MAX);
run_burst("O_NONBLOCK+poll", POLL_OUT, MAX);
printf("\nBench 2 – latency tail"
" (EFD_SEMAPHORE, 10K/s writer, ~8K/s reader, %d events)\n",
LAT_N);
printf(" %-12s %10s %10s %10s\n",
"maximum","p99_us","p999_us","max_us");
printf(" -------------------------------------------\n");
static const uint64_t mv[]={0,100,10};
for (int i=0;i<3;i++) run_latency(mv[i]);
printf("\nBench 3 – coalescing"
" (non-EFD_SEMAPHORE, %llu writes, 125us/read reader)\n",
(unsigned long long)COAL_N);
printf(" %-12s %10s %8s %10s\n",
"maximum","writes","reads","avg_batch");
printf(" -----------------------------------------------\n");
static const uint64_t cv[]={0,100,10};
for (int i=0;i<3;i++) run_coalesce(cv[i]);
}
On a 4-core x86_64 (writer and reader pinned to separate CPUs,
reader sleeps 1 ms between reads to simulate processing time):
Bench 1 – burst/CPU (writer vs reader, 5s)
writer_mode maximum wcpu_ms rcpu_ms EAGAIN writes reads
-------------------------------------------------------------------------
blocking, no limit ULLONG_MAX 5002 132 0 6517388 4506
blocking 10 133 150 0 40456 4496
O_NONBLOCK+spin 10 4999 126 5789340 40568 4508
O_NONBLOCK+poll 10 189 151 0 40519 4503
Bench 2 – latency tail (EFD_SEMAPHORE, 10K/s writer, ~8K/s reader, 5000 events)
maximum p99_us p999_us max_us
-------------------------------------------
ULLONG_MAX 141218 142477 142588
100 13298 13320 13334
10 1719 2378 2381
Bench 3 – coalescing (non-EFD_SEMAPHORE, 10000 writes, 125us/read reader)
maximum writes reads avg_batch
-----------------------------------------------
ULLONG_MAX 10000 79 126.6
100 10000 105 95.2
10 10000 1121 8.9
With maximum=10: burst CPU drops >97% (5002 ms → 133 ms); latency p999
drops ~60x (142 ms → 2.4 ms); coalescing batch size is bounded to 9
(vs 127 without a limit), so the consumer always knows the backlog is
small. O_NONBLOCK+spin bypasses flow control entirely — use
poll(POLLOUT)+write to get the same benefit as a blocking write while
still multiplexing other fds in a single poll(2) call.
Signed-off-by: Wen Yang <wen.yang@linux.dev>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
---
.../userspace-api/ioctl/ioctl-number.rst | 1 +
fs/eventfd.c | 74 ++++++++++++++++---
include/uapi/linux/eventfd.h | 6 ++
3 files changed, 69 insertions(+), 12 deletions(-)
diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst
index 331223761fff..d233559179b1 100644
--- a/Documentation/userspace-api/ioctl/ioctl-number.rst
+++ b/Documentation/userspace-api/ioctl/ioctl-number.rst
@@ -170,6 +170,7 @@ Code Seq# Include File Comments
'I' all linux/isdn.h conflict!
'I' 00-0F drivers/isdn/divert/isdn_divert.h conflict!
'I' 40-4F linux/mISDNif.h conflict!
+'J' 00-01 linux/eventfd.h eventfd ioctl
'K' all linux/kd.h
'L' 00-1F linux/loop.h conflict!
'L' 10-1F drivers/scsi/mpt3sas/mpt3sas_ctl.h conflict!
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 3219e0d596fe..11985d07e904 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -39,6 +39,7 @@ struct eventfd_ctx {
* also, adds to the "count" counter and issue a wakeup.
*/
__u64 count;
+ __u64 maximum;
unsigned int flags;
int id;
};
@@ -49,9 +50,9 @@ struct eventfd_ctx {
* @mask: [in] poll mask
*
* This function is supposed to be called by the kernel in paths that do not
- * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX
- * value, and we signal this as overflow condition by returning a EPOLLERR
- * to poll(2).
+ * allow sleeping. In this function we allow the counter to reach the maximum
+ * value (ctx->maximum), and we signal this as overflow condition by returning
+ * a EPOLLERR to poll(2).
*/
void eventfd_signal_mask(struct eventfd_ctx *ctx, __poll_t mask)
{
@@ -70,7 +71,7 @@ void eventfd_signal_mask(struct eventfd_ctx *ctx, __poll_t mask)
spin_lock_irqsave(&ctx->wqh.lock, flags);
current->in_eventfd = 1;
- if (ctx->count < ULLONG_MAX)
+ if (ctx->count < ctx->maximum)
ctx->count++;
if (waitqueue_active(&ctx->wqh))
wake_up_locked_poll(&ctx->wqh, EPOLLIN | mask);
@@ -119,7 +120,7 @@ static __poll_t eventfd_poll(struct file *file, poll_table *wait)
{
struct eventfd_ctx *ctx = file->private_data;
__poll_t events = 0;
- u64 count;
+ u64 count, max;
poll_wait(file, &ctx->wqh, wait);
@@ -162,12 +163,13 @@ static __poll_t eventfd_poll(struct file *file, poll_table *wait)
* eventfd_poll returns 0
*/
count = READ_ONCE(ctx->count);
+ max = READ_ONCE(ctx->maximum);
if (count > 0)
events |= EPOLLIN;
- if (count == ULLONG_MAX)
+ if (count == max)
events |= EPOLLERR;
- if (ULLONG_MAX - 1 > count)
+ if (max - 1 > count)
events |= EPOLLOUT;
return events;
@@ -244,6 +246,11 @@ static ssize_t eventfd_read(struct kiocb *iocb, struct iov_iter *to)
return sizeof(ucnt);
}
+static inline bool eventfd_is_writable(struct eventfd_ctx *ctx, __u64 cnt)
+{
+ return ctx->maximum > ctx->count && ctx->maximum - ctx->count > cnt;
+}
+
static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t count,
loff_t *ppos)
{
@@ -259,11 +266,11 @@ static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t c
return -EINVAL;
spin_lock_irq(&ctx->wqh.lock);
res = -EAGAIN;
- if (ULLONG_MAX - ctx->count > ucnt)
+ if (eventfd_is_writable(ctx, ucnt))
res = sizeof(ucnt);
else if (!(file->f_flags & O_NONBLOCK)) {
res = wait_event_interruptible_locked_irq(ctx->wqh,
- ULLONG_MAX - ctx->count > ucnt);
+ eventfd_is_writable(ctx, ucnt));
if (!res)
res = sizeof(ucnt);
}
@@ -283,22 +290,62 @@ static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t c
static void eventfd_show_fdinfo(struct seq_file *m, struct file *f)
{
struct eventfd_ctx *ctx = f->private_data;
- __u64 cnt;
+ __u64 cnt, max;
spin_lock_irq(&ctx->wqh.lock);
cnt = ctx->count;
+ max = ctx->maximum;
spin_unlock_irq(&ctx->wqh.lock);
seq_printf(m,
"eventfd-count: %16llx\n"
"eventfd-id: %d\n"
- "eventfd-semaphore: %d\n",
+ "eventfd-semaphore: %d\n"
+ "eventfd-maximum: %16llx\n",
cnt,
ctx->id,
- !!(ctx->flags & EFD_SEMAPHORE));
+ !!(ctx->flags & EFD_SEMAPHORE),
+ max);
}
#endif
+static long eventfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ struct eventfd_ctx *ctx = file->private_data;
+ void __user *argp = (void __user *)arg;
+ __u64 max;
+ int ret;
+
+ switch (cmd) {
+ case EFD_IOC_SET_MAXIMUM:
+ if (copy_from_user(&max, argp, sizeof(max)))
+ return -EFAULT;
+
+ spin_lock_irq(&ctx->wqh.lock);
+ if (ctx->count >= max) {
+ ret = -EINVAL;
+ } else {
+ ctx->maximum = max;
+ ret = 0;
+ /* wake blocked writers that may now fit within the new maximum */
+ if (waitqueue_active(&ctx->wqh))
+ wake_up_locked_poll(&ctx->wqh, EPOLLOUT);
+ }
+ spin_unlock_irq(&ctx->wqh.lock);
+ return ret;
+
+ case EFD_IOC_GET_MAXIMUM:
+ spin_lock_irq(&ctx->wqh.lock);
+ max = ctx->maximum;
+ spin_unlock_irq(&ctx->wqh.lock);
+
+ return copy_to_user(argp, &max, sizeof(max)) ? -EFAULT : 0;
+
+ default:
+ return -ENOTTY;
+ }
+}
+
static const struct file_operations eventfd_fops = {
#ifdef CONFIG_PROC_FS
.show_fdinfo = eventfd_show_fdinfo,
@@ -307,6 +354,8 @@ static const struct file_operations eventfd_fops = {
.poll = eventfd_poll,
.read_iter = eventfd_read,
.write = eventfd_write,
+ .unlocked_ioctl = eventfd_ioctl,
+ .compat_ioctl = compat_ptr_ioctl,
.llseek = noop_llseek,
};
@@ -395,6 +444,7 @@ static int do_eventfd(unsigned int count, int flags)
kref_init(&ctx->kref);
init_waitqueue_head(&ctx->wqh);
ctx->count = count;
+ ctx->maximum = ULLONG_MAX;
ctx->flags = flags;
flags &= EFD_SHARED_FCNTL_FLAGS;
diff --git a/include/uapi/linux/eventfd.h b/include/uapi/linux/eventfd.h
index 2eb9ab6c32f3..ba46b746f597 100644
--- a/include/uapi/linux/eventfd.h
+++ b/include/uapi/linux/eventfd.h
@@ -3,9 +3,15 @@
#define _UAPI_LINUX_EVENTFD_H
#include <linux/fcntl.h>
+#include <linux/ioctl.h>
+#include <linux/types.h>
#define EFD_SEMAPHORE (1 << 0)
#define EFD_CLOEXEC O_CLOEXEC
#define EFD_NONBLOCK O_NONBLOCK
+/* Flow-control ioctls: configure the per-fd counter maximum. */
+#define EFD_IOC_SET_MAXIMUM _IOW('J', 0, __u64)
+#define EFD_IOC_GET_MAXIMUM _IOR('J', 1, __u64)
+
#endif /* _UAPI_LINUX_EVENTFD_H */
--
2.25.1
^ permalink raw reply related [flat|nested] 3+ messages in thread* [RFC PATCH v5 2/2] selftests/eventfd: add EFD_IOC_{SET,GET}_MAXIMUM tests
2026-04-08 17:24 [RFC PATCH v5 0/2] eventfd: add configurable maximum counter value for flow control wen.yang
2026-04-08 17:24 ` [RFC PATCH v5 1/2] eventfd: add configurable per-fd counter maximum " wen.yang
@ 2026-04-08 17:24 ` wen.yang
1 sibling, 0 replies; 3+ messages in thread
From: wen.yang @ 2026-04-08 17:24 UTC (permalink / raw)
To: Christian Brauner, Jan Kara, Alexander Viro
Cc: linux-fsdevel, linux-kernel, Wen Yang
From: Wen Yang <wen.yang@linux.dev>
Add correctness tests for the flow-control ioctls introduced in the
preceding commit. Cover the GET/SET round-trip, EINVAL when the
requested maximum does not exceed the current counter, EAGAIN on an
O_NONBLOCK fd when a write would reach the configured maximum, EPOLLOUT
gating at maximum-1, /proc/self/fdinfo exposure of the "eventfd-maximum"
field, and ENOTTY for unrecognised ioctl commands.
Signed-off-by: Wen Yang <wen.yang@linux.dev>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
---
.../filesystems/eventfd/eventfd_test.c | 238 +++++++++++++++++-
1 file changed, 237 insertions(+), 1 deletion(-)
diff --git a/tools/testing/selftests/filesystems/eventfd/eventfd_test.c b/tools/testing/selftests/filesystems/eventfd/eventfd_test.c
index 1b48f267157d..9e33780f5330 100644
--- a/tools/testing/selftests/filesystems/eventfd/eventfd_test.c
+++ b/tools/testing/selftests/filesystems/eventfd/eventfd_test.c
@@ -5,12 +5,22 @@
#include <fcntl.h>
#include <asm/unistd.h>
#include <linux/time_types.h>
+#include <stdarg.h>
+#include <stdint.h>
+#include <string.h>
#include <unistd.h>
#include <assert.h>
#include <signal.h>
#include <pthread.h>
#include <sys/epoll.h>
-#include <sys/eventfd.h>
+#include <sys/ioctl.h>
+/*
+ * Prevent <asm-generic/fcntl.h> (pulled in via <linux/eventfd.h> ->
+ * <linux/fcntl.h> -> <asm/fcntl.h>) from redefining struct flock and
+ * friends that are already provided by the system <fcntl.h> above.
+ */
+#define _ASM_GENERIC_FCNTL_H
+#include <linux/eventfd.h>
#include "kselftest_harness.h"
#define EVENTFD_TEST_ITERATIONS 100000UL
@@ -308,4 +318,230 @@ TEST(eventfd_check_read_with_semaphore)
close(fd);
}
+/*
+ * The default maximum is ULLONG_MAX, matching the original behaviour.
+ */
+TEST(eventfd_check_ioctl_get_maximum_default)
+{
+ uint64_t max;
+ int fd, ret;
+
+ fd = sys_eventfd2(0, EFD_NONBLOCK);
+ ASSERT_GE(fd, 0);
+
+ ret = ioctl(fd, EFD_IOC_GET_MAXIMUM, &max);
+ EXPECT_EQ(ret, 0);
+ EXPECT_EQ(max, UINT64_MAX);
+
+ close(fd);
+}
+
+/*
+ * EFD_IOC_SET_MAXIMUM and EFD_IOC_GET_MAXIMUM round-trip.
+ */
+TEST(eventfd_check_ioctl_set_get_maximum)
+{
+ uint64_t max;
+ int fd, ret;
+
+ fd = sys_eventfd2(0, EFD_NONBLOCK);
+ ASSERT_GE(fd, 0);
+
+ max = 1000;
+ ret = ioctl(fd, EFD_IOC_SET_MAXIMUM, &max);
+ EXPECT_EQ(ret, 0);
+
+ max = 0;
+ ret = ioctl(fd, EFD_IOC_GET_MAXIMUM, &max);
+ EXPECT_EQ(ret, 0);
+ EXPECT_EQ(max, 1000);
+
+ close(fd);
+}
+
+/*
+ * Setting a maximum that is less than or equal to the current counter
+ * must fail with EINVAL.
+ */
+TEST(eventfd_check_ioctl_set_maximum_invalid)
+{
+ uint64_t value = 5, max;
+ ssize_t size;
+ int fd, ret;
+
+ fd = sys_eventfd2(0, EFD_NONBLOCK);
+ ASSERT_GE(fd, 0);
+
+ /* write 5 into the counter */
+ size = write(fd, &value, sizeof(value));
+ EXPECT_EQ(size, (ssize_t)sizeof(value));
+
+ /* setting maximum == count (5) must fail */
+ max = 5;
+ ret = ioctl(fd, EFD_IOC_SET_MAXIMUM, &max);
+ EXPECT_EQ(ret, -1);
+ EXPECT_EQ(errno, EINVAL);
+
+ /* setting maximum < count (3 < 5) must also fail */
+ max = 3;
+ ret = ioctl(fd, EFD_IOC_SET_MAXIMUM, &max);
+ EXPECT_EQ(ret, -1);
+ EXPECT_EQ(errno, EINVAL);
+
+ /* setting maximum > count (10 > 5) must succeed */
+ max = 10;
+ ret = ioctl(fd, EFD_IOC_SET_MAXIMUM, &max);
+ EXPECT_EQ(ret, 0);
+
+ close(fd);
+}
+
+/*
+ * Writes that would push the counter to or beyond maximum must return
+ * EAGAIN on a non-blocking fd. After a read drains the counter the
+ * write should succeed again.
+ */
+TEST(eventfd_check_ioctl_write_blocked_at_maximum)
+{
+ uint64_t value, max_val = 5;
+ ssize_t size;
+ int fd, ret;
+
+ fd = sys_eventfd2(0, EFD_NONBLOCK);
+ ASSERT_GE(fd, 0);
+
+ ret = ioctl(fd, EFD_IOC_SET_MAXIMUM, &max_val);
+ ASSERT_EQ(ret, 0);
+
+ /* write 4 — counter becomes 4, one slot before maximum */
+ value = 4;
+ size = write(fd, &value, sizeof(value));
+ EXPECT_EQ(size, (ssize_t)sizeof(value));
+
+ /*
+ * Writing 1 more would reach maximum (4+1 == 5 == maximum), which
+ * is the overflow level. The write must block, i.e. return EAGAIN
+ * in non-blocking mode.
+ */
+ value = 1;
+ size = write(fd, &value, sizeof(value));
+ EXPECT_EQ(size, -1);
+ EXPECT_EQ(errno, EAGAIN);
+
+ /* drain the counter */
+ size = read(fd, &value, sizeof(value));
+ EXPECT_EQ(size, (ssize_t)sizeof(value));
+ EXPECT_EQ(value, 4);
+
+ /* now the write must succeed (counter was reset to 0) */
+ value = 1;
+ size = write(fd, &value, sizeof(value));
+ EXPECT_EQ(size, (ssize_t)sizeof(value));
+
+ close(fd);
+}
+
+/*
+ * Verify that EPOLLOUT is correctly gated by the configured maximum:
+ * it should be clear when count >= maximum - 1, and set again after a read.
+ */
+TEST(eventfd_check_ioctl_poll_epollout)
+{
+ struct epoll_event ev, events[2];
+ uint64_t value, max_val = 5;
+ ssize_t sz;
+ int fd, epfd, nfds, ret;
+
+ fd = sys_eventfd2(0, EFD_NONBLOCK);
+ ASSERT_GE(fd, 0);
+
+ epfd = epoll_create1(0);
+ ASSERT_GE(epfd, 0);
+
+ ev.events = EPOLLIN | EPOLLOUT | EPOLLERR;
+ ev.data.fd = fd;
+ ret = epoll_ctl(epfd, EPOLL_CTL_ADD, fd, &ev);
+ ASSERT_EQ(ret, 0);
+
+ ret = ioctl(fd, EFD_IOC_SET_MAXIMUM, &max_val);
+ ASSERT_EQ(ret, 0);
+
+ /* fresh fd: EPOLLOUT must be set (count=0 < maximum-1=4) */
+ nfds = epoll_wait(epfd, events, 2, 0);
+ EXPECT_EQ(nfds, 1);
+ EXPECT_TRUE(!!(events[0].events & EPOLLOUT));
+
+ /* write 4 — count reaches maximum-1=4, EPOLLOUT must clear */
+ value = 4;
+ sz = write(fd, &value, sizeof(value));
+ EXPECT_EQ(sz, (ssize_t)sizeof(value));
+
+ nfds = epoll_wait(epfd, events, 2, 0);
+ EXPECT_EQ(nfds, 1);
+ EXPECT_FALSE(!!(events[0].events & EPOLLOUT));
+ EXPECT_TRUE(!!(events[0].events & EPOLLIN));
+
+ /* drain counter — EPOLLOUT must reappear */
+ sz = read(fd, &value, sizeof(value));
+ EXPECT_EQ(sz, (ssize_t)sizeof(value));
+
+ nfds = epoll_wait(epfd, events, 2, 0);
+ EXPECT_EQ(nfds, 1);
+ EXPECT_TRUE(!!(events[0].events & EPOLLOUT));
+
+ close(epfd);
+ close(fd);
+}
+
+/*
+ * /proc/self/fdinfo must expose the configured maximum.
+ */
+TEST(eventfd_check_fdinfo_maximum)
+{
+ struct error err = {0};
+ uint64_t max_val = 12345;
+ int fd, ret;
+
+ fd = sys_eventfd2(0, 0);
+ ASSERT_GE(fd, 0);
+
+ /* before setting: default should be ULLONG_MAX */
+ ret = verify_fdinfo(fd, &err, "eventfd-maximum: ", 17,
+ "%16llx\n", (unsigned long long)UINT64_MAX);
+ if (ret != 0)
+ ksft_print_msg("eventfd-maximum default check failed: %s\n",
+ err.msg);
+ EXPECT_EQ(ret, 0);
+
+ ret = ioctl(fd, EFD_IOC_SET_MAXIMUM, &max_val);
+ ASSERT_EQ(ret, 0);
+
+ memset(&err, 0, sizeof(err));
+ ret = verify_fdinfo(fd, &err, "eventfd-maximum: ", 17,
+ "%16llx\n", (unsigned long long)max_val);
+ if (ret != 0)
+ ksft_print_msg("eventfd-maximum after set check failed: %s\n",
+ err.msg);
+ EXPECT_EQ(ret, 0);
+
+ close(fd);
+}
+
+/*
+ * An unrecognised ioctl must return ENOTTY (not EINVAL or ENOENT).
+ */
+TEST(eventfd_check_ioctl_unknown)
+{
+ int fd, ret;
+
+ fd = sys_eventfd2(0, 0);
+ ASSERT_GE(fd, 0);
+
+ ret = ioctl(fd, _IO('J', 0xff));
+ EXPECT_EQ(ret, -1);
+ EXPECT_EQ(errno, ENOTTY);
+
+ close(fd);
+}
+
TEST_HARNESS_MAIN
--
2.25.1
^ permalink raw reply related [flat|nested] 3+ messages in thread