All of lore.kernel.org
 help / color / mirror / Atom feed
From: wen.yang@linux.dev
To: Christian Brauner <brauner@kernel.org>, Jan Kara <jack@suse.cz>,
	Alexander Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org, linux-kernel@vger.kernel.org,
	Wen Yang <wen.yang@linux.dev>, Jens Axboe <axboe@kernel.dk>
Subject: [RFC PATCH v5 1/2] eventfd: add configurable per-fd counter maximum for flow control
Date: Thu,  9 Apr 2026 01:24:48 +0800	[thread overview]
Message-ID: <530e8b5e22e08f8459d335eaf31ff78b999fa5cf.1775668339.git.wen.yang@linux.dev> (raw)
In-Reply-To: <cover.1775668339.git.wen.yang@linux.dev>

From: Wen Yang <wen.yang@linux.dev>

In non-semaphore mode, write(2) accumulates into the counter and a
single read(2) drains it entirely. A producer issuing repeated
write(1) calls coalesces N signals into the counter; each write
succeeds immediately regardless of whether the consumer has processed
earlier events. With no bound below ULLONG_MAX (~1.8×10¹⁹), the
counter grows without bound, consumer lag is invisible to the producer,
and in tight loops both sides burn CPU at 100% even though the consumer
is not keeping up. Without a maximum, the batch size seen by each
read(2) is also unbounded: a slow consumer may drain thousands of
accumulated signals in one call, losing visibility into how far behind
it has fallen.

Introduce two ioctl commands:

  EFD_IOC_SET_MAXIMUM  (_IOW('J', 0, __u64))
    Set the overflow threshold. A write(2) that would push the counter
    to or beyond this value blocks (EAGAIN for O_NONBLOCK fds).
    Returns -EINVAL if the requested maximum is <= the current counter.
    Wakes any blocked writers so they re-evaluate the new limit without
    waiting for the next read(2).

  EFD_IOC_GET_MAXIMUM  (_IOR('J', 1, __u64))
    Return the current threshold. Defaults to ULLONG_MAX, preserving
    the original unlimited behaviour. The value is also visible in
    /proc/self/fdinfo as "eventfd-maximum".

The maximum acts as the overflow level, exactly as ULLONG_MAX did in the
original design: the kernel-internal eventfd_signal() path may still
raise the counter to maximum (triggering EPOLLERR), while userspace
writes are capped at maximum-1.

This follows the backpressure pattern established by pipe(2): writers
block when the buffer is full, and capacity is adjustable via
fcntl(F_SETPIPE_SZ). POSIX message queues apply the same model:
mq_send(3) blocks when the queue depth reaches mq_maxmsg.

The following self-contained program covers three benchmarks. Build and
run with:  gcc -O2 -lpthread bench.c -o bench && ./bench

  /* bench.c */
  #define _GNU_SOURCE
  #include <errno.h>
  #include <inttypes.h>
  #include <poll.h>
  #include <pthread.h>
  #include <sched.h>
  #include <stdint.h>
  #include <stdio.h>
  #include <stdlib.h>
  #include <sys/eventfd.h>
  #include <sys/ioctl.h>
  #include <time.h>
  #include <unistd.h>
  #include <linux/eventfd.h>

  #define SECS  5
  #define MAX   10ULL
  #define LAT_N 5000
  #define COAL_N 10000ULL
  #define WINT  100000ULL   /* 100 µs → 10 K events/s */
  #define RSLT  125000ULL   /* 125 µs → ~8 K events/s */

  /* helpers */
  static uint64_t cpu_ms(void) {
  	struct timespec t;
  	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &t);
  	return (uint64_t)t.tv_sec * 1000 + t.tv_nsec / 1000000;
  }
  static uint64_t mono_ns(void) {
  	struct timespec t;
  	clock_gettime(CLOCK_MONOTONIC, &t);
  	return (uint64_t)t.tv_sec * 1000000000ULL + t.tv_nsec;
  }
  static void set_max(int fd, uint64_t m) {
  	if (m) ioctl(fd, EFD_IOC_SET_MAXIMUM, &m);
  }
  static void maxstr(char *b, uint64_t m) {
  	if (!m) snprintf(b, 24, "ULLONG_MAX");
  	else    snprintf(b, 24, "%llu", (unsigned long long)m);
  }

  /* bench 1: burst/CPU savings */
  enum mode { BLOCKING, SPIN, POLL_OUT };
  static int burst_fd; static volatile int stop;
  static enum mode wmode;
  static uint64_t wcpu, rcpu, neagain, nwrites, nreads;

  static void *burst_writer(void *_) {
  	(void)_;
  	uint64_t v=1, n=0, ea=0, t0=cpu_ms();
  	struct pollfd p={.fd=burst_fd,.events=POLLOUT};
  	while (!stop) {
  		if      (wmode==BLOCKING) { if (write(burst_fd,&v,8)==8) n++; }
  		else if (wmode==SPIN)     { if (write(burst_fd,&v,8)<0 && errno==EAGAIN) ea++; else n++; }
  		else { while (!stop && !(poll(&p,1,20)>0 && p.revents&POLLOUT));
  		       if (write(burst_fd,&v,8)==8) n++; }
  	}
  	wcpu=cpu_ms()-t0; neagain=ea; nwrites=n; return NULL;
  }
  static void *burst_reader(void *_) {
  	(void)_;
  	struct pollfd p={.fd=burst_fd,.events=POLLIN};
  	uint64_t v, nr=0, t0=cpu_ms();
  	while (stop==0 || (poll(&p,1,0)>0 && p.revents&POLLIN))
  		if (poll(&p,1,5)>0 && read(burst_fd,&v,8)==8) { nr++; usleep(1000); }
  	rcpu=cpu_ms()-t0; nreads=nr; return NULL;
  }
  static void run_burst(const char *lbl, enum mode m, uint64_t max) {
  	burst_fd=eventfd(0, m!=BLOCKING ? EFD_CLOEXEC|EFD_NONBLOCK : EFD_CLOEXEC);
  	set_max(burst_fd, max); wmode=m; stop=0;
  	pthread_t w,r;
  	pthread_create(&r,NULL,burst_reader,NULL); pthread_create(&w,NULL,burst_writer,NULL);
  	cpu_set_t c;
  	CPU_ZERO(&c); CPU_SET(0,&c); pthread_setaffinity_np(r,sizeof(c),&c);
  	CPU_ZERO(&c); CPU_SET(1,&c); pthread_setaffinity_np(w,sizeof(c),&c);
  	sleep(SECS); stop=1;
  	pthread_join(w,NULL); pthread_join(r,NULL); close(burst_fd);
  	char mb[24]; maxstr(mb, max);
  	printf("  %-22s  %-12s  %8llu  %8llu  %10llu  %10llu  %8llu\n", lbl, mb,
  	       (unsigned long long)wcpu, (unsigned long long)rcpu,
  	       (unsigned long long)neagain,
  	       (unsigned long long)nwrites, (unsigned long long)nreads);
  }

  /* bench 2: latency tail (EFD_SEMAPHORE) */
  static int latency_fd;
  static uint64_t wts[LAT_N], rts[LAT_N];

  static void *latency_writer(void *_) {
  	(void)_; uint64_t v=1, next=mono_ns();
  	for (int i=0; i<LAT_N; i++) {
  		while (mono_ns()<next); next+=WINT;
  		wts[i]=mono_ns();
  		int r; do { r=write(latency_fd,&v,8); } while (r<0 && errno==EINTR);
  	}
  	return NULL;
  }
  static void *latency_reader(void *_) {
  	(void)_; struct pollfd p={.fd=latency_fd,.events=POLLIN}; uint64_t v;
  	for (int i=0; i<LAT_N; i++) {
  		while (poll(&p,1,200)<=0 || !(p.revents&POLLIN));
  		(void)read(latency_fd,&v,8); rts[i]=mono_ns();
  		uint64_t w=mono_ns()+RSLT; while (mono_ns()<w);
  	}
  	return NULL;
  }
  static int cmp64(const void *a, const void *b) {
  	uint64_t x=*(const uint64_t*)a, y=*(const uint64_t*)b;
  	return (x>y)-(x<y);
  }
  static void run_latency(uint64_t max) {
  	latency_fd=eventfd(0, EFD_CLOEXEC|EFD_SEMAPHORE); set_max(latency_fd, max);
  	pthread_t w,r;
  	pthread_create(&r,NULL,latency_reader,NULL); pthread_create(&w,NULL,latency_writer,NULL);
  	pthread_join(w,NULL); pthread_join(r,NULL); close(latency_fd);
  	uint64_t lat[LAT_N];
  	for (int i=0; i<LAT_N; i++) lat[i]=rts[i]-wts[i];
  	qsort(lat,LAT_N,sizeof(lat[0]),cmp64);
  	char mb[24]; maxstr(mb, max);
  	printf("  %-12s  %10.0f  %10.0f  %10.0f\n", mb,
  	       lat[LAT_N*99/100]/1000.0, lat[LAT_N*999/1000]/1000.0,
  	       lat[LAT_N-1]/1000.0);
  }

  /* bench 3: coalescing (non-EFD_SEMAPHORE) */
  static int coal_fd;
  static uint64_t coal_reads;

  static void *coal_writer(void *_) {
  	(void)_; uint64_t v=1;
  	for (uint64_t i=0; i<COAL_N; i++) {
  		int r; do { r=write(coal_fd,&v,8); } while (r<0 && errno==EINTR);
  	}
  	return NULL;
  }
  static void *coal_reader(void *_) {
  	(void)_; uint64_t v, nr=0, tot=0;
  	while (tot < COAL_N) {
  		(void)read(coal_fd,&v,8); nr++; tot+=v;
  		uint64_t w=mono_ns()+RSLT; while(mono_ns()<w);
  	}
  	coal_reads=nr; return NULL;
  }
  static void run_coalesce(uint64_t max) {
  	coal_fd=eventfd(0, EFD_CLOEXEC); set_max(coal_fd, max);
  	pthread_t w,r;
  	pthread_create(&r,NULL,coal_reader,NULL); pthread_create(&w,NULL,coal_writer,NULL);
  	pthread_join(w,NULL); pthread_join(r,NULL); close(coal_fd);
  	char mb[24]; maxstr(mb, max);
  	printf("  %-12s  %10llu  %8llu  %10.1f\n", mb,
  	       (unsigned long long)COAL_N,
  	       (unsigned long long)coal_reads,
  	       (double)COAL_N/coal_reads);
  }

  /* main */
  int main(void) {
  	printf("\nBench 1 – burst/CPU  (writer vs reader, %ds)\n", SECS);
  	printf("  %-22s  %-12s  %8s  %8s  %10s  %10s  %8s\n",
  	       "writer_mode","maximum","wcpu_ms","rcpu_ms","EAGAIN","writes","reads");
  	printf("  -----------------------------------------------------------------------\n");
  	run_burst("blocking, no limit", BLOCKING, 0);
  	run_burst("blocking",          BLOCKING, MAX);
  	run_burst("O_NONBLOCK+spin",   SPIN,     MAX);
  	run_burst("O_NONBLOCK+poll",   POLL_OUT, MAX);

  	printf("\nBench 2 – latency tail"
  	       "  (EFD_SEMAPHORE, 10K/s writer, ~8K/s reader, %d events)\n",
  	       LAT_N);
  	printf("  %-12s  %10s  %10s  %10s\n",
  	       "maximum","p99_us","p999_us","max_us");
  	printf("  -------------------------------------------\n");
  	static const uint64_t mv[]={0,100,10};
  	for (int i=0;i<3;i++) run_latency(mv[i]);

  	printf("\nBench 3 – coalescing"
  	       "  (non-EFD_SEMAPHORE, %llu writes, 125us/read reader)\n",
  	       (unsigned long long)COAL_N);
  	printf("  %-12s  %10s  %8s  %10s\n",
  	       "maximum","writes","reads","avg_batch");
  	printf("  -----------------------------------------------\n");
  	static const uint64_t cv[]={0,100,10};
  	for (int i=0;i<3;i++) run_coalesce(cv[i]);
  }

On a 4-core x86_64 (writer and reader pinned to separate CPUs,
reader sleeps 1 ms between reads to simulate processing time):

  Bench 1 – burst/CPU  (writer vs reader, 5s)
  writer_mode             maximum       wcpu_ms  rcpu_ms      EAGAIN      writes    reads
  -------------------------------------------------------------------------
  blocking, no limit      ULLONG_MAX       5002      132           0     6517388     4506
  blocking                10                133      150           0       40456     4496
  O_NONBLOCK+spin         10               4999      126     5789340       40568     4508
  O_NONBLOCK+poll         10                189      151           0       40519     4503

  Bench 2 – latency tail  (EFD_SEMAPHORE, 10K/s writer, ~8K/s reader, 5000 events)
  maximum        p99_us    p999_us     max_us
  -------------------------------------------
  ULLONG_MAX     141218     142477     142588
  100             13298      13320      13334
  10               1719       2378       2381

  Bench 3 – coalescing  (non-EFD_SEMAPHORE, 10000 writes, 125us/read reader)
  maximum           writes    reads   avg_batch
  -----------------------------------------------
  ULLONG_MAX         10000       79      126.6
  100                10000      105       95.2
  10                 10000     1121        8.9

With maximum=10: burst CPU drops >97% (5002 ms → 133 ms); latency p999
drops ~60x (142 ms → 2.4 ms); coalescing batch size is bounded to 9
(vs 127 without a limit), so the consumer always knows the backlog is
small. O_NONBLOCK+spin bypasses flow control entirely — use
poll(POLLOUT)+write to get the same benefit as a blocking write while
still multiplexing other fds in a single poll(2) call.

Signed-off-by: Wen Yang <wen.yang@linux.dev>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
---
 .../userspace-api/ioctl/ioctl-number.rst      |  1 +
 fs/eventfd.c                                  | 74 ++++++++++++++++---
 include/uapi/linux/eventfd.h                  |  6 ++
 3 files changed, 69 insertions(+), 12 deletions(-)

diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst
index 331223761fff..d233559179b1 100644
--- a/Documentation/userspace-api/ioctl/ioctl-number.rst
+++ b/Documentation/userspace-api/ioctl/ioctl-number.rst
@@ -170,6 +170,7 @@ Code  Seq#    Include File                                             Comments
 'I'   all    linux/isdn.h                                              conflict!
 'I'   00-0F  drivers/isdn/divert/isdn_divert.h                         conflict!
 'I'   40-4F  linux/mISDNif.h                                           conflict!
+'J'   00-01  linux/eventfd.h                                           eventfd ioctl
 'K'   all    linux/kd.h
 'L'   00-1F  linux/loop.h                                              conflict!
 'L'   10-1F  drivers/scsi/mpt3sas/mpt3sas_ctl.h                        conflict!
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 3219e0d596fe..11985d07e904 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -39,6 +39,7 @@ struct eventfd_ctx {
 	 * also, adds to the "count" counter and issue a wakeup.
 	 */
 	__u64 count;
+	__u64 maximum;
 	unsigned int flags;
 	int id;
 };
@@ -49,9 +50,9 @@ struct eventfd_ctx {
  * @mask: [in] poll mask
  *
  * This function is supposed to be called by the kernel in paths that do not
- * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX
- * value, and we signal this as overflow condition by returning a EPOLLERR
- * to poll(2).
+ * allow sleeping. In this function we allow the counter to reach the maximum
+ * value (ctx->maximum), and we signal this as overflow condition by returning
+ * a EPOLLERR to poll(2).
  */
 void eventfd_signal_mask(struct eventfd_ctx *ctx, __poll_t mask)
 {
@@ -70,7 +71,7 @@ void eventfd_signal_mask(struct eventfd_ctx *ctx, __poll_t mask)
 
 	spin_lock_irqsave(&ctx->wqh.lock, flags);
 	current->in_eventfd = 1;
-	if (ctx->count < ULLONG_MAX)
+	if (ctx->count < ctx->maximum)
 		ctx->count++;
 	if (waitqueue_active(&ctx->wqh))
 		wake_up_locked_poll(&ctx->wqh, EPOLLIN | mask);
@@ -119,7 +120,7 @@ static __poll_t eventfd_poll(struct file *file, poll_table *wait)
 {
 	struct eventfd_ctx *ctx = file->private_data;
 	__poll_t events = 0;
-	u64 count;
+	u64 count, max;
 
 	poll_wait(file, &ctx->wqh, wait);
 
@@ -162,12 +163,13 @@ static __poll_t eventfd_poll(struct file *file, poll_table *wait)
 	 *     eventfd_poll returns 0
 	 */
 	count = READ_ONCE(ctx->count);
+	max = READ_ONCE(ctx->maximum);
 
 	if (count > 0)
 		events |= EPOLLIN;
-	if (count == ULLONG_MAX)
+	if (count == max)
 		events |= EPOLLERR;
-	if (ULLONG_MAX - 1 > count)
+	if (max - 1 > count)
 		events |= EPOLLOUT;
 
 	return events;
@@ -244,6 +246,11 @@ static ssize_t eventfd_read(struct kiocb *iocb, struct iov_iter *to)
 	return sizeof(ucnt);
 }
 
+static inline bool eventfd_is_writable(struct eventfd_ctx *ctx, __u64 cnt)
+{
+	return ctx->maximum > ctx->count && ctx->maximum - ctx->count > cnt;
+}
+
 static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t count,
 			     loff_t *ppos)
 {
@@ -259,11 +266,11 @@ static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t c
 		return -EINVAL;
 	spin_lock_irq(&ctx->wqh.lock);
 	res = -EAGAIN;
-	if (ULLONG_MAX - ctx->count > ucnt)
+	if (eventfd_is_writable(ctx, ucnt))
 		res = sizeof(ucnt);
 	else if (!(file->f_flags & O_NONBLOCK)) {
 		res = wait_event_interruptible_locked_irq(ctx->wqh,
-				ULLONG_MAX - ctx->count > ucnt);
+				eventfd_is_writable(ctx, ucnt));
 		if (!res)
 			res = sizeof(ucnt);
 	}
@@ -283,22 +290,62 @@ static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t c
 static void eventfd_show_fdinfo(struct seq_file *m, struct file *f)
 {
 	struct eventfd_ctx *ctx = f->private_data;
-	__u64 cnt;
+	__u64 cnt, max;
 
 	spin_lock_irq(&ctx->wqh.lock);
 	cnt = ctx->count;
+	max = ctx->maximum;
 	spin_unlock_irq(&ctx->wqh.lock);
 
 	seq_printf(m,
 		   "eventfd-count: %16llx\n"
 		   "eventfd-id: %d\n"
-		   "eventfd-semaphore: %d\n",
+		   "eventfd-semaphore: %d\n"
+		   "eventfd-maximum: %16llx\n",
 		   cnt,
 		   ctx->id,
-		   !!(ctx->flags & EFD_SEMAPHORE));
+		   !!(ctx->flags & EFD_SEMAPHORE),
+		   max);
 }
 #endif
 
+static long eventfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct eventfd_ctx *ctx = file->private_data;
+	void __user *argp = (void __user *)arg;
+	__u64 max;
+	int ret;
+
+	switch (cmd) {
+	case EFD_IOC_SET_MAXIMUM:
+		if (copy_from_user(&max, argp, sizeof(max)))
+			return -EFAULT;
+
+		spin_lock_irq(&ctx->wqh.lock);
+		if (ctx->count >= max) {
+			ret = -EINVAL;
+		} else {
+			ctx->maximum = max;
+			ret = 0;
+			/* wake blocked writers that may now fit within the new maximum */
+			if (waitqueue_active(&ctx->wqh))
+				wake_up_locked_poll(&ctx->wqh, EPOLLOUT);
+		}
+		spin_unlock_irq(&ctx->wqh.lock);
+		return ret;
+
+	case EFD_IOC_GET_MAXIMUM:
+		spin_lock_irq(&ctx->wqh.lock);
+		max = ctx->maximum;
+		spin_unlock_irq(&ctx->wqh.lock);
+
+		return copy_to_user(argp, &max, sizeof(max)) ? -EFAULT : 0;
+
+	default:
+		return -ENOTTY;
+	}
+}
+
 static const struct file_operations eventfd_fops = {
 #ifdef CONFIG_PROC_FS
 	.show_fdinfo	= eventfd_show_fdinfo,
@@ -307,6 +354,8 @@ static const struct file_operations eventfd_fops = {
 	.poll		= eventfd_poll,
 	.read_iter	= eventfd_read,
 	.write		= eventfd_write,
+	.unlocked_ioctl	= eventfd_ioctl,
+	.compat_ioctl	= compat_ptr_ioctl,
 	.llseek		= noop_llseek,
 };
 
@@ -395,6 +444,7 @@ static int do_eventfd(unsigned int count, int flags)
 	kref_init(&ctx->kref);
 	init_waitqueue_head(&ctx->wqh);
 	ctx->count = count;
+	ctx->maximum = ULLONG_MAX;
 	ctx->flags = flags;
 
 	flags &= EFD_SHARED_FCNTL_FLAGS;
diff --git a/include/uapi/linux/eventfd.h b/include/uapi/linux/eventfd.h
index 2eb9ab6c32f3..ba46b746f597 100644
--- a/include/uapi/linux/eventfd.h
+++ b/include/uapi/linux/eventfd.h
@@ -3,9 +3,15 @@
 #define _UAPI_LINUX_EVENTFD_H
 
 #include <linux/fcntl.h>
+#include <linux/ioctl.h>
+#include <linux/types.h>
 
 #define EFD_SEMAPHORE (1 << 0)
 #define EFD_CLOEXEC O_CLOEXEC
 #define EFD_NONBLOCK O_NONBLOCK
 
+/* Flow-control ioctls: configure the per-fd counter maximum. */
+#define EFD_IOC_SET_MAXIMUM	_IOW('J', 0, __u64)
+#define EFD_IOC_GET_MAXIMUM	_IOR('J', 1, __u64)
+
 #endif /* _UAPI_LINUX_EVENTFD_H */
-- 
2.25.1


  reply	other threads:[~2026-04-08 17:25 UTC|newest]

Thread overview: 3+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-04-08 17:24 [RFC PATCH v5 0/2] eventfd: add configurable maximum counter value for flow control wen.yang
2026-04-08 17:24 ` wen.yang [this message]
2026-04-08 17:24 ` [RFC PATCH v5 2/2] selftests/eventfd: add EFD_IOC_{SET,GET}_MAXIMUM tests wen.yang

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=530e8b5e22e08f8459d335eaf31ff78b999fa5cf.1775668339.git.wen.yang@linux.dev \
    --to=wen.yang@linux.dev \
    --cc=axboe@kernel.dk \
    --cc=brauner@kernel.org \
    --cc=jack@suse.cz \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=viro@zeniv.linux.org.uk \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.