[PATCH] epoll more scalable than poll

linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH] epoll more scalable than poll
@ 2002-10-28 19:14 Hanna Linder
  2002-10-28 20:10 ` Hanna Linder
                   ` (2 more replies)
  0 siblings, 3 replies; 22+ messages in thread
From: Hanna Linder @ 2002-10-28 19:14 UTC (permalink / raw)
  To: torvalds; +Cc: linux-kernel, davidel, linux-aio, lse-tech

--On Friday, October 18, 2002 15:05:50 -0700 Linus Torvalds <torvalds@transmeta.com> wrote:

> I like it noticeably better as a system call, so it's maybe worth 
> discussing. It's not going to happen before I leave (very early tomorrow 
> morning), but if people involved agree on this and clean patches to 
> actiually add the code (not just system call stubs) can be made..
> 
> 		Linus

Linus,

	The results of our testing show not only does the system call 
interface to epoll perform as well as the /dev interface but also that epoll 
is many times better than standard poll. No other implementations of poll 
have performed as well as epoll in any measure. Testing details and results 
are published here, please take a minute to check it out: http://lse.sourceforge.net/epoll/index.html
	Davide Libenzi finished the system call interface to epoll including
the changes Andrew Morton requested early last week. See that thread here: http://marc.theaimsgroup.com/?t=103516170500003&r=1&w=2
	Please consider sys_epoll for inclusion in the next 2.5 kernel release.
The results clearly show epoll is the most scalable of all the existing poll implementations and the impact on existing code is minimal. 

Thank you.

Hanna 

ps- Did I mention there is a web site? http://lse.sf.net/epoll/index.html

-----
diff -Nru linux-2.5.44.vanilla/arch/i386/kernel/entry.S linux-2.5.44.epoll/arch/i386/kernel/entry.S
--- linux-2.5.44.vanilla/arch/i386/kernel/entry.S	Fri Oct 18 21:01:19 2002
+++ linux-2.5.44.epoll/arch/i386/kernel/entry.S	Sat Oct 19 21:16:19 2002
@@ -737,6 +737,10 @@
 	.long sys_free_hugepages
 	.long sys_exit_group
 	.long sys_lookup_dcookie
+	.long sys_epoll_create
+	.long sys_epoll_ctl	/* 255 */
+	.long sys_epoll_wait
+
 
 	.rept NR_syscalls-(.-sys_call_table)/4
 		.long sys_ni_syscall
diff -Nru linux-2.5.44.vanilla/drivers/char/Makefile linux-2.5.44.epoll/drivers/char/Makefile
--- linux-2.5.44.vanilla/drivers/char/Makefile	Fri Oct 18 21:02:32 2002
+++ linux-2.5.44.epoll/drivers/char/Makefile	Tue Oct 22 10:08:40 2002
@@ -7,14 +7,14 @@
 #
 FONTMAPFILE = cp437.uni
 
-obj-y	 += mem.o tty_io.o n_tty.o tty_ioctl.o pty.o misc.o random.o
+obj-y	 += mem.o tty_io.o n_tty.o tty_ioctl.o pty.o misc.o random.o eventpoll.o
 
 # All of the (potential) objects that export symbols.
 # This list comes from 'grep -l EXPORT_SYMBOL *.[hc]'.
 
 export-objs     :=	busmouse.o vt.o generic_serial.o ip2main.o \
 			ite_gpio.o keyboard.o misc.o nvram.o random.o rtc.o \
-			selection.o sonypi.o sysrq.o tty_io.o tty_ioctl.o
+			selection.o sonypi.o sysrq.o tty_io.o tty_ioctl.o eventpoll.o
 
 obj-$(CONFIG_VT) += vt_ioctl.o vc_screen.o consolemap.o consolemap_deftbl.o selection.o keyboard.o
 obj-$(CONFIG_HW_CONSOLE) += vt.o defkeymap.o
diff -Nru linux-2.5.44.vanilla/drivers/char/eventpoll.c linux-2.5.44.epoll/drivers/char/eventpoll.c
--- linux-2.5.44.vanilla/drivers/char/eventpoll.c	Wed Dec 31 16:00:00 1969
+++ linux-2.5.44.epoll/drivers/char/eventpoll.c	Sun Oct 27 15:23:47 2002
@@ -0,0 +1,1136 @@
+/*
+ *  drivers/char/eventpoll.c ( Efficent event polling implementation )
+ *  Copyright (C) 2001,...,2002  Davide Libenzi
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  Davide Libenzi <davidel@xmailserver.org>
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/signal.h>
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+#include <linux/poll.h>
+#include <linux/miscdevice.h>
+#include <linux/random.h>
+#include <linux/smp_lock.h>
+#include <linux/wrapper.h>
+#include <linux/string.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/wait.h>
+#include <linux/fcblist.h>
+#include <asm/bitops.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <asm/io.h>
+#include <asm/mman.h>
+#include <asm/atomic.h>
+#include <linux/eventpoll.h>
+
+
+
+#define EVENTPOLLFS_MAGIC 0x03111965 /* My birthday should work for this :) */
+
+#define DEBUG_EPOLL 0
+
+#if DEBUG_EPOLL > 0
+#define DPRINTK(x) printk x
+#define DNPRINTK(n, x) do { if ((n) <= DEBUG_EPOLL) printk x; } while (0)
+#else /* #if DEBUG_EPOLL > 0 */
+#define DPRINTK(x) (void) 0
+#define DNPRINTK(n, x) (void) 0
+#endif /* #if DEBUG_EPOLL > 0 */
+
+#define DEBUG_DPI 0
+
+#if DEBUG_DPI != 0
+#define DPI_SLAB_DEBUG (SLAB_DEBUG_FREE | SLAB_RED_ZONE /* | SLAB_POISON */)
+#else /* #if DEBUG_DPI != 0 */
+#define DPI_SLAB_DEBUG 0
+#endif /* #if DEBUG_DPI != 0 */
+
+#define INITIAL_HASH_BITS 7
+#define MAX_HASH_BITS 18
+#define RESIZE_LENGTH 2
+
+#define DPI_MEM_ALLOC()	(struct epitem *) kmem_cache_alloc(dpi_cache, SLAB_KERNEL)
+#define DPI_MEM_FREE(p) kmem_cache_free(dpi_cache, p)
+#define IS_FILE_EPOLL(f) ((f)->f_op == &eventpoll_fops)
+
+
+
+typedef unsigned long long event_version_t;
+
+struct eventpoll {
+	rwlock_t lock;
+	wait_queue_head_t wq;
+	wait_queue_head_t poll_wait;
+	struct list_head *hash;
+	unsigned int hbits;
+	unsigned int hmask;
+	atomic_t hents;
+	atomic_t resize;
+	int numpages;
+	char **pages;
+	char *pages0[MAX_EVENTPOLL_PAGES];
+	char *pages1[MAX_EVENTPOLL_PAGES];
+	unsigned long vmabase;
+	atomic_t mmapped;
+	int eventcnt;
+	event_version_t ver;
+};
+
+struct epitem {
+	struct list_head llink;
+	struct eventpoll *ep;
+	struct file *file;
+	struct pollfd pfd;
+	int index;
+	event_version_t ver;
+};
+
+
+
+
+static int ep_getfd(int *efd, struct inode **einode, struct file **efile);
+static int ep_alloc_pages(char **pages, int numpages);
+static int ep_free_pages(char **pages, int numpages);
+static int ep_init(struct eventpoll *ep);
+static void ep_free(struct eventpoll *ep);
+static struct epitem *ep_find_nl(struct eventpoll *ep, int fd);
+static struct epitem *ep_find(struct eventpoll *ep, int fd);
+static int ep_hashresize(struct eventpoll *ep, unsigned long *kflags);
+static int ep_insert(struct eventpoll *ep, struct pollfd *pfd);
+static int ep_remove(struct eventpoll *ep, struct epitem *dpi);
+static void notify_proc(struct file *file, void *data, unsigned long *local, long *event);
+static int open_eventpoll(struct inode *inode, struct file *file);
+static int close_eventpoll(struct inode *inode, struct file *file);
+static unsigned int poll_eventpoll(struct file *file, poll_table *wait);
+static int write_eventpoll(struct file *file, const char *buffer, size_t count,
+						   loff_t *ppos);
+static int ep_poll(struct eventpoll *ep, struct evpoll *dvp);
+static int ep_do_alloc_pages(struct eventpoll *ep, int numpages);
+static int ioctl_eventpoll(struct inode *inode, struct file *file,
+						   unsigned int cmd, unsigned long arg);
+static void eventpoll_mm_open(struct vm_area_struct * vma);
+static void eventpoll_mm_close(struct vm_area_struct * vma);
+static int mmap_eventpoll(struct file *file, struct vm_area_struct *vma);
+static int eventpollfs_delete_dentry(struct dentry *dentry);
+static struct inode *get_eventpoll_inode(void);
+static struct super_block *eventpollfs_get_sb(struct file_system_type *fs_type,
+											  int flags, char *dev_name, void *data);
+
+
+
+
+static kmem_cache_t *dpi_cache;
+static struct vfsmount *eventpoll_mnt;
+
+static struct file_operations eventpoll_fops = {
+	.write		= write_eventpoll,
+	.ioctl		= ioctl_eventpoll,
+	.mmap		= mmap_eventpoll,
+	.open		= open_eventpoll,
+	.release	= close_eventpoll,
+	.poll		= poll_eventpoll
+};
+
+static struct vm_operations_struct eventpoll_mmap_ops = {
+	.open		= eventpoll_mm_open,
+	.close		= eventpoll_mm_close,
+};
+
+static struct miscdevice eventpoll_miscdev = {
+	EVENTPOLL_MINOR, "eventpoll", &eventpoll_fops
+};
+
+static struct file_system_type eventpoll_fs_type = {
+	.name		= "eventpollfs",
+	.get_sb		= eventpollfs_get_sb,
+	.kill_sb	= kill_anon_super,
+};
+
+static struct dentry_operations eventpollfs_dentry_operations = {
+	.d_delete	= eventpollfs_delete_dentry,
+};
+
+
+
+asmlinkage int sys_epoll_create(int maxfds)
+{
+	int error = -EINVAL, fd;
+	unsigned long addr;
+	struct inode *inode;
+	struct file *file;
+	struct eventpoll *ep;
+
+	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d)\n",
+				 current, maxfds));
+
+	if (maxfds > MAX_FDS_IN_EVENTPOLL)
+		goto eexit_1;
+	error = ep_getfd(&fd, &inode, &file);
+	if (error)
+		goto eexit_1;
+	error = open_eventpoll(inode, file);
+	if (error)
+		goto eexit_2;
+	ep = file->private_data;
+	error = ep_do_alloc_pages(ep, EP_FDS_PAGES(maxfds + 1));
+	if (error)
+		goto eexit_2;
+	down_write(&current->mm->mmap_sem);
+	addr = do_mmap_pgoff(file, 0, EP_MAP_SIZE(maxfds + 1), PROT_READ,
+						 MAP_PRIVATE, 0);
+	up_write(&current->mm->mmap_sem);
+	error = PTR_ERR((void *) addr);
+	if (IS_ERR((void *) addr))
+		goto eexit_2;
+
+	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n",
+				 current, maxfds, fd));
+
+	return fd;
+
+eexit_2:
+	sys_close(fd);
+eexit_1:
+	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n",
+				 current, maxfds, error));
+	return error;
+}
+EXPORT_SYMBOL(sys_epoll_create);
+
+
+asmlinkage int sys_epoll_ctl(int epfd, int op, int fd, unsigned int events)
+{
+	int error = -EBADF;
+	struct file *file;
+	struct eventpoll *ep;
+	struct epitem *dpi;
+	struct pollfd pfd;
+
+	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %u)\n",
+				 current, epfd, op, fd, events));
+
+	file = fget(epfd);
+	if (!file)
+		goto eexit_1;
+	error = -EINVAL;
+	if (!IS_FILE_EPOLL(file))
+		goto eexit_2;
+
+	ep = file->private_data;
+
+	pfd.fd = fd;
+	pfd.events = events;
+	pfd.revents = 0;
+
+	dpi = ep_find(ep, fd);
+
+	error = -EINVAL;
+	switch (op) {
+	case EP_CTL_ADD:
+		if (!dpi)
+			error = ep_insert(ep, &pfd);
+		else
+			error = -EEXIST;
+		break;
+	case EP_CTL_DEL:
+		if (dpi)
+			error = ep_remove(ep, dpi);
+		else
+			error = -ENOENT;
+		break;
+	case EP_CTL_MOD:
+		if (dpi) {
+			dpi->pfd.events = events;
+			error = 0;
+		} else
+			error = -ENOENT;
+		break;
+	}
+
+	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %u) = %d\n",
+				 current, epfd, op, fd, events, error));
+
+eexit_2:
+	fput(file);
+eexit_1:
+	return error;
+}
+EXPORT_SYMBOL(sys_epoll_ctl);
+
+
+asmlinkage int sys_epoll_wait(int epfd, struct pollfd const **events, int timeout)
+{
+	int error = -EBADF;
+	void *eaddr;
+	struct file *file;
+	struct eventpoll *ep;
+	struct evpoll dvp;
+
+	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d)\n",
+				 current, epfd, events, timeout));
+
+	file = fget(epfd);
+	if (!file)
+		goto eexit_1;
+	error = -EINVAL;
+	if (!IS_FILE_EPOLL(file))
+		goto eexit_2;
+
+	ep = file->private_data;
+
+	error = -EINVAL;
+	if (!atomic_read(&ep->mmapped))
+		goto eexit_2;
+
+	dvp.ep_timeout = timeout;
+	error = ep_poll(ep, &dvp);
+	if (error > 0) {
+		eaddr = (void *) (ep->vmabase + dvp.ep_resoff);
+		if (copy_to_user(events, &eaddr, sizeof(struct pollfd *)))
+			error = -EFAULT;
+	}
+
+	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d) = %d\n",
+				 current, epfd, events, timeout, error));
+
+eexit_2:
+	fput(file);
+eexit_1:
+	return error;
+}
+EXPORT_SYMBOL(sys_epoll_wait);
+
+
+static int ep_getfd(int *efd, struct inode **einode, struct file **efile)
+{
+	struct qstr this;
+	char name[32];
+	struct dentry *dentry;
+	struct inode *inode;
+	struct file *file;
+	int error, fd;
+
+	error = -ENFILE;
+	file = get_empty_filp();
+	if (!file)
+		goto eexit_1;
+
+	inode = get_eventpoll_inode();
+	error = PTR_ERR(inode);
+	if (IS_ERR(inode))
+		goto eexit_2;
+
+	error = get_unused_fd();
+	if (error < 0)
+		goto eexit_3;
+	fd = error;
+
+	error = -ENOMEM;
+	sprintf(name, "[%lu]", inode->i_ino);
+	this.name = name;
+	this.len = strlen(name);
+	this.hash = inode->i_ino;
+	dentry = d_alloc(eventpoll_mnt->mnt_sb->s_root, &this);
+	if (!dentry)
+		goto eexit_4;
+	dentry->d_op = &eventpollfs_dentry_operations;
+	d_add(dentry, inode);
+	file->f_vfsmnt = mntget(mntget(eventpoll_mnt));
+	file->f_dentry = dget(dentry);
+
+	file->f_pos = 0;
+	file->f_flags = O_RDWR;
+	file->f_op = &eventpoll_fops;
+	file->f_mode = FMODE_READ | FMODE_WRITE;
+	file->f_version = 0;
+	file->private_data = NULL;
+
+	fd_install(fd, file);
+
+	*efd = fd;
+	*einode = inode;
+	*efile = file;
+	return 0;
+
+eexit_4:
+	put_unused_fd(fd);
+eexit_3:
+	iput(inode);
+eexit_2:
+	put_filp(file);
+eexit_1:
+	return error;	
+}
+
+
+static int ep_alloc_pages(char **pages, int numpages)
+{
+	int ii;
+
+	for (ii = 0; ii < numpages; ii++) {
+		pages[ii] = (char *) __get_free_pages(GFP_KERNEL, 0);
+		if (!pages[ii]) {
+			for (--ii; ii >= 0; ii--) {
+				ClearPageReserved(virt_to_page(pages[ii]));
+				free_pages((unsigned long) pages[ii], 0);
+			}
+			return -ENOMEM;
+		}
+		SetPageReserved(virt_to_page(pages[ii]));
+	}
+	return 0;
+}
+
+
+static int ep_free_pages(char **pages, int numpages)
+{
+	int ii;
+
+	for (ii = 0; ii < numpages; ii++) {
+		ClearPageReserved(virt_to_page(pages[ii]));
+		free_pages((unsigned long) pages[ii], 0);
+	}
+	return 0;
+}
+
+
+static int ep_init(struct eventpoll *ep)
+{
+	int ii, hentries;
+
+	rwlock_init(&ep->lock);
+	init_waitqueue_head(&ep->wq);
+	init_waitqueue_head(&ep->poll_wait);
+	ep->hbits = INITIAL_HASH_BITS;
+	ep->hmask = (1 << ep->hbits) - 1;
+	atomic_set(&ep->hents, 0);
+	atomic_set(&ep->resize, 0);
+	atomic_set(&ep->mmapped, 0);
+	ep->numpages = 0;
+	ep->vmabase = 0;
+	ep->pages = ep->pages0;
+	ep->eventcnt = 0;
+	ep->ver = 1;
+
+	hentries = ep->hmask + 1;
+	if (!(ep->hash = (struct list_head *) vmalloc(hentries * sizeof(struct list_head))))
+		return -ENOMEM;
+
+	for (ii = 0; ii < hentries; ii++)
+		INIT_LIST_HEAD(&ep->hash[ii]);
+
+	return 0;
+}
+
+
+static void ep_free(struct eventpoll *ep)
+{
+	int ii;
+	struct list_head *lnk;
+
+	for (ii = 0; ii <= ep->hmask; ii++) {
+		while ((lnk = list_first(&ep->hash[ii]))) {
+			struct epitem *dpi = list_entry(lnk, struct epitem, llink);
+
+			file_notify_delcb(dpi->file, notify_proc);
+			list_del(lnk);
+			DPI_MEM_FREE(dpi);
+		}
+	}
+	vfree(ep->hash);
+	if (ep->numpages > 0) {
+		ep_free_pages(ep->pages0, ep->numpages);
+		ep_free_pages(ep->pages1, ep->numpages);
+	}
+}
+
+
+static struct epitem *ep_find_nl(struct eventpoll *ep, int fd)
+{
+	struct epitem *dpi = NULL;
+	struct list_head *lsthead, *lnk;
+
+	lsthead = &ep->hash[fd & ep->hmask];
+	list_for_each(lnk, lsthead) {
+		dpi = list_entry(lnk, struct epitem, llink);
+
+		if (dpi->pfd.fd == fd) break;
+		dpi = NULL;
+	}
+
+	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_find(%d) -> %p\n", current, fd, dpi));
+
+	return dpi;
+}
+
+
+static struct epitem *ep_find(struct eventpoll *ep, int fd)
+{
+	struct epitem *dpi;
+	unsigned long flags;
+
+	read_lock_irqsave(&ep->lock, flags);
+
+	dpi = ep_find_nl(ep, fd);
+
+	read_unlock_irqrestore(&ep->lock, flags);
+
+	return dpi;
+}
+
+
+static int ep_hashresize(struct eventpoll *ep, unsigned long *kflags)
+{
+	struct list_head *hash, *oldhash;
+	unsigned int hbits = ep->hbits + 1;
+	unsigned int hmask = (1 << hbits) - 1;
+	int ii, res, hentries = hmask + 1;
+	unsigned long flags = *kflags;
+
+	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_hashresize(%p) bits=%u\n", current, ep, hbits));
+
+	write_unlock_irqrestore(&ep->lock, flags);
+
+	res = -ENOMEM;
+	if (!(hash = (struct list_head *) vmalloc(hentries * sizeof(struct list_head)))) {
+		write_lock_irqsave(&ep->lock, flags);
+		goto eexit_1;
+	}
+
+	for (ii = 0; ii < hentries; ii++)
+		INIT_LIST_HEAD(&hash[ii]);
+
+	write_lock_irqsave(&ep->lock, flags);
+
+	oldhash = ep->hash;
+	for (ii = 0; ii <= ep->hmask; ii++) {
+		struct list_head *oldhead = &oldhash[ii], *lnk;
+
+		while ((lnk = list_first(oldhead))) {
+			struct epitem *dpi = list_entry(lnk, struct epitem, llink);
+
+			list_del(lnk);
+			list_add(lnk, &hash[dpi->pfd.fd & hmask]);
+		}
+	}
+
+	ep->hash = hash;
+	ep->hbits = hbits;
+	ep->hmask = hmask;
+
+	write_unlock_irqrestore(&ep->lock, flags);
+	vfree(oldhash);
+	write_lock_irqsave(&ep->lock, flags);
+
+	res = 0;
+eexit_1:
+	*kflags = flags;
+	atomic_dec(&ep->resize);
+	return res;
+}
+
+
+static int ep_insert(struct eventpoll *ep, struct pollfd *pfd)
+{
+	int error;
+	struct epitem *dpi;
+	struct file *file;
+	unsigned long flags;
+
+	if (atomic_read(&ep->hents) >= (ep->numpages * POLLFD_X_PAGE))
+		return -E2BIG;
+
+	file = fget(pfd->fd);
+	if (!file)
+		return -EBADF;
+
+	error = -ENOMEM;
+	if (!(dpi = DPI_MEM_ALLOC()))
+		goto eexit_1;
+
+	INIT_LIST_HEAD(&dpi->llink);
+	dpi->ep = ep;
+	dpi->file = file;
+	dpi->pfd = *pfd;
+	dpi->index = -1;
+	dpi->ver = ep->ver - 1;
+
+	write_lock_irqsave(&ep->lock, flags);
+
+	list_add(&dpi->llink, &ep->hash[pfd->fd & ep->hmask]);
+	atomic_inc(&ep->hents);
+
+	if (!atomic_read(&ep->resize) &&
+		(atomic_read(&ep->hents) >> ep->hbits) > RESIZE_LENGTH &&
+		ep->hbits < MAX_HASH_BITS) {
+		atomic_inc(&ep->resize);
+		ep_hashresize(ep, &flags);
+	}
+
+	write_unlock_irqrestore(&ep->lock, flags);
+
+	file_notify_addcb(file, notify_proc, dpi);
+
+	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_insert(%p, %d)\n", current, ep, pfd->fd));
+
+	error = 0;
+eexit_1:
+	fput(file);
+
+	return error;
+}
+
+
+static int ep_remove(struct eventpoll *ep, struct epitem *dpi)
+{
+	unsigned long flags;
+	struct pollfd *pfd, *lpfd;
+	struct epitem *ldpi;
+
+	file_notify_delcb(dpi->file, notify_proc);
+
+	write_lock_irqsave(&ep->lock, flags);
+
+	list_del(&dpi->llink);
+	atomic_dec(&ep->hents);
+
+	if (dpi->index >= 0 && dpi->ver == ep->ver && dpi->index < ep->eventcnt) {
+		pfd = (struct pollfd *) (ep->pages[EVENT_PAGE_INDEX(dpi->index)] +
+								 EVENT_PAGE_OFFSET(dpi->index));
+		if (pfd->fd == dpi->pfd.fd && dpi->index < --ep->eventcnt) {
+			lpfd = (struct pollfd *) (ep->pages[EVENT_PAGE_INDEX(ep->eventcnt)] +
+									  EVENT_PAGE_OFFSET(ep->eventcnt));
+			*pfd = *lpfd;
+
+			if ((ldpi = ep_find_nl(ep, pfd->fd))) ldpi->index = dpi->index;
+		}
+	}
+
+	write_unlock_irqrestore(&ep->lock, flags);
+
+	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_remove(%p, %d)\n",
+				 current, ep, dpi->pfd.fd));
+
+	DPI_MEM_FREE(dpi);
+
+	return 0;
+}
+
+
+static void notify_proc(struct file *file, void *data, unsigned long *local, long *event)
+{
+	struct epitem *dpi = data;
+	struct eventpoll *ep = dpi->ep;
+	struct pollfd *pfd;
+
+	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: notify(%p, %p, %ld, %ld) ep=%p\n",
+				 current, file, data, event[0], event[1], ep));
+
+	write_lock(&ep->lock);
+	if (!(dpi->pfd.events & event[1]))
+		goto out;
+
+	if (dpi->index < 0 || dpi->ver != ep->ver) {
+		if (ep->eventcnt >= (ep->numpages * POLLFD_X_PAGE))
+			goto out;
+		dpi->index = ep->eventcnt++;
+		dpi->ver = ep->ver;
+		pfd = (struct pollfd *) (ep->pages[EVENT_PAGE_INDEX(dpi->index)] +
+								 EVENT_PAGE_OFFSET(dpi->index));
+		*pfd = dpi->pfd;
+	} else {
+		pfd = (struct pollfd *) (ep->pages[EVENT_PAGE_INDEX(dpi->index)] +
+								 EVENT_PAGE_OFFSET(dpi->index));
+		if (pfd->fd != dpi->pfd.fd) {
+			if (ep->eventcnt >= (ep->numpages * POLLFD_X_PAGE))
+				goto out;
+			dpi->index = ep->eventcnt++;
+			pfd = (struct pollfd *) (ep->pages[EVENT_PAGE_INDEX(dpi->index)] +
+									 EVENT_PAGE_OFFSET(dpi->index));
+			*pfd = dpi->pfd;
+		}
+	}
+
+	pfd->revents |= (pfd->events & event[1]);
+
+	if (waitqueue_active(&ep->wq))
+		wake_up(&ep->wq);
+	if (waitqueue_active(&ep->poll_wait))
+		wake_up(&ep->poll_wait);
+out:
+	write_unlock(&ep->lock);
+}
+
+
+static int open_eventpoll(struct inode *inode, struct file *file)
+{
+	int res;
+	struct eventpoll *ep;
+
+	if (!(ep = kmalloc(sizeof(struct eventpoll), GFP_KERNEL)))
+		return -ENOMEM;
+
+	memset(ep, 0, sizeof(*ep));
+	if ((res = ep_init(ep))) {
+		kfree(ep);
+		return res;
+	}
+
+	file->private_data = ep;
+
+	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: open() ep=%p\n", current, ep));
+	return 0;
+}
+
+
+static int close_eventpoll(struct inode *inode, struct file *file)
+{
+	struct eventpoll *ep = file->private_data;
+
+	if (ep) {
+		ep_free(ep);
+		kfree(ep);
+	}
+
+	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: close() ep=%p\n", current, ep));
+	return 0;
+}
+
+
+static unsigned int poll_eventpoll(struct file *file, poll_table *wait)
+{
+	struct eventpoll *ep = file->private_data;
+
+	poll_wait(file, &ep->poll_wait, wait);
+	if (ep->eventcnt)
+		return POLLIN | POLLRDNORM;
+
+	return 0;
+}
+
+
+static int write_eventpoll(struct file *file, const char *buffer, size_t count,
+						   loff_t *ppos)
+{
+	int rcount;
+	struct eventpoll *ep = file->private_data;
+	struct epitem *dpi;
+	struct pollfd pfd;
+
+	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: write(%p, %d)\n", current, ep, count));
+
+	rcount = -EINVAL;
+	if (count % sizeof(struct pollfd))
+		goto eexit_1;
+
+	if ((rcount = verify_area(VERIFY_READ, buffer, count)))
+		goto eexit_1;
+
+	rcount = 0;
+
+	while (count > 0) {
+		if (__copy_from_user(&pfd, buffer, sizeof(pfd))) {
+			rcount = -EFAULT;
+			goto eexit_1;
+		}
+
+		dpi = ep_find(ep, pfd.fd);
+
+		if (pfd.fd >= current->files->max_fds || !current->files->fd[pfd.fd])
+			pfd.events = POLLREMOVE;
+		if (pfd.events & POLLREMOVE) {
+			if (dpi) {
+				ep_remove(ep, dpi);
+				rcount += sizeof(pfd);
+			}
+		}
+		else if (dpi) {
+			dpi->pfd.events = pfd.events;
+			rcount += sizeof(pfd);
+		} else {
+			pfd.revents = 0;
+			if (!ep_insert(ep, &pfd))
+				rcount += sizeof(pfd);
+		}
+
+		buffer += sizeof(pfd);
+		count -= sizeof(pfd);
+	}
+
+eexit_1:
+	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: write(%p, %d) = %d\n",
+				 current, ep, count, rcount));
+
+	return rcount;
+}
+
+
+static int ep_poll(struct eventpoll *ep, struct evpoll *dvp)
+{
+	int res = 0;
+	long timeout;
+	unsigned long flags;
+	wait_queue_t wait;
+
+	if (!atomic_read(&ep->mmapped))
+		return -EINVAL;
+
+	write_lock_irqsave(&ep->lock, flags);
+
+	res = 0;
+	if (!ep->eventcnt) {
+		init_waitqueue_entry(&wait, current);
+		add_wait_queue(&ep->wq, &wait);
+		timeout = dvp->ep_timeout == -1 || dvp->ep_timeout > MAX_SCHEDULE_TIMEOUT / HZ ?
+			MAX_SCHEDULE_TIMEOUT: (dvp->ep_timeout * HZ) / 1000; 
+		for (;;) {
+			set_current_state(TASK_INTERRUPTIBLE);
+			if (ep->eventcnt || !timeout)
+				break;
+			if (signal_pending(current)) {
+				res = -EINTR;
+				break;
+			}
+
+			write_unlock_irqrestore(&ep->lock, flags);
+			timeout = schedule_timeout(timeout);
+			write_lock_irqsave(&ep->lock, flags);
+		}
+		remove_wait_queue(&ep->wq, &wait);
+
+		set_current_state(TASK_RUNNING);
+	}
+
+	if (!res && ep->eventcnt) {
+		res = ep->eventcnt;
+		ep->eventcnt = 0;
+		++ep->ver;
+		if (ep->pages == ep->pages0) {
+			ep->pages = ep->pages1;
+			dvp->ep_resoff = 0;
+		} else {
+			ep->pages = ep->pages0;
+			dvp->ep_resoff = ep->numpages * PAGE_SIZE;
+		}
+	}
+
+	write_unlock_irqrestore(&ep->lock, flags);
+
+	return res;
+}
+
+
+static int ep_do_alloc_pages(struct eventpoll *ep, int numpages)
+{
+	int res, pgalloc, pgcpy;
+	unsigned long flags;
+	char **pages, **pages0, **pages1;
+
+	if (atomic_read(&ep->mmapped))
+		return -EBUSY;
+	if (numpages > MAX_EVENTPOLL_PAGES)
+		return -EINVAL;
+
+	pgalloc = numpages - ep->numpages;
+	if ((pages = (char **) vmalloc(2 * (pgalloc + 1) * sizeof(char *))) == NULL)
+		return -ENOMEM;
+	pages0 = &pages[0];
+	pages1 = &pages[pgalloc + 1];
+
+	if ((res = ep_alloc_pages(pages0, pgalloc)))
+		goto eexit_1;
+
+	if ((res = ep_alloc_pages(pages1, pgalloc))) {
+		ep_free_pages(pages0, pgalloc);
+		goto eexit_1;
+	}
+
+	write_lock_irqsave(&ep->lock, flags);
+	pgcpy = (ep->numpages + pgalloc) > numpages ? numpages - ep->numpages: pgalloc;
+	if (pgcpy > 0) {
+		memcpy(&ep->pages0[ep->numpages], pages0, pgcpy * sizeof(char *));
+		memcpy(&ep->pages1[ep->numpages], pages1, pgcpy * sizeof(char *));
+		ep->numpages += pgcpy;
+	}
+	write_unlock_irqrestore(&ep->lock, flags);
+
+	if (pgcpy < pgalloc) {
+		if (pgcpy < 0)
+			pgcpy = 0;
+		ep_free_pages(&pages0[pgcpy], pgalloc - pgcpy);
+		ep_free_pages(&pages1[pgcpy], pgalloc - pgcpy);
+	}
+
+eexit_1:
+	vfree(pages);
+	return res;
+}
+
+
+static int ioctl_eventpoll(struct inode *inode, struct file *file,
+						   unsigned int cmd, unsigned long arg)
+{
+	int res;
+	struct eventpoll *ep = file->private_data;
+	struct epitem *dpi;
+	unsigned long flags;
+	struct pollfd pfd;
+	struct evpoll dvp;
+
+	switch (cmd) {
+	case EP_ALLOC:
+		res = ep_do_alloc_pages(ep, EP_FDS_PAGES(arg));
+
+		DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ioctl(%p, EP_ALLOC, %lu) == %d\n",
+					 current, ep, arg, res));
+		return res;
+
+	case EP_FREE:
+		if (atomic_read(&ep->mmapped))
+			return -EBUSY;
+
+		res = -EINVAL;
+		write_lock_irqsave(&ep->lock, flags);
+		if (ep->numpages > 0) {
+			ep_free_pages(ep->pages0, ep->numpages);
+			ep_free_pages(ep->pages1, ep->numpages);
+			ep->numpages = 0;
+			ep->pages = ep->pages0;
+			res = 0;
+		}
+		write_unlock_irqrestore(&ep->lock, flags);
+
+		DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ioctl(%p, EP_FREE) == %d\n",
+					 current, ep, res));
+		return res;
+
+	case EP_POLL:
+		if (copy_from_user(&dvp, (void *) arg, sizeof(struct evpoll)))
+			return -EFAULT;
+
+		DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ioctl(%p, EP_POLL, %d)\n",
+					 current, ep, dvp.ep_timeout));
+
+		res = ep_poll(ep, &dvp);
+
+		DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ioctl(%p, EP_POLL, %d) == %d\n",
+					 current, ep, dvp.ep_timeout, res));
+
+		if (res > 0 && copy_to_user((void *) arg, &dvp, sizeof(struct evpoll)))
+			res = -EFAULT;
+
+		return res;
+
+	case EP_ISPOLLED:
+		if (copy_from_user(&pfd, (void *) arg, sizeof(struct pollfd)))
+			return 0;
+
+		read_lock_irqsave(&ep->lock, flags);
+
+		res = 0;
+		if (!(dpi = ep_find_nl(ep, pfd.fd)))
+			goto is_not_polled;
+
+		pfd = dpi->pfd;
+		res = 1;
+
+	is_not_polled:
+		read_unlock_irqrestore(&ep->lock, flags);
+
+		if (res)
+			copy_to_user((void *) arg, &pfd, sizeof(struct pollfd));
+
+		DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ioctl(%p, EP_ISPOLLED, %d) == %d\n",
+					 current, ep, pfd.fd, res));
+		return res;
+	}
+
+	return -EINVAL;
+}
+
+
+static void eventpoll_mm_open(struct vm_area_struct * vma)
+{
+	struct file *file = vma->vm_file;
+	struct eventpoll *ep = file->private_data;
+
+	if (ep) atomic_inc(&ep->mmapped);
+
+	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: mm_open(%p)\n", current, ep));
+}
+
+
+static void eventpoll_mm_close(struct vm_area_struct * vma)
+{
+	struct file *file = vma->vm_file;
+	struct eventpoll *ep = file->private_data;
+
+	if (ep && atomic_dec_and_test(&ep->mmapped))
+		ep->vmabase = 0;
+
+	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: mm_close(%p)\n", current, ep));
+}
+
+
+static int mmap_eventpoll(struct file *file, struct vm_area_struct *vma)
+{
+	struct eventpoll *ep = file->private_data;
+	unsigned long start;
+	int ii, res, numpages;
+	size_t mapsize;
+
+	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: mmap(%p, %lx, %lx)\n",
+				 current, ep, vma->vm_start, vma->vm_pgoff << PAGE_SHIFT));
+
+	if (vma->vm_flags & VM_WRITE)
+		return -EACCES;
+	if ((vma->vm_pgoff << PAGE_SHIFT) != 0)
+		return -EINVAL;
+
+	mapsize = PAGE_ALIGN(vma->vm_end - vma->vm_start);
+	numpages = mapsize >> PAGE_SHIFT;
+
+	res = -EINVAL;
+	if (numpages != (2 * ep->numpages))
+		goto eexit_1;
+
+	start = vma->vm_start;
+	for (ii = 0; ii < ep->numpages; ii++) {
+		if ((res = remap_page_range(vma, start, __pa(ep->pages0[ii]),
+									PAGE_SIZE, vma->vm_page_prot)))
+    		goto eexit_1;
+		start += PAGE_SIZE;
+	}
+	for (ii = 0; ii < ep->numpages; ii++) {
+		if ((res = remap_page_range(vma, start, __pa(ep->pages1[ii]),
+									PAGE_SIZE, vma->vm_page_prot)))
+    		goto eexit_1;
+		start += PAGE_SIZE;
+	}
+	vma->vm_ops = &eventpoll_mmap_ops;
+	ep->vmabase = vma->vm_start;
+	atomic_set(&ep->mmapped, 1);
+	res = 0;
+eexit_1:
+
+	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: mmap(%p, %lx, %lx) == %d\n",
+				 current, ep, vma->vm_start, vma->vm_pgoff << PAGE_SHIFT, res));
+	return res;
+}
+
+
+static int eventpollfs_delete_dentry(struct dentry *dentry)
+{
+
+	return 1;
+}
+
+
+static struct inode *get_eventpoll_inode(void)
+{
+	int error = -ENOMEM;
+	struct inode *inode = new_inode(eventpoll_mnt->mnt_sb);
+
+	if (!inode)
+		goto eexit_1;
+
+	inode->i_fop = &eventpoll_fops;
+
+	/*
+	 * Mark the inode dirty from the very beginning,
+	 * that way it will never be moved to the dirty
+	 * list because "mark_inode_dirty()" will think
+	 * that it already _is_ on the dirty list.
+	 */
+	inode->i_state = I_DIRTY;
+	inode->i_mode = S_IRUSR | S_IWUSR;
+	inode->i_uid = current->fsuid;
+	inode->i_gid = current->fsgid;
+	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+	inode->i_blksize = PAGE_SIZE;
+	return inode;
+
+eexit_1:
+	return ERR_PTR(error);
+}
+
+
+static struct super_block *eventpollfs_get_sb(struct file_system_type *fs_type,
+											  int flags, char *dev_name, void *data)
+{
+
+	return get_sb_pseudo(fs_type, "eventpoll:", NULL, EVENTPOLLFS_MAGIC);
+}
+
+
+static int __init eventpoll_init(void)
+{
+	int error;
+
+	error = -ENOMEM;
+	dpi_cache = kmem_cache_create("eventpoll",
+								  sizeof(struct epitem),
+								  __alignof__(struct epitem),
+								  DPI_SLAB_DEBUG, NULL, NULL);
+	if (!dpi_cache)
+		goto eexit_1;
+
+	error = register_filesystem(&eventpoll_fs_type);
+	if (error)
+		goto eexit_2;
+
+	eventpoll_mnt = kern_mount(&eventpoll_fs_type);
+	error = PTR_ERR(eventpoll_mnt);
+	if (IS_ERR(eventpoll_mnt))
+		goto eexit_3;
+
+	error = misc_register(&eventpoll_miscdev);
+	if (error)
+		goto eexit_4;
+
+	printk(KERN_INFO "[%p] eventpoll: driver installed.\n", current);
+
+	return error;
+
+eexit_4:
+	mntput(eventpoll_mnt);
+eexit_3:
+	unregister_filesystem(&eventpoll_fs_type);
+eexit_2:
+	kmem_cache_destroy(dpi_cache);
+eexit_1:
+
+	return error;
+}
+
+static void __exit eventpoll_exit(void)
+{
+	unregister_filesystem(&eventpoll_fs_type);
+	mntput(eventpoll_mnt);
+	misc_deregister(&eventpoll_miscdev);
+	kmem_cache_destroy(dpi_cache);
+}
+
+module_init(eventpoll_init);
+module_exit(eventpoll_exit);
+
+MODULE_LICENSE("GPL");
+
+
diff -Nru linux-2.5.44.vanilla/fs/Makefile linux-2.5.44.epoll/fs/Makefile
--- linux-2.5.44.vanilla/fs/Makefile	Fri Oct 18 21:01:57 2002
+++ linux-2.5.44.epoll/fs/Makefile	Sat Oct 19 12:05:48 2002
@@ -6,14 +6,14 @@
 # 
 
 export-objs :=	open.o dcache.o buffer.o bio.o inode.o dquot.o mpage.o aio.o \
-                fcntl.o read_write.o dcookies.o
+                fcntl.o read_write.o dcookies.o fcblist.o
 
 obj-y :=	open.o read_write.o devices.o file_table.o buffer.o \
 		bio.o super.o block_dev.o char_dev.o stat.o exec.o pipe.o \
 		namei.o fcntl.o ioctl.o readdir.o select.o fifo.o locks.o \
 		dcache.o inode.o attr.o bad_inode.o file.o dnotify.o \
 		filesystems.o namespace.o seq_file.o xattr.o libfs.o \
-		fs-writeback.o mpage.o direct-io.o aio.o
+		fs-writeback.o mpage.o direct-io.o aio.o fcblist.o
 
 ifneq ($(CONFIG_NFSD),n)
 ifneq ($(CONFIG_NFSD),)
diff -Nru linux-2.5.44.vanilla/fs/fcblist.c linux-2.5.44.epoll/fs/fcblist.c
--- linux-2.5.44.vanilla/fs/fcblist.c	Wed Dec 31 16:00:00 1969
+++ linux-2.5.44.epoll/fs/fcblist.c	Sun Oct 27 15:23:07 2002
@@ -0,0 +1,135 @@
+/*
+ *  linux/fs/fcblist.c ( File event callbacks handling )
+ *  Copyright (C) 2001,...,2002  Davide Libenzi
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  Davide Libenzi <davidel@xmailserver.org>
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/poll.h>
+#include <asm/bitops.h>
+#include <linux/fcblist.h>
+
+
+long ion_band_table[NSIGPOLL] = {
+	ION_IN,		/* POLL_IN */
+	ION_OUT,	/* POLL_OUT */
+	ION_IN,		/* POLL_MSG */
+	ION_ERR,	/* POLL_ERR */
+	0,			/* POLL_PRI */
+	ION_HUP		/* POLL_HUP */
+};
+EXPORT_SYMBOL(ion_band_table);
+
+long poll_band_table[NSIGPOLL] = {
+	POLLIN | POLLRDNORM,			/* POLL_IN */
+	POLLOUT | POLLWRNORM | POLLWRBAND,	/* POLL_OUT */
+	POLLIN | POLLRDNORM | POLLMSG,		/* POLL_MSG */
+	POLLERR,				/* POLL_ERR */
+	POLLPRI | POLLRDBAND,			/* POLL_PRI */
+	POLLHUP | POLLERR			/* POLL_HUP */
+};
+EXPORT_SYMBOL(poll_band_table);
+
+
+void file_notify_event(struct file *filep, long *event)
+{
+	unsigned long flags;
+	struct list_head *lnk, *lsthead;
+
+	fcblist_read_lock(filep, flags);
+
+	lsthead = &filep->f_cblist;
+	list_for_each(lnk, lsthead) {
+		struct fcb_struct *fcbp = list_entry(lnk, struct fcb_struct, llink);
+
+		fcbp->cbproc(filep, fcbp->data, fcbp->local, event);
+	}
+
+	fcblist_read_unlock(filep, flags);
+}
+EXPORT_SYMBOL(file_notify_event);
+
+
+int file_notify_addcb(struct file *filep,
+					  void (*cbproc)(struct file *, void *, unsigned long *, long *), void *data)
+{
+	unsigned long flags;
+	struct fcb_struct *fcbp;
+
+	if (!(fcbp = (struct fcb_struct *) kmalloc(sizeof(struct fcb_struct), GFP_KERNEL)))
+		return -ENOMEM;
+
+	memset(fcbp, 0, sizeof(struct fcb_struct));
+	fcbp->cbproc = cbproc;
+	fcbp->data = data;
+
+	fcblist_write_lock(filep, flags);
+	list_add_tail(&fcbp->llink, &filep->f_cblist);
+	fcblist_write_unlock(filep, flags);
+
+	return 0;
+}
+EXPORT_SYMBOL(file_notify_addcb);
+
+
+int file_notify_delcb(struct file *filep,
+					  void (*cbproc)(struct file *, void *, unsigned long *, long *))
+{
+	unsigned long flags;
+	struct list_head *lnk, *lsthead;
+
+	fcblist_write_lock(filep, flags);
+
+	lsthead = &filep->f_cblist;
+	list_for_each(lnk, lsthead) {
+		struct fcb_struct *fcbp = list_entry(lnk, struct fcb_struct, llink);
+
+		if (fcbp->cbproc == cbproc) {
+			list_del(lnk);
+			fcblist_write_unlock(filep, flags);
+			kfree(fcbp);
+			return 0;
+		}
+	}
+
+	fcblist_write_unlock(filep, flags);
+
+	return -ENOENT;
+}
+EXPORT_SYMBOL(file_notify_delcb);
+
+
+void file_notify_cleanup(struct file *filep)
+{
+	unsigned long flags;
+	struct list_head *lnk, *lsthead;
+
+	fcblist_write_lock(filep, flags);
+
+	lsthead = &filep->f_cblist;
+	while ((lnk = list_first(lsthead))) {
+		struct fcb_struct *fcbp = list_entry(lnk, struct fcb_struct, llink);
+
+		list_del(lnk);
+		fcblist_write_unlock(filep, flags);
+		kfree(fcbp);
+		fcblist_write_lock(filep, flags);
+	}
+
+	fcblist_write_unlock(filep, flags);
+}
+EXPORT_SYMBOL(file_notify_cleanup);
+
diff -Nru linux-2.5.44.vanilla/fs/file_table.c linux-2.5.44.epoll/fs/file_table.c
--- linux-2.5.44.vanilla/fs/file_table.c	Fri Oct 18 21:01:08 2002
+++ linux-2.5.44.epoll/fs/file_table.c	Sat Oct 19 12:01:33 2002
@@ -8,6 +8,7 @@
 #include <linux/string.h>
 #include <linux/slab.h>
 #include <linux/file.h>
+#include <linux/fcblist.h>
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/smp_lock.h>
@@ -58,6 +59,7 @@
 		f->f_gid = current->fsgid;
 		f->f_owner.lock = RW_LOCK_UNLOCKED;
 		list_add(&f->f_list, &anon_list);
+		file_notify_init(f);
 		file_list_unlock();
 		return f;
 	}
@@ -102,6 +104,7 @@
 	filp->f_uid    = current->fsuid;
 	filp->f_gid    = current->fsgid;
 	filp->f_op     = dentry->d_inode->i_fop;
+	file_notify_init(filp);
 	if (filp->f_op->open)
 		return filp->f_op->open(dentry->d_inode, filp);
 	else
@@ -123,6 +126,7 @@
 	struct vfsmount * mnt = file->f_vfsmnt;
 	struct inode * inode = dentry->d_inode;
 
+	file_notify_cleanup(file);
 	locks_remove_flock(file);
 
 	if (file->f_op && file->f_op->release)
diff -Nru linux-2.5.44.vanilla/fs/pipe.c linux-2.5.44.epoll/fs/pipe.c
--- linux-2.5.44.vanilla/fs/pipe.c	Fri Oct 18 21:01:56 2002
+++ linux-2.5.44.epoll/fs/pipe.c	Sat Oct 19 12:32:34 2002
@@ -11,6 +11,7 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/fs.h>
+#include <linux/fcblist.h>
 
 #include <asm/uaccess.h>
 #include <asm/ioctls.h>
@@ -47,7 +48,7 @@
 pipe_read(struct file *filp, char *buf, size_t count, loff_t *ppos)
 {
 	struct inode *inode = filp->f_dentry->d_inode;
-	int do_wakeup;
+	int do_wakeup, pfull;
 	ssize_t ret;
 
 	/* pread is not allowed on pipes. */
@@ -63,6 +64,7 @@
 	down(PIPE_SEM(*inode));
 	for (;;) {
 		int size = PIPE_LEN(*inode);
+		pfull = PIPE_FULL(*inode);
 		if (size) {
 			char *pipebuf = PIPE_BASE(*inode) + PIPE_START(*inode);
 			ssize_t chars = PIPE_MAX_RCHUNK(*inode);
@@ -108,12 +110,18 @@
 			if (!ret) ret = -ERESTARTSYS;
 			break;
 		}
+		/* Send notification message */
+		if (pfull && !PIPE_FULL(*inode) && PIPE_WRITEFILE(*inode))
+			file_send_notify(PIPE_WRITEFILE(*inode), ION_OUT, POLLOUT | POLLWRNORM | POLLWRBAND);
 		if (do_wakeup) {
 			wake_up_interruptible(PIPE_WAIT(*inode));
  			kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT);
 		}
 		pipe_wait(inode);
 	}
+	/* Send notification message */
+	if (pfull && !PIPE_FULL(*inode) && PIPE_WRITEFILE(*inode))
+		file_send_notify(PIPE_WRITEFILE(*inode), ION_OUT, POLLOUT | POLLWRNORM | POLLWRBAND);
 	up(PIPE_SEM(*inode));
 	/* Signal writers asynchronously that there is more room.  */
 	if (do_wakeup) {
@@ -131,7 +139,7 @@
 	struct inode *inode = filp->f_dentry->d_inode;
 	ssize_t ret;
 	size_t min;
-	int do_wakeup;
+	int do_wakeup, pempty;
 
 	/* pwrite is not allowed on pipes. */
 	if (unlikely(ppos != &filp->f_pos))
@@ -149,6 +157,7 @@
 	down(PIPE_SEM(*inode));
 	for (;;) {
 		int free;
+		pempty = PIPE_EMPTY(*inode);
 		if (!PIPE_READERS(*inode)) {
 			send_sig(SIGPIPE, current, 0);
 			if (!ret) ret = -EPIPE;
@@ -194,6 +203,9 @@
 			if (!ret) ret = -ERESTARTSYS;
 			break;
 		}
+		/* Send notification message */
+		if (pempty && !PIPE_EMPTY(*inode) && PIPE_READFILE(*inode))
+			file_send_notify(PIPE_READFILE(*inode), ION_IN, POLLIN | POLLRDNORM);
 		if (do_wakeup) {
 			wake_up_interruptible_sync(PIPE_WAIT(*inode));
 			kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN);
@@ -203,6 +215,9 @@
 		pipe_wait(inode);
 		PIPE_WAITING_WRITERS(*inode)--;
 	}
+	/* Send notification message */
+	if (pempty && !PIPE_EMPTY(*inode) && PIPE_READFILE(*inode))
+		file_send_notify(PIPE_READFILE(*inode), ION_IN, POLLIN | POLLRDNORM);
 	up(PIPE_SEM(*inode));
 	if (do_wakeup) {
 		wake_up_interruptible(PIPE_WAIT(*inode));
@@ -266,9 +281,22 @@
 static int
 pipe_release(struct inode *inode, int decr, int decw)
 {
+	struct file *rdfile, *wrfile;
 	down(PIPE_SEM(*inode));
 	PIPE_READERS(*inode) -= decr;
 	PIPE_WRITERS(*inode) -= decw;
+	rdfile = PIPE_READFILE(*inode);
+	wrfile = PIPE_WRITEFILE(*inode);
+ 	if (decr && !PIPE_READERS(*inode)) {
+		PIPE_READFILE(*inode) = NULL;
+		if (wrfile)
+			file_send_notify(wrfile, ION_HUP, POLLHUP);
+	}
+	if (decw && !PIPE_WRITERS(*inode)) {
+		PIPE_WRITEFILE(*inode) = NULL;
+		if (rdfile)
+			file_send_notify(rdfile, ION_HUP, POLLHUP);
+	}
 	if (!PIPE_READERS(*inode) && !PIPE_WRITERS(*inode)) {
 		struct pipe_inode_info *info = inode->i_pipe;
 		inode->i_pipe = NULL;
@@ -488,6 +516,7 @@
 	PIPE_READERS(*inode) = PIPE_WRITERS(*inode) = 0;
 	PIPE_WAITING_WRITERS(*inode) = 0;
 	PIPE_RCOUNTER(*inode) = PIPE_WCOUNTER(*inode) = 1;
+	PIPE_READFILE(*inode) = PIPE_WRITEFILE(*inode) = NULL;
 	*PIPE_FASYNC_READERS(*inode) = *PIPE_FASYNC_WRITERS(*inode) = NULL;
 
 	return inode;
@@ -595,6 +624,9 @@
 	f2->f_op = &write_pipe_fops;
 	f2->f_mode = 2;
 	f2->f_version = 0;
+
+	PIPE_READFILE(*inode) = f1;
+	PIPE_WRITEFILE(*inode) = f2;
 
 	fd_install(i, f1);
 	fd_install(j, f2);
diff -Nru linux-2.5.44.vanilla/include/asm-i386/poll.h linux-2.5.44.epoll/include/asm-i386/poll.h
--- linux-2.5.44.vanilla/include/asm-i386/poll.h	Fri Oct 18 21:01:52 2002
+++ linux-2.5.44.epoll/include/asm-i386/poll.h	Sat Oct 19 12:01:33 2002
@@ -15,6 +15,7 @@
 #define POLLWRNORM	0x0100
 #define POLLWRBAND	0x0200
 #define POLLMSG		0x0400
+#define POLLREMOVE	0x1000
 
 struct pollfd {
 	int fd;
diff -Nru linux-2.5.44.vanilla/include/asm-i386/unistd.h linux-2.5.44.epoll/include/asm-i386/unistd.h
--- linux-2.5.44.vanilla/include/asm-i386/unistd.h	Fri Oct 18 21:02:00 2002
+++ linux-2.5.44.epoll/include/asm-i386/unistd.h	Sat Oct 19 20:23:33 2002
@@ -258,6 +258,9 @@
 #define __NR_free_hugepages	251
 #define __NR_exit_group		252
 #define __NR_lookup_dcookie	253
+#define __NR_sys_epoll_create	254
+#define __NR_sys_epoll_ctl	255
+#define __NR_sys_epoll_wait	256
   
 
 /* user-visible error numbers are in the range -1 - -124: see <asm-i386/errno.h> */
diff -Nru linux-2.5.44.vanilla/include/linux/eventpoll.h linux-2.5.44.epoll/include/linux/eventpoll.h
--- linux-2.5.44.vanilla/include/linux/eventpoll.h	Wed Dec 31 16:00:00 1969
+++ linux-2.5.44.epoll/include/linux/eventpoll.h	Sun Oct 27 15:23:37 2002
@@ -0,0 +1,51 @@
+/*
+ *  include/linux/eventpoll.h ( Efficent event polling implementation )
+ *  Copyright (C) 2001,...,2002  Davide Libenzi
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  Davide Libenzi <davidel@xmailserver.org>
+ *
+ */
+
+#ifndef _LINUX_EVENTPOLL_H
+#define _LINUX_EVENTPOLL_H
+
+
+#define EVENTPOLL_MINOR	124
+#define POLLFD_X_PAGE (PAGE_SIZE / sizeof(struct pollfd))
+#define MAX_FDS_IN_EVENTPOLL (1024 * 128)
+#define MAX_EVENTPOLL_PAGES (MAX_FDS_IN_EVENTPOLL / POLLFD_X_PAGE)
+#define EVENT_PAGE_INDEX(n) ((n) / POLLFD_X_PAGE)
+#define EVENT_PAGE_REM(n) ((n) % POLLFD_X_PAGE)
+#define EVENT_PAGE_OFFSET(n) (((n) % POLLFD_X_PAGE) * sizeof(struct pollfd))
+#define EP_FDS_PAGES(n) (((n) + POLLFD_X_PAGE - 1) / POLLFD_X_PAGE)
+#define EP_MAP_SIZE(n) (EP_FDS_PAGES(n) * PAGE_SIZE * 2)
+
+
+struct evpoll {
+	int ep_timeout;
+	unsigned long ep_resoff;
+};
+
+#define EP_ALLOC _IOR('P', 1, int)
+#define EP_POLL _IOWR('P', 2, struct evpoll)
+#define EP_FREE _IO('P', 3)
+#define EP_ISPOLLED _IOWR('P', 4, struct pollfd)
+
+#define EP_CTL_ADD 1
+#define EP_CTL_DEL 2
+#define EP_CTL_MOD 3
+
+
+asmlinkage int sys_epoll_create(int maxfds);
+asmlinkage int sys_epoll_ctl(int epfd, int op, int fd, unsigned int events);
+asmlinkage int sys_epoll_wait(int epfd, struct pollfd const **events, int timeout);
+
+
+
+#endif
+
diff -Nru linux-2.5.44.vanilla/include/linux/fcblist.h linux-2.5.44.epoll/include/linux/fcblist.h
--- linux-2.5.44.vanilla/include/linux/fcblist.h	Wed Dec 31 16:00:00 1969
+++ linux-2.5.44.epoll/include/linux/fcblist.h	Sun Oct 27 15:23:21 2002
@@ -0,0 +1,73 @@
+/*
+ *  include/linux/fcblist.h ( File event callbacks handling )
+ *  Copyright (C) 2001,...,2002  Davide Libenzi
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  Davide Libenzi <davidel@xmailserver.org>
+ *
+ */
+
+#ifndef __LINUX_FCBLIST_H
+#define __LINUX_FCBLIST_H
+
+#include <linux/config.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+
+
+
+/* file callback notification events */
+#define ION_IN		1
+#define ION_OUT		2
+#define ION_HUP		3
+#define ION_ERR		4
+
+#define FCB_LOCAL_SIZE	4
+
+#define fcblist_read_lock(fp, fl)		read_lock_irqsave(&(fp)->f_cblock, fl)
+#define fcblist_read_unlock(fp, fl)		read_unlock_irqrestore(&(fp)->f_cblock, fl)
+#define fcblist_write_lock(fp, fl)		write_lock_irqsave(&(fp)->f_cblock, fl)
+#define fcblist_write_unlock(fp, fl)	write_unlock_irqrestore(&(fp)->f_cblock, fl)
+
+struct fcb_struct {
+	struct list_head llink;
+	void (*cbproc)(struct file *, void *, unsigned long *, long *);
+	void *data;
+	unsigned long local[FCB_LOCAL_SIZE];
+};
+
+
+extern long ion_band_table[];
+extern long poll_band_table[];
+
+
+void file_notify_event(struct file *filep, long *event);
+
+int file_notify_addcb(struct file *filep,
+					  void (*cbproc)(struct file *, void *, unsigned long *, long *), void *data);
+
+int file_notify_delcb(struct file *filep,
+					  void (*cbproc)(struct file *, void *, unsigned long *, long *));
+
+void file_notify_cleanup(struct file *filep);
+
+
+static inline void file_notify_init(struct file *filep)
+{
+	rwlock_init(&filep->f_cblock);
+	INIT_LIST_HEAD(&filep->f_cblist);
+}
+
+static inline void file_send_notify(struct file *filep, long ioevt, long plevt) {
+	long event[] = { ioevt, plevt, -1 };
+
+	file_notify_event(filep, event);
+}
+
+#endif
diff -Nru linux-2.5.44.vanilla/include/linux/fs.h linux-2.5.44.epoll/include/linux/fs.h
--- linux-2.5.44.vanilla/include/linux/fs.h	Fri Oct 18 21:01:18 2002
+++ linux-2.5.44.epoll/include/linux/fs.h	Sat Oct 19 12:01:33 2002
@@ -506,6 +506,10 @@
 
 	/* needed for tty driver, and maybe others */
 	void			*private_data;
+
+	/* file callback list */
+	rwlock_t f_cblock;
+	struct list_head f_cblist;
 };
 extern spinlock_t files_lock;
 #define file_list_lock() spin_lock(&files_lock);
diff -Nru linux-2.5.44.vanilla/include/linux/list.h linux-2.5.44.epoll/include/linux/list.h
--- linux-2.5.44.vanilla/include/linux/list.h	Fri Oct 18 21:01:07 2002
+++ linux-2.5.44.epoll/include/linux/list.h	Sat Oct 19 12:01:33 2002
@@ -319,6 +319,11 @@
 	for (pos = (head)->next, n = pos->next; pos != (head); \
 		pos = n, ({ read_barrier_depends(); 0;}), n = pos->next)
 
+#define list_first(head)	(((head)->next != (head)) ? (head)->next: (struct list_head *) 0)
+#define list_last(head)	(((head)->prev != (head)) ? (head)->prev: (struct list_head *) 0)
+#define list_next(pos, head)	(((pos)->next != (head)) ? (pos)->next: (struct list_head *) 0)
+#define list_prev(pos, head)	(((pos)->prev != (head)) ? (pos)->prev: (struct list_head *) 0)
+
 #endif /* __KERNEL__ || _LVM_H_INCLUDE */
 
 #endif
diff -Nru linux-2.5.44.vanilla/include/linux/pipe_fs_i.h linux-2.5.44.epoll/include/linux/pipe_fs_i.h
--- linux-2.5.44.vanilla/include/linux/pipe_fs_i.h	Fri Oct 18 21:02:24 2002
+++ linux-2.5.44.epoll/include/linux/pipe_fs_i.h	Sat Oct 19 12:01:33 2002
@@ -12,6 +12,8 @@
 	unsigned int waiting_writers;
 	unsigned int r_counter;
 	unsigned int w_counter;
+	struct file *rdfile;
+	struct file *wrfile;
 	struct fasync_struct *fasync_readers;
 	struct fasync_struct *fasync_writers;
 };
@@ -30,6 +32,8 @@
 #define PIPE_WAITING_WRITERS(inode)	((inode).i_pipe->waiting_writers)
 #define PIPE_RCOUNTER(inode)	((inode).i_pipe->r_counter)
 #define PIPE_WCOUNTER(inode)	((inode).i_pipe->w_counter)
+#define PIPE_READFILE(inode)	((inode).i_pipe->rdfile)
+#define PIPE_WRITEFILE(inode)	((inode).i_pipe->wrfile)
 #define PIPE_FASYNC_READERS(inode)     (&((inode).i_pipe->fasync_readers))
 #define PIPE_FASYNC_WRITERS(inode)     (&((inode).i_pipe->fasync_writers))
 
diff -Nru linux-2.5.44.vanilla/include/linux/sys.h linux-2.5.44.epoll/include/linux/sys.h
--- linux-2.5.44.vanilla/include/linux/sys.h	Fri Oct 18 21:01:49 2002
+++ linux-2.5.44.epoll/include/linux/sys.h	Sun Oct 20 15:13:06 2002
@@ -4,7 +4,7 @@
 /*
  * system call entry points ... but not all are defined
  */
-#define NR_syscalls 256
+#define NR_syscalls 260
 
 /*
  * These are system calls that will be removed at some time
diff -Nru linux-2.5.44.vanilla/include/net/sock.h linux-2.5.44.epoll/include/net/sock.h
--- linux-2.5.44.vanilla/include/net/sock.h	Fri Oct 18 21:02:27 2002
+++ linux-2.5.44.epoll/include/net/sock.h	Tue Oct 22 15:57:38 2002
@@ -52,6 +52,9 @@
 #include <asm/atomic.h>
 #include <net/dst.h>
 #include <net/scm.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/fcblist.h>
 
 /*
  * This structure really needs to be cleaned up.
@@ -766,8 +769,13 @@
 
 static inline void sk_wake_async(struct sock *sk, int how, int band)
 {
-	if (sk->socket && sk->socket->fasync_list)
-		sock_wake_async(sk->socket, how, band);
+	if (sk->socket) {
+		if (sk->socket->file)
+			file_send_notify(sk->socket->file, ion_band_table[band - POLL_IN],
+					poll_band_table[band - POLL_IN]);
+		if (sk->socket->fasync_list)
+			sock_wake_async(sk->socket, how, band);
+	}
 }
 
 #define SOCK_MIN_SNDBUF 2048
diff -Nru linux-2.5.44.vanilla/net/ipv4/tcp.c linux-2.5.44.epoll/net/ipv4/tcp.c
--- linux-2.5.44.vanilla/net/ipv4/tcp.c	Fri Oct 18 21:01:19 2002
+++ linux-2.5.44.epoll/net/ipv4/tcp.c	Sat Oct 19 12:01:33 2002
@@ -476,8 +476,8 @@
 		if (sk->sleep && waitqueue_active(sk->sleep))
 			wake_up_interruptible(sk->sleep);
 
-		if (sock->fasync_list && !(sk->shutdown & SEND_SHUTDOWN))
-			sock_wake_async(sock, 2, POLL_OUT);
+		if (!(sk->shutdown & SEND_SHUTDOWN))
+			sk_wake_async(sk, 2, POLL_OUT);
 	}
 }
 





^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] epoll more scalable than poll
  2002-10-28 19:14 Hanna Linder
@ 2002-10-28 20:10 ` Hanna Linder
  2002-10-28 20:56 ` Martin Waitz
  2002-10-28 22:08 ` bert hubert
  2 siblings, 0 replies; 22+ messages in thread
From: Hanna Linder @ 2002-10-28 20:10 UTC (permalink / raw)
  To: torvalds; +Cc: Hanna Linder, linux-kernel, davidel, lse-tech, linux-aio


--On Monday, October 28, 2002 11:14:19 -0800 Hanna Linder <hannal@us.ibm.com> wrote:

> ps- Did I mention there is a web site? http://lse.sf.net/epoll/index.html
> 
> -----
> diff -Nru linux-2.5.44.vanilla/arch/i386/kernel/entry.S linux-2.5.44.epoll/arch/i386/kernel/entry.S

Forgot to include the diffstat for Davide's sys_epoll patch (v11):


 arch/i386/kernel/entry.S  |    4
 drivers/char/Makefile     |    4
 drivers/char/eventpoll.c  | 1136 ++++++++++++++++++++++++++++++++++++++++++++++ fs/Makefile               |    4
 fs/fcblist.c              |  135 +++++
 fs/file_table.c           |    4
 fs/pipe.c                 |   36 +
 include/asm-i386/poll.h   |    1
 include/asm-i386/unistd.h |    3
 include/linux/eventpoll.h |   51 ++
 include/linux/fcblist.h   |   73 ++
 include/linux/fs.h        |    4
 include/linux/list.h      |    5
 include/linux/pipe_fs_i.h |    4
 include/linux/sys.h       |    2
 include/net/sock.h        |   12
 net/ipv4/tcp.c            |    4
 17 files changed, 1471 insertions(+), 11 deletions(-)


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] epoll more scalable than poll
  2002-10-28 19:14 Hanna Linder
  2002-10-28 20:10 ` Hanna Linder
@ 2002-10-28 20:56 ` Martin Waitz
  2002-10-28 22:02   ` bert hubert
  2002-10-28 22:17   ` Davide Libenzi
  2002-10-28 22:08 ` bert hubert
  2 siblings, 2 replies; 22+ messages in thread
From: Martin Waitz @ 2002-10-28 20:56 UTC (permalink / raw)
  To: Hanna Linder; +Cc: linux-kernel

[-- Attachment #1: Type: text/plain, Size: 1430 bytes --]

hi :)

On Mon, Oct 28, 2002 at 11:14:19AM -0800, Hanna Linder wrote:
> 	The results of our testing show not only does the system call 
> interface to epoll perform as well as the /dev interface but also that epoll 
> is many times better than standard poll. No other implementations of poll 
> have performed as well as epoll in any measure. Testing details and results 
> are published here, please take a minute to check it out: http://lse.sourceforge.net/epoll/index.html
how does this compare to the kqueue mechanism found in {Free,Net}BSD?
(see http://people.freebsd.org/~jlemon/papers/kqueue.pdf)

i especially like their combined event update/event wait,
needing only one syscall per poll while building a changelist in
userspace...

a replacement for poll/select is _really_ needed.
but i think we should use existing interfaces if possible,
to reduce the changes needed in userspace.


-- 
CU,		  / Friedrich-Alexander University Erlangen, Germany
Martin Waitz	//  [Tali on IRCnet]  [tali.home.pages.de] _________
______________/// - - - - - - - - - - - - - - - - - - - - ///
dies ist eine manuell generierte mail, sie beinhaltet    //
tippfehler und ist auch ohne grossbuchstaben gueltig.   /
			    -
Wer bereit ist, grundlegende Freiheiten aufzugeben, um sich 
kurzfristige Sicherheit zu verschaffen, der hat weder Freiheit 
noch Sicherheit verdient.
			Benjamin Franklin  (1706 - 1790)

[-- Attachment #2: Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] epoll more scalable than poll
@ 2002-10-28 21:59 Dan Kegel
  2002-10-28 22:36 ` Dan Kegel
  2002-10-28 22:51 ` Davide Libenzi
  0 siblings, 2 replies; 22+ messages in thread
From: Dan Kegel @ 2002-10-28 21:59 UTC (permalink / raw)
  To: linux-kernel, Martin Waitz

Martin Waitz <tali@admingilde.org> wrote:
> On Mon, Oct 28, 2002 at 11:14:19AM -0800, Hanna Linder wrote:
>> 	The results of our testing show not only does the system call 
>> interface to epoll perform as well as the /dev interface but also that epoll 
>> is many times better than standard poll. No other implementations of poll 
>> have performed as well as epoll in any measure. Testing details and results 
>> are published here, please take a minute to check it out: 
 >> http://lse.sourceforge.net/epoll/index.html
 >
> how does this compare to the kqueue mechanism found in {Free,Net}BSD?
> (see http://people.freebsd.org/~jlemon/papers/kqueue.pdf)
> 
> i especially like their combined event update/event wait,
> needing only one syscall per poll while building a changelist in
> userspace...
> 
> a replacement for poll/select is _really_ needed.
> but i think we should use existing interfaces if possible,
> to reduce the changes needed in userspace.

I'd kinda like to see a unified event queue object
used uniformly for everything.  You might instantiate
several of them in one process (so e.g. libraries could have
their own).

The idea of using the kqueue interface was discussed once before.  See
http://marc.theaimsgroup.com/?l=linux-kernel&m=97236943118139&w=2
for Linus' opinion of kqueues (he doesn't like them much).

Another existing event queue for readiness notification to
be delivered via is Ben's AIO completion notification queue,
but I haven't heard a definitive story about whether
epoll events could be delivered that way.  (The discussion
seems to always veer into a discussion of asynchronous
poll, which is something else.)
- Dan



^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] epoll more scalable than poll
  2002-10-28 20:56 ` Martin Waitz
@ 2002-10-28 22:02   ` bert hubert
  2002-10-28 22:15     ` bert hubert
  2002-10-28 22:17   ` Davide Libenzi
  1 sibling, 1 reply; 22+ messages in thread
From: bert hubert @ 2002-10-28 22:02 UTC (permalink / raw)
  To: Hanna Linder, linux-kernel

On Mon, Oct 28, 2002 at 09:56:47PM +0100, Martin Waitz wrote:

> needing only one syscall per poll while building a changelist in
> userspace...

Which is so smashingly succesful for iptables. I would very much doubt the
utility of building tables in userspace and them blasting them across,
especially as they will tend to be large when people bother to use epoll.

Regards,

bert

-- 
http://www.PowerDNS.com          Versatile DNS Software & Services
http://lartc.org           Linux Advanced Routing & Traffic Control HOWTO

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] epoll more scalable than poll
  2002-10-28 19:14 Hanna Linder
  2002-10-28 20:10 ` Hanna Linder
  2002-10-28 20:56 ` Martin Waitz
@ 2002-10-28 22:08 ` bert hubert
  2002-10-28 22:29   ` Davide Libenzi
  2 siblings, 1 reply; 22+ messages in thread
From: bert hubert @ 2002-10-28 22:08 UTC (permalink / raw)
  To: Hanna Linder; +Cc: linux-kernel, davidel, linux-aio, lse-tech

On Mon, Oct 28, 2002 at 11:14:19AM -0800, Hanna Linder wrote:

> 	The results of our testing show not only does the system call 
> interface to epoll perform as well as the /dev interface but also that epoll 
> is many times better than standard poll. No other implementations of poll 

Hanna,

Sure that this works? The following trivial program doesn't work on stdinput
when I'd expect it to. It just waits until the timeout passes end then
returns 0. It also does not work on a file, which is to be expected,
although 'select' returns with an immediate availability of data on a file
according to SuS.

Furthermore, there is some const weirdness going on, the ephttpd server has
a different second argument to sys_epoll_wait.

I compile this with:
cc -Wall -I/mnt/linux-2.5.44/include/    epoll.c   -o epoll


#include <stdio.h>
#include <errno.h>
#include <asm/page.h> 
#include <asm/poll.h> 
#include <linux/linkage.h>
#include <linux/eventpoll.h>
#include <linux/unistd.h>

#define __sys_epoll_create(maxfds) _syscall1(int, sys_epoll_create, int, maxfds)
#define __sys_epoll_ctl(epfd, op, fd, events) _syscall4(int, sys_epoll_ctl, \
int, epfd, int, op, int, fd, unsigned int, events)

#define __sys_epoll_wait(epfd, events, timeout) _syscall3(int, sys_epoll_wait, \
int, epfd, struct pollfd const **, events, int, timeout)

__sys_epoll_create(maxfds)
__sys_epoll_ctl(epfd, op, fd, events)
__sys_epoll_wait(epfd, events, timeout)

// asmlinkage int sys_epoll_wait(int epfd, struct pollfd const **events, int timeout);

int main()
{
	int kdpfd;
	struct pollfd const *pfds;
	int nfds;
	int timeout=2;
	
	if ((kdpfd = sys_epoll_create(10)) < 0) {
        	perror("sys_epoll_create");
                return -1;
        }
        if (sys_epoll_ctl(kdpfd, EP_CTL_ADD, 0, POLLIN ) < 0) {
		fprintf(stderr, "sys_epoll set insertion error: fd=%d\n", 0);

		return -1;
	}                                        

	nfds = sys_epoll_wait(kdpfd, &pfds, timeout * 1000);	
	fprintf(stderr,"sys_epoll_wait returned: %d\n",nfds);
	return 0;
}

-- 
http://www.PowerDNS.com          Versatile DNS Software & Services
http://lartc.org           Linux Advanced Routing & Traffic Control HOWTO

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] epoll more scalable than poll
  2002-10-28 22:02   ` bert hubert
@ 2002-10-28 22:15     ` bert hubert
  0 siblings, 0 replies; 22+ messages in thread
From: bert hubert @ 2002-10-28 22:15 UTC (permalink / raw)
  To: Hanna Linder, linux-kernel

On Mon, Oct 28, 2002 at 11:02:49PM +0100, bert hubert wrote:
> On Mon, Oct 28, 2002 at 09:56:47PM +0100, Martin Waitz wrote:
> 
> > needing only one syscall per poll while building a changelist in
> > userspace...
> 
> Which is so smashingly succesful for iptables. I would very much doubt the
> utility of building tables in userspace and them blasting them across,
> especially as they will tend to be large when people bother to use epoll.

Never mind, I should have read what you wrote. Building changesets in
userspace may have some utility.

-- 
http://www.PowerDNS.com          Versatile DNS Software & Services
http://lartc.org           Linux Advanced Routing & Traffic Control HOWTO

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] epoll more scalable than poll
  2002-10-28 20:56 ` Martin Waitz
  2002-10-28 22:02   ` bert hubert
@ 2002-10-28 22:17   ` Davide Libenzi
  1 sibling, 0 replies; 22+ messages in thread
From: Davide Libenzi @ 2002-10-28 22:17 UTC (permalink / raw)
  To: Martin Waitz; +Cc: Linux Kernel Mailing List

On Mon, 28 Oct 2002, Martin Waitz wrote:

> hi :)
>
> On Mon, Oct 28, 2002 at 11:14:19AM -0800, Hanna Linder wrote:
> > 	The results of our testing show not only does the system call
> > interface to epoll perform as well as the /dev interface but also that epoll
> > is many times better than standard poll. No other implementations of poll
> > have performed as well as epoll in any measure. Testing details and results
> > are published here, please take a minute to check it out: http://lse.sourceforge.net/epoll/index.html
> how does this compare to the kqueue mechanism found in {Free,Net}BSD?
> (see http://people.freebsd.org/~jlemon/papers/kqueue.pdf)
>
> i especially like their combined event update/event wait,
> needing only one syscall per poll while building a changelist in
> userspace...
>
> a replacement for poll/select is _really_ needed.
> but i think we should use existing interfaces if possible,
> to reduce the changes needed in userspace.

KQueue has not been tested simply because it does not ( to my knowledge )
have patches to apply to lk. I'd expect kqueue to scale in a similar way
of sys_epoll though. Where for "similar" I mean to not suffer high
connection load drops. About the interface, it looks pretty simple to me :

http://www.xmailserver.org/linux-patches/epoll_create.txt
http://www.xmailserver.org/linux-patches/epoll_ctl.txt
http://www.xmailserver.org/linux-patches/epoll_wait.txt




- Davide



^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] epoll more scalable than poll
  2002-10-28 22:08 ` bert hubert
@ 2002-10-28 22:29   ` Davide Libenzi
  2002-10-28 23:44     ` Jamie Lokier
  0 siblings, 1 reply; 22+ messages in thread
From: Davide Libenzi @ 2002-10-28 22:29 UTC (permalink / raw)
  To: bert hubert; +Cc: Hanna Linder, Linux Kernel Mailing List, linux-aio, lse-tech

On Mon, 28 Oct 2002, bert hubert wrote:

> On Mon, Oct 28, 2002 at 11:14:19AM -0800, Hanna Linder wrote:
>
> > 	The results of our testing show not only does the system call
> > interface to epoll perform as well as the /dev interface but also that epoll
> > is many times better than standard poll. No other implementations of poll
>
> Hanna,
>
> Sure that this works? The following trivial program doesn't work on stdinput
> when I'd expect it to. It just waits until the timeout passes end then
> returns 0. It also does not work on a file, which is to be expected,
> although 'select' returns with an immediate availability of data on a file
> according to SuS.
>
> Furthermore, there is some const weirdness going on, the ephttpd server has
> a different second argument to sys_epoll_wait.

sys_epoll, by plugging directly in the existing kernel architecture,
supports sockets and pipes. It does not support and there're not even
plans to support other devices like tty, where poll() and select() works
flawlessy. Since the sys_epoll ( and /dev/epoll ) fd support standard polling, you
can mix sys_epoll handling with other methods like poll() and the AIO's
POLL function when it'll be ready. For example, for devices that sys_epoll
intentionally does not support, you can use a method like :

        put_sys_epoll_fd_inside_XXX();
        ...
        wait_for_XXX_events();
        ...
        if (XXX_event_fd() == sys_epoll_fd) {
                sys_epoll_wait();
                for_each_sys_epoll_event {
                        handle_fd_event();
                }
        }




- Davide




^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] epoll more scalable than poll
  2002-10-28 21:59 [PATCH] epoll more scalable than poll Dan Kegel
@ 2002-10-28 22:36 ` Dan Kegel
  2002-10-28 22:51 ` Davide Libenzi
  1 sibling, 0 replies; 22+ messages in thread
From: Dan Kegel @ 2002-10-28 22:36 UTC (permalink / raw)
  To: Dan Kegel; +Cc: linux-kernel, Martin Waitz

Dan Kegel wrote:
> The idea of using the kqueue interface was discussed once before.  See
> http://marc.theaimsgroup.com/?l=linux-kernel&m=97236943118139&w=2
> for Linus' opinion of kqueues (he doesn't like them much).

Hang on - reading again, I wonder if the main reason he didn't like
kqueue is because it allowed for multiple event queues
(so libraries don't need to be tightly integrated into
the main program, for instance).  He preferred one queue
and callbacks.

However, I think Linus admitted later on that nobody liked his
callback idea, so maybe he'd be receptive to the multiple
event queue idea now.

Um, I assume Ben's aio stuff allows multiple completion queues, right?
- Dan

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] epoll more scalable than poll
  2002-10-28 21:59 [PATCH] epoll more scalable than poll Dan Kegel
  2002-10-28 22:36 ` Dan Kegel
@ 2002-10-28 22:51 ` Davide Libenzi
  2002-10-28 23:03   ` Dan Kegel
  1 sibling, 1 reply; 22+ messages in thread
From: Davide Libenzi @ 2002-10-28 22:51 UTC (permalink / raw)
  To: Dan Kegel; +Cc: Linux Kernel Mailing List

On Mon, 28 Oct 2002, Dan Kegel wrote:

> Another existing event queue for readiness notification to
> be delivered via is Ben's AIO completion notification queue,
> but I haven't heard a definitive story about whether
> epoll events could be delivered that way.  (The discussion
> seems to always veer into a discussion of asynchronous
> poll, which is something else.)

Yep Dan, Ben proposed that approach that we did not have the time to test.
The way of returning events of sys_epoll is very efficent, like you can
see in the scalability page ( pipetest ) that Hanna and her team setup :

http://lse.sourceforge.net/epoll/index.html



- Davide



^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] epoll more scalable than poll
  2002-10-28 22:51 ` Davide Libenzi
@ 2002-10-28 23:03   ` Dan Kegel
  0 siblings, 0 replies; 22+ messages in thread
From: Dan Kegel @ 2002-10-28 23:03 UTC (permalink / raw)
  To: Davide Libenzi; +Cc: Linux Kernel Mailing List

Davide Libenzi wrote:
> On Mon, 28 Oct 2002, Dan Kegel wrote:
>>Another existing event queue for readiness notification to
>>be delivered via is Ben's AIO completion notification queue,
>>but I haven't heard a definitive story about whether
>>epoll events could be delivered that way.  (The discussion
>>seems to always veer into a discussion of asynchronous
>>poll, which is something else.)
> 
> Yep Dan, Ben proposed that approach that we did not have the time to test.
> The way of returning events of sys_epoll is very efficent, like you can
> see in the scalability page ( pipetest ) that Hanna and her team setup :
> 
> http://lse.sourceforge.net/epoll/index.html

I do like those results.  If, however, the unified approach performs
as well, it might be good to go with it to reduce the number of
interfaces, as Martin suggested.  (Though he was suggesting kqueue as
the preferred interface, not Ben's aio...)
- Dan


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] epoll more scalable than poll
  2002-10-28 22:29   ` Davide Libenzi
@ 2002-10-28 23:44     ` Jamie Lokier
  2002-10-29  0:02       ` Davide Libenzi
  2002-10-29  0:03       ` bert hubert
  0 siblings, 2 replies; 22+ messages in thread
From: Jamie Lokier @ 2002-10-28 23:44 UTC (permalink / raw)
  To: Davide Libenzi
  Cc: bert hubert, Hanna Linder, Linux Kernel Mailing List, linux-aio,
	lse-tech

Davide Libenzi wrote:
> sys_epoll, by plugging directly in the existing kernel architecture,
> supports sockets and pipes. It does not support and there're not even
> plans to support other devices like tty, where poll() and select() works
> flawlessy. Since the sys_epoll ( and /dev/epoll ) fd support standard polling, you
> can mix sys_epoll handling with other methods like poll() and the AIO's
> POLL function when it'll be ready. For example, for devices that sys_epoll
> intentionally does not support, you can use a method like :

:( I was hoping sys_epoll would be scalable without increasing the
number of system calls per event.

Is it too much work to support all kinds of fd?  It would be rather a
good thing IMHO.

I'm thinking that a typical generic event handling library (like in a
typical home grown server) takes a set of fds and event handling
callbacks.  sys_epoll is obviously not so trivial to use in place of a
poll() loop, because the library needs to fstat() each fd that is
registered to decide if epoll will return events for that fd.

For that to work, it's important that you can determine, through
fstat(), whether sys_epoll will actually return events for the fd, or
whether a sigqueue event is needed to trigger the epoll read.

So, is it exactly _all_ sockets and pipes, and nothing else?

Btw, is the set of fd types supported by epoll the same as the set of
fd types supported by SIGIO?  That would be convenient - and logical.

thanks,
-- Jamie (who thinks a lot about fast web servers)

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] epoll more scalable than poll
  2002-10-28 23:44     ` Jamie Lokier
@ 2002-10-29  0:02       ` Davide Libenzi
  2002-10-29  1:51         ` Jamie Lokier
  2002-10-29  0:03       ` bert hubert
  1 sibling, 1 reply; 22+ messages in thread
From: Davide Libenzi @ 2002-10-29  0:02 UTC (permalink / raw)
  To: Jamie Lokier; +Cc: Linux Kernel Mailing List, linux-aio, lse-tech

On Mon, 28 Oct 2002, Jamie Lokier wrote:

> Davide Libenzi wrote:
> > sys_epoll, by plugging directly in the existing kernel architecture,
> > supports sockets and pipes. It does not support and there're not even
> > plans to support other devices like tty, where poll() and select() works
> > flawlessy. Since the sys_epoll ( and /dev/epoll ) fd support standard polling, you
> > can mix sys_epoll handling with other methods like poll() and the AIO's
> > POLL function when it'll be ready. For example, for devices that sys_epoll
> > intentionally does not support, you can use a method like :
>
> :( I was hoping sys_epoll would be scalable without increasing the
> number of system calls per event.
>
> Is it too much work to support all kinds of fd?  It would be rather a
> good thing IMHO.
>
> I'm thinking that a typical generic event handling library (like in a
> typical home grown server) takes a set of fds and event handling
> callbacks.  sys_epoll is obviously not so trivial to use in place of a
> poll() loop, because the library needs to fstat() each fd that is
> registered to decide if epoll will return events for that fd.
>
> For that to work, it's important that you can determine, through
> fstat(), whether sys_epoll will actually return events for the fd, or
> whether a sigqueue event is needed to trigger the epoll read.
>
> So, is it exactly _all_ sockets and pipes, and nothing else?
>
> Btw, is the set of fd types supported by epoll the same as the set of
> fd types supported by SIGIO?  That would be convenient - and logical.

Jamie, doing that is not a real problem. The fact is that sys_epoll aimed
to solve issues where scalability on huge number of fds is involved. By
covering sockets ( network connections ) and pipes ( cgi execution ) you
have a pretty wide scalability addressing. Usually you know from where and
fd born, so you're typically able to handle it correctly. Those reasons,
togheter with the fact that I did not want to introduce revolutions in the
kernel, drove me to limit the sys_epoll coverage to sockets and pipes.



- Davide




^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] epoll more scalable than poll
  2002-10-28 23:44     ` Jamie Lokier
  2002-10-29  0:02       ` Davide Libenzi
@ 2002-10-29  0:03       ` bert hubert
  2002-10-29  0:20         ` Davide Libenzi
  2002-10-29  0:48         ` Jamie Lokier
  1 sibling, 2 replies; 22+ messages in thread
From: bert hubert @ 2002-10-29  0:03 UTC (permalink / raw)
  To: Jamie Lokier
  Cc: Davide Libenzi, Hanna Linder, Linux Kernel Mailing List,
	linux-aio, lse-tech

On Mon, Oct 28, 2002 at 11:44:34PM +0000, Jamie Lokier wrote:

> :( I was hoping sys_epoll would be scalable without increasing the
> number of system calls per event.

I see only one call per event? sys_epoll_wait. Perhaps sys_epoll_ctl to
register a new interest.

Regards,

bert
-- 
http://www.PowerDNS.com          Versatile DNS Software & Services
http://lartc.org           Linux Advanced Routing & Traffic Control HOWTO

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] epoll more scalable than poll
  2002-10-29  0:03       ` bert hubert
@ 2002-10-29  0:20         ` Davide Libenzi
  2002-10-29  0:48         ` Jamie Lokier
  1 sibling, 0 replies; 22+ messages in thread
From: Davide Libenzi @ 2002-10-29  0:20 UTC (permalink / raw)
  To: bert hubert; +Cc: Linux Kernel Mailing List, lse-tech

On Tue, 29 Oct 2002, bert hubert wrote:

> On Mon, Oct 28, 2002 at 11:44:34PM +0000, Jamie Lokier wrote:
>
> > :( I was hoping sys_epoll would be scalable without increasing the
> > number of system calls per event.
>
> I see only one call per event? sys_epoll_wait. Perhaps sys_epoll_ctl to
> register a new interest.

In theory you can register the fd at creation time with the full interest
set and you can leave it in there for its whole life without having to
call epoll_ctl() every switch between read/write. It's true that you could
receive false events, but by studying the frequency of those false events
on a "very high loaded" HTTP server, it resulted to be both very little
and uneffective on the server performance.

- Davide

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] epoll more scalable than poll
  2002-10-29  0:03       ` bert hubert
  2002-10-29  0:20         ` Davide Libenzi
@ 2002-10-29  0:48         ` Jamie Lokier
  2002-10-29  1:53           ` Jamie Lokier
  1 sibling, 1 reply; 22+ messages in thread
From: Jamie Lokier @ 2002-10-29  0:48 UTC (permalink / raw)
  To: bert hubert, Davide Libenzi, Hanna Linder,
	Linux Kernel Mailing List, linux-aio, lse-tech

bert hubert wrote:
> > :( I was hoping sys_epoll would be scalable without increasing the
> > number of system calls per event.
> 
> I see only one call per event? sys_epoll_wait. Perhaps sys_epoll_ctl to
> register a new interest.

As David pointed out, you need a second call before the sys_epoll_wait
if you're waiting for fds that epoll doesn't support (such as a tty).

-- Jamie

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] epoll more scalable than poll
  2002-10-29  0:02       ` Davide Libenzi
@ 2002-10-29  1:51         ` Jamie Lokier
  2002-10-29  5:06           ` Davide Libenzi
  0 siblings, 1 reply; 22+ messages in thread
From: Jamie Lokier @ 2002-10-29  1:51 UTC (permalink / raw)
  To: Davide Libenzi; +Cc: Linux Kernel Mailing List, linux-aio, lse-tech

Davide Libenzi wrote:
> Jamie, doing that is not a real problem. The fact is that sys_epoll aimed
> to solve issues where scalability on huge number of fds is involved. By
> covering sockets ( network connections ) and pipes ( cgi execution ) you
> have a pretty wide scalability addressing. Usually you know from where and
> fd born, so you're typically able to handle it correctly. Those reasons,
> togheter with the fact that I did not want to introduce revolutions in the
> kernel, drove me to limit the sys_epoll coverage to sockets and pipes.

Oh I agree this is an acceptable limitation.  Just wondering whether I
can safely depend on an fd being a socket/pipe being sufficient?
I.e. does it work on a non-IP socket, a packet socket, an IPX socket
etc?

It would be good if epoll would at least refuse to register fds that
it can't handle, returning EINVAL for them.  If it's as simple as
socket+pipe, that's a trivial test in ep_insert.

I've just read the /dev/epoll patch.  I think it makes sense, in the
long run, to share infrastructure with that other event notification
subsystem - sigio.  The two should really be interchangable interfaces
to the same underlying event notification system - not one interface
handling some fds and the other handling different fds.

(Ideally, though, with the new waitqueue wakeup callback functions
that were needed for aio the old fd poll mechanism can be made to
generate events - which epoll and sigio and aio and poll() could all
use - full circle back to a beautiful and harmonious unix world once
more.)

-- Jamie

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] epoll more scalable than poll
  2002-10-29  0:48         ` Jamie Lokier
@ 2002-10-29  1:53           ` Jamie Lokier
  0 siblings, 0 replies; 22+ messages in thread
From: Jamie Lokier @ 2002-10-29  1:53 UTC (permalink / raw)
  To: bert hubert, Davide Libenzi, Hanna Linder,
	Linux Kernel Mailing List, linux-aio, lse-tech

Jamie Lokier wrote:
> bert hubert wrote:
> > > :( I was hoping sys_epoll would be scalable without increasing the
> > > number of system calls per event.
> > 
> > I see only one call per event? sys_epoll_wait. Perhaps sys_epoll_ctl to
> > register a new interest.
> 
> As David pointed out, you need a second call before the sys_epoll_wait
> if you're waiting for fds that epoll doesn't support (such as a tty).
           ^
       insert "also"

-- Jamie

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] epoll more scalable than poll
  2002-10-29  1:51         ` Jamie Lokier
@ 2002-10-29  5:06           ` Davide Libenzi
  2002-10-29 11:20             ` Jamie Lokier
  0 siblings, 1 reply; 22+ messages in thread
From: Davide Libenzi @ 2002-10-29  5:06 UTC (permalink / raw)
  To: Jamie Lokier; +Cc: Linux Kernel Mailing List, lse-tech

On Tue, 29 Oct 2002, Jamie Lokier wrote:

> Davide Libenzi wrote:
> Oh I agree this is an acceptable limitation.  Just wondering whether I
> can safely depend on an fd being a socket/pipe being sufficient?
> I.e. does it work on a non-IP socket, a packet socket, an IPX socket
> etc?

Yes, by plugging the sk_wake_async() that is called from std ->data_ready
and ->write_space of generic socket support, all sockets types are
supported. Well, I should say "should" instead of "are" because I never
tested it with sockets different from TCP/IP :)

> It would be good if epoll would at least refuse to register fds that
> it can't handle, returning EINVAL for them.  If it's as simple as
> socket+pipe, that's a trivial test in ep_insert.

This can be certainly implemented if many of you feel that it could be
usefull. The clean way to understand if a file* is of a given type would
be to make the "struct file_operations" of the compatible files ( sockets
and pipes ) to be non-static and to use something like :

if (f->f_op == ...)

to test the target file type. I'm already doing this to verify the epoll
file descriptor coherence.

> I've just read the /dev/epoll patch.  I think it makes sense, in the
> long run, to share infrastructure with that other event notification
> subsystem - sigio.  The two should really be interchangable interfaces
> to the same underlying event notification system - not one interface
> handling some fds and the other handling different fds.

IMHO sys_epoll is going to be a replacement for rt-signals, because it
scales better, it collapses events and does not have the overflowing queue
problem.

> (Ideally, though, with the new waitqueue wakeup callback functions
> that were needed for aio the old fd poll mechanism can be made to
> generate events - which epoll and sigio and aio and poll() could all
> use - full circle back to a beautiful and harmonious unix world once
> more.)

The sys_epoll interface was coded to use the existing infrastructure w/out
adding any legacy code added to suite the implementation. Basically,
besides the few lines added to fs/pipe.c to support pipes ( rt-signal did
not support them ), the hook lays inside sk_wake_async().

- Davide

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] epoll more scalable than poll
  2002-10-29  5:06           ` Davide Libenzi
@ 2002-10-29 11:20             ` Jamie Lokier
  2002-10-30  0:16               ` Davide Libenzi
  0 siblings, 1 reply; 22+ messages in thread
From: Jamie Lokier @ 2002-10-29 11:20 UTC (permalink / raw)
  To: Davide Libenzi; +Cc: Linux Kernel Mailing List, lse-tech

Davide Libenzi wrote:
> IMHO sys_epoll is going to be a replacement for rt-signals, because it
> scales better, it collapses events and does not have the overflowing queue
> problem.

Scalability is also solved by the signal-per-fd patch, as you know.
The main advantage of epoll is that it's lighter weight than rt-signals.

(IMHO signal-per-fd really ought to be included in 2.6 _anyway_, regardless
of any better mechanism for reading events.)

> The sys_epoll interface was coded to use the existing infrastructure w/out
> adding any legacy code added to suite the implementation. Basically,
> besides the few lines added to fs/pipe.c to support pipes ( rt-signal did
> not support them ), the hook lays inside sk_wake_async().

I agree that was the way to do it for 2.4 and earlier - you have to
work with a range of kernels, and minimum impact.

But now in 2.5 it's appropriate to implement whatever's _right_.

Time for me to take the big plunge and try a 2.5 kernel on my IDE
laptop, I guess :-)

-- Jamie


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] epoll more scalable than poll
  2002-10-29 11:20             ` Jamie Lokier
@ 2002-10-30  0:16               ` Davide Libenzi
  0 siblings, 0 replies; 22+ messages in thread
From: Davide Libenzi @ 2002-10-30  0:16 UTC (permalink / raw)
  To: Jamie Lokier; +Cc: Linux Kernel Mailing List, lse-tech

On Tue, 29 Oct 2002, Jamie Lokier wrote:

> Davide Libenzi wrote:
> > IMHO sys_epoll is going to be a replacement for rt-signals, because it
> > scales better, it collapses events and does not have the overflowing queue
> > problem.
>
> Scalability is also solved by the signal-per-fd patch, as you know.
> The main advantage of epoll is that it's lighter weight than rt-signals.
>
> (IMHO signal-per-fd really ought to be included in 2.6 _anyway_, regardless
> of any better mechanism for reading events.)

It scales pretty good, yes. You have to be carefull to build your kernel
with a huge queue to avoid SIGIO, that makes you pay somthing. Also does
not support pipes.

> > The sys_epoll interface was coded to use the existing infrastructure w/out
> > adding any legacy code added to suite the implementation. Basically,
> > besides the few lines added to fs/pipe.c to support pipes ( rt-signal did
> > not support them ), the hook lays inside sk_wake_async().
>
> I agree that was the way to do it for 2.4 and earlier - you have to
> work with a range of kernels, and minimum impact.
>
> But now in 2.5 it's appropriate to implement whatever's _right_.

Yes Jamie, you can add sys_epoll support for other devices but IMHO the
only devices where you're going to have scalability problems due huge
number of handled fds are 1) sockets 2) pipes. I feel that devices that do
not go over 100-500 can be easily handled with the fully supportive
poll(). The fact that you can drop a sys_epoll fd inside a poll() set,
garanties you 1) scalability due the stocking of sockets+pipes inside the
sys_epoll fd 2) compatibility with the full set of devices. This w/out
screwing up much the kernel code. What do you think ?

- Davide

^ permalink raw reply	[flat|nested] 22+ messages in thread

end of thread, other threads:[~2002-10-30  0:00 UTC | newest]

Thread overview: 22+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2002-10-28 21:59 [PATCH] epoll more scalable than poll Dan Kegel
2002-10-28 22:36 ` Dan Kegel
2002-10-28 22:51 ` Davide Libenzi
2002-10-28 23:03   ` Dan Kegel
  -- strict thread matches above, loose matches on Subject: below --
2002-10-28 19:14 Hanna Linder
2002-10-28 20:10 ` Hanna Linder
2002-10-28 20:56 ` Martin Waitz
2002-10-28 22:02   ` bert hubert
2002-10-28 22:15     ` bert hubert
2002-10-28 22:17   ` Davide Libenzi
2002-10-28 22:08 ` bert hubert
2002-10-28 22:29   ` Davide Libenzi
2002-10-28 23:44     ` Jamie Lokier
2002-10-29  0:02       ` Davide Libenzi
2002-10-29  1:51         ` Jamie Lokier
2002-10-29  5:06           ` Davide Libenzi
2002-10-29 11:20             ` Jamie Lokier
2002-10-30  0:16               ` Davide Libenzi
2002-10-29  0:03       ` bert hubert
2002-10-29  0:20         ` Davide Libenzi
2002-10-29  0:48         ` Jamie Lokier
2002-10-29  1:53           ` Jamie Lokier

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).