* RE: Linux's implementation of poll() not scalable?
[not found] <39F529CC.2538300@alumni.caltech.edu>
@ 2000-10-30 22:22 ` Mike Jagdis
2000-11-01 16:09 ` Dan Kegel
0 siblings, 1 reply; 10+ messages in thread
From: Mike Jagdis @ 2000-10-30 22:22 UTC (permalink / raw)
To: dank, Linus Torvalds, linux-kernel
[-- Attachment #1: Type: text/plain, Size: 1322 bytes --]
Here's something I did last year and then put on ice, partly
through lack of time and partly because I thought I'd pick
it up for 2.5.
All this talk of event queues misses one thing: we already
have an event queue mechanism. They're called wait queues.
The only problem is that the only on-event action possible
is to wake the process (assuming it was asleep in the first
place). This patch firstly extends the wait queue mechanism
to allow an arbitrary action to be performed. Then I rewrote
the select/poll implementation to use event queueing to avoid
rescanning descriptors that had not changed - and restructured
the loops to be rather more efficient. This approach doesn't
need any changes to driver poll routines, it doesn't need
backwards mapping struct files. It should be fairly easy to
implement a /dev/poll mechanism using this, although I haven't
yet.
Yes, the change to wait queues has a slight cost, but it isn't
great and the main part of it only happens if you actually sleep.
Performance graphs and the lmbench derived test programs I
used are at http://www.purplet.demon.co.uk/linux/select/ (bounce
in and out of the index page 'cos the next and prev buttons
aren't wired up :-) )
Oh, and I updated this patch for 2.4.0-test9.
Comments and opinions are, as always, welcome :-).
Mike
[-- Attachment #2: select.patch --]
[-- Type: application/octet-stream, Size: 30285 bytes --]
diff -urN -x *.[oa] -x .config -x .version -x .depend -x .hdepend -x *.flags -x autoconf.h -x modversions.h -x version.h -x asm -x modules -x config -x soundmodem linux-2.4.0-test9/fs/select.c linux-2.4.0-test9+/fs/select.c
--- linux-2.4.0-test9/fs/select.c Mon Jul 24 06:39:44 2000
+++ linux-2.4.0-test9+/fs/select.c Sun Oct 29 20:53:15 2000
@@ -9,238 +9,325 @@
* flag set in its personality we do *not* modify the given timeout
* parameter to reflect time remaining.
*
- * 24 January 2000
- * Changed sys_poll()/do_poll() to use PAGE_SIZE chunk-based allocation
- * of fds to overcome nfds < 16390 descriptors limit (Tigran Aivazian).
+ * August 1999 - February 2000
+ * Rewritten to use a wake up callback to queue events
+ * after sleeping, plus general performance enhancements.
+ * -- Mike Jagdis <jaggy@purplet.demon.co.uk>
*/
#include <linux/malloc.h>
#include <linux/smp_lock.h>
+#include <linux/list.h>
#include <linux/poll.h>
#include <linux/file.h>
#include <asm/uaccess.h>
-#define ROUND_UP(x,y) (((x)+(y)-1)/(y))
-#define DEFAULT_POLLMASK (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)
+/* Claim the file_lock semaphore around the whole fdset scan loop
+ * rather than when looking at an fdset bit within the loop (this
+ * is done by fget()/fput()). This holds a read lock on the decsriptor
+ * table for longer but reduces overhead in the loop.
+ * Note that poll accesses user space while holding the file lock.
+ * This may be an issue for threaded programs?
+ */
+#define WIDE_FILE_LOCK
-struct poll_table_entry {
- struct file * filp;
- wait_queue_t wait;
- wait_queue_head_t * wait_address;
-};
-
-struct poll_table_page {
- struct poll_table_page * next;
- struct poll_table_entry * entry;
- struct poll_table_entry entries[0];
-};
-#define POLL_TABLE_FULL(table) \
- ((unsigned long)((table)->entry+1) > PAGE_SIZE + (unsigned long)(table))
+#define ROUND_UP(x,y) (((x)+(y)-1)/(y))
+#define DEFAULT_POLLMASK (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)
-/*
- * Ok, Peter made a complicated, but straightforward multiple_wait() function.
- * I have rewritten this, taking some shortcuts: This code may not be easy to
- * follow, but it should be free of race-conditions, and it's practical. If you
- * understand what I'm doing here, then you understand how the linux
- * sleep/wakeup mechanism works.
- *
- * Two very simple procedures, poll_wait() and poll_freewait() make all the
- * work. poll_wait() is an inline-function defined in <linux/poll.h>,
- * as all select/poll functions have to call it to add an entry to the
- * poll table.
- */
-void poll_freewait(poll_table* pt)
+void poll_freewait(poll_table * ptab)
{
- struct poll_table_page * p = pt->table;
- while (p) {
- struct poll_table_entry * entry;
- struct poll_table_page *old;
+ struct poll_table_head *p;
- entry = p->entry;
- do {
- entry--;
- remove_wait_queue(entry->wait_address,&entry->wait);
+ p = ptab->head;
+ while (p) {
+ struct poll_table_entry *entry;
+ entry = (struct poll_table_entry *)(p + 1);
+ while (entry < p->entry) {
+ remove_wait_queue(entry->wait_address, &entry->wait);
fput(entry->filp);
- } while (entry > p->entries);
- old = p;
+ entry++;
+ }
+ p = p->next;
+ }
+ p = ptab->head;
+ while (p) {
+ struct poll_table_head *old = p;
p = p->next;
free_page((unsigned long) old);
}
}
-void __pollwait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)
+static int
+__pollwake(wait_queue_t *wq, unsigned int mode, const int sync)
{
- struct poll_table_page *table = p->table;
-
- if (!table || POLL_TABLE_FULL(table)) {
- struct poll_table_page *new_table;
-
- new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL);
- if (!new_table) {
- p->error = -ENOMEM;
- __set_current_state(TASK_RUNNING);
- return;
+ struct poll_table_entry *p;
+ struct poll_table_head *head;
+ unsigned long flags;
+
+ wq->onwake = NULL;
+ p = (void *)wq - offsetof(struct poll_table_entry, wait);
+ head = (struct poll_table_head *)((unsigned long)p & PAGE_MASK);
+
+ spin_lock_irqsave(&head->poll_table->woken_lock, flags);
+ p->woken_list = head->poll_table->woken_list;
+ head->poll_table->woken_list = p;
+ spin_unlock_irqrestore(&head->poll_table->woken_lock, flags);
+
+ if (wq->task->state & (mode & ~TASK_EXCLUSIVE)) {
+ if (!sync) {
+ wake_up_process(wq->task);
+ goto out;
}
- new_table->entry = new_table->entries;
- new_table->next = table;
- p->table = new_table;
- table = new_table;
+ wake_up_process_synchronous(wq->task);
}
+out:
+ return 0;
+}
- /* Add a new entry */
- {
- struct poll_table_entry * entry = table->entry;
- table->entry = entry+1;
+void __pollwait(struct file * filp, wait_queue_head_t * wait_address, poll_table *ptab)
+{
+ struct poll_table_head * tmp, * p = ptab->last;
+ struct poll_table_entry *last_entry;
+
+ last_entry = (void *)p + PAGE_SIZE - sizeof(struct poll_table_entry);
+ if (p->entry <= last_entry) {
+ struct poll_table_entry *entry;
+ok_table:
+ entry = p->entry++;
get_file(filp);
entry->filp = filp;
+ entry->n = ptab->n;
+ init_waitqueue_entry_onwake(&entry->wait, current, __pollwake);
entry->wait_address = wait_address;
- init_waitqueue_entry(&entry->wait, current);
- add_wait_queue(wait_address,&entry->wait);
+ add_wait_queue(wait_address, &entry->wait);
+ return;
}
-}
-#define __IN(fds, n) (fds->in + n)
-#define __OUT(fds, n) (fds->out + n)
-#define __EX(fds, n) (fds->ex + n)
-#define __RES_IN(fds, n) (fds->res_in + n)
-#define __RES_OUT(fds, n) (fds->res_out + n)
-#define __RES_EX(fds, n) (fds->res_ex + n)
-
-#define BITS(fds, n) (*__IN(fds, n)|*__OUT(fds, n)|*__EX(fds, n))
-
-static int max_select_fd(unsigned long n, fd_set_bits *fds)
-{
- unsigned long *open_fds;
- unsigned long set;
- int max;
-
- /* handle last in-complete long-word first */
- set = ~(~0UL << (n & (__NFDBITS-1)));
- n /= __NFDBITS;
- open_fds = current->files->open_fds->fds_bits+n;
- max = 0;
- if (set) {
- set &= BITS(fds, n);
- if (set) {
- if (!(set & ~*open_fds))
- goto get_max;
- return -EBADF;
- }
- }
- while (n) {
- open_fds--;
- n--;
- set = BITS(fds, n);
- if (!set)
- continue;
- if (set & ~*open_fds)
- return -EBADF;
- if (max)
- continue;
-get_max:
- do {
- max++;
- set >>= 1;
- } while (set);
- max += n * __NFDBITS;
+ tmp = (void *) __get_free_page(GFP_KERNEL);
+ if (tmp) {
+ tmp->entry = (struct poll_table_entry *)(tmp + 1);
+ tmp->next = NULL;
+ tmp->poll_table = ptab;
+ p->next = tmp;
+ ptab->last = p = tmp;
+ goto ok_table;
}
-
- return max;
+ ptab->error = -ENOMEM;
}
+
#define BIT(i) (1UL << ((i)&(__NFDBITS-1)))
-#define MEM(i,m) ((m)+(unsigned)(i)/__NFDBITS)
-#define ISSET(i,m) (((i)&*(m)) != 0)
-#define SET(i,m) (*(m) |= (i))
#define POLLIN_SET (POLLRDNORM | POLLRDBAND | POLLIN | POLLHUP | POLLERR)
#define POLLOUT_SET (POLLWRBAND | POLLWRNORM | POLLOUT | POLLERR)
#define POLLEX_SET (POLLPRI)
-int do_select(int n, fd_set_bits *fds, long *timeout)
+int do_select(int n, unsigned long *bits, const int size, long *timeout)
{
- poll_table table, *wait;
- int retval, i, off;
- long __timeout = *timeout;
-
- read_lock(¤t->files->file_lock);
- retval = max_select_fd(n, fds);
- read_unlock(¤t->files->file_lock);
-
- if (retval < 0)
- return retval;
- n = retval;
-
- poll_initwait(&table);
- wait = &table;
- if (!__timeout)
- wait = NULL;
- retval = 0;
- for (;;) {
- set_current_state(TASK_INTERRUPTIBLE);
- for (i = 0 ; i < n; i++) {
- unsigned long bit = BIT(i);
- unsigned long mask;
- struct file *file;
-
- off = i / __NFDBITS;
- if (!(bit & BITS(fds, off)))
- continue;
- file = fget(i);
- mask = POLLNVAL;
- if (file) {
+ int retval;
+ poll_table *wait, wait_table;
+ long __timeout;
+ unsigned long bit;
+ unsigned long fds, *fdl;
+ unsigned long mask;
+ struct file *file;
+
+ wait = NULL;
+ __timeout = *timeout;
+ if (__timeout) {
+ wait_table.head = (void *) __get_free_page(GFP_KERNEL);
+ if (!wait_table.head)
+ return -ENOMEM;
+
+ poll_initwait(&wait_table);
+ wait = &wait_table;
+ }
+
+ wait_table.error = retval = 0;
+
+ if (!n)
+ goto no_setup;
+
+ fdl = bits + (n - 1) / __NFDBITS;
+
+ fds = (*fdl | fdl[size] | fdl[2*size]);
+
+#ifdef WIDE_FILE_LOCK
+ read_lock(¤t->files->file_lock);
+#endif
+ if (!fds)
+ goto next_long;
+
+ mask = (n & (__NFDBITS-1));
+ if (mask) {
+ fds &= ~(~0UL << mask);
+ if (!fds)
+ goto next_long;
+ }
+
+ do {
+ /* At this point we know some bit is set in fds. */
+ wait_table.n = (fdl - bits) * 8*sizeof(unsigned long);
+ bit = 1;
+ do {
+ /* If a long has set bits do we expect most
+ * of them to be set?
+ */
+ if ((fds & bit)) {
+ fds ^= bit;
+#ifdef WIDE_FILE_LOCK
+ file = fcheck(wait_table.n);
+#else
+ file = fget(wait_table.n);
+#endif
+ if (!file)
+ goto out_badf;
mask = DEFAULT_POLLMASK;
if (file->f_op && file->f_op->poll)
mask = file->f_op->poll(file, wait);
+#ifndef WIDE_FILE_LOCK
fput(file);
+#endif
+ /* Branch prediction says that forward
+ * conditional branches are not taken. So...
+ */
+ /* Normally have to wait for input */
+ if (!(mask & POLLIN_SET))
+ goto no_in_check;
+ if ((*fdl & bit)) {
+ fdl[3*size] |= bit;
+ retval++;
+ wait = NULL;
+ }
+no_in_check:
+ /* Normally can output */
+ if (!(fdl[size] & bit))
+ goto no_out_check;
+ if ((mask & POLLOUT_SET)) {
+ fdl[4*size] |= bit;
+ retval++;
+ wait = NULL;
+ }
+no_out_check:
+ /* Normally no exception */
+ if (!(mask & POLLEX_SET))
+ goto no_ex_check;
+ if ((fdl[2*size] & bit)) {
+ fdl[5*size] |= bit;
+ retval++;
+ wait = NULL;
+ }
+no_ex_check:
}
- if ((mask & POLLIN_SET) && ISSET(bit, __IN(fds,off))) {
- SET(bit, __RES_IN(fds,off));
+ bit += bit;
+ wait_table.n++;
+ } while (fds);
+
+next_long:
+ /* Long runs of zero bits tend to be common, especially
+ * at the end of over large sets so we use a tight loop
+ * to skip them.
+ * N.B. We allocated an extra, non-zero, long in front
+ * of bits in sys_select specifically so we can access
+ * below bits and avoid an extra test and branch.
+ */
+ do {
+ fdl--;
+ fds = *fdl | fdl[size] | fdl[2*size];
+ } while (!fds);
+ } while (fdl >= bits);
+
+#ifdef WIDE_FILE_LOCK
+ read_unlock(¤t->files->file_lock);
+#endif
+
+ if (!retval)
+ retval = wait_table.error;
+
+no_setup:
+ while (!retval && __timeout && !signal_pending(current)) {
+ unsigned long flags;
+ struct poll_table_entry *item, *next;
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (wait_table.woken_list == NULL)
+ __timeout = schedule_timeout(__timeout);
+ set_current_state(TASK_RUNNING);
+
+ /* Lift the queued events and reset the queue ready
+ * for anything that happens while we "think".
+ */
+ spin_lock_irqsave(&wait_table.woken_lock, flags);
+ item = wait_table.woken_list;
+ wait_table.woken_list = NULL;
+ spin_unlock_irqrestore(&wait_table.woken_lock, flags);
+
+ for (; item; item=next) {
+ struct poll_table_entry * p = item;
+
+ /* Reset the wait_queue's event pointer before
+ * checking in case it is triggered again.
+ */
+ next = item->woken_list;
+ set_waitqueue_entry_onwake(&p->wait, __pollwake);
+
+ file = p->filp;
+#if 0
+ /* This has to be true or we would never have
+ * done a wait on the file!
+ */
+ if (file->f_op && file->f_op->poll)
+#endif
+ mask = file->f_op->poll(file, NULL);
+
+ fdl = bits + (p->n / __NFDBITS);
+ bit = BIT(p->n);
+
+ /* Branch prediction says that forward
+ * conditional branches are not taken.
+ * Wake ups are dumb. We get woken up every time
+ * more space becomes available even if we are
+ * not interested.
+ */
+ if (!(mask & POLLIN_SET))
+ goto ev_no_in_check;
+ if ((*fdl & bit)) {
+ fdl[3*size] |= bit;
retval++;
- wait = NULL;
}
- if ((mask & POLLOUT_SET) && ISSET(bit, __OUT(fds,off))) {
- SET(bit, __RES_OUT(fds,off));
+ev_no_in_check:
+ if (!(fdl[size] & bit))
+ goto ev_no_out_check;
+ if ((mask & POLLOUT_SET)) {
+ fdl[4*size] |= bit;
retval++;
- wait = NULL;
}
- if ((mask & POLLEX_SET) && ISSET(bit, __EX(fds,off))) {
- SET(bit, __RES_EX(fds,off));
+ev_no_out_check:
+ /* Normally no exception */
+ if (!(mask & POLLEX_SET))
+ goto ev_no_ex_check;
+ if ((fdl[2*size] & bit)) {
+ fdl[5*size] |= bit;
retval++;
- wait = NULL;
}
+ev_no_ex_check:
}
- wait = NULL;
- if (retval || !__timeout || signal_pending(current))
- break;
- if(table.error) {
- retval = table.error;
- break;
- }
- __timeout = schedule_timeout(__timeout);
}
- current->state = TASK_RUNNING;
-
- poll_freewait(&table);
-
- /*
- * Up-to-date the caller timeout.
- */
+out:
+ if (*timeout)
+ poll_freewait(&wait_table);
*timeout = __timeout;
return retval;
-}
-
-static void *select_bits_alloc(int size)
-{
- return kmalloc(6 * size, GFP_KERNEL);
-}
-static void select_bits_free(void *bits, int size)
-{
- kfree(bits);
+out_badf:
+#ifdef WIDE_FILE_LOCK
+ read_unlock(¤t->files->file_lock);
+#endif
+ retval = -EBADF;
+ goto out;
}
/*
@@ -254,13 +341,22 @@
#define MAX_SELECT_SECONDS \
((unsigned long) (MAX_SCHEDULE_TIMEOUT / HZ)-1)
+/* How big a set we can use stack for rather than going out to kmalloc.
+ * 512 fds = 388 bytes, 1024 fds = 772 bytes, 2048 fds = 1540 bytes,
+ * 4096 fds = 3068 bytes
+ * Remember: the future call depth is fairly shallow but we need
+ * to leave space for interrupts!
+ */
+#define MAX_LOC_FDS 1024
+#define MAX_LOC_LONGS (1 + 6 * FDS_LONGS(MAX_LOC_FDS))
+
asmlinkage long
sys_select(int n, fd_set *inp, fd_set *outp, fd_set *exp, struct timeval *tvp)
{
- fd_set_bits fds;
- char *bits;
+ unsigned long *data, *bits;
long timeout;
int ret, size;
+ unsigned long ldata[MAX_LOC_LONGS];
timeout = MAX_SCHEDULE_TIMEOUT;
if (tvp) {
@@ -291,29 +387,27 @@
/*
* We need 6 bitmaps (in/out/ex for both incoming and outgoing),
* since we used fdset we need to allocate memory in units of
- * long-words.
+ * long-words. We add an extra, non-zero, long at the beginning
+ * because that lets us avoid an extra test-and-branch within
+ * the loop in do_select.
*/
ret = -ENOMEM;
- size = FDS_BYTES(n);
- bits = select_bits_alloc(size);
- if (!bits)
- goto out_nofds;
- fds.in = (unsigned long *) bits;
- fds.out = (unsigned long *) (bits + size);
- fds.ex = (unsigned long *) (bits + 2*size);
- fds.res_in = (unsigned long *) (bits + 3*size);
- fds.res_out = (unsigned long *) (bits + 4*size);
- fds.res_ex = (unsigned long *) (bits + 5*size);
-
- if ((ret = get_fd_set(n, inp, fds.in)) ||
- (ret = get_fd_set(n, outp, fds.out)) ||
- (ret = get_fd_set(n, exp, fds.ex)))
+ size = FDS_LONGS(n);
+ data = ldata;
+ if (n > MAX_LOC_FDS) {
+ data = kmalloc((1 + 6 * size) * sizeof(unsigned long), GFP_KERNEL);
+ if (!data)
+ goto out_nofds;
+ }
+ *data = 1;
+ bits = data + 1;
+ if ((ret = get_fd_set(n, inp, bits)) ||
+ (ret = get_fd_set(n, outp, bits+size)) ||
+ (ret = get_fd_set(n, exp, bits+2*size)))
goto out;
- zero_fd_set(n, fds.res_in);
- zero_fd_set(n, fds.res_out);
- zero_fd_set(n, fds.res_ex);
+ memset(bits+3*size, 0, 3*size*sizeof(unsigned long));
- ret = do_select(n, &fds, &timeout);
+ ret = do_select(n, bits, size, &timeout);
if (tvp && !(current->personality & STICKY_TIMEOUTS)) {
time_t sec = 0, usec = 0;
@@ -322,8 +416,8 @@
usec = timeout % HZ;
usec *= (1000000/HZ);
}
- put_user(sec, &tvp->tv_sec);
- put_user(usec, &tvp->tv_usec);
+ __put_user(sec, &tvp->tv_sec);
+ __put_user(usec, &tvp->tv_usec);
}
if (ret < 0)
@@ -335,86 +429,148 @@
ret = 0;
}
- set_fd_set(n, inp, fds.res_in);
- set_fd_set(n, outp, fds.res_out);
- set_fd_set(n, exp, fds.res_ex);
+ if (inp)
+ __copy_to_user(inp, bits+3*size, size*sizeof(unsigned long));
+ if (outp)
+ __copy_to_user(outp, bits+4*size, size*sizeof(unsigned long));
+ if (exp)
+ __copy_to_user(exp, bits+5*size, size*sizeof(unsigned long));
out:
- select_bits_free(bits, size);
+ if (n > MAX_LOC_FDS)
+ kfree(data);
out_nofds:
return ret;
}
-#define POLLFD_PER_PAGE ((PAGE_SIZE) / sizeof(struct pollfd))
-
-static void do_pollfd(unsigned int num, struct pollfd * fdpage,
- poll_table ** pwait, int *count)
+static int do_poll(unsigned int nfds, struct pollfd *ufds, long timeout)
{
- int i;
+ int retval;
+ poll_table *wait, wait_table;
+
+ wait = NULL;
+ if (timeout) {
+ wait_table.head = (void *) __get_free_page(GFP_KERNEL);
+ if (!wait_table.head)
+ return -ENOMEM;
- for (i = 0; i < num; i++) {
+ poll_initwait(&wait_table);
+ wait = &wait_table;
+ }
+
+ wait_table.error = retval = 0;
+
+#ifdef WIDE_FILE_LOCK
+ read_lock(¤t->files->file_lock);
+#endif
+ for (wait_table.n = 0; wait_table.n < nfds; wait_table.n++) {
int fd;
- unsigned int mask;
- struct pollfd *fdp;
+ unsigned int mask, events;
mask = 0;
- fdp = fdpage+i;
- fd = fdp->fd;
+ if (__get_user(fd, &ufds[wait_table.n].fd))
+ goto out_fault;
if (fd >= 0) {
- struct file * file = fget(fd);
+ struct file * file;
+#ifdef WIDE_FILE_LOCK
+ file = fcheck(fd);
+#else
+ file = fget(fd);
+#endif
mask = POLLNVAL;
if (file != NULL) {
mask = DEFAULT_POLLMASK;
if (file->f_op && file->f_op->poll)
- mask = file->f_op->poll(file, *pwait);
- mask &= fdp->events | POLLERR | POLLHUP;
+ mask = file->f_op->poll(file, wait);
+#ifndef WIDE_FILE_LOCK
fput(file);
+#endif
+ if (__get_user(events, &ufds[wait_table.n].events))
+ goto out_fault;
+ mask &= events | POLLERR | POLLHUP;
}
if (mask) {
- *pwait = NULL;
- (*count)++;
+ wait = NULL;
+ retval++;
}
}
- fdp->revents = mask;
+ __put_user(mask, &ufds[wait_table.n].revents);
}
-}
-static int do_poll(unsigned int nfds, unsigned int nchunks, unsigned int nleft,
- struct pollfd *fds[], poll_table *wait, long timeout)
-{
- int count = 0;
- poll_table* pt = wait;
+#ifdef WIDE_FILE_LOCK
+ read_unlock(¤t->files->file_lock);
+#endif
- for (;;) {
- unsigned int i;
+ if (!retval)
+ retval = wait_table.error;
+
+ while (!retval && timeout && !signal_pending(current)) {
+ unsigned long flags;
+ struct poll_table_entry *item, *next;
set_current_state(TASK_INTERRUPTIBLE);
- for (i=0; i < nchunks; i++)
- do_pollfd(POLLFD_PER_PAGE, fds[i], &pt, &count);
- if (nleft)
- do_pollfd(nleft, fds[nchunks], &pt, &count);
- pt = NULL;
- if (count || !timeout || signal_pending(current))
- break;
- if(wait->error) {
- return wait->error;
+ if (wait_table.woken_list == NULL)
+ timeout = schedule_timeout(timeout);
+ set_current_state(TASK_RUNNING);
+
+ /* Lift the queued events and reset the queue ready
+ * for anything that happens while we "think".
+ */
+ spin_lock_irqsave(&wait_table.woken_lock, flags);
+ item = wait_table.woken_list;
+ wait_table.woken_list = NULL;
+ spin_unlock_irqrestore(&wait_table.woken_lock, flags);
+
+ for (; item; item=next) {
+ struct poll_table_entry * p = item;
+ unsigned long mask, events;
+ struct file * file;
+
+ /* Reset the wait_queue's event pointer before
+ * checking in case it is triggered again.
+ */
+ next = item->woken_list;
+ set_waitqueue_entry_onwake(&p->wait, __pollwake);
+
+ file = p->filp;
+#if 0
+ /* This has to be true or we would never have
+ * done a wait on the file!
+ */
+ if (file->f_op && file->f_op->poll)
+#endif
+ mask = file->f_op->poll(file, NULL);
+ if (__get_user(events, &ufds[p->n].events))
+ goto out_fault;
+ mask &= events | POLLERR | POLLHUP;
+ if (mask) {
+ retval++;
+ __put_user(mask, &ufds[p->n].revents);
+ }
}
- timeout = schedule_timeout(timeout);
}
- current->state = TASK_RUNNING;
- return count;
+out:
+ if (timeout)
+ poll_freewait(&wait_table);
+ return retval;
+
+out_fault:
+#ifdef WIDE_FILE_LOCK
+ read_unlock(¤t->files->file_lock);
+#endif
+ retval = -EFAULT;
+ goto out;
}
-asmlinkage long sys_poll(struct pollfd * ufds, unsigned int nfds, long timeout)
+asmlinkage long
+sys_poll(struct pollfd * ufds, unsigned int nfds, long timeout)
{
- int i, j, fdcount, err;
- struct pollfd **fds;
- poll_table table, *wait;
- int nchunks, nleft;
+ int err;
/* Do a sanity check on nfds ... */
+ err = -EINVAL;
if (nfds > current->files->max_fds)
- return -EINVAL;
+ goto out;
if (timeout) {
/* Careful about overflow in the intermediate values */
@@ -424,69 +580,13 @@
timeout = MAX_SCHEDULE_TIMEOUT;
}
- poll_initwait(&table);
- wait = &table;
- if (!timeout)
- wait = NULL;
-
- err = -ENOMEM;
- fds = NULL;
- if (nfds != 0) {
- fds = (struct pollfd **)kmalloc(
- (1 + (nfds - 1) / POLLFD_PER_PAGE) * sizeof(struct pollfd *),
- GFP_KERNEL);
- if (fds == NULL)
- goto out;
- }
+ err = verify_area(VERIFY_WRITE, ufds, nfds * sizeof(struct pollfd));
+ if (!err) {
+ err = do_poll(nfds, ufds, timeout);
- nchunks = 0;
- nleft = nfds;
- while (nleft > POLLFD_PER_PAGE) { /* allocate complete PAGE_SIZE chunks */
- fds[nchunks] = (struct pollfd *)__get_free_page(GFP_KERNEL);
- if (fds[nchunks] == NULL)
- goto out_fds;
- nchunks++;
- nleft -= POLLFD_PER_PAGE;
- }
- if (nleft) { /* allocate last PAGE_SIZE chunk, only nleft elements used */
- fds[nchunks] = (struct pollfd *)__get_free_page(GFP_KERNEL);
- if (fds[nchunks] == NULL)
- goto out_fds;
- }
-
- err = -EFAULT;
- for (i=0; i < nchunks; i++)
- if (copy_from_user(fds[i], ufds + i*POLLFD_PER_PAGE, PAGE_SIZE))
- goto out_fds1;
- if (nleft) {
- if (copy_from_user(fds[nchunks], ufds + nchunks*POLLFD_PER_PAGE,
- nleft * sizeof(struct pollfd)))
- goto out_fds1;
- }
-
- fdcount = do_poll(nfds, nchunks, nleft, fds, wait, timeout);
-
- /* OK, now copy the revents fields back to user space. */
- for(i=0; i < nchunks; i++)
- for (j=0; j < POLLFD_PER_PAGE; j++, ufds++)
- __put_user((fds[i] + j)->revents, &ufds->revents);
- if (nleft)
- for (j=0; j < nleft; j++, ufds++)
- __put_user((fds[nchunks] + j)->revents, &ufds->revents);
-
- err = fdcount;
- if (!fdcount && signal_pending(current))
- err = -EINTR;
-
-out_fds1:
- if (nleft)
- free_page((unsigned long)(fds[nchunks]));
-out_fds:
- for (i=0; i < nchunks; i++)
- free_page((unsigned long)(fds[i]));
- if (nfds != 0)
- kfree(fds);
+ if (!err && signal_pending(current))
+ err = -EINTR;
+ }
out:
- poll_freewait(&table);
return err;
}
diff -urN -x *.[oa] -x .config -x .version -x .depend -x .hdepend -x *.flags -x autoconf.h -x modversions.h -x version.h -x asm -x modules -x config -x soundmodem linux-2.4.0-test9/include/linux/poll.h linux-2.4.0-test9+/include/linux/poll.h
--- linux-2.4.0-test9/include/linux/poll.h Mon Oct 2 19:01:39 2000
+++ linux-2.4.0-test9+/include/linux/poll.h Sun Oct 29 21:01:35 2000
@@ -8,15 +8,33 @@
#include <linux/wait.h>
#include <linux/string.h>
#include <linux/mm.h>
+#include <linux/smp_lock.h>
#include <asm/uaccess.h>
-struct poll_table_page;
+
+struct poll_table_entry {
+ int n;
+ struct file * filp;
+ struct poll_table_entry *woken_list;
+ wait_queue_t wait;
+ wait_queue_head_t * wait_address;
+};
typedef struct poll_table_struct {
+ int n;
int error;
- struct poll_table_page * table;
+ spinlock_t woken_lock;
+ struct poll_table_entry *woken_list;
+ struct poll_table_head * head, * last;
} poll_table;
+struct poll_table_head {
+ poll_table *poll_table;
+ struct poll_table_head * next;
+ struct poll_table_entry * entry;
+};
+
+
extern void __pollwait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p);
extern inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)
@@ -27,20 +45,16 @@
static inline void poll_initwait(poll_table* pt)
{
+ spin_lock_init(&pt->woken_lock);
+ pt->head->entry = (struct poll_table_entry *)(pt->head + 1);
+ pt->head->poll_table = pt;
+ pt->head->next = NULL;
+ pt->woken_list = NULL;
pt->error = 0;
- pt->table = NULL;
+ pt->last = pt->head;
}
-extern void poll_freewait(poll_table* pt);
-
-/*
- * Scaleable version of the fd_set.
- */
-
-typedef struct {
- unsigned long *in, *out, *ex;
- unsigned long *res_in, *res_out, *res_ex;
-} fd_set_bits;
+extern void poll_freewait(poll_table* pt);
/*
* How many longwords for "nr" bits?
@@ -69,21 +83,6 @@
memset(fdset, 0, nr);
return 0;
}
-
-static inline
-void set_fd_set(unsigned long nr, void *ufdset, unsigned long *fdset)
-{
- if (ufdset)
- __copy_to_user(ufdset, fdset, FDS_BYTES(nr));
-}
-
-static inline
-void zero_fd_set(unsigned long nr, unsigned long *fdset)
-{
- memset(fdset, 0, FDS_BYTES(nr));
-}
-
-extern int do_select(int n, fd_set_bits *fds, long *timeout);
#endif /* KERNEL */
diff -urN -x *.[oa] -x .config -x .version -x .depend -x .hdepend -x *.flags -x autoconf.h -x modversions.h -x version.h -x asm -x modules -x config -x soundmodem linux-2.4.0-test9/include/linux/sched.h linux-2.4.0-test9+/include/linux/sched.h
--- linux-2.4.0-test9/include/linux/sched.h Mon Oct 2 19:01:19 2000
+++ linux-2.4.0-test9+/include/linux/sched.h Sun Oct 29 18:01:27 2000
@@ -541,6 +541,7 @@
extern long FASTCALL(interruptible_sleep_on_timeout(wait_queue_head_t *q,
signed long timeout));
extern void FASTCALL(wake_up_process(struct task_struct * tsk));
+extern void FASTCALL(wake_up_process_synchronous(struct task_struct * tsk));
#define wake_up(x) __wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE | TASK_EXCLUSIVE)
#define wake_up_all(x) __wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE)
diff -urN -x *.[oa] -x .config -x .version -x .depend -x .hdepend -x *.flags -x autoconf.h -x modversions.h -x version.h -x asm -x modules -x config -x soundmodem linux-2.4.0-test9/include/linux/wait.h linux-2.4.0-test9+/include/linux/wait.h
--- linux-2.4.0-test9/include/linux/wait.h Mon Oct 2 19:01:17 2000
+++ linux-2.4.0-test9+/include/linux/wait.h Sun Oct 29 17:10:05 2000
@@ -16,6 +16,7 @@
#include <linux/spinlock.h>
#include <asm/page.h>
+#include <asm/system.h>
#include <asm/processor.h>
/*
@@ -31,7 +32,7 @@
BUG(); \
} while (0)
-#define CHECK_MAGIC(x) if (x != (long)&(x)) \
+#define CHECK_MAGIC(x) if ((long)x != (long)&(x)) \
{ printk("bad magic %lx (should be %lx), ", (long)x, (long)&(x)); WQ_BUG(); }
#define CHECK_MAGIC_WQHEAD(x) do { \
@@ -43,16 +44,19 @@
} while (0)
#endif
+typedef struct __wait_queue wait_queue_t;
+typedef int (*onwake_func_t)(wait_queue_t *, unsigned int, const int);
+
struct __wait_queue {
unsigned int compiler_warning;
struct task_struct * task;
struct list_head task_list;
+ onwake_func_t onwake;
#if WAITQUEUE_DEBUG
long __magic;
long __waker;
#endif
};
-typedef struct __wait_queue wait_queue_t;
/*
* 'dual' spinlock architecture. Can be switched between spinlock_t and
@@ -109,7 +113,7 @@
#endif
#define __WAITQUEUE_INITIALIZER(name,task) \
- { 0x1234567, task, { NULL, NULL } __WAITQUEUE_DEBUG_INIT(name)}
+ { 0x1234567, task, { NULL, NULL }, (void *)-1 __WAITQUEUE_DEBUG_INIT(name)}
#define DECLARE_WAITQUEUE(name,task) \
wait_queue_t name = __WAITQUEUE_INITIALIZER(name,task)
@@ -134,17 +138,25 @@
#endif
}
-static inline void init_waitqueue_entry(wait_queue_t *q,
- struct task_struct *p)
+static inline void init_waitqueue_entry_onwake(wait_queue_t *q,
+ struct task_struct *p, onwake_func_t onwake)
{
#if WAITQUEUE_DEBUG
if (!q || !p)
WQ_BUG();
#endif
q->task = p;
+ q->onwake = onwake;
#if WAITQUEUE_DEBUG
q->__magic = (long)&q->__magic;
#endif
+}
+
+#define init_waitqueue_entry(q, p) init_waitqueue_entry_onwake(q, p, (void *)-1);
+
+static inline void set_waitqueue_entry_onwake(wait_queue_t *wq, onwake_func_t onwake)
+{
+ wq->onwake = onwake;
}
static inline int waitqueue_active(wait_queue_head_t *q)
diff -urN -x *.[oa] -x .config -x .version -x .depend -x .hdepend -x *.flags -x autoconf.h -x modversions.h -x version.h -x asm -x modules -x config -x soundmodem linux-2.4.0-test9/kernel/sched.c linux-2.4.0-test9+/kernel/sched.c
--- linux-2.4.0-test9/kernel/sched.c Mon Oct 2 19:45:01 2000
+++ linux-2.4.0-test9+/kernel/sched.c Tue Oct 24 11:49:07 2000
@@ -361,7 +361,7 @@
spin_unlock_irqrestore(&runqueue_lock, flags);
}
-static inline void wake_up_process_synchronous(struct task_struct * p)
+inline void wake_up_process_synchronous(struct task_struct * p)
{
unsigned long flags;
@@ -726,38 +726,45 @@
while (tmp != head) {
unsigned int state;
wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
+ onwake_func_t onwake = curr->onwake;
tmp = tmp->next;
#if WAITQUEUE_DEBUG
CHECK_MAGIC(curr->__magic);
#endif
- p = curr->task;
- state = p->state;
- if (state & (mode & ~TASK_EXCLUSIVE)) {
+ if (onwake == (void *)-1) {
+ p = curr->task;
+ state = p->state;
+ if (state & (mode & ~TASK_EXCLUSIVE)) {
#if WAITQUEUE_DEBUG
- curr->__waker = (long)__builtin_return_address(0);
+ curr->__waker = (long)__builtin_return_address(0);
#endif
- /*
- * If waking up from an interrupt context then
- * prefer processes which are affine to this
- * CPU.
- */
- if (irq && (state & mode & TASK_EXCLUSIVE)) {
- if (!best_exclusive)
- best_exclusive = p;
- else if ((p->processor == best_cpu) &&
- (best_exclusive->processor != best_cpu))
+ /*
+ * If waking up from an interrupt context then
+ * prefer processes which are affine to this
+ * CPU.
+ */
+ if (irq && (state & mode & TASK_EXCLUSIVE)) {
+ if (!best_exclusive)
best_exclusive = p;
- } else {
- if (sync)
- wake_up_process_synchronous(p);
- else
- wake_up_process(p);
- if (state & mode & TASK_EXCLUSIVE)
- break;
+ else if ((p->processor == best_cpu) &&
+ (best_exclusive->processor != best_cpu))
+ best_exclusive = p;
+ } else {
+ if (sync)
+ wake_up_process_synchronous(p);
+ else
+ wake_up_process(p);
+ if (state & mode & TASK_EXCLUSIVE)
+ break;
+ }
}
+ continue;
}
+
+ if (onwake && onwake(curr, mode, sync))
+ break;
}
if (best_exclusive)
best_exclusive->state = TASK_RUNNING;
^ permalink raw reply [flat|nested] 10+ messages in thread