* [patch 1/2] ufd v1 - unsequential O(1) fdmap core
@ 2007-06-02 22:59 Davide Libenzi
2007-06-03 21:19 ` Eric Dumazet
0 siblings, 1 reply; 26+ messages in thread
From: Davide Libenzi @ 2007-06-02 22:59 UTC (permalink / raw)
To: Linux Kernel Mailing List
Cc: Linus Torvalds, Andrew Morton, Ulrich Drepper, Ingo Molnar
Core code for the unsequential fdmap implementation. Random allocation,
exact allocation, de-allocation and lookup are all O(1) operations.
The only operation that is O(N) is the strict from-N-up kind of allocation,
but that only used by F_DUPFD and it's definitely not frequently used
(and current code is O(N) too).
Like the "struct fdtable", the unsequential fdmap is RCU friendly too.
Signed-off-by: Davide Libenzi <davidel@xmailserver.org>
- Davide
Index: linux-2.6.mod/fs/fdmap.c
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.mod/fs/fdmap.c 2007-06-02 15:35:58.000000000 -0700
@@ -0,0 +1,473 @@
+/*
+ * fs/fdmap.c
+ *
+ * Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org>
+ *
+ */
+
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/vmalloc.h>
+#include <linux/kernel.h>
+#include <linux/spinlock.h>
+#include <linux/fdmap.h>
+
+#define FDMAP_F_BUSYSLOT (1UL << (BITS_PER_LONG - 1))
+
+#define FDMAP_FD(i) ((i) + 1)
+#define FDMAP_MKUSERFD(i) ((i) - 1)
+
+struct fdmap_defer {
+ spinlock_t lock;
+ struct work_struct wq;
+ struct fd_map *next;
+};
+
+static DEFINE_PER_CPU(struct fdmap_defer, fdmap_defer_list);
+
+static inline void fdmap_init_slothead(struct fd_slot *head)
+{
+ head->prev = head->next = 0;
+}
+
+static inline int fdmap_list_empty(struct fd_slot *head)
+{
+ return head->next == 0;
+}
+
+static void fdmap_insert(struct fd_slot *head, unsigned long inew,
+ unsigned long iprev, unsigned long inext)
+{
+ struct fd_slot *new, *prev, *next;
+
+ prev = head + iprev;
+ next = head + inext;
+ new = head + inew;
+ prev->next = inew;
+ new->next = inext;
+ /*
+ * The insert function is used to re-insert the slot inside
+ * the list of free slots, so basically during fd release time.
+ * The ->next field is used by fdmap_busy_slot() to test if a
+ * slot is allocated or not. We need to make sure the ->next
+ * fields are properly set, before the updates to the ->prev
+ * fields are visible.
+ */
+ smp_wmb();
+ next->prev = inew;
+ new->prev = iprev;
+}
+
+static void fdmap_remove(struct fd_slot *head, unsigned long idel)
+{
+ struct fd_slot *del, *prev, *next;
+
+ del = head + idel;
+ prev = head + del->prev;
+ next = head + del->next;
+ prev->next = del->next;
+ next->prev = del->prev;
+}
+
+static inline void fdmap_add_slot(struct fd_slot *slots, unsigned long inew)
+{
+ fdmap_insert(slots, inew, slots[0].prev, 0);
+}
+
+static inline int fdmap_busy_slot(const struct fd_slot *ptr)
+{
+ smp_rmb();
+ return !!(ptr->next & FDMAP_F_BUSYSLOT);
+}
+
+static void *fdmap_alloc_mem(unsigned long size)
+{
+ if (size <= PAGE_SIZE)
+ return kmalloc(size, GFP_KERNEL);
+ else
+ return vmalloc(size);
+}
+
+static void fdmap_free_mem(void *data, unsigned long size)
+{
+ if (size <= PAGE_SIZE)
+ kfree(data);
+ else
+ vfree(data);
+}
+
+/**
+ * fdmap_file_get - Gets the file pointer associated with a file descriptor
+ *
+ * @fmap: [in] Pointer to the file descriptor map
+ * @fd: [in] File descriptor
+ *
+ * Returns the file pointer associated with the file @fd, or NULL
+ * if no file pointer is still associated with @fd.
+ */
+struct file *fdmap_file_get(struct fd_map *fmap, unsigned int fd)
+{
+ struct fd_slot *ptr;
+
+ ptr = fmap->slots + FDMAP_FD(fd) - fmap->base;
+ if (unlikely(!fdmap_busy_slot(ptr)))
+ return NULL;
+ return (struct file *) ptr->prev;
+}
+
+/**
+ * fdmap_install - Installs a file pointer onto a previously allocated
+ * file descriptor
+ *
+ * @fmap: [in] Pointer to the file descriptor map
+ * @fd: [in] Previously allocated file descriptor
+ * @file: [in] File pointer
+ *
+ */
+void fdmap_install(struct fd_map *fmap, unsigned int fd, struct file *file)
+{
+ smp_wmb();
+ fmap->slots[FDMAP_FD(fd) - fmap->base].prev = (unsigned long) file;
+}
+
+/**
+ * fdmap_newfd - Allocates a new file descriptor
+ *
+ * @fmap: [in] Pointer to the file descriptor map
+ * @fd: [in] File descriptor allocation hint. If @fd == FDMAP_HINT_ANY,
+ * then a random free file descriptor is allocated. If @fd is
+ * FDMAP_HINT_FDUP(FD), then a free file descriptor from FD up,
+ * is allocated. If @fd is FDMAP_HINT_EXACT(FD), the function
+ * tries to allocate the file descriptor FD, if available
+ * @flags: [in] Flags to be associated with the new file descriptor
+ *
+ * Return the newly allocated file descriptor, or a negative number in case
+ * of error.
+ */
+int fdmap_newfd(struct fd_map *fmap, int fd, unsigned long flags)
+{
+ unsigned int i;
+ struct fd_slot *ptr;
+
+ /*
+ * fd == 0 means alloc any
+ * fd < 0 means alloc one from (-fd) up
+ * fd > 0 means alloc exactly (fd - 1), if possible
+ */
+ if (fd) {
+ if (fd > 0) {
+ fd--;
+ if (unlikely(fd < fmap->base))
+ return -EBADF;
+ if (fd >= fmap->base + fmap->size)
+ return -EMFILE;
+ fd = FDMAP_FD(fd);
+ ptr = fmap->slots + fd - fmap->base;
+ if (fdmap_busy_slot(ptr))
+ return -EBADF;
+ } else {
+ i = FDMAP_FD(-fd);
+ if (unlikely(i <= fmap->base))
+ return -EBADF;
+ i -= fmap->base;
+ ptr = fmap->slots + i;
+ for (; i <= fmap->size; i++, ptr++)
+ if (!fdmap_busy_slot(ptr))
+ break;
+ if (i > fmap->size)
+ return -EMFILE;
+ fd = i;
+ }
+ } else {
+ if (unlikely(fdmap_list_empty(&fmap->slots[0])))
+ return -EMFILE;
+ fd = (int) fmap->slots[0].next;
+ ptr = fmap->slots + fd;
+ }
+ fdmap_remove(&fmap->slots[0], fd);
+ /*
+ * We need to make sure that at the time ->next is marked as allocated,
+ * ->prev is properly initialize to NULL. This way the RCU-aware
+ * fdmap_file_get() can return the "correct" invalid NULL value, instead
+ * of garbage.
+ */
+ ptr->prev = 0;
+ smp_wmb();
+ ptr->next = FDMAP_F_BUSYSLOT | flags;
+
+ return fmap->base + FDMAP_MKUSERFD(fd);
+}
+
+/**
+ * fdmap_putfd - Releases a previously allocated file descriptor
+ *
+ * @fmap: [in] Pointer to the file descriptor map
+ * @fd: [in] Previously allocated file descriptor
+ *
+ */
+void fdmap_putfd(struct fd_map *fmap, unsigned int fd)
+{
+ /*
+ * The smp_wmb() inside fdmap_insert() takes care of making
+ * the transaction RCU friendly.
+ */
+ fdmap_add_slot(&fmap->slots[0], FDMAP_FD(fd) - fmap->base);
+}
+
+/**
+ * fdmap_get_fdflags - Retrieves an allocated file descriptor flags
+ *
+ * @fmap: [in] Pointer to the file descriptor map
+ * @fd: [in] Previously allocated file descriptor
+ *
+ * Returns the file descriptor flags, if the descriptor is allocated,
+ * or zero if not.
+ */
+unsigned long fdmap_get_fdflags(struct fd_map *fmap, unsigned int fd)
+{
+ struct fd_slot *ptr;
+
+ ptr = fmap->slots + FDMAP_FD(fd) - fmap->base;
+ if (unlikely(!fdmap_busy_slot(ptr)))
+ return 0;
+
+ return ptr->next & ~FDMAP_F_BUSYSLOT;
+}
+
+/**
+ * fdmap_set_fdflags - Changes an allocated file descriptor flags. It allows
+ * to specify a set of flags to be cleared, together with
+ * a set of flags to be set
+ *
+ * @fmap: [in] Pointer to the file descriptor map
+ * @fd: [in] Previously allocated file descriptor
+ * @fclear: [in] Set of flags to be cleared
+ * @fadd: [in] Set of flags to be set
+ *
+ * Returns the file descriptor flags, if the descriptor is allocated,
+ * or zero if not.
+ */
+int fdmap_set_fdflags(struct fd_map *fmap, unsigned int fd, unsigned long fclear,
+ unsigned long fadd)
+{
+ struct fd_slot *ptr;
+
+ ptr = fmap->slots + FDMAP_FD(fd) - fmap->base;
+ if (unlikely(!fdmap_busy_slot(ptr)))
+ return -EBADF;
+ fclear &= ~FDMAP_F_BUSYSLOT;
+ ptr->next = (ptr->next & ~fclear) | fadd;
+
+ return 0;
+}
+
+/**
+ * fdmap_for_each_file - Enumerates all the file pointers inside the allocated
+ * file descriptors. Only if the file pointer is not NULL
+ * (although the file descriptor may be allocated), the
+ * callback function is invoked
+ *
+ * @fmap: [in] Pointer to the file descriptor map
+ * @proc: [in] Callback to be invoked for every file pointer
+ * @priv: [in] Private data for the callback (passed in the first parameter)
+ *
+ */
+void fdmap_for_each_file(struct fd_map *fmap, int (*proc)(void *, struct file *),
+ void *priv)
+{
+ unsigned int i;
+ struct fd_slot *ptr;
+ struct file *file;
+
+ for (i = 0, ptr = fmap->slots + 1; i < fmap->size; i++, ptr++) {
+ if (fdmap_busy_slot(ptr)) {
+ file = (struct file *) ptr->prev;
+ if (file && (*proc)(priv, file))
+ break;
+ }
+ }
+}
+
+/**
+ * fdmap_next_flag_set - Retrieves the next, not empty set, of allocated file
+ * desciptors having the bit @bit set in their flags
+ *
+ * @fmap: [in] Pointer to the file descriptor map
+ * @bit: [in] Bit number to test for
+ * @start: [in/out] Next position to scan from. Must be set to set to start
+ * a new scan, and it will be updated at every call to this
+ * function
+ * @base: [out] File descriptor base value for the returned set
+ * @fset: [out] Bit set of file desciptors having the bit @bit set in
+ * their flags. Bit #0 of @fset refers to the file desciptor
+ * @base, bit #1 to @base+1, etc...
+ *
+ * Returns a non zero value if the next set is available, or zero if no more
+ * file desciptors with the bit @bit set are available.
+ */
+int fdmap_next_flag_set(struct fd_map *fmap, int bit, unsigned int *start,
+ unsigned int *base, unsigned long *fset)
+{
+ unsigned int i, j;
+ unsigned long f, mask;
+ struct fd_slot *ptr;
+
+ mask = 1 << bit;
+ i = *start;
+ ptr = fmap->slots + FDMAP_FD(i);
+ do {
+ if (i >= fmap->size)
+ return 0;
+ *base = i + fmap->base;
+ for (j = 0, f = 0; i < fmap->size && j < BITS_PER_LONG;
+ i++, j++, ptr++) {
+ if (fdmap_busy_slot(ptr) &&
+ (ptr->next & mask))
+ f |= 1 << j;
+ }
+ } while (!f);
+ *start = i;
+ *fset = f;
+
+ return 1;
+}
+
+/**
+ * fdmap_copy - Copies the content of a file descriptor map to another
+ *
+ * @dfmap: [in/out] Pointer to the destination file descriptor map
+ * @sfmap: [in] Pointer to the source file descriptor map
+ * @count: [out] Pointer to the number of allocated file descriptor
+ * transfered
+ * @dofget [in] Boolean indicating if the function should perform
+ * a get_file() for each trasfered file pointer
+ *
+ */
+void fdmap_copy(struct fd_map *dfmap, const struct fd_map *sfmap,
+ unsigned int *count, int dofget)
+{
+ unsigned int i, bcount;
+ struct fd_slot *dptr;
+ const struct fd_slot *sptr;
+
+ BUG_ON(dfmap->size < sfmap->size);
+
+ fdmap_init_slothead(&dfmap->slots[0]);
+ dptr = dfmap->slots + 1;
+ sptr = sfmap->slots + 1;
+ for (i = 0, bcount = 0; i < sfmap->size; i++, dptr++, sptr++) {
+ if (fdmap_busy_slot(sptr) && sptr->prev) {
+ if (dofget)
+ get_file((struct file *) sptr->prev);
+ *dptr = *sptr;
+ bcount++;
+ } else
+ fdmap_add_slot(&dfmap->slots[0], i + 1);
+ }
+ for (; i < dfmap->size; i++)
+ fdmap_add_slot(&dfmap->slots[0], i + 1);
+ if (count)
+ *count = bcount;
+}
+
+/**
+ * fdmap_alloc - Allocates a new file descriptor map
+ *
+ * @base: [in] Starting value for the file descriptors allocated inside this map
+ * @size: [in] Size of the map
+ * @init: [in] Boolean indicating if the free-map initialization should be done.
+ * When allocating a new must just to be copied over by fdmap_copy(),
+ * it saves time to avoid to go through the whole map memory to
+ * initialize it, when it will be overwritten soon after
+ *
+ */
+struct fd_map *fdmap_alloc(unsigned int base, unsigned int size, int init)
+{
+ unsigned int i;
+ struct fd_map *fmap;
+
+ if ((long) base + (long) size >= INT_MAX ||
+ UINT_MAX / sizeof(struct fd_slot) < size)
+ return NULL;
+ fmap = kzalloc(sizeof(struct fd_map), GFP_KERNEL);
+ if (!fmap)
+ return NULL;
+ fmap->base = base;
+ fmap->size = size;
+ fmap->slots = fdmap_alloc_mem((size + 1) * sizeof(struct fd_slot));
+ if (!fmap->slots) {
+ kfree(fmap);
+ return NULL;
+ }
+ fdmap_init_slothead(&fmap->slots[0]);
+ if (init)
+ for (i = 0; i < size; i++)
+ fdmap_add_slot(&fmap->slots[0], i + 1);
+ INIT_RCU_HEAD(&fmap->rcu);
+
+ return fmap;
+}
+
+static void fdmap_free_rcu(struct rcu_head *rcu)
+{
+ struct fd_map *fmap = container_of(rcu, struct fd_map, rcu);
+ struct fdmap_defer *fddef;
+
+ BUG_ON(!fmap);
+
+ fddef = &get_cpu_var(fdmap_defer_list);
+ spin_lock(&fddef->lock);
+ fmap->next = fddef->next;
+ fddef->next = fmap;
+ schedule_work(&fddef->wq);
+ spin_unlock(&fddef->lock);
+ put_cpu_var(fdmap_defer_list);
+}
+
+/**
+ * fdmap_free - Frees a file descriptor map. File descriptor map deallocation
+ * is done in an RCU way, since file descriptor maps must be RCU
+ * friendly
+ *
+ * @fmap: [in] Pointer to the file descriptor map to be freed
+ *
+ */
+void fdmap_free(struct fd_map *fmap)
+{
+ call_rcu(&fmap->rcu, fdmap_free_rcu);
+}
+
+static void free_fdtable_work(struct work_struct *work)
+{
+ struct fdmap_defer *f = container_of(work, struct fdmap_defer, wq);
+ struct fd_map *fmap, *next;
+
+ spin_lock_bh(&f->lock);
+ fmap = f->next;
+ f->next = NULL;
+ spin_unlock_bh(&f->lock);
+ while (fmap) {
+ next = fmap->next;
+ fdmap_free_mem(fmap->slots,
+ (fmap->size + 1) * sizeof(struct fd_slot));
+ kfree(fmap);
+ fmap = next;
+ }
+}
+
+static int __init fdmap_defer_init(void)
+{
+ int i;
+ struct fdmap_defer *fddef;
+
+ for_each_possible_cpu(i) {
+ fddef = &per_cpu(fdmap_defer_list, i);
+ spin_lock_init(&fddef->lock);
+ INIT_WORK(&fddef->wq, free_fdtable_work);
+ fddef->next = NULL;
+ }
+ return 0;
+}
+__initcall(fdmap_defer_init);
+
Index: linux-2.6.mod/include/linux/fdmap.h
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.mod/include/linux/fdmap.h 2007-06-02 14:26:05.000000000 -0700
@@ -0,0 +1,92 @@
+/*
+ * include/linux/fdmap.h
+ *
+ * Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org>
+ *
+ */
+
+#ifndef _LINUX_FDMAP_H
+#define _LINUX_FDMAP_H
+
+#include <linux/rcupdate.h>
+#include <linux/fs.h>
+
+/*
+ * Flags for the file map (BITS_PER_LONG - 1) is reserved.
+ */
+#define FDMAP_BIT_CLOEXEC 0
+#define FDMAP_F_CLOEXEC (1UL << FDMAP_BIT_CLOEXEC)
+
+#define FDMAP_HINT_ANY 0
+#define FDMAP_HINT_FDUP(i) (- (int) (i))
+#define FDMAP_HINT_EXACT(i) ((int) (i) + 1)
+
+struct fd_slot {
+ unsigned long prev, next;
+};
+
+struct fd_map {
+ struct fd_map *next;
+ struct rcu_head rcu;
+ unsigned int base;
+ unsigned int size;
+ struct fd_slot *slots;
+};
+
+/**
+ * fdmap_fdof - Tells if a file descriptor value falls inside the range
+ * allowed byt @fmap
+ *
+ * @fmap: [in] Pointer to the file descriptor map
+ * @fd: [in] Previously allocated file descriptor
+ *
+ * Return non-zero if the file descriptor value falls inside the range
+ * allowed byt @fmap, or zero otherwise
+ */
+static inline int fdmap_fdof(struct fd_map *fmap, unsigned int fd)
+{
+ return fd >= fmap->base && fd < fmap->base + fmap->size;
+}
+
+/**
+ * fdmap_basefd - Returns the first file descriptor value that can be
+ * allocated inside this map
+ *
+ * @fmap: [in] Pointer to the file descriptor map
+ *
+ */
+static inline unsigned int fdmap_basefd(const struct fd_map *fmap)
+{
+ return fmap->base;
+}
+
+/**
+ * fdmap_topfd - Returns the file descriptor value after the last
+ * that can be allocated inside this map
+ *
+ * @fmap: [in] Pointer to the file descriptor map
+ *
+ */
+static inline unsigned int fdmap_topfd(const struct fd_map *fmap)
+{
+ return fmap->base + fmap->size;
+}
+
+struct file *fdmap_file_get(struct fd_map *fmap, unsigned int fd);
+void fdmap_install(struct fd_map *fmap, unsigned int fd, struct file *file);
+int fdmap_newfd(struct fd_map *fmap, int fd, unsigned long flags);
+void fdmap_putfd(struct fd_map *fmap, unsigned int fd);
+unsigned long fdmap_get_fdflags(struct fd_map *fmap, unsigned int fd);
+int fdmap_set_fdflags(struct fd_map *fmap, unsigned int fd, unsigned long fclear,
+ unsigned long fadd);
+void fdmap_for_each_file(struct fd_map *fmap, int (*proc)(void *, struct file *),
+ void *priv);
+int fdmap_next_flag_set(struct fd_map *fmap, int bit, unsigned int *start,
+ unsigned int *base, unsigned long *fset);
+void fdmap_copy(struct fd_map *dfmap, const struct fd_map *sfmap,
+ unsigned int *count, int dofget);
+struct fd_map *fdmap_alloc(unsigned int base, unsigned int size, int init);
+void fdmap_free(struct fd_map *fmap);
+
+#endif /* _LINUX_FDMAP_H */
+
Index: linux-2.6.mod/fs/Makefile
===================================================================
--- linux-2.6.mod.orig/fs/Makefile 2007-05-31 17:16:56.000000000 -0700
+++ linux-2.6.mod/fs/Makefile 2007-05-31 17:17:12.000000000 -0700
@@ -5,7 +5,7 @@
# Rewritten to use lists instead of if-statements.
#
-obj-y := open.o read_write.o file_table.o super.o \
+obj-y := open.o read_write.o file_table.o super.o fdmap.o \
char_dev.o stat.o exec.o pipe.o namei.o fcntl.o \
ioctl.o readdir.o select.o fifo.o locks.o dcache.o inode.o \
attr.o bad_inode.o file.o filesystems.o namespace.o aio.o \
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [patch 1/2] ufd v1 - unsequential O(1) fdmap core
2007-06-02 22:59 [patch 1/2] ufd v1 - unsequential O(1) fdmap core Davide Libenzi
@ 2007-06-03 21:19 ` Eric Dumazet
2007-06-03 22:51 ` Davide Libenzi
0 siblings, 1 reply; 26+ messages in thread
From: Eric Dumazet @ 2007-06-03 21:19 UTC (permalink / raw)
To: Davide Libenzi
Cc: Linux Kernel Mailing List, Linus Torvalds, Andrew Morton,
Ulrich Drepper, Ingo Molnar
Davide Libenzi a écrit :
> Core code for the unsequential fdmap implementation. Random allocation,
> exact allocation, de-allocation and lookup are all O(1) operations.
> The only operation that is O(N) is the strict from-N-up kind of allocation,
> but that only used by F_DUPFD and it's definitely not frequently used
> (and current code is O(N) too).
> Like the "struct fdtable", the unsequential fdmap is RCU friendly too.
>
>
>
Could you please provide a diffstat ?
Me think : "Huge patch, and icache pressure for what exact gain ?"
File descriptor allocation is dust compared to socket setup costs and network
stuf. (Not speaking of close() wich is O(1) obviously)
If we want a different file descriptor allocation, why should we use a
parallel structure, and adding one level of complexity ?
Instead of finding the first zero bit in a bitmap, we could just use a cyclic
allocation, ie finding a zero bit from a 'last' position. Keeping fd_count
would help not atempting a findzerobit in the case all bits are known to be set.
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [patch 1/2] ufd v1 - unsequential O(1) fdmap core
2007-06-03 21:19 ` Eric Dumazet
@ 2007-06-03 22:51 ` Davide Libenzi
2007-06-04 6:08 ` Andrew Morton
0 siblings, 1 reply; 26+ messages in thread
From: Davide Libenzi @ 2007-06-03 22:51 UTC (permalink / raw)
To: Eric Dumazet
Cc: Linux Kernel Mailing List, Linus Torvalds, Andrew Morton,
Ulrich Drepper, Ingo Molnar
[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: TEXT/PLAIN; CHARSET=X-UNKNOWN, Size: 1949 bytes --]
On Sun, 3 Jun 2007, Eric Dumazet wrote:
> Davide Libenzi a écrit :
> > Core code for the unsequential fdmap implementation. Random allocation,
> > exact allocation, de-allocation and lookup are all O(1) operations.
> > The only operation that is O(N) is the strict from-N-up kind of allocation,
> > but that only used by F_DUPFD and it's definitely not frequently used
> > (and current code is O(N) too).
> > Like the "struct fdtable", the unsequential fdmap is RCU friendly too.
> >
> >
> >
>
> Could you please provide a diffstat ?
With or w/out the comments? :)
> Me think : "Huge patch, and icache pressure for what exact gain ?"
>
> File descriptor allocation is dust compared to socket setup costs and network
> stuf. (Not speaking of close() wich is O(1) obviously)
>
> If we want a different file descriptor allocation, why should we use a
> parallel structure, and adding one level of complexity ?
>
> Instead of finding the first zero bit in a bitmap, we could just use a cyclic
> allocation, ie finding a zero bit from a 'last' position. Keeping fd_count
> would help not atempting a findzerobit in the case all bits are known to be
> set.
A bitmap allocator made sense because it has the property of making
allocations compact. Once that requirement is relaxed, it does not make
any sense to use it (and you have still to modify it in any case).
I generally agree on code re-use, but that just not the right structure.
You can tweak it how much you like, but you're still doing a search inside
an N-sized bitmap. It's just the *wrong* structure.
I-cache pressure? All the extra code that you see out of fdmap.c/h, comes
from handling two "bitmaps", that you still have to have (unless you
plan to have a single huge botmap that covers legacy and non-sequential
areas). But with a bitmap structure, you're gonna have more D-cache
pressure. And that's a fact.
- Davide
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [patch 1/2] ufd v1 - unsequential O(1) fdmap core
2007-06-03 22:51 ` Davide Libenzi
@ 2007-06-04 6:08 ` Andrew Morton
2007-06-04 8:05 ` Ingo Molnar
0 siblings, 1 reply; 26+ messages in thread
From: Andrew Morton @ 2007-06-04 6:08 UTC (permalink / raw)
To: Davide Libenzi
Cc: Eric Dumazet, Linux Kernel Mailing List, Linus Torvalds,
Ulrich Drepper, Ingo Molnar
On Sun, 3 Jun 2007 15:51:13 -0700 (PDT) Davide Libenzi <davidel@xmailserver.org> wrote:
> A bitmap allocator made sense because it has the property of making
> allocations compact. Once that requirement is relaxed, it does not make
> any sense to use it (and you have still to modify it in any case).
> I generally agree on code re-use, but that just not the right structure.
> You can tweak it how much you like, but you're still doing a search inside
> an N-sized bitmap. It's just the *wrong* structure.
I must say that it's not really clear to me why this fdmap thing was
created. Exactly what problem is it solving, and what properties is it
designed to have?
Could not a (prehaps suitably modified) IDR tree have adequately provided
those properties?
I'm sure it's good stuff, but the patches were presented as if we all know
what they're for. But I don't. Maybe I was asleep at the time.
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [patch 1/2] ufd v1 - unsequential O(1) fdmap core
2007-06-04 6:08 ` Andrew Morton
@ 2007-06-04 8:05 ` Ingo Molnar
2007-06-04 8:09 ` Ingo Molnar
0 siblings, 1 reply; 26+ messages in thread
From: Ingo Molnar @ 2007-06-04 8:05 UTC (permalink / raw)
To: Andrew Morton
Cc: Davide Libenzi, Eric Dumazet, Linux Kernel Mailing List,
Linus Torvalds, Ulrich Drepper
* Andrew Morton <akpm@linux-foundation.org> wrote:
> I must say that it's not really clear to me why this fdmap thing was
> created. Exactly what problem is it solving, and what properties is
> it designed to have?
>
> Could not a (prehaps suitably modified) IDR tree have adequately
> provided those properties?
>
> I'm sure it's good stuff, but the patches were presented as if we all
> know what they're for. But I don't. Maybe I was asleep at the time.
i think this sums it up:
http://www.uwsg.iu.edu/hypermail/linux/kernel/0705.3/2490.html
and some more, with a benchmark as well:
http://linux.derkeiler.com/Mailing-Lists/Kernel/2007-05/msg13685.html
( blame Google for picking up two different lkml archives for the two
searches i did :-)
Ingo
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [patch 1/2] ufd v1 - unsequential O(1) fdmap core
2007-06-04 8:05 ` Ingo Molnar
@ 2007-06-04 8:09 ` Ingo Molnar
2007-06-04 8:34 ` Andrew Morton
0 siblings, 1 reply; 26+ messages in thread
From: Ingo Molnar @ 2007-06-04 8:09 UTC (permalink / raw)
To: Andrew Morton
Cc: Davide Libenzi, Eric Dumazet, Linux Kernel Mailing List,
Linus Torvalds, Ulrich Drepper
* Ingo Molnar <mingo@elte.hu> wrote:
> i think this sums it up:
>
> http://www.uwsg.iu.edu/hypermail/linux/kernel/0705.3/2490.html
i mean this mail started it:
http://linux.derkeiler.com/Mailing-Lists/Kernel/2007-05/msg13070.html
> and some more, with a benchmark as well:
>
> http://linux.derkeiler.com/Mailing-Lists/Kernel/2007-05/msg13685.html
Ingo
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [patch 1/2] ufd v1 - unsequential O(1) fdmap core
2007-06-04 8:09 ` Ingo Molnar
@ 2007-06-04 8:34 ` Andrew Morton
2007-06-04 8:42 ` Ingo Molnar
2007-06-04 10:28 ` Eric Dumazet
0 siblings, 2 replies; 26+ messages in thread
From: Andrew Morton @ 2007-06-04 8:34 UTC (permalink / raw)
To: Ingo Molnar
Cc: Davide Libenzi, Eric Dumazet, Linux Kernel Mailing List,
Linus Torvalds, Ulrich Drepper
On Mon, 4 Jun 2007 10:09:41 +0200 Ingo Molnar <mingo@elte.hu> wrote:
>
> * Ingo Molnar <mingo@elte.hu> wrote:
>
> > i think this sums it up:
> >
> > http://www.uwsg.iu.edu/hypermail/linux/kernel/0705.3/2490.html
>
> i mean this mail started it:
>
> http://linux.derkeiler.com/Mailing-Lists/Kernel/2007-05/msg13070.html
>
> > and some more, with a benchmark as well:
> >
> > http://linux.derkeiler.com/Mailing-Lists/Kernel/2007-05/msg13685.html
>
Yeah, I remember all that but I don't think it provides a suitable
description of what all this code is there for - what problem it is
solving and how it solves it.
If we just want some pseudo-private fd space for glibc to use then I'd have
thought that the existing code could be tweaked to do that: top-down
allocation, start at some high offset, etc. But apparently there's more
to it than this.
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [patch 1/2] ufd v1 - unsequential O(1) fdmap core
2007-06-04 8:34 ` Andrew Morton
@ 2007-06-04 8:42 ` Ingo Molnar
2007-06-04 8:47 ` Andrew Morton
2007-06-04 10:28 ` Eric Dumazet
1 sibling, 1 reply; 26+ messages in thread
From: Ingo Molnar @ 2007-06-04 8:42 UTC (permalink / raw)
To: Andrew Morton
Cc: Davide Libenzi, Eric Dumazet, Linux Kernel Mailing List,
Linus Torvalds, Ulrich Drepper
* Andrew Morton <akpm@linux-foundation.org> wrote:
> If we just want some pseudo-private fd space for glibc to use then I'd
> have thought that the existing code could be tweaked to do that:
> top-down allocation, start at some high offset, etc. But apparently
> there's more to it than this.
top-down has the problem of rlimits: 'where is top' is a variable
notion.
start-at-high-offset using the existing scheme has a 'bitmap size'
problem: even at 2^28 the bitmap size would be 32+ MB. per process (!).
The bitmap could be allocated on demand, but that slows down the current
code, uglifies it, and it would still end up somewhere looking a bit
like Davide's clean new code.
so, instead of trying to mesh this thing into the old fd data structures
which are very much centered around and tailored to the
continuous-allocation usage model, Davide cleanly separated it out into
a separate data structure that fits this independently-allocated usage
model well and leaves the original data structure alone. I'm strongly in
favor of such clean data structure separations.
Ingo
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [patch 1/2] ufd v1 - unsequential O(1) fdmap core
2007-06-04 8:42 ` Ingo Molnar
@ 2007-06-04 8:47 ` Andrew Morton
2007-06-04 13:05 ` Davide Libenzi
0 siblings, 1 reply; 26+ messages in thread
From: Andrew Morton @ 2007-06-04 8:47 UTC (permalink / raw)
To: Ingo Molnar
Cc: Davide Libenzi, Eric Dumazet, Linux Kernel Mailing List,
Linus Torvalds, Ulrich Drepper
On Mon, 4 Jun 2007 10:42:27 +0200 Ingo Molnar <mingo@elte.hu> wrote:
>
> * Andrew Morton <akpm@linux-foundation.org> wrote:
>
> > If we just want some pseudo-private fd space for glibc to use then I'd
> > have thought that the existing code could be tweaked to do that:
> > top-down allocation, start at some high offset, etc. But apparently
> > there's more to it than this.
>
> top-down has the problem of rlimits: 'where is top' is a variable
> notion.
Well, sort-of. rlimits affect the number of open files, not the actual fd
indices. But whatever.
> start-at-high-offset using the existing scheme has a 'bitmap size'
> problem: even at 2^28 the bitmap size would be 32+ MB. per process (!).
> The bitmap could be allocated on demand, but that slows down the current
> code, uglifies it, and it would still end up somewhere looking a bit
> like Davide's clean new code.
OK, so the existing code doesn't support a holey bitmap.
> so, instead of trying to mesh this thing into the old fd data structures
> which are very much centered around and tailored to the
> continuous-allocation usage model, Davide cleanly separated it out into
> a separate data structure that fits this independently-allocated usage
> model well and leaves the original data structure alone. I'm strongly in
> favor of such clean data structure separations.
a) Were IDR trees evaluated and if so, why were they rejected?
b) it's a bit disappointing that this new allocator is only usable for
one specific application. We have a *lot* of places in the kernel which
want allocators of this type. Many of them are open-coded and crappy.
Some use IDR trees.
If we're going to go and add a complete new allocator, it would be
good to position it as a library thing if poss.
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [patch 1/2] ufd v1 - unsequential O(1) fdmap core
2007-06-04 8:34 ` Andrew Morton
2007-06-04 8:42 ` Ingo Molnar
@ 2007-06-04 10:28 ` Eric Dumazet
2007-06-04 12:55 ` Davide Libenzi
1 sibling, 1 reply; 26+ messages in thread
From: Eric Dumazet @ 2007-06-04 10:28 UTC (permalink / raw)
To: Andrew Morton
Cc: Ingo Molnar, Davide Libenzi, Linux Kernel Mailing List,
Linus Torvalds, Ulrich Drepper
On Mon, 4 Jun 2007 01:34:49 -0700
Andrew Morton <akpm@linux-foundation.org> wrote:
> On Mon, 4 Jun 2007 10:09:41 +0200 Ingo Molnar <mingo@elte.hu> wrote:
>
> >
> > * Ingo Molnar <mingo@elte.hu> wrote:
> >
> > > i think this sums it up:
> > >
> > > http://www.uwsg.iu.edu/hypermail/linux/kernel/0705.3/2490.html
> >
> > i mean this mail started it:
> >
> > http://linux.derkeiler.com/Mailing-Lists/Kernel/2007-05/msg13070.html
> >
> > > and some more, with a benchmark as well:
> > >
> > > http://linux.derkeiler.com/Mailing-Lists/Kernel/2007-05/msg13685.html
> >
>
> Yeah, I remember all that but I don't think it provides a suitable
> description of what all this code is there for - what problem it is
> solving and how it solves it.
>
> If we just want some pseudo-private fd space for glibc to use then I'd have
> thought that the existing code could be tweaked to do that: top-down
> allocation, start at some high offset, etc. But apparently there's more
> to it than this.
Goals :
1) libc wants 'private fds'
2) Latencies of get_unused_fd() for huge processes (more than 100.000 file handles)
Point 1) can use a top-down allocation, or use a 'last unused' index.
Point 2) Instead of introducing a *complex* layer, couldnt we improve existing one ?
If the main problem we want to solve is the potentially slow bitmap search,
we could logically divide the open_fds bitmap into pages (4096*8 = 32768 bits per page on i386/x86_64 arches)
We would have to add a new field in 'struct fdtable', pointer to an array of u32 counters, that would count the number of 'one' bits in each PAGE. This array is tiny : 128 bytes only for 1.000.000 file handles
get_unused_fd() could then use this array to select an appropriate page (a page known to have at least one zero bit), then do a find_next_zero_bit() restricted to at most PAGE_SIZE bytes. Max latency would be similar to vm one when clearing a page. If applications use Point 1) hint (asking kernel one fd, not the POSIX low fd), typical latency will be null.
Old applications (still asking for POSIX "give me the lowest fd crap") would benefit from this, without recompilation or glibc change.
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [patch 1/2] ufd v1 - unsequential O(1) fdmap core
2007-06-04 10:28 ` Eric Dumazet
@ 2007-06-04 12:55 ` Davide Libenzi
2007-06-04 13:25 ` Eric Dumazet
0 siblings, 1 reply; 26+ messages in thread
From: Davide Libenzi @ 2007-06-04 12:55 UTC (permalink / raw)
To: Eric Dumazet
Cc: Andrew Morton, Ingo Molnar, Linux Kernel Mailing List,
Linus Torvalds, Ulrich Drepper
On Mon, 4 Jun 2007, Eric Dumazet wrote:
> Goals :
> 1) libc wants 'private fds'
> 2) Latencies of get_unused_fd() for huge processes (more than 100.000 file handles)
>
> Point 1) can use a top-down allocation, or use a 'last unused' index.
>
>
> Point 2) Instead of introducing a *complex* layer, couldnt we improve existing one ?
Complex layer?! It's an array with free slots chained by a double linked
list.
> If the main problem we want to solve is the potentially slow bitmap
> search,
> we could logically divide the open_fds bitmap into pages (4096*8 = 32768
> bits per page on i386/x86_64 arches)
>
> We would have to add a new field in 'struct fdtable', pointer to an
> array of u32 counters, that would count the number of 'one' bits in each
> PAGE. This array is tiny : 128 bytes only for 1.000.000 file handles
>
> get_unused_fd() could then use this array to select an appropriate page
> (a page known to have at least one zero bit), then do a
> find_next_zero_bit() restricted to at most PAGE_SIZE bytes. Max latency
> would be similar to vm one when clearing a page. If applications use
> Point 1) hint (asking kernel one fd, not the POSIX low fd), typical
> latency will be null.
And look at what you're describing here, talking about simplicity. You'd
still need two bitmaps, so you'd still need the out-of-fdmap.c/h code.
You're trying to fit an horse-shoe to a deer :)
The most appropriate structure for this, is an array (O(1) lookup) with
free elements chained by a dbl linked list (O(1) alloc and free). Plus,
the extra pointer can nicely fit other per-allocated-fd flags w/out adding
extra custom flags bitmaps to the fdtable.
- Davide
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [patch 1/2] ufd v1 - unsequential O(1) fdmap core
2007-06-04 8:47 ` Andrew Morton
@ 2007-06-04 13:05 ` Davide Libenzi
2007-06-04 13:30 ` Davide Libenzi
2007-06-04 16:56 ` Andrew Morton
0 siblings, 2 replies; 26+ messages in thread
From: Davide Libenzi @ 2007-06-04 13:05 UTC (permalink / raw)
To: Andrew Morton
Cc: Ingo Molnar, Eric Dumazet, Linux Kernel Mailing List,
Linus Torvalds, Ulrich Drepper
On Mon, 4 Jun 2007, Andrew Morton wrote:
> a) Were IDR trees evaluated and if so, why were they rejected?
>
> b) it's a bit disappointing that this new allocator is only usable for
> one specific application. We have a *lot* of places in the kernel which
> want allocators of this type. Many of them are open-coded and crappy.
> Some use IDR trees.
>
> If we're going to go and add a complete new allocator, it would be
> good to position it as a library thing if poss.
Thank you for pointing me to that, Andrew. I didn't know about it (IDR
trees).
It does not fit AFAICS. Locking should be handled extarnally (the files
struct), must be RCU friendly (proper barriers) since it's used in
lockless code, and must have flags associated to an allocation. And I'm
leaving out the O(1) part, that for something like this, is just silly not
to have it. This is really an array.
- Davide
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [patch 1/2] ufd v1 - unsequential O(1) fdmap core
2007-06-04 12:55 ` Davide Libenzi
@ 2007-06-04 13:25 ` Eric Dumazet
2007-06-04 13:33 ` Davide Libenzi
` (2 more replies)
0 siblings, 3 replies; 26+ messages in thread
From: Eric Dumazet @ 2007-06-04 13:25 UTC (permalink / raw)
To: Davide Libenzi
Cc: Andrew Morton, Ingo Molnar, Linux Kernel Mailing List,
Linus Torvalds, Ulrich Drepper
On Mon, 4 Jun 2007 05:55:42 -0700 (PDT)
Davide Libenzi <davidel@xmailserver.org> wrote:
> On Mon, 4 Jun 2007, Eric Dumazet wrote:
>
> > Goals :
> > 1) libc wants 'private fds'
> > 2) Latencies of get_unused_fd() for huge processes (more than 100.000 file handles)
> >
> > Point 1) can use a top-down allocation, or use a 'last unused' index.
> >
> >
> > Point 2) Instead of introducing a *complex* layer, couldnt we improve existing one ?
>
> Complex layer?! It's an array with free slots chained by a double linked
> list.
>
>
> > If the main problem we want to solve is the potentially slow bitmap
> > search,
> > we could logically divide the open_fds bitmap into pages (4096*8 = 32768
> > bits per page on i386/x86_64 arches)
> >
> > We would have to add a new field in 'struct fdtable', pointer to an
> > array of u32 counters, that would count the number of 'one' bits in each
> > PAGE. This array is tiny : 128 bytes only for 1.000.000 file handles
> >
> > get_unused_fd() could then use this array to select an appropriate page
> > (a page known to have at least one zero bit), then do a
> > find_next_zero_bit() restricted to at most PAGE_SIZE bytes. Max latency
> > would be similar to vm one when clearing a page. If applications use
> > Point 1) hint (asking kernel one fd, not the POSIX low fd), typical
> > latency will be null.
>
> And look at what you're describing here, talking about simplicity. You'd
> still need two bitmaps, so you'd still need the out-of-fdmap.c/h code.
> You're trying to fit an horse-shoe to a deer :)
> The most appropriate structure for this, is an array (O(1) lookup) with
> free elements chained by a dbl linked list (O(1) alloc and free). Plus,
> the extra pointer can nicely fit other per-allocated-fd flags w/out adding
> extra custom flags bitmaps to the fdtable.
>
Bitmaps are already there, you didnt zap them.
Your proposal is going to double size taken by file table, since you need two long words
per file instead of one pointer.
You add conditional branches on very hot spots.
When you open/close a file, you need to access previous and next cells, so you need 3 cache lines, exactly like
current *legacy* code. (one for file pointer, one on each bitmap flags(open/close_on_exec) )
O(1) lookup doesnt imply it needs to be super-fast. You make a confusion about this.
O(128) is still O(1) for instance. Having to search a bit in a PAGE is a sensible compromise, if we dont add overhead
on each fget() calls.
Instead of adding complexity and a pile of new bugs (see how long it takes to bring RCU on files to a stable state), we can take a safe path. Then if it happens to still be a problem, we can consider the painfull way.
I probably can code a < 100 lines patch, later this evening after my day job.
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [patch 1/2] ufd v1 - unsequential O(1) fdmap core
2007-06-04 13:05 ` Davide Libenzi
@ 2007-06-04 13:30 ` Davide Libenzi
2007-06-04 16:56 ` Andrew Morton
1 sibling, 0 replies; 26+ messages in thread
From: Davide Libenzi @ 2007-06-04 13:30 UTC (permalink / raw)
To: Andrew Morton
Cc: Ingo Molnar, Eric Dumazet, Linux Kernel Mailing List,
Linus Torvalds, Ulrich Drepper
On Mon, 4 Jun 2007, Davide Libenzi wrote:
> On Mon, 4 Jun 2007, Andrew Morton wrote:
>
> > a) Were IDR trees evaluated and if so, why were they rejected?
> >
> > b) it's a bit disappointing that this new allocator is only usable for
> > one specific application. We have a *lot* of places in the kernel which
> > want allocators of this type. Many of them are open-coded and crappy.
> > Some use IDR trees.
> >
> > If we're going to go and add a complete new allocator, it would be
> > good to position it as a library thing if poss.
>
> Thank you for pointing me to that, Andrew. I didn't know about it (IDR
> trees).
> It does not fit AFAICS. Locking should be handled extarnally (the files
> struct), must be RCU friendly (proper barriers) since it's used in
> lockless code, and must have flags associated to an allocation. And I'm
> leaving out the O(1) part, that for something like this, is just silly not
> to have it. This is really an array.
What we could do (if we want to cut code), is to add a bitmap to fdmap,
add a flag "FDMAP_SEQUENTIAL to fdmap_newfd(), and cut-out the old fdtable
code by using fdmap for everything. We'd still need two fdmaps though,
since allocation spaces are far apart (and one of them is random).
Basically all the code of fs/file.c will go.
- Davide
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [patch 1/2] ufd v1 - unsequential O(1) fdmap core
2007-06-04 13:25 ` Eric Dumazet
@ 2007-06-04 13:33 ` Davide Libenzi
2007-06-04 13:35 ` Davide Libenzi
2007-06-04 14:12 ` Ingo Molnar
2 siblings, 0 replies; 26+ messages in thread
From: Davide Libenzi @ 2007-06-04 13:33 UTC (permalink / raw)
To: Eric Dumazet
Cc: Andrew Morton, Ingo Molnar, Linux Kernel Mailing List,
Linus Torvalds, Ulrich Drepper
On Mon, 4 Jun 2007, Eric Dumazet wrote:
> Bitmaps are already there, you didnt zap them.
>
> Your proposal is going to double size taken by file table, since you need two long words
> per file instead of one pointer.
>
> You add conditional branches on very hot spots.
>
> When you open/close a file, you need to access previous and next cells, so you need 3 cache lines, exactly like
> current *legacy* code. (one for file pointer, one on each bitmap flags(open/close_on_exec) )
>
> O(1) lookup doesnt imply it needs to be super-fast. You make a confusion about this.
>
> O(128) is still O(1) for instance. Having to search a bit in a PAGE is a sensible compromise, if we dont add overhead
> on each fget() calls.
>
> Instead of adding complexity and a pile of new bugs (see how long it takes to bring RCU on files to a stable state), we can take a safe path. Then if it happens to still be a problem, we can consider the painfull way.
>
> I probably can code a < 100 lines patch, later this evening after my day job.
fdmap.c is 300 lines of code, w/out comments. You're trying to fit the
wrong structure. It is *that* simple.
- Davide
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [patch 1/2] ufd v1 - unsequential O(1) fdmap core
2007-06-04 13:25 ` Eric Dumazet
2007-06-04 13:33 ` Davide Libenzi
@ 2007-06-04 13:35 ` Davide Libenzi
2007-06-04 14:28 ` Eric Dumazet
2007-06-04 14:12 ` Ingo Molnar
2 siblings, 1 reply; 26+ messages in thread
From: Davide Libenzi @ 2007-06-04 13:35 UTC (permalink / raw)
To: Eric Dumazet
Cc: Andrew Morton, Ingo Molnar, Linux Kernel Mailing List,
Linus Torvalds, Ulrich Drepper
On Mon, 4 Jun 2007, Eric Dumazet wrote:
> You add conditional branches on very hot spots.
Keep BS for the ones you argue usually, and that are not able to reply.
You *still* two bitmaps, because allocation spaces are far apart. So the
"if" will still be there.
- Davide
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [patch 1/2] ufd v1 - unsequential O(1) fdmap core
2007-06-04 13:25 ` Eric Dumazet
2007-06-04 13:33 ` Davide Libenzi
2007-06-04 13:35 ` Davide Libenzi
@ 2007-06-04 14:12 ` Ingo Molnar
2007-06-04 14:27 ` Eric Dumazet
2 siblings, 1 reply; 26+ messages in thread
From: Ingo Molnar @ 2007-06-04 14:12 UTC (permalink / raw)
To: Eric Dumazet
Cc: Davide Libenzi, Andrew Morton, Linux Kernel Mailing List,
Linus Torvalds, Ulrich Drepper
* Eric Dumazet <dada1@cosmosbay.com> wrote:
> O(1) lookup doesnt imply it needs to be super-fast. You make a
> confusion about this.
Davide has written many good speedups for the kernel and he is one of
the best scalability experts the Linux kernel has today. You could learn
from Davide a thing or two, instead of lecturing him about O(1) ...
For example, the recent futex.c changes you did in commit 34f01cc1 are,
and unfortunately there's no better word i can find: plain disgusting.
You apparently have plopped the 'fshared' code into the existing logic
via conditionals and have blown up the complexity of the functions for
no good reason - instead of neatly separating them out. You have added
_33_ (thirty-three!) new 'if' branches to futex.c! The feature you
introduced is nice and useful, but for heaven's sake please work on
cleanliness of your code some more and undo that colossal damage ...
preferably before working on other areas of the kernel.
> O(128) is still O(1) for instance. Having to search a bit in a PAGE is
> a sensible compromise, if we dont add overhead on each fget() calls.
hm, i'm not sure what you are talking about here. Look at Davide's stuff
- it's clearly not O(128)...
> You add conditional branches on very hot spots.
>
> When you open/close a file, you need to access previous and next
> cells, so you need 3 cache lines, exactly like current *legacy* code.
> (one for file pointer, one on each bitmap flags(open/close_on_exec) )
the fd spaces will be separated _no matter what_, that is a physical
inevitability of the ABI in question. Whether you hide that into 'extra
complexity by trying to bend bitmaps in a way that the new users dont
need' or do it explicitly like Davide, i'll go for the explicit
separation. Davide's approach, besides being cleaner, simpler and faster
also has the advantage of enabling the possible demoting of the 'legacy'
fd space in the future. Or demoting the 'new' fd space in the future, if
the interface does not take off.
Ingo
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [patch 1/2] ufd v1 - unsequential O(1) fdmap core
2007-06-04 14:12 ` Ingo Molnar
@ 2007-06-04 14:27 ` Eric Dumazet
2007-06-05 20:37 ` Ingo Molnar
0 siblings, 1 reply; 26+ messages in thread
From: Eric Dumazet @ 2007-06-04 14:27 UTC (permalink / raw)
To: Ingo Molnar
Cc: Davide Libenzi, Andrew Morton, Linux Kernel Mailing List,
Linus Torvalds, Ulrich Drepper
On Mon, 4 Jun 2007 16:12:35 +0200
Ingo Molnar <mingo@elte.hu> wrote:
>
> * Eric Dumazet <dada1@cosmosbay.com> wrote:
>
> > O(1) lookup doesnt imply it needs to be super-fast. You make a
> > confusion about this.
>
> Davide has written many good speedups for the kernel and he is one of
> the best scalability experts the Linux kernel has today. You could learn
> from Davide a thing or two, instead of lecturing him about O(1) ...
>
I know who is Davide, thank you :)
> For example, the recent futex.c changes you did in commit 34f01cc1 are,
> and unfortunately there's no better word i can find: plain disgusting.
> You apparently have plopped the 'fshared' code into the existing logic
> via conditionals and have blown up the complexity of the functions for
> no good reason - instead of neatly separating them out. You have added
> _33_ (thirty-three!) new 'if' branches to futex.c! The feature you
> introduced is nice and useful, but for heaven's sake please work on
> cleanliness of your code some more and undo that colossal damage ...
> preferably before working on other areas of the kernel.
>
This code took the normal path for inclusion and discussion. If you find it so horrible,
you should complained before. Fact is that you Acked it :)
If you wanted to make a joke, I find it quite misplaced.
> > O(128) is still O(1) for instance. Having to search a bit in a PAGE is
> > a sensible compromise, if we dont add overhead on each fget() calls.
>
> hm, i'm not sure what you are talking about here. Look at Davide's stuff
> - it's clearly not O(128)...
I am talking about my suggestion to use a bitmap search limited to one page.
So I named it O(128), this clearly should be labeled O(PAGE_SIZE/L1_CACHE_BYTES)
>
> > You add conditional branches on very hot spots.
> >
> > When you open/close a file, you need to access previous and next
> > cells, so you need 3 cache lines, exactly like current *legacy* code.
> > (one for file pointer, one on each bitmap flags(open/close_on_exec) )
>
> the fd spaces will be separated _no matter what_, that is a physical
> inevitability of the ABI in question. Whether you hide that into 'extra
> complexity by trying to bend bitmaps in a way that the new users dont
> need' or do it explicitly like Davide, i'll go for the explicit
> separation. Davide's approach, besides being cleaner, simpler and faster
> also has the advantage of enabling the possible demoting of the 'legacy'
> fd space in the future. Or demoting the 'new' fd space in the future, if
> the interface does not take off.
>
> Ingo
>
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [patch 1/2] ufd v1 - unsequential O(1) fdmap core
2007-06-04 13:35 ` Davide Libenzi
@ 2007-06-04 14:28 ` Eric Dumazet
2007-06-04 14:53 ` Davide Libenzi
0 siblings, 1 reply; 26+ messages in thread
From: Eric Dumazet @ 2007-06-04 14:28 UTC (permalink / raw)
To: Davide Libenzi
Cc: Andrew Morton, Ingo Molnar, Linux Kernel Mailing List,
Linus Torvalds, Ulrich Drepper
On Mon, 4 Jun 2007 06:35:16 -0700 (PDT)
Davide Libenzi <davidel@xmailserver.org> wrote:
> On Mon, 4 Jun 2007, Eric Dumazet wrote:
>
> > You add conditional branches on very hot spots.
>
> Keep BS for the ones you argue usually, and that are not able to reply.
> You *still* two bitmaps, because allocation spaces are far apart. So the
> "if" will still be there.
I actually read your patches and spent time to see the pros and cons.
If you dont need reviewers, please dont post your patches on lkml.
If I am not mistaken, you added a test in fget()/fget_light(), which is a known hot point for said huge processes.
fget() dont need to access the bitmap at all. Using fd_slots means less (50%) file pointers per cache line.
On my machines, there is a ratio of 100/1 in cpu time for fget(),fget_light() against get_unused_fd().
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [patch 1/2] ufd v1 - unsequential O(1) fdmap core
2007-06-04 14:28 ` Eric Dumazet
@ 2007-06-04 14:53 ` Davide Libenzi
0 siblings, 0 replies; 26+ messages in thread
From: Davide Libenzi @ 2007-06-04 14:53 UTC (permalink / raw)
To: Eric Dumazet
Cc: Andrew Morton, Ingo Molnar, Linux Kernel Mailing List,
Linus Torvalds, Ulrich Drepper
On Mon, 4 Jun 2007, Eric Dumazet wrote:
> On Mon, 4 Jun 2007 06:35:16 -0700 (PDT)
> Davide Libenzi <davidel@xmailserver.org> wrote:
>
> > On Mon, 4 Jun 2007, Eric Dumazet wrote:
> >
> > > You add conditional branches on very hot spots.
> >
> > Keep BS for the ones you argue usually, and that are not able to reply.
> > You *still* two bitmaps, because allocation spaces are far apart. So the
> > "if" will still be there.
>
> I actually read your patches and spent time to see the pros and cons.
>
> If you dont need reviewers, please dont post your patches on lkml.
I *do like* reviews, and if you go back in all the reviews my code had
during the last few months for example, you'll notice that I *almost always*
ACK the reviewers suggestions. I'm a believer that in computer science,
there are many ways to do the same thing, and many of those are very close
as far as pros&cons. So, if some reviewer suggests me something, I'm
inclined to ACK it, even if the solution I wrote (or that I had in mind)
was different.
In this case though, your solution and mine are *really* different. You
are trying to fit a structure to a problem that is not meant to. And it is
so clearly evident, that it blows my mind how this could even be argued.
As I said to Andrew, we can use fdmap for both allocations (fdmap already
supports all the allocation modes the current fdtable does - modulo adding
a bitmap), and cut a lot of code out.
This is the way I'm currently heading, so all the code from fs/file.c (256
lines) can go. locate_fd will go. set_close_on_exec and get_close_on_exec
will use the existing function in fdmap.c, etc...
> If I am not mistaken, you added a test in fget()/fget_light(), which is
> a known hot point for said huge processes.
There is one "if", that you cannot avoid, since allocation areas are far
apart (and one of them is even random). Unless you plan to have a single
contiguous bitmap with *a lot* of wasted space in the middle.
- Davide
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [patch 1/2] ufd v1 - unsequential O(1) fdmap core
2007-06-04 13:05 ` Davide Libenzi
2007-06-04 13:30 ` Davide Libenzi
@ 2007-06-04 16:56 ` Andrew Morton
2007-06-04 17:57 ` Davide Libenzi
1 sibling, 1 reply; 26+ messages in thread
From: Andrew Morton @ 2007-06-04 16:56 UTC (permalink / raw)
To: Davide Libenzi
Cc: Ingo Molnar, Eric Dumazet, Linux Kernel Mailing List,
Linus Torvalds, Ulrich Drepper
On Mon, 4 Jun 2007 06:05:22 -0700 (PDT) Davide Libenzi <davidel@xmailserver.org> wrote:
> On Mon, 4 Jun 2007, Andrew Morton wrote:
>
> > a) Were IDR trees evaluated and if so, why were they rejected?
> >
> > b) it's a bit disappointing that this new allocator is only usable for
> > one specific application. We have a *lot* of places in the kernel which
> > want allocators of this type. Many of them are open-coded and crappy.
> > Some use IDR trees.
> >
> > If we're going to go and add a complete new allocator, it would be
> > good to position it as a library thing if poss.
>
> Thank you for pointing me to that, Andrew. I didn't know about it (IDR
> trees).
> It does not fit AFAICS.
> Locking should be handled extarnally (the files
> struct),
Yeah, that's already a problem in IDR and I'm hoping sometime someone will
be inspired to redo it, move it to caller-provided locking.
> must be RCU friendly (proper barriers) since it's used in
> lockless code,
Haven't looked at that.
> and must have flags associated to an allocation.
Don't understand that.
> And I'm
> leaving out the O(1) part, that for something like this, is just silly not
> to have it. This is really an array.
Having to walk down a tree in fget_light() would kinda suck.
What about my b)?
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [patch 1/2] ufd v1 - unsequential O(1) fdmap core
2007-06-04 16:56 ` Andrew Morton
@ 2007-06-04 17:57 ` Davide Libenzi
0 siblings, 0 replies; 26+ messages in thread
From: Davide Libenzi @ 2007-06-04 17:57 UTC (permalink / raw)
To: Andrew Morton
Cc: Ingo Molnar, Eric Dumazet, Linux Kernel Mailing List,
Linus Torvalds, Ulrich Drepper
On Mon, 4 Jun 2007, Andrew Morton wrote:
> > must be RCU friendly (proper barriers) since it's used in
> > lockless code,
>
> Haven't looked at that.
>
> > and must have flags associated to an allocation.
>
> Don't understand that.
It needs to be able to store flags (like close-on-exec, and maybe more)
together with a file*.
> > And I'm
> > leaving out the O(1) part, that for something like this, is just silly not
> > to have it. This is really an array.
>
> Having to walk down a tree in fget_light() would kinda suck.
>
> What about my b)?
Right now, I'm working on a patch that uses fdmap even for legacy
(sequential) fd allocations. With a cleaner interface, w/out having code
all around the kernel that access fdtable members directly. This should
cut some code WRT using two different allocators (code in fs/file.c will
almost zip-away), and maybe it'll look even better and have some new
comments too ;)
It can be, in theory, completely abstracted and moved into lib/. I'll look
into it once I completed the first cleanup, assuming the whole thing won't
look worse than before :)
- Davide
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [patch 1/2] ufd v1 - unsequential O(1) fdmap core
2007-06-04 14:27 ` Eric Dumazet
@ 2007-06-05 20:37 ` Ingo Molnar
2007-06-05 20:50 ` Thomas Gleixner
` (2 more replies)
0 siblings, 3 replies; 26+ messages in thread
From: Ingo Molnar @ 2007-06-05 20:37 UTC (permalink / raw)
To: Eric Dumazet
Cc: Davide Libenzi, Andrew Morton, Linux Kernel Mailing List,
Linus Torvalds, Ulrich Drepper, Thomas Gleixner
* Eric Dumazet <dada1@cosmosbay.com> wrote:
> > For example, the recent futex.c changes you did in commit 34f01cc1
> > are, and unfortunately there's no better word i can find: plain
> > disgusting. You apparently have plopped the 'fshared' code into the
> > existing logic via conditionals and have blown up the complexity of
> > the functions for no good reason - instead of neatly separating them
> > out. You have added _33_ (thirty-three!) new 'if' branches to
> > futex.c! The feature you introduced is nice and useful, but for
> > heaven's sake please work on cleanliness of your code some more and
> > undo that colossal damage ... preferably before working on other
> > areas of the kernel.
>
> This code took the normal path for inclusion and discussion. If you
> find it so horrible, you should complained before. Fact is that you
> Acked it :)
yes, of course, i still think it's a good and nice patch, all things
considered =B-)
> If you wanted to make a joke, I find it quite misplaced.
no, i just wanted to make a demonstration that one can be pretty nasty
in on-lkml replies while being technically correct :-) I think you went
a bit overboard in your replies to Davide. Lets move this back into
constructive channels, ok? :)
Ingo
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [patch 1/2] ufd v1 - unsequential O(1) fdmap core
2007-06-05 20:37 ` Ingo Molnar
@ 2007-06-05 20:50 ` Thomas Gleixner
2007-06-05 20:57 ` Eric Dumazet
2007-06-05 22:29 ` Eric Dumazet
2 siblings, 0 replies; 26+ messages in thread
From: Thomas Gleixner @ 2007-06-05 20:50 UTC (permalink / raw)
To: Ingo Molnar
Cc: Eric Dumazet, Davide Libenzi, Andrew Morton,
Linux Kernel Mailing List, Linus Torvalds, Ulrich Drepper
On Tue, 2007-06-05 at 22:37 +0200, Ingo Molnar wrote:
> * Eric Dumazet <dada1@cosmosbay.com> wrote:
>
> > > For example, the recent futex.c changes you did in commit 34f01cc1
> > > are, and unfortunately there's no better word i can find: plain
> > > disgusting. You apparently have plopped the 'fshared' code into the
> > > existing logic via conditionals and have blown up the complexity of
> > > the functions for no good reason - instead of neatly separating them
> > > out. You have added _33_ (thirty-three!) new 'if' branches to
> > > futex.c! The feature you introduced is nice and useful, but for
> > > heaven's sake please work on cleanliness of your code some more and
> > > undo that colossal damage ... preferably before working on other
> > > areas of the kernel.
> >
> > This code took the normal path for inclusion and discussion. If you
> > find it so horrible, you should complained before. Fact is that you
> > Acked it :)
>
> yes, of course, i still think it's a good and nice patch, all things
> considered =B-)
>
> > If you wanted to make a joke, I find it quite misplaced.
>
> no, i just wanted to make a demonstration that one can be pretty nasty
> in on-lkml replies while being technically correct :-) I think you went
> a bit overboard in your replies to Davide. Lets move this back into
> constructive channels, ok? :)
I'm digging into the pending futex bugs anyway. I'm doing some cleanups
along the way to make the code look more like it used to look before :)
tglx
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [patch 1/2] ufd v1 - unsequential O(1) fdmap core
2007-06-05 20:37 ` Ingo Molnar
2007-06-05 20:50 ` Thomas Gleixner
@ 2007-06-05 20:57 ` Eric Dumazet
2007-06-05 22:29 ` Eric Dumazet
2 siblings, 0 replies; 26+ messages in thread
From: Eric Dumazet @ 2007-06-05 20:57 UTC (permalink / raw)
To: Ingo Molnar
Cc: Davide Libenzi, Andrew Morton, Linux Kernel Mailing List,
Linus Torvalds, Ulrich Drepper, Thomas Gleixner
Ingo Molnar a écrit :
> * Eric Dumazet <dada1@cosmosbay.com> wrote:
>
>>> For example, the recent futex.c changes you did in commit 34f01cc1
>>> are, and unfortunately there's no better word i can find: plain
>>> disgusting. You apparently have plopped the 'fshared' code into the
>>> existing logic via conditionals and have blown up the complexity of
>>> the functions for no good reason - instead of neatly separating them
>>> out. You have added _33_ (thirty-three!) new 'if' branches to
>>> futex.c! The feature you introduced is nice and useful, but for
>>> heaven's sake please work on cleanliness of your code some more and
>>> undo that colossal damage ... preferably before working on other
>>> areas of the kernel.
>> This code took the normal path for inclusion and discussion. If you
>> find it so horrible, you should complained before. Fact is that you
>> Acked it :)
>
> yes, of course, i still think it's a good and nice patch, all things
> considered =B-)
>
>> If you wanted to make a joke, I find it quite misplaced.
>
> no, i just wanted to make a demonstration that one can be pretty nasty
> in on-lkml replies while being technically correct :-) I think you went
> a bit overboard in your replies to Davide. Lets move this back into
> constructive channels, ok? :)
No problem Ingo. I am sorry you and Davide took my remarks so badly.
I tried to be constructive.
You know this stuff got my interest, since I even tested your file
open-many-fd benchmark :)
I have some machines around with 1.000.000 file descriptors opened by one
process. I even had to change NR_OPEN (1024*1024 was too small for me :) )
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [patch 1/2] ufd v1 - unsequential O(1) fdmap core
2007-06-05 20:37 ` Ingo Molnar
2007-06-05 20:50 ` Thomas Gleixner
2007-06-05 20:57 ` Eric Dumazet
@ 2007-06-05 22:29 ` Eric Dumazet
2 siblings, 0 replies; 26+ messages in thread
From: Eric Dumazet @ 2007-06-05 22:29 UTC (permalink / raw)
To: Ingo Molnar
Cc: Davide Libenzi, Andrew Morton, Linux Kernel Mailing List,
Linus Torvalds, Ulrich Drepper, Thomas Gleixner
[-- Attachment #1: Type: text/plain, Size: 1469 bytes --]
Ingo Molnar a écrit :
>
> no, i just wanted to make a demonstration that one can be pretty nasty
> in on-lkml replies while being technically correct :-) I think you went
> a bit overboard in your replies to Davide. Lets move this back into
> constructive channels, ok? :)
In any case, here is one preliminary patch to show what I had in mind.
I only had time to compile it (its very late here), not even boot tested, so
dont try it !
[PATCH] reduce max latency of get_unused_fd().
Goal is to scan at most 4096 bytes (or 32768 bits) in the open_fds bitmap.
Processes that have many file descriptors might have to scan 128 KB of ram to
find a zero bit. Thats about 100 us on modern machines.
This patch introduces an array of counters. Each counter gives the number of
'one' bits in a 4 KB section of the open_fds bitmap.
I chose to statically allocate this array of counters, being very small (64
bytes), so a dynamic allocation would only add complexity.
As a result, max latency is 4 us (same latency on x86 when vm gives you a new
page)
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
---
fs/fcntl.c | 18 +++++++++++++++---
fs/file.c | 6 +++---
fs/open.c | 13 +++++++++++++
include/linux/file.h | 11 +++++++++++
include/linux/fs.h | 2 --
include/linux/init_task.h | 1 +
kernel/fork.c | 1 +
7 files changed, 44 insertions(+), 8 deletions(-)
[-- Attachment #2: fds_counter.patch --]
[-- Type: text/plain, Size: 5356 bytes --]
diff --git a/include/linux/file.h b/include/linux/file.h
index a59001e..ec6b120 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -36,6 +36,16 @@ struct fdtable {
};
/*
+ * To avoid big latencies in get_unused_fd(),
+ * we maintain counters of "one" bits in bitmap pages
+ * we define a 'page' here to contain 32768 bits,
+ * so that each counter is an unsigned short
+ * with MAX_NR_OPENS = 2^20, we get 32 counters : 64 bytes
+ */
+#define FDSBITS 32768
+#define MAX_NR_OPEN (1024*1024) /* Absolute upper limit on fd num */
+
+/*
* Open file table structure
*/
struct files_struct {
@@ -50,6 +60,7 @@ struct files_struct {
*/
spinlock_t file_lock ____cacheline_aligned_in_smp;
int next_fd;
+ unsigned short fds_counter[(MAX_NR_OPEN + (FDSBITS - 1)) / FDSBITS];
struct embedded_fd_set close_on_exec_init;
struct embedded_fd_set open_fds_init;
struct file * fd_array[NR_OPEN_DEFAULT];
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 8e382a5..5257ba6 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -61,6 +61,7 @@ static int locate_fd(struct files_struct
unsigned int start;
int error;
struct fdtable *fdt;
+ unsigned int page_nr;
error = -EINVAL;
if (orig_start >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
@@ -77,11 +78,19 @@ repeat:
start = files->next_fd;
newfd = start;
- if (start < fdt->max_fds)
+
+ error = -EMFILE;
+ if (start < fdt->max_fds) {
+ page_nr = start / FDSBITS;
+ while (files->fds_counter[page_nr] == FDSBITS) {
+ page_nr++;
+ start = page_nr * FDSBITS;
+ if (start >= fdt->max_fds)
+ goto out;
+ }
newfd = find_next_zero_bit(fdt->open_fds->fds_bits,
fdt->max_fds, start);
-
- error = -EMFILE;
+ }
if (newfd >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
goto out;
@@ -122,6 +131,7 @@ static int dupfd(struct file *file, unsi
/* locate_fd() may have expanded fdtable, load the ptr */
fdt = files_fdtable(files);
FD_SET(fd, fdt->open_fds);
+ files->fds_counter[fd / FDSBITS]++;
FD_CLR(fd, fdt->close_on_exec);
spin_unlock(&files->file_lock);
fd_install(fd, file);
@@ -171,7 +181,9 @@ asmlinkage long sys_dup2(unsigned int ol
rcu_assign_pointer(fdt->fd[newfd], file);
FD_SET(newfd, fdt->open_fds);
+ files->fds_counter[newfd / FDSBITS]++;
FD_CLR(newfd, fdt->close_on_exec);
+
spin_unlock(&files->file_lock);
if (tofree)
diff --git a/fs/file.c b/fs/file.c
index c5575de..7dbd9c5 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -147,8 +147,8 @@ static struct fdtable * alloc_fdtable(un
nr /= (1024 / sizeof(struct file *));
nr = roundup_pow_of_two(nr + 1);
nr *= (1024 / sizeof(struct file *));
- if (nr > NR_OPEN)
- nr = NR_OPEN;
+ if (nr > MAX_NR_OPEN)
+ nr = MAX_NR_OPEN;
fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL);
if (!fdt)
@@ -233,7 +233,7 @@ int expand_files(struct files_struct *fi
if (nr < fdt->max_fds)
return 0;
/* Can we expand? */
- if (nr >= NR_OPEN)
+ if (nr >= MAX_NR_OPEN)
return -EMFILE;
/* All good, so we try */
diff --git a/fs/open.c b/fs/open.c
index 0d515d1..340e69b 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -860,12 +860,23 @@ int get_unused_fd(void)
struct files_struct * files = current->files;
int fd, error;
struct fdtable *fdt;
+ unsigned int page_nr;
error = -EMFILE;
spin_lock(&files->file_lock);
repeat:
fdt = files_fdtable(files);
+ page_nr = files->next_fd / FDSBITS;
+ /*
+ * We can avoid testing big chunks of memory if all bit are set
+ */
+ while (files->fds_counter[page_nr] == FDSBITS) {
+ page_nr++;
+ files->next_fd = page_nr * FDSBITS;
+ if (files->next_fd >= fdt->max_fds)
+ break;
+ }
fd = find_next_zero_bit(fdt->open_fds->fds_bits, fdt->max_fds,
files->next_fd);
@@ -891,6 +902,7 @@ repeat:
}
FD_SET(fd, fdt->open_fds);
+ files->fds_counter[fd / FDSBITS]++;
FD_CLR(fd, fdt->close_on_exec);
files->next_fd = fd + 1;
#if 1
@@ -913,6 +925,7 @@ static void __put_unused_fd(struct files
{
struct fdtable *fdt = files_fdtable(files);
__FD_CLR(fd, fdt->open_fds);
+ files->fds_counter[fd / FDSBITS]--;
if (fd < files->next_fd)
files->next_fd = fd;
}
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b3ae77c..9db2799 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -20,8 +20,6 @@ #include <linux/ioctl.h>
*/
/* Fixed constants first: */
-#undef NR_OPEN
-#define NR_OPEN (1024*1024) /* Absolute upper limit on fd num */
#define INR_OPEN 1024 /* Initial setting for nfile rlimits */
#define BLOCK_SIZE_BITS 10
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 276ccaa..bd24190 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -26,6 +26,7 @@ #define INIT_FILES \
.fdtab = INIT_FDTABLE, \
.file_lock = __SPIN_LOCK_UNLOCKED(init_task.file_lock), \
.next_fd = 0, \
+ .fds_counter = {0}, \
.close_on_exec_init = { { 0, } }, \
.open_fds_init = { { 0, } }, \
.fd_array = { NULL, } \
diff --git a/kernel/fork.c b/kernel/fork.c
index 73ad5cd..f4341c5 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -641,6 +641,7 @@ static struct files_struct *alloc_files(
spin_lock_init(&newf->file_lock);
newf->next_fd = 0;
+ memset(newf->fds_counter, 0, sizeof(newf->fds_counter));
fdt = &newf->fdtab;
fdt->max_fds = NR_OPEN_DEFAULT;
fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init;
^ permalink raw reply related [flat|nested] 26+ messages in thread
end of thread, other threads:[~2007-06-05 22:30 UTC | newest]
Thread overview: 26+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-06-02 22:59 [patch 1/2] ufd v1 - unsequential O(1) fdmap core Davide Libenzi
2007-06-03 21:19 ` Eric Dumazet
2007-06-03 22:51 ` Davide Libenzi
2007-06-04 6:08 ` Andrew Morton
2007-06-04 8:05 ` Ingo Molnar
2007-06-04 8:09 ` Ingo Molnar
2007-06-04 8:34 ` Andrew Morton
2007-06-04 8:42 ` Ingo Molnar
2007-06-04 8:47 ` Andrew Morton
2007-06-04 13:05 ` Davide Libenzi
2007-06-04 13:30 ` Davide Libenzi
2007-06-04 16:56 ` Andrew Morton
2007-06-04 17:57 ` Davide Libenzi
2007-06-04 10:28 ` Eric Dumazet
2007-06-04 12:55 ` Davide Libenzi
2007-06-04 13:25 ` Eric Dumazet
2007-06-04 13:33 ` Davide Libenzi
2007-06-04 13:35 ` Davide Libenzi
2007-06-04 14:28 ` Eric Dumazet
2007-06-04 14:53 ` Davide Libenzi
2007-06-04 14:12 ` Ingo Molnar
2007-06-04 14:27 ` Eric Dumazet
2007-06-05 20:37 ` Ingo Molnar
2007-06-05 20:50 ` Thomas Gleixner
2007-06-05 20:57 ` Eric Dumazet
2007-06-05 22:29 ` Eric Dumazet
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox