From: Eric Dumazet <dada1@cosmosbay.com>
To: Ingo Molnar <mingo@elte.hu>
Cc: Davide Libenzi <davidel@xmailserver.org>,
Andrew Morton <akpm@linux-foundation.org>,
Linux Kernel Mailing List <linux-kernel@vger.kernel.org>,
Linus Torvalds <torvalds@linux-foundation.org>,
Ulrich Drepper <drepper@redhat.com>,
Thomas Gleixner <tglx@linutronix.de>
Subject: Re: [patch 1/2] ufd v1 - unsequential O(1) fdmap core
Date: Wed, 06 Jun 2007 00:29:15 +0200 [thread overview]
Message-ID: <4665E3BB.2010401@cosmosbay.com> (raw)
In-Reply-To: <20070605203720.GA5519@elte.hu>
[-- Attachment #1: Type: text/plain, Size: 1469 bytes --]
Ingo Molnar a écrit :
>
> no, i just wanted to make a demonstration that one can be pretty nasty
> in on-lkml replies while being technically correct :-) I think you went
> a bit overboard in your replies to Davide. Lets move this back into
> constructive channels, ok? :)
In any case, here is one preliminary patch to show what I had in mind.
I only had time to compile it (its very late here), not even boot tested, so
dont try it !
[PATCH] reduce max latency of get_unused_fd().
Goal is to scan at most 4096 bytes (or 32768 bits) in the open_fds bitmap.
Processes that have many file descriptors might have to scan 128 KB of ram to
find a zero bit. Thats about 100 us on modern machines.
This patch introduces an array of counters. Each counter gives the number of
'one' bits in a 4 KB section of the open_fds bitmap.
I chose to statically allocate this array of counters, being very small (64
bytes), so a dynamic allocation would only add complexity.
As a result, max latency is 4 us (same latency on x86 when vm gives you a new
page)
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
---
fs/fcntl.c | 18 +++++++++++++++---
fs/file.c | 6 +++---
fs/open.c | 13 +++++++++++++
include/linux/file.h | 11 +++++++++++
include/linux/fs.h | 2 --
include/linux/init_task.h | 1 +
kernel/fork.c | 1 +
7 files changed, 44 insertions(+), 8 deletions(-)
[-- Attachment #2: fds_counter.patch --]
[-- Type: text/plain, Size: 5356 bytes --]
diff --git a/include/linux/file.h b/include/linux/file.h
index a59001e..ec6b120 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -36,6 +36,16 @@ struct fdtable {
};
/*
+ * To avoid big latencies in get_unused_fd(),
+ * we maintain counters of "one" bits in bitmap pages
+ * we define a 'page' here to contain 32768 bits,
+ * so that each counter is an unsigned short
+ * with MAX_NR_OPENS = 2^20, we get 32 counters : 64 bytes
+ */
+#define FDSBITS 32768
+#define MAX_NR_OPEN (1024*1024) /* Absolute upper limit on fd num */
+
+/*
* Open file table structure
*/
struct files_struct {
@@ -50,6 +60,7 @@ struct files_struct {
*/
spinlock_t file_lock ____cacheline_aligned_in_smp;
int next_fd;
+ unsigned short fds_counter[(MAX_NR_OPEN + (FDSBITS - 1)) / FDSBITS];
struct embedded_fd_set close_on_exec_init;
struct embedded_fd_set open_fds_init;
struct file * fd_array[NR_OPEN_DEFAULT];
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 8e382a5..5257ba6 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -61,6 +61,7 @@ static int locate_fd(struct files_struct
unsigned int start;
int error;
struct fdtable *fdt;
+ unsigned int page_nr;
error = -EINVAL;
if (orig_start >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
@@ -77,11 +78,19 @@ repeat:
start = files->next_fd;
newfd = start;
- if (start < fdt->max_fds)
+
+ error = -EMFILE;
+ if (start < fdt->max_fds) {
+ page_nr = start / FDSBITS;
+ while (files->fds_counter[page_nr] == FDSBITS) {
+ page_nr++;
+ start = page_nr * FDSBITS;
+ if (start >= fdt->max_fds)
+ goto out;
+ }
newfd = find_next_zero_bit(fdt->open_fds->fds_bits,
fdt->max_fds, start);
-
- error = -EMFILE;
+ }
if (newfd >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
goto out;
@@ -122,6 +131,7 @@ static int dupfd(struct file *file, unsi
/* locate_fd() may have expanded fdtable, load the ptr */
fdt = files_fdtable(files);
FD_SET(fd, fdt->open_fds);
+ files->fds_counter[fd / FDSBITS]++;
FD_CLR(fd, fdt->close_on_exec);
spin_unlock(&files->file_lock);
fd_install(fd, file);
@@ -171,7 +181,9 @@ asmlinkage long sys_dup2(unsigned int ol
rcu_assign_pointer(fdt->fd[newfd], file);
FD_SET(newfd, fdt->open_fds);
+ files->fds_counter[newfd / FDSBITS]++;
FD_CLR(newfd, fdt->close_on_exec);
+
spin_unlock(&files->file_lock);
if (tofree)
diff --git a/fs/file.c b/fs/file.c
index c5575de..7dbd9c5 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -147,8 +147,8 @@ static struct fdtable * alloc_fdtable(un
nr /= (1024 / sizeof(struct file *));
nr = roundup_pow_of_two(nr + 1);
nr *= (1024 / sizeof(struct file *));
- if (nr > NR_OPEN)
- nr = NR_OPEN;
+ if (nr > MAX_NR_OPEN)
+ nr = MAX_NR_OPEN;
fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL);
if (!fdt)
@@ -233,7 +233,7 @@ int expand_files(struct files_struct *fi
if (nr < fdt->max_fds)
return 0;
/* Can we expand? */
- if (nr >= NR_OPEN)
+ if (nr >= MAX_NR_OPEN)
return -EMFILE;
/* All good, so we try */
diff --git a/fs/open.c b/fs/open.c
index 0d515d1..340e69b 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -860,12 +860,23 @@ int get_unused_fd(void)
struct files_struct * files = current->files;
int fd, error;
struct fdtable *fdt;
+ unsigned int page_nr;
error = -EMFILE;
spin_lock(&files->file_lock);
repeat:
fdt = files_fdtable(files);
+ page_nr = files->next_fd / FDSBITS;
+ /*
+ * We can avoid testing big chunks of memory if all bit are set
+ */
+ while (files->fds_counter[page_nr] == FDSBITS) {
+ page_nr++;
+ files->next_fd = page_nr * FDSBITS;
+ if (files->next_fd >= fdt->max_fds)
+ break;
+ }
fd = find_next_zero_bit(fdt->open_fds->fds_bits, fdt->max_fds,
files->next_fd);
@@ -891,6 +902,7 @@ repeat:
}
FD_SET(fd, fdt->open_fds);
+ files->fds_counter[fd / FDSBITS]++;
FD_CLR(fd, fdt->close_on_exec);
files->next_fd = fd + 1;
#if 1
@@ -913,6 +925,7 @@ static void __put_unused_fd(struct files
{
struct fdtable *fdt = files_fdtable(files);
__FD_CLR(fd, fdt->open_fds);
+ files->fds_counter[fd / FDSBITS]--;
if (fd < files->next_fd)
files->next_fd = fd;
}
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b3ae77c..9db2799 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -20,8 +20,6 @@ #include <linux/ioctl.h>
*/
/* Fixed constants first: */
-#undef NR_OPEN
-#define NR_OPEN (1024*1024) /* Absolute upper limit on fd num */
#define INR_OPEN 1024 /* Initial setting for nfile rlimits */
#define BLOCK_SIZE_BITS 10
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 276ccaa..bd24190 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -26,6 +26,7 @@ #define INIT_FILES \
.fdtab = INIT_FDTABLE, \
.file_lock = __SPIN_LOCK_UNLOCKED(init_task.file_lock), \
.next_fd = 0, \
+ .fds_counter = {0}, \
.close_on_exec_init = { { 0, } }, \
.open_fds_init = { { 0, } }, \
.fd_array = { NULL, } \
diff --git a/kernel/fork.c b/kernel/fork.c
index 73ad5cd..f4341c5 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -641,6 +641,7 @@ static struct files_struct *alloc_files(
spin_lock_init(&newf->file_lock);
newf->next_fd = 0;
+ memset(newf->fds_counter, 0, sizeof(newf->fds_counter));
fdt = &newf->fdtab;
fdt->max_fds = NR_OPEN_DEFAULT;
fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init;
prev parent reply other threads:[~2007-06-05 22:30 UTC|newest]
Thread overview: 26+ messages / expand[flat|nested] mbox.gz Atom feed top
2007-06-02 22:59 [patch 1/2] ufd v1 - unsequential O(1) fdmap core Davide Libenzi
2007-06-03 21:19 ` Eric Dumazet
2007-06-03 22:51 ` Davide Libenzi
2007-06-04 6:08 ` Andrew Morton
2007-06-04 8:05 ` Ingo Molnar
2007-06-04 8:09 ` Ingo Molnar
2007-06-04 8:34 ` Andrew Morton
2007-06-04 8:42 ` Ingo Molnar
2007-06-04 8:47 ` Andrew Morton
2007-06-04 13:05 ` Davide Libenzi
2007-06-04 13:30 ` Davide Libenzi
2007-06-04 16:56 ` Andrew Morton
2007-06-04 17:57 ` Davide Libenzi
2007-06-04 10:28 ` Eric Dumazet
2007-06-04 12:55 ` Davide Libenzi
2007-06-04 13:25 ` Eric Dumazet
2007-06-04 13:33 ` Davide Libenzi
2007-06-04 13:35 ` Davide Libenzi
2007-06-04 14:28 ` Eric Dumazet
2007-06-04 14:53 ` Davide Libenzi
2007-06-04 14:12 ` Ingo Molnar
2007-06-04 14:27 ` Eric Dumazet
2007-06-05 20:37 ` Ingo Molnar
2007-06-05 20:50 ` Thomas Gleixner
2007-06-05 20:57 ` Eric Dumazet
2007-06-05 22:29 ` Eric Dumazet [this message]
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=4665E3BB.2010401@cosmosbay.com \
--to=dada1@cosmosbay.com \
--cc=akpm@linux-foundation.org \
--cc=davidel@xmailserver.org \
--cc=drepper@redhat.com \
--cc=linux-kernel@vger.kernel.org \
--cc=mingo@elte.hu \
--cc=tglx@linutronix.de \
--cc=torvalds@linux-foundation.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.