From: Eric Dumazet <dada1@cosmosbay.com>
To: Ingo Molnar <mingo@elte.hu>
Cc: Davide Libenzi <davidel@xmailserver.org>,
Andrew Morton <akpm@linux-foundation.org>,
Linux Kernel Mailing List <linux-kernel@vger.kernel.org>,
Linus Torvalds <torvalds@linux-foundation.org>,
Ulrich Drepper <drepper@redhat.com>,
Thomas Gleixner <tglx@linutronix.de>
Subject: Re: [patch 1/2] ufd v1 - unsequential O(1) fdmap core
Date: Wed, 06 Jun 2007 00:29:15 +0200 [thread overview]
Message-ID: <4665E3BB.2010401@cosmosbay.com> (raw)
In-Reply-To: <20070605203720.GA5519@elte.hu>
[-- Attachment #1: Type: text/plain, Size: 1469 bytes --]
Ingo Molnar a écrit :
>
> no, i just wanted to make a demonstration that one can be pretty nasty
> in on-lkml replies while being technically correct :-) I think you went
> a bit overboard in your replies to Davide. Lets move this back into
> constructive channels, ok? :)
In any case, here is one preliminary patch to show what I had in mind.
I only had time to compile it (its very late here), not even boot tested, so
dont try it !
[PATCH] reduce max latency of get_unused_fd().
Goal is to scan at most 4096 bytes (or 32768 bits) in the open_fds bitmap.
Processes that have many file descriptors might have to scan 128 KB of ram to
find a zero bit. Thats about 100 us on modern machines.
This patch introduces an array of counters. Each counter gives the number of
'one' bits in a 4 KB section of the open_fds bitmap.
I chose to statically allocate this array of counters, being very small (64
bytes), so a dynamic allocation would only add complexity.
As a result, max latency is 4 us (same latency on x86 when vm gives you a new
page)
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
---
fs/fcntl.c | 18 +++++++++++++++---
fs/file.c | 6 +++---
fs/open.c | 13 +++++++++++++
include/linux/file.h | 11 +++++++++++
include/linux/fs.h | 2 --
include/linux/init_task.h | 1 +
kernel/fork.c | 1 +
7 files changed, 44 insertions(+), 8 deletions(-)
[-- Attachment #2: fds_counter.patch --]
[-- Type: text/plain, Size: 5356 bytes --]
diff --git a/include/linux/file.h b/include/linux/file.h
index a59001e..ec6b120 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -36,6 +36,16 @@ struct fdtable {
};
/*
+ * To avoid big latencies in get_unused_fd(),
+ * we maintain counters of "one" bits in bitmap pages
+ * we define a 'page' here to contain 32768 bits,
+ * so that each counter is an unsigned short
+ * with MAX_NR_OPENS = 2^20, we get 32 counters : 64 bytes
+ */
+#define FDSBITS 32768
+#define MAX_NR_OPEN (1024*1024) /* Absolute upper limit on fd num */
+
+/*
* Open file table structure
*/
struct files_struct {
@@ -50,6 +60,7 @@ struct files_struct {
*/
spinlock_t file_lock ____cacheline_aligned_in_smp;
int next_fd;
+ unsigned short fds_counter[(MAX_NR_OPEN + (FDSBITS - 1)) / FDSBITS];
struct embedded_fd_set close_on_exec_init;
struct embedded_fd_set open_fds_init;
struct file * fd_array[NR_OPEN_DEFAULT];
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 8e382a5..5257ba6 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -61,6 +61,7 @@ static int locate_fd(struct files_struct
unsigned int start;
int error;
struct fdtable *fdt;
+ unsigned int page_nr;
error = -EINVAL;
if (orig_start >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
@@ -77,11 +78,19 @@ repeat:
start = files->next_fd;
newfd = start;
- if (start < fdt->max_fds)
+
+ error = -EMFILE;
+ if (start < fdt->max_fds) {
+ page_nr = start / FDSBITS;
+ while (files->fds_counter[page_nr] == FDSBITS) {
+ page_nr++;
+ start = page_nr * FDSBITS;
+ if (start >= fdt->max_fds)
+ goto out;
+ }
newfd = find_next_zero_bit(fdt->open_fds->fds_bits,
fdt->max_fds, start);
-
- error = -EMFILE;
+ }
if (newfd >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
goto out;
@@ -122,6 +131,7 @@ static int dupfd(struct file *file, unsi
/* locate_fd() may have expanded fdtable, load the ptr */
fdt = files_fdtable(files);
FD_SET(fd, fdt->open_fds);
+ files->fds_counter[fd / FDSBITS]++;
FD_CLR(fd, fdt->close_on_exec);
spin_unlock(&files->file_lock);
fd_install(fd, file);
@@ -171,7 +181,9 @@ asmlinkage long sys_dup2(unsigned int ol
rcu_assign_pointer(fdt->fd[newfd], file);
FD_SET(newfd, fdt->open_fds);
+ files->fds_counter[newfd / FDSBITS]++;
FD_CLR(newfd, fdt->close_on_exec);
+
spin_unlock(&files->file_lock);
if (tofree)
diff --git a/fs/file.c b/fs/file.c
index c5575de..7dbd9c5 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -147,8 +147,8 @@ static struct fdtable * alloc_fdtable(un
nr /= (1024 / sizeof(struct file *));
nr = roundup_pow_of_two(nr + 1);
nr *= (1024 / sizeof(struct file *));
- if (nr > NR_OPEN)
- nr = NR_OPEN;
+ if (nr > MAX_NR_OPEN)
+ nr = MAX_NR_OPEN;
fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL);
if (!fdt)
@@ -233,7 +233,7 @@ int expand_files(struct files_struct *fi
if (nr < fdt->max_fds)
return 0;
/* Can we expand? */
- if (nr >= NR_OPEN)
+ if (nr >= MAX_NR_OPEN)
return -EMFILE;
/* All good, so we try */
diff --git a/fs/open.c b/fs/open.c
index 0d515d1..340e69b 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -860,12 +860,23 @@ int get_unused_fd(void)
struct files_struct * files = current->files;
int fd, error;
struct fdtable *fdt;
+ unsigned int page_nr;
error = -EMFILE;
spin_lock(&files->file_lock);
repeat:
fdt = files_fdtable(files);
+ page_nr = files->next_fd / FDSBITS;
+ /*
+ * We can avoid testing big chunks of memory if all bit are set
+ */
+ while (files->fds_counter[page_nr] == FDSBITS) {
+ page_nr++;
+ files->next_fd = page_nr * FDSBITS;
+ if (files->next_fd >= fdt->max_fds)
+ break;
+ }
fd = find_next_zero_bit(fdt->open_fds->fds_bits, fdt->max_fds,
files->next_fd);
@@ -891,6 +902,7 @@ repeat:
}
FD_SET(fd, fdt->open_fds);
+ files->fds_counter[fd / FDSBITS]++;
FD_CLR(fd, fdt->close_on_exec);
files->next_fd = fd + 1;
#if 1
@@ -913,6 +925,7 @@ static void __put_unused_fd(struct files
{
struct fdtable *fdt = files_fdtable(files);
__FD_CLR(fd, fdt->open_fds);
+ files->fds_counter[fd / FDSBITS]--;
if (fd < files->next_fd)
files->next_fd = fd;
}
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b3ae77c..9db2799 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -20,8 +20,6 @@ #include <linux/ioctl.h>
*/
/* Fixed constants first: */
-#undef NR_OPEN
-#define NR_OPEN (1024*1024) /* Absolute upper limit on fd num */
#define INR_OPEN 1024 /* Initial setting for nfile rlimits */
#define BLOCK_SIZE_BITS 10
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 276ccaa..bd24190 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -26,6 +26,7 @@ #define INIT_FILES \
.fdtab = INIT_FDTABLE, \
.file_lock = __SPIN_LOCK_UNLOCKED(init_task.file_lock), \
.next_fd = 0, \
+ .fds_counter = {0}, \
.close_on_exec_init = { { 0, } }, \
.open_fds_init = { { 0, } }, \
.fd_array = { NULL, } \
diff --git a/kernel/fork.c b/kernel/fork.c
index 73ad5cd..f4341c5 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -641,6 +641,7 @@ static struct files_struct *alloc_files(
spin_lock_init(&newf->file_lock);
newf->next_fd = 0;
+ memset(newf->fds_counter, 0, sizeof(newf->fds_counter));
fdt = &newf->fdtab;
fdt->max_fds = NR_OPEN_DEFAULT;
fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init;
prev parent reply other threads:[~2007-06-05 22:30 UTC|newest]
Thread overview: 26+ messages / expand[flat|nested] mbox.gz Atom feed top
2007-06-02 22:59 [patch 1/2] ufd v1 - unsequential O(1) fdmap core Davide Libenzi
2007-06-03 21:19 ` Eric Dumazet
2007-06-03 22:51 ` Davide Libenzi
2007-06-04 6:08 ` Andrew Morton
2007-06-04 8:05 ` Ingo Molnar
2007-06-04 8:09 ` Ingo Molnar
2007-06-04 8:34 ` Andrew Morton
2007-06-04 8:42 ` Ingo Molnar
2007-06-04 8:47 ` Andrew Morton
2007-06-04 13:05 ` Davide Libenzi
2007-06-04 13:30 ` Davide Libenzi
2007-06-04 16:56 ` Andrew Morton
2007-06-04 17:57 ` Davide Libenzi
2007-06-04 10:28 ` Eric Dumazet
2007-06-04 12:55 ` Davide Libenzi
2007-06-04 13:25 ` Eric Dumazet
2007-06-04 13:33 ` Davide Libenzi
2007-06-04 13:35 ` Davide Libenzi
2007-06-04 14:28 ` Eric Dumazet
2007-06-04 14:53 ` Davide Libenzi
2007-06-04 14:12 ` Ingo Molnar
2007-06-04 14:27 ` Eric Dumazet
2007-06-05 20:37 ` Ingo Molnar
2007-06-05 20:50 ` Thomas Gleixner
2007-06-05 20:57 ` Eric Dumazet
2007-06-05 22:29 ` Eric Dumazet [this message]
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=4665E3BB.2010401@cosmosbay.com \
--to=dada1@cosmosbay.com \
--cc=akpm@linux-foundation.org \
--cc=davidel@xmailserver.org \
--cc=drepper@redhat.com \
--cc=linux-kernel@vger.kernel.org \
--cc=mingo@elte.hu \
--cc=tglx@linutronix.de \
--cc=torvalds@linux-foundation.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox