From: Eric Dumazet <dada1@cosmosbay.com>
To: Linus Torvalds <torvalds@osdl.org>
Cc: Benjamin LaHaise <bcrl@kvack.org>, Sonny Rao <sonny@burdell.org>,
Linux Kernel Mailing List <linux-kernel@vger.kernel.org>
Subject: Re: "Read my lips: no more merges" - aka Linux 2.6.14-rc1
Date: Thu, 15 Sep 2005 23:08:22 +0200 [thread overview]
Message-ID: <4329E2C6.4030106@cosmosbay.com> (raw)
In-Reply-To: <Pine.LNX.4.58.0509151328260.26803@g5.osdl.org>
[-- Attachment #1: Type: text/plain, Size: 1477 bytes --]
Linus Torvalds a écrit :
>
> On Thu, 15 Sep 2005, Benjamin LaHaise wrote:
>
>>Alternatively, the kernel could track available file descriptors using a
>>tree to efficiently insert freed slots into an ordered list of free
>>regions (something similar to the avl tree used in vmas). Is it worth
>>doing?
>
>
> For file descriptors, even a few hundred is considered a _lot_ in almost
> all settings. Yes, you can certainly have more, but it's unusual.
>
> And we keep track of the fd reservations with a bitmap _and_ a "lowest
> possible" count. So we can check 32 fd's in one go (64 on modern setups),
> starting from the last one we allocated.
>
> In other words, no. It's not worth doing anything more than we already do.
>
> I bet all the expense in this area tends under heavy load to be the
> cacheline bouncing of the updates. Keeping the lock close to the bitmap is
> probably advantageous, since the bitmap tends to be looked at only when we
> need to change them (and we hold the lock).
Absolutely :)
Here is the patch I use to :
- place modified bits (file_lock & next_fd) in a separate cache line, reducing
cacheline bouncing for multi threaded apps.
- Reduce the size of struct (files_struct), using small embedded fd_sets
matching fd_array[NR_OPEN_DEFAULT] (ie one long per 'fd_set' instead of 128 bytes)
- 10 % gain on a benchmark ran on a dual HT Xeon 2GHz, one thread per logical CPU.
Eric
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
[-- Attachment #2: patch_reorder_1 --]
[-- Type: text/plain, Size: 9818 bytes --]
--- linux-2.6.14-rc1/include/linux/file.h 2005-09-13 05:12:09.000000000 +0200
+++ linux-2.6.14-rc1-ed/include/linux/file.h 2005-09-16 00:54:41.000000000 +0200
@@ -16,11 +16,17 @@
* as this is the granularity returned by copy_fdset().
*/
#define NR_OPEN_DEFAULT BITS_PER_LONG
+/*
+ * The embedded_fd_set is a small subset of a fd_set.
+ * One long is enough for most tasks.
+ */
+typedef struct {
+ unsigned long fds_bits[NR_OPEN_DEFAULT/BITS_PER_LONG];
+} embedded_fd_set;
struct fdtable {
unsigned int max_fds;
- int max_fdset;
- int next_fd;
+ unsigned int max_fdset;
struct file ** fd; /* current fd array */
fd_set *close_on_exec;
fd_set *open_fds;
@@ -33,13 +39,21 @@
* Open file table structure
*/
struct files_struct {
- atomic_t count;
- spinlock_t file_lock; /* Protects all the below members. Nests inside tsk->alloc_lock */
+/*
+ * Mostly read part
+ */
+ atomic_t count;
struct fdtable *fdt;
struct fdtable fdtab;
- fd_set close_on_exec_init;
- fd_set open_fds_init;
- struct file * fd_array[NR_OPEN_DEFAULT];
+
+/*
+ * Modified part (open()/close()) in a separate cache line
+ */
+ spinlock_t file_lock ____cacheline_aligned_in_smp; /* Writers take this lock. Nests inside tsk->alloc_lock */
+ int next_fd;
+ embedded_fd_set close_on_exec_init;
+ embedded_fd_set open_fds_init;
+ struct file * fd_array[NR_OPEN_DEFAULT];
};
#define files_fdtable(files) (rcu_dereference((files)->fdt))
@@ -63,13 +77,11 @@
extern void filp_ctor(void * objp, struct kmem_cache_s *cachep, unsigned long cflags);
extern void filp_dtor(void * objp, struct kmem_cache_s *cachep, unsigned long dflags);
-extern struct file ** alloc_fd_array(int);
-extern void free_fd_array(struct file **, int);
+extern void free_fd_array(struct file **, unsigned int);
-extern fd_set *alloc_fdset(int);
-extern void free_fdset(fd_set *, int);
+extern void free_fdset(fd_set *, unsigned int);
-extern int expand_files(struct files_struct *, int nr);
+extern int expand_files(struct files_struct *, unsigned int nr);
extern void free_fdtable(struct fdtable *fdt);
extern void __init files_defer_init(void);
--- linux-2.6.14-rc1/include/linux/init_task.h 2005-09-13 05:12:09.000000000 +0200
+++ linux-2.6.14-rc1-ed/include/linux/init_task.h 2005-09-16 00:09:19.000000000 +0200
@@ -8,10 +8,9 @@
{ \
.max_fds = NR_OPEN_DEFAULT, \
.max_fdset = __FD_SETSIZE, \
- .next_fd = 0, \
.fd = &init_files.fd_array[0], \
- .close_on_exec = &init_files.close_on_exec_init, \
- .open_fds = &init_files.open_fds_init, \
+ .close_on_exec = (fd_set *)&init_files.close_on_exec_init, \
+ .open_fds = (fd_set *)&init_files.open_fds_init, \
.rcu = RCU_HEAD_INIT, \
.free_files = NULL, \
.next = NULL, \
@@ -20,9 +19,10 @@
#define INIT_FILES \
{ \
.count = ATOMIC_INIT(1), \
- .file_lock = SPIN_LOCK_UNLOCKED, \
.fdt = &init_files.fdtab, \
.fdtab = INIT_FDTABLE, \
+ .file_lock = SPIN_LOCK_UNLOCKED, \
+ .next_fd = 0, \
.close_on_exec_init = { { 0, } }, \
.open_fds_init = { { 0, } }, \
.fd_array = { NULL, } \
--- linux-2.6.14-rc1/fs/file.c 2005-09-13 05:12:09.000000000 +0200
+++ linux-2.6.14-rc1-ed/fs/file.c 2005-09-16 00:53:15.000000000 +0200
@@ -38,10 +38,10 @@
* Allocate an fd array, using kmalloc or vmalloc.
* Note: the array isn't cleared at allocation time.
*/
-struct file ** alloc_fd_array(int num)
+static struct file ** alloc_fd_array(unsigned int num)
{
struct file **new_fds;
- int size = num * sizeof(struct file *);
+ unsigned int size = num * sizeof(struct file *);
if (size <= PAGE_SIZE)
new_fds = (struct file **) kmalloc(size, GFP_KERNEL);
@@ -50,12 +50,12 @@
return new_fds;
}
-void free_fd_array(struct file **array, int num)
+void free_fd_array(struct file **array, unsigned int num)
{
- int size = num * sizeof(struct file *);
+ unsigned int size = num * sizeof(struct file *);
- if (!array) {
- printk (KERN_ERR "free_fd_array: array = 0 (num = %d)\n", num);
+ if (unlikely(!array)) {
+ printk (KERN_ERR "free_fd_array: array = 0 (num = %u)\n", num);
return;
}
@@ -69,7 +69,7 @@
static void __free_fdtable(struct fdtable *fdt)
{
- int fdset_size, fdarray_size;
+ unsigned int fdset_size, fdarray_size;
fdset_size = fdt->max_fdset / 8;
fdarray_size = fdt->max_fds * sizeof(struct file *);
@@ -129,7 +129,8 @@
kmem_cache_free(files_cachep, fdt->free_files);
return;
}
- if (fdt->max_fdset <= __FD_SETSIZE && fdt->max_fds <= NR_OPEN_DEFAULT) {
+ if (fdt->max_fdset <= 8 * sizeof(embedded_fd_set) &&
+ fdt->max_fds <= NR_OPEN_DEFAULT) {
/*
* The fdtable was embedded
*/
@@ -159,8 +160,9 @@
void free_fdtable(struct fdtable *fdt)
{
- if (fdt->free_files || fdt->max_fdset > __FD_SETSIZE ||
- fdt->max_fds > NR_OPEN_DEFAULT)
+ if (fdt->free_files ||
+ fdt->max_fdset > 8 * sizeof(embedded_fd_set) ||
+ fdt->max_fds > NR_OPEN_DEFAULT)
call_rcu(&fdt->rcu, free_fdtable_rcu);
}
@@ -170,7 +172,7 @@
*/
static void copy_fdtable(struct fdtable *nfdt, struct fdtable *fdt)
{
- int i;
+ unsigned int i;
int count;
BUG_ON(nfdt->max_fdset < fdt->max_fdset);
@@ -203,17 +205,17 @@
(nfdt->max_fds - fdt->max_fds) *
sizeof(struct file *));
}
- nfdt->next_fd = fdt->next_fd;
+// nfdt->next_fd = fdt->next_fd;
}
/*
* Allocate an fdset array, using kmalloc or vmalloc.
* Note: the array isn't cleared at allocation time.
*/
-fd_set * alloc_fdset(int num)
+static fd_set * alloc_fdset(unsigned int num)
{
fd_set *new_fdset;
- int size = num / 8;
+ unsigned int size = num / 8;
if (size <= PAGE_SIZE)
new_fdset = (fd_set *) kmalloc(size, GFP_KERNEL);
@@ -222,11 +224,11 @@
return new_fdset;
}
-void free_fdset(fd_set *array, int num)
+void free_fdset(fd_set *array, unsigned int num)
{
- int size = num / 8;
+ unsigned int size = num / 8;
- if (num <= __FD_SETSIZE) /* Don't free an embedded fdset */
+ if (size <= sizeof(embedded_fd_set)) /* Don't free an embedded fdset */
return;
else if (size <= PAGE_SIZE)
kfree(array);
@@ -234,10 +236,10 @@
vfree(array);
}
-static struct fdtable *alloc_fdtable(int nr)
+static struct fdtable *alloc_fdtable(unsigned int nr)
{
struct fdtable *fdt = NULL;
- int nfds = 0;
+ unsigned int nfds = 0;
fd_set *new_openset = NULL, *new_execset = NULL;
struct file **new_fds;
@@ -246,16 +248,12 @@
goto out;
memset(fdt, 0, sizeof(*fdt));
- nfds = __FD_SETSIZE;
+ nfds = 8 * L1_CACHE_BYTES;
/* Expand to the max in easy steps */
do {
- if (nfds < (PAGE_SIZE * 8))
- nfds = PAGE_SIZE * 8;
- else {
- nfds = nfds * 2;
- if (nfds > NR_OPEN)
- nfds = NR_OPEN;
- }
+ nfds = nfds * 2;
+ if (nfds > NR_OPEN)
+ nfds = NR_OPEN;
} while (nfds <= nr);
new_openset = alloc_fdset(nfds);
@@ -306,7 +304,7 @@
* both fd array and fdset. It is expected to be called with the
* files_lock held.
*/
-static int expand_fdtable(struct files_struct *files, int nr)
+static int expand_fdtable(struct files_struct *files, unsigned int nr)
__releases(files->file_lock)
__acquires(files->file_lock)
{
@@ -348,7 +346,7 @@
* Return <0 on error; 0 nothing done; 1 files expanded, we may have blocked.
* Should be called with the files->file_lock spinlock held for write.
*/
-int expand_files(struct files_struct *files, int nr)
+int expand_files(struct files_struct *files, unsigned int nr)
{
int err, expand = 0;
struct fdtable *fdt;
--- linux-2.6.14-rc1/fs/fcntl.c 2005-09-13 05:12:09.000000000 +0200
+++ linux-2.6.14-rc1-ed/fs/fcntl.c 2005-09-16 00:09:19.000000000 +0200
@@ -69,11 +69,11 @@
fdt = files_fdtable(files);
/*
* Someone might have closed fd's in the range
- * orig_start..fdt->next_fd
+ * orig_start..files->next_fd
*/
start = orig_start;
- if (start < fdt->next_fd)
- start = fdt->next_fd;
+ if (start < files->next_fd)
+ start = files->next_fd;
newfd = start;
if (start < fdt->max_fdset) {
@@ -101,9 +101,8 @@
* we reacquire the fdtable pointer and use it while holding
* the lock, no one can free it during that time.
*/
- fdt = files_fdtable(files);
- if (start <= fdt->next_fd)
- fdt->next_fd = newfd + 1;
+ if (start <= files->next_fd)
+ files->next_fd = newfd + 1;
error = newfd;
--- linux-2.6.14-rc1/fs/open.c 2005-09-13 05:12:09.000000000 +0200
+++ linux-2.6.14-rc1-ed/fs/open.c 2005-09-16 00:11:23.000000000 +0200
@@ -852,7 +852,7 @@
fdt = files_fdtable(files);
fd = find_next_zero_bit(fdt->open_fds->fds_bits,
fdt->max_fdset,
- fdt->next_fd);
+ files->next_fd);
/*
* N.B. For clone tasks sharing a files structure, this test
@@ -877,7 +877,7 @@
FD_SET(fd, fdt->open_fds);
FD_CLR(fd, fdt->close_on_exec);
- fdt->next_fd = fd + 1;
+ files->next_fd = fd + 1;
#if 1
/* Sanity check */
if (fdt->fd[fd] != NULL) {
@@ -898,8 +898,8 @@
{
struct fdtable *fdt = files_fdtable(files);
__FD_CLR(fd, fdt->open_fds);
- if (fd < fdt->next_fd)
- fdt->next_fd = fd;
+ if (fd < files->next_fd)
+ files->next_fd = fd;
}
void fastcall put_unused_fd(unsigned int fd)
--- linux-2.6.14-rc1/kernel/fork.c 2005-09-13 05:12:09.000000000 +0200
+++ linux-2.6.14-rc1-ed/kernel/fork.c 2005-09-16 00:21:18.000000000 +0200
@@ -592,12 +592,12 @@
atomic_set(&newf->count, 1);
spin_lock_init(&newf->file_lock);
+ newf->next_fd = 0;
fdt = &newf->fdtab;
- fdt->next_fd = 0;
fdt->max_fds = NR_OPEN_DEFAULT;
- fdt->max_fdset = __FD_SETSIZE;
- fdt->close_on_exec = &newf->close_on_exec_init;
- fdt->open_fds = &newf->open_fds_init;
+ fdt->max_fdset = 8 * sizeof(embedded_fd_set);
+ fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init;
+ fdt->open_fds = (fd_set *)&newf->open_fds_init;
fdt->fd = &newf->fd_array[0];
INIT_RCU_HEAD(&fdt->rcu);
fdt->free_files = NULL;
next prev parent reply other threads:[~2005-09-15 21:08 UTC|newest]
Thread overview: 54+ messages / expand[flat|nested] mbox.gz Atom feed top
2005-09-13 3:34 "Read my lips: no more merges" - aka Linux 2.6.14-rc1 Linus Torvalds
2005-09-13 3:54 ` Alejandro Bonilla Beeche
2005-09-13 3:59 ` Keith Owens
2005-09-13 4:03 ` Alejandro Bonilla Beeche
2005-09-14 5:16 ` Alejandro Bonilla Beeche
2005-09-14 16:28 ` Jeff Garzik
2005-09-14 16:40 ` Alejandro Bonilla
2005-09-14 16:43 ` Linus Torvalds
2005-09-14 16:52 ` Alejandro Bonilla
2005-09-15 0:48 ` Alejandro Bonilla Beeche
2005-09-13 14:27 ` Linus Torvalds
2005-09-13 6:28 ` more fallout from ATI Xpress timer workaround (was: Linux 2.6.14-rc1) Cal Peake
2005-09-13 20:04 ` Jean Delvare
2005-09-13 6:33 ` "Read my lips: no more merges" - aka Linux 2.6.14-rc1 Sonny Rao
2005-09-13 7:04 ` Eric Dumazet
2005-09-15 4:06 ` David S. Miller
2005-09-15 4:22 ` Linus Torvalds
2005-09-15 20:13 ` Benjamin LaHaise
2005-09-15 20:32 ` Linus Torvalds
2005-09-15 21:08 ` Eric Dumazet [this message]
2005-09-15 20:41 ` Eric Dumazet
2005-09-13 7:34 ` Udo A. Steinberg
2005-09-13 10:40 ` Mathieu Fluhr
2005-09-13 11:15 ` Helge Hafting
2005-09-13 15:14 ` Linus Torvalds
2005-09-13 17:01 ` Mathieu Fluhr
2005-09-13 17:15 ` Linus Torvalds
2005-09-13 18:12 ` Mathieu Fluhr
2005-09-13 19:11 ` Linus Torvalds
2005-09-14 8:11 ` 2.6.13 brings buffer underruns when recording DVDs in 16x (was Re: "Read my lips: no more merges" - aka Linux 2.6.14-rc1) Mathieu Fluhr
2005-09-14 8:30 ` Andrew Morton
2005-09-14 10:32 ` Mathieu Fluhr
2005-09-14 10:58 ` Andrew Morton
2005-09-14 11:12 ` Alessandro Suardi
2005-09-14 15:04 ` "Read my lips: no more merges" - aka Linux 2.6.14-rc1 Bill Davidsen
2005-09-14 23:38 ` Redeeman
2005-09-13 18:34 ` Roland Dreier
2005-09-13 18:46 ` Linus Torvalds
2005-09-13 21:32 ` Horst von Brand
2005-09-13 19:57 ` Rafael J. Wysocki
2005-09-14 15:31 ` Bill Davidsen
2005-09-14 22:56 ` Matthew Garrett
2005-09-14 17:33 ` Bill Davidsen
2005-09-14 17:45 ` Bill Davidsen
2005-09-14 21:47 ` Henrik Persson
2005-09-14 23:20 ` Jesper Juhl
2005-09-16 19:51 ` Henrik Persson
2005-09-14 22:11 ` 2.6.14-rc1 on ATI hangs when executing _STA and _INI methods Peter Osterlund
2005-09-14 22:27 ` Linus Torvalds
2005-09-14 22:41 ` Peter Osterlund
2005-09-14 23:27 ` "Read my lips: no more merges" - aka Linux 2.6.14-rc1 Redeeman
2005-09-16 7:44 ` Tomasz Torcz
-- strict thread matches above, loose matches on Subject: below --
2005-09-13 6:07 Voluspa
2005-09-14 17:04 Steve Lee
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=4329E2C6.4030106@cosmosbay.com \
--to=dada1@cosmosbay.com \
--cc=bcrl@kvack.org \
--cc=linux-kernel@vger.kernel.org \
--cc=sonny@burdell.org \
--cc=torvalds@osdl.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox