* [patch v2] epoll use a single inode ...
@ 2007-03-05 23:41 Davide Libenzi
2007-03-06 0:00 ` Linus Torvalds
0 siblings, 1 reply; 29+ messages in thread
From: Davide Libenzi @ 2007-03-05 23:41 UTC (permalink / raw)
To: Linux Kernel Mailing List; +Cc: Linus Torvalds, Andrew Morton, Eric Dumazet
ChangeLog:
- v2) Properly size the buffer. Avoid extra strlen() (Eric Dumazet)
Epoll does not keep any private data attached to its inode, so there'd be
no need to allocate one inode per fd. For epoll, the inode is just a
placeholder for the file operations and could be shared by all instances.
I'd like to use the same optimization even for the upcoming file-based
objects, so if you see problems let me know.
One that Al was pointing out was that an fstat(2) over an epoll fd would
show the same st_ino. IMO that should be fine since an fstat(2) over an
epoll fd is not something you want to do in any case and expecting
meaningfull results.
This patch also avoids the dentry associated with the file* to be hashed
inside the global dentry hash.
Signed-off-by: Davide Libenzi <davidel@xmailserver.org>
- Davide
eventpoll.c | 39 +++++++++++++++++++++++++++++++++------
1 file changed, 33 insertions(+), 6 deletions(-)
Index: linux-2.6.20.ep2/fs/eventpoll.c
===================================================================
--- linux-2.6.20.ep2.orig/fs/eventpoll.c 2007-03-04 14:40:01.000000000 -0800
+++ linux-2.6.20.ep2/fs/eventpoll.c 2007-03-05 15:26:16.000000000 -0800
@@ -258,6 +258,7 @@
int maxevents, long timeout);
static int eventpollfs_delete_dentry(struct dentry *dentry);
static struct inode *ep_eventpoll_inode(void);
+static struct inode *ep_create_inode(void);
static int eventpollfs_get_sb(struct file_system_type *fs_type,
int flags, const char *dev_name,
void *data, struct vfsmount *mnt);
@@ -279,6 +280,9 @@
/* Virtual fs used to allocate inodes for eventpoll files */
static struct vfsmount *eventpoll_mnt __read_mostly;
+/* Placeholder inode for eventpoll fds */
+static struct inode *eventpoll_inode;
+
/* File callbacks that implement the eventpoll file behaviour */
static const struct file_operations eventpoll_fops = {
.release = ep_eventpoll_close,
@@ -733,7 +737,7 @@
struct eventpoll *ep)
{
struct qstr this;
- char name[32];
+ char name[2 * sizeof(void *) + 8];
struct dentry *dentry;
struct inode *inode;
struct file *file;
@@ -763,15 +767,17 @@
* using the inode number.
*/
error = -ENOMEM;
- sprintf(name, "[%lu]", inode->i_ino);
this.name = name;
- this.len = strlen(name);
- this.hash = inode->i_ino;
+ this.len = sprintf(name, "[%p]", ep);
+ this.hash = 0;
dentry = d_alloc(eventpoll_mnt->mnt_sb->s_root, &this);
if (!dentry)
goto eexit_4;
dentry->d_op = &eventpollfs_dentry_operations;
- d_add(dentry, inode);
+ /* Do not publish this dentry inside the global dentry hash table */
+ dentry->d_flags &= ~DCACHE_UNHASHED;
+ d_instantiate(dentry, inode);
+
file->f_path.mnt = mntget(eventpoll_mnt);
file->f_path.dentry = dentry;
file->f_mapping = inode->i_mapping;
@@ -1555,6 +1561,11 @@
static int eventpollfs_delete_dentry(struct dentry *dentry)
{
+ /*
+ * We faked vfs to believe the dentry was hashed when we created it.
+ * Now we restore the flag so that dput() will work correctly.
+ */
+ dentry->d_flags |= DCACHE_UNHASHED;
return 1;
}
@@ -1562,6 +1573,17 @@
static struct inode *ep_eventpoll_inode(void)
{
+
+ return igrab(eventpoll_inode);
+}
+
+/*
+ * A single inode exist for all eventpoll files. On the contrary of pipes,
+ * eventpoll inodes has no per-instance data associated, so we can avoid
+ * the allocation of multiple of them.
+ */
+static struct inode *ep_create_inode(void)
+{
int error = -ENOMEM;
struct inode *inode = new_inode(eventpoll_mnt->mnt_sb);
@@ -1626,10 +1648,14 @@
/* Mount the above commented virtual file system */
eventpoll_mnt = kern_mount(&eventpoll_fs_type);
- error = PTR_ERR(eventpoll_mnt);
if (IS_ERR(eventpoll_mnt))
goto epanic;
+ /* Create the single instance of inode for all eventpoll fds */
+ eventpoll_inode = ep_create_inode();
+ if (IS_ERR(eventpoll_inode))
+ goto epanic;
+
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: successfully initialized.\n",
current));
return 0;
@@ -1642,6 +1668,7 @@
static void __exit eventpoll_exit(void)
{
/* Undo all operations done inside eventpoll_init() */
+ iput(eventpoll_inode);
unregister_filesystem(&eventpoll_fs_type);
mntput(eventpoll_mnt);
kmem_cache_destroy(pwq_cache);
^ permalink raw reply [flat|nested] 29+ messages in thread* Re: [patch v2] epoll use a single inode ... 2007-03-05 23:41 [patch v2] epoll use a single inode Davide Libenzi @ 2007-03-06 0:00 ` Linus Torvalds 2007-03-06 0:12 ` Davide Libenzi 0 siblings, 1 reply; 29+ messages in thread From: Linus Torvalds @ 2007-03-06 0:00 UTC (permalink / raw) To: Davide Libenzi; +Cc: Linux Kernel Mailing List, Andrew Morton, Eric Dumazet On Mon, 5 Mar 2007, Davide Libenzi wrote: > @@ -763,15 +767,17 @@ > * using the inode number. > */ > error = -ENOMEM; > - sprintf(name, "[%lu]", inode->i_ino); > this.name = name; > - this.len = strlen(name); > - this.hash = inode->i_ino; > + this.len = sprintf(name, "[%p]", ep); > + this.hash = 0; Please don't expose kernel pointers to user space. It's much better to do something like static unsigned int epoll_inode; this.len = sprintf(name, "[%u]", ++epoll_inode); if you just need some pseudo-unique name to distinguish two epoll things from each other (vs from a dup'ed fd). Linus ^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [patch v2] epoll use a single inode ... 2007-03-06 0:00 ` Linus Torvalds @ 2007-03-06 0:12 ` Davide Libenzi 2007-03-06 0:20 ` Davide Libenzi ` (2 more replies) 0 siblings, 3 replies; 29+ messages in thread From: Davide Libenzi @ 2007-03-06 0:12 UTC (permalink / raw) To: Linus Torvalds; +Cc: Linux Kernel Mailing List, Andrew Morton, Eric Dumazet On Mon, 5 Mar 2007, Linus Torvalds wrote: > On Mon, 5 Mar 2007, Davide Libenzi wrote: > > @@ -763,15 +767,17 @@ > > * using the inode number. > > */ > > error = -ENOMEM; > > - sprintf(name, "[%lu]", inode->i_ino); > > this.name = name; > > - this.len = strlen(name); > > - this.hash = inode->i_ino; > > + this.len = sprintf(name, "[%p]", ep); > > + this.hash = 0; > > Please don't expose kernel pointers to user space. > > It's much better to do something like > > static unsigned int epoll_inode; > > this.len = sprintf(name, "[%u]", ++epoll_inode); > > if you just need some pseudo-unique name to distinguish two epoll things > from each other (vs from a dup'ed fd). Heh, this is what Al was saying ;) I'm fine with that, but how about counter cycles (going back to zero)? Should we care to handle them correctly? - Davide ^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [patch v2] epoll use a single inode ... 2007-03-06 0:12 ` Davide Libenzi @ 2007-03-06 0:20 ` Davide Libenzi 2007-03-06 0:25 ` Linus Torvalds 2007-03-10 8:06 ` Pavel Machek 2 siblings, 0 replies; 29+ messages in thread From: Davide Libenzi @ 2007-03-06 0:20 UTC (permalink / raw) To: Linus Torvalds; +Cc: Linux Kernel Mailing List, Andrew Morton, Eric Dumazet On Mon, 5 Mar 2007, Davide Libenzi wrote: > On Mon, 5 Mar 2007, Linus Torvalds wrote: > > > On Mon, 5 Mar 2007, Davide Libenzi wrote: > > > @@ -763,15 +767,17 @@ > > > * using the inode number. > > > */ > > > error = -ENOMEM; > > > - sprintf(name, "[%lu]", inode->i_ino); > > > this.name = name; > > > - this.len = strlen(name); > > > - this.hash = inode->i_ino; > > > + this.len = sprintf(name, "[%p]", ep); > > > + this.hash = 0; > > > > Please don't expose kernel pointers to user space. > > > > It's much better to do something like > > > > static unsigned int epoll_inode; > > > > this.len = sprintf(name, "[%u]", ++epoll_inode); > > > > if you just need some pseudo-unique name to distinguish two epoll things > > from each other (vs from a dup'ed fd). > > Heh, this is what Al was saying ;) > I'm fine with that, but how about counter cycles (going back to zero)? > Should we care to handle them correctly? Maybe a [TID.FD] format? - Davide ^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [patch v2] epoll use a single inode ... 2007-03-06 0:12 ` Davide Libenzi 2007-03-06 0:20 ` Davide Libenzi @ 2007-03-06 0:25 ` Linus Torvalds 2007-03-06 2:25 ` H. Peter Anvin 2007-03-10 8:06 ` Pavel Machek 2 siblings, 1 reply; 29+ messages in thread From: Linus Torvalds @ 2007-03-06 0:25 UTC (permalink / raw) To: Davide Libenzi; +Cc: Linux Kernel Mailing List, Andrew Morton, Eric Dumazet On Mon, 5 Mar 2007, Davide Libenzi wrote: > On Mon, 5 Mar 2007, Linus Torvalds wrote: > > > > It's much better to do something like > > > > static unsigned int epoll_inode; > > > > this.len = sprintf(name, "[%u]", ++epoll_inode); > > > > if you just need some pseudo-unique name to distinguish two epoll things > > from each other (vs from a dup'ed fd). > > Heh, this is what Al was saying ;) > I'm fine with that, but how about counter cycles (going back to zero)? > Should we care to handle them correctly? Since this is not actually *used* for anything but showing the fd's in /proc/<pid>/fd/ etc, no. In fact, an integer will wrap a *lot* less than a kernel data structure will be re-used, so even with the simple "wraps every 4G uses", you're still better off. IOW, if the thing actually _mattered_ we should use some bitmap allocator or similar (eg pidmaps etc), but with something where the only reason really is as a visible indicator of difference for a user that happens to look, simple-and-stupid is better. Linus ^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [patch v2] epoll use a single inode ... 2007-03-06 0:25 ` Linus Torvalds @ 2007-03-06 2:25 ` H. Peter Anvin 2007-03-06 2:34 ` Davide Libenzi 0 siblings, 1 reply; 29+ messages in thread From: H. Peter Anvin @ 2007-03-06 2:25 UTC (permalink / raw) To: Linus Torvalds Cc: Davide Libenzi, Linux Kernel Mailing List, Andrew Morton, Eric Dumazet Linus Torvalds wrote: > > Since this is not actually *used* for anything but showing the fd's in > /proc/<pid>/fd/ etc, no. In fact, an integer will wrap a *lot* less than a > kernel data structure will be re-used, so even with the simple "wraps > every 4G uses", you're still better off. > ... and if that worries you, use a 64-bit counter. They're cheap (compared to an sprintf), and even if they advance once a nanosecond they won't wrap around for over 584 years. > IOW, if the thing actually _mattered_ we should use some bitmap allocator > or similar (eg pidmaps etc), but with something where the only reason > really is as a visible indicator of difference for a user that happens to > look, simple-and-stupid is better. That only makes sense if it matters that the numbers are kept small. This is in fact a highly suboptimal constraint, as it is almost guaranteed to create wraparounds much sooner, but there are situations in which that's the only option, e.g. for pty number allocations due to glibc braindamage. -hpa ^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [patch v2] epoll use a single inode ... 2007-03-06 2:25 ` H. Peter Anvin @ 2007-03-06 2:34 ` Davide Libenzi 2007-03-06 2:37 ` H. Peter Anvin 0 siblings, 1 reply; 29+ messages in thread From: Davide Libenzi @ 2007-03-06 2:34 UTC (permalink / raw) To: H. Peter Anvin Cc: Linus Torvalds, Linux Kernel Mailing List, Andrew Morton, Eric Dumazet On Mon, 5 Mar 2007, H. Peter Anvin wrote: > Linus Torvalds wrote: > > > > Since this is not actually *used* for anything but showing the fd's in > > /proc/<pid>/fd/ etc, no. In fact, an integer will wrap a *lot* less than a > > kernel data structure will be re-used, so even with the simple "wraps every > > 4G uses", you're still better off. > > > > ... and if that worries you, use a 64-bit counter. They're cheap (compared to > an sprintf), and even if they advance once a nanosecond they won't wrap around > for over 584 years. Right now is using: this.len = sprintf(name, "[%u.%d]", current->pid, fd); That should be unique and not have the wraparound problem. Ack? - Davide ^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [patch v2] epoll use a single inode ... 2007-03-06 2:34 ` Davide Libenzi @ 2007-03-06 2:37 ` H. Peter Anvin 2007-03-06 2:43 ` Davide Libenzi 0 siblings, 1 reply; 29+ messages in thread From: H. Peter Anvin @ 2007-03-06 2:37 UTC (permalink / raw) To: Davide Libenzi Cc: Linus Torvalds, Linux Kernel Mailing List, Andrew Morton, Eric Dumazet Davide Libenzi wrote: > > Right now is using: > > this.len = sprintf(name, "[%u.%d]", current->pid, fd); > > That should be unique and not have the wraparound problem. Ack? > NAK, very much NAK. File descriptors aren't file structures, they're *pointers* to file structures. It's perfectly possible -- downright common -- for a file descriptor to be inherited by another process, and then the pid is recycled -- collision. -hpa ^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [patch v2] epoll use a single inode ... 2007-03-06 2:37 ` H. Peter Anvin @ 2007-03-06 2:43 ` Davide Libenzi 2007-03-06 6:22 ` Eric Dumazet 0 siblings, 1 reply; 29+ messages in thread From: Davide Libenzi @ 2007-03-06 2:43 UTC (permalink / raw) To: H. Peter Anvin Cc: Linus Torvalds, Linux Kernel Mailing List, Andrew Morton, Eric Dumazet On Mon, 5 Mar 2007, H. Peter Anvin wrote: > Davide Libenzi wrote: > > > > Right now is using: > > > > this.len = sprintf(name, "[%u.%d]", current->pid, fd); > > > > That should be unique and not have the wraparound problem. Ack? > > > > NAK, very much NAK. > > File descriptors aren't file structures, they're *pointers* to file > structures. > > It's perfectly possible -- downright common -- for a file descriptor to be > inherited by another process, and then the pid is recycled -- collision. Ugh! Right! 64 bit counter it is ... :) - Davide ^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [patch v2] epoll use a single inode ... 2007-03-06 2:43 ` Davide Libenzi @ 2007-03-06 6:22 ` Eric Dumazet 2007-03-06 6:31 ` Davide Libenzi 2007-03-06 16:28 ` H. Peter Anvin 0 siblings, 2 replies; 29+ messages in thread From: Eric Dumazet @ 2007-03-06 6:22 UTC (permalink / raw) To: Davide Libenzi Cc: H. Peter Anvin, Linus Torvalds, Linux Kernel Mailing List, Andrew Morton Davide Libenzi a écrit : > On Mon, 5 Mar 2007, H. Peter Anvin wrote: > >> Davide Libenzi wrote: >>> Right now is using: >>> >>> this.len = sprintf(name, "[%u.%d]", current->pid, fd); >>> >>> That should be unique and not have the wraparound problem. Ack? >>> >> NAK, very much NAK. >> >> File descriptors aren't file structures, they're *pointers* to file >> structures. >> >> It's perfectly possible -- downright common -- for a file descriptor to be >> inherited by another process, and then the pid is recycled -- collision. > > Ugh! Right! 64 bit counter it is ... :) For epoll, I suspect this is harmless : Programs dont allocate epolls fd every milli second, but at startup only. For pipes/sockets, using a 64 bits would be problematic, because sprintf() uses a divide for each digit. And a divide is slow. Ten divides are *very* slow. ^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [patch v2] epoll use a single inode ... 2007-03-06 6:22 ` Eric Dumazet @ 2007-03-06 6:31 ` Davide Libenzi 2007-03-06 6:37 ` Davide Libenzi 2007-03-06 16:28 ` H. Peter Anvin 1 sibling, 1 reply; 29+ messages in thread From: Davide Libenzi @ 2007-03-06 6:31 UTC (permalink / raw) To: Eric Dumazet Cc: H. Peter Anvin, Linus Torvalds, Linux Kernel Mailing List, Andrew Morton [-- Attachment #1: Type: TEXT/PLAIN, Size: 1101 bytes --] On Tue, 6 Mar 2007, Eric Dumazet wrote: > Davide Libenzi a écrit : > > On Mon, 5 Mar 2007, H. Peter Anvin wrote: > > > > > Davide Libenzi wrote: > > > > Right now is using: > > > > > > > > this.len = sprintf(name, "[%u.%d]", current->pid, fd); > > > > > > > > That should be unique and not have the wraparound problem. Ack? > > > > > > > NAK, very much NAK. > > > > > > File descriptors aren't file structures, they're *pointers* to file > > > structures. > > > > > > It's perfectly possible -- downright common -- for a file descriptor to be > > > inherited by another process, and then the pid is recycled -- collision. > > > > Ugh! Right! 64 bit counter it is ... :) > > For epoll, I suspect this is harmless : Programs dont allocate epolls fd every > milli second, but at startup only. > > For pipes/sockets, using a 64 bits would be problematic, because sprintf() > uses a divide for each digit. And a divide is slow. Ten divides are *very* > slow. This would be used for epoll, signalfd and timerfd. And the printf format is %llu ;) - Davide ^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [patch v2] epoll use a single inode ... 2007-03-06 6:31 ` Davide Libenzi @ 2007-03-06 6:37 ` Davide Libenzi 0 siblings, 0 replies; 29+ messages in thread From: Davide Libenzi @ 2007-03-06 6:37 UTC (permalink / raw) To: Eric Dumazet Cc: H. Peter Anvin, Linus Torvalds, Linux Kernel Mailing List, Andrew Morton On Mon, 5 Mar 2007, Davide Libenzi wrote: > This would be used for epoll, signalfd and timerfd. And the printf format > is %llu ;) Today is really one on those days ... %llx - Davide ^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [patch v2] epoll use a single inode ... 2007-03-06 6:22 ` Eric Dumazet 2007-03-06 6:31 ` Davide Libenzi @ 2007-03-06 16:28 ` H. Peter Anvin 2007-03-06 17:09 ` Linus Torvalds ` (2 more replies) 1 sibling, 3 replies; 29+ messages in thread From: H. Peter Anvin @ 2007-03-06 16:28 UTC (permalink / raw) To: Eric Dumazet Cc: Davide Libenzi, Linus Torvalds, Linux Kernel Mailing List, Andrew Morton Eric Dumazet wrote: > > For epoll, I suspect this is harmless : Programs dont allocate epolls fd > every milli second, but at startup only. > > For pipes/sockets, using a 64 bits would be problematic, because > sprintf() uses a divide for each digit. And a divide is slow. Ten > divides are *very* slow. > That's true for *any* sprintf(), though. sprintf() converts all its arguments to 64 bits. However, this could be optimized. I think right now sprintf() uses a generic divide-by-base, but a divide by 8 and 16 can of course be handled with a shift, and divide by 10 can be replaced with a multiplication by 0x1999999999999999ULL on most architectures. -hpa ^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [patch v2] epoll use a single inode ... 2007-03-06 16:28 ` H. Peter Anvin @ 2007-03-06 17:09 ` Linus Torvalds 2007-03-06 17:14 ` H. Peter Anvin 2007-03-06 17:12 ` Eric Dumazet 2007-03-06 18:10 ` Davide Libenzi 2 siblings, 1 reply; 29+ messages in thread From: Linus Torvalds @ 2007-03-06 17:09 UTC (permalink / raw) To: H. Peter Anvin Cc: Eric Dumazet, Davide Libenzi, Linux Kernel Mailing List, Andrew Morton On Tue, 6 Mar 2007, H. Peter Anvin wrote: > > That's true for *any* sprintf(), though. sprintf() converts all its arguments > to 64 bits. Well, it very much uses "do_div()", so that it can do a 64 / 32 -> (div64,mod32) divide, which is often faster than a full 64:64 divide. > However, this could be optimized. I think right now sprintf() uses a generic > divide-by-base, but a divide by 8 and 16 can of course be handled with a > shift, and divide by 10 can be replaced with a multiplication by > 0x1999999999999999ULL on most architectures. Nope. You need both the result of the division *and* the remainder, and you can't do that with a single multiply. Also, with modern hardware, divides are usually fairly cheap, with early exit logic, so that the common case of small integers is fairly cheap. Yeah, generating a full 64-bit number printout is still expensive, of course (both because you need to do many divides *and* because only the last few divides will be able to do any appreciable early exit logic. Anyway, I think a full integer divide on Core 2 is something like 22 cycles. Yes, the multiply is much fasster (at 4 cycles), but I think that 22 cycles is actually worst-case. Somebody who has a benchmark could try. Linus ^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [patch v2] epoll use a single inode ... 2007-03-06 17:09 ` Linus Torvalds @ 2007-03-06 17:14 ` H. Peter Anvin 0 siblings, 0 replies; 29+ messages in thread From: H. Peter Anvin @ 2007-03-06 17:14 UTC (permalink / raw) To: Linus Torvalds Cc: Eric Dumazet, Davide Libenzi, Linux Kernel Mailing List, Andrew Morton Linus Torvalds wrote: > >> However, this could be optimized. I think right now sprintf() uses a generic >> divide-by-base, but a divide by 8 and 16 can of course be handled with a >> shift, and divide by 10 can be replaced with a multiplication by >> 0x1999999999999999ULL on most architectures. > > Nope. You need both the result of the division *and* the remainder, and > you can't do that with a single multiply. Oh, right. *D'oh*. -hpa ^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [patch v2] epoll use a single inode ... 2007-03-06 16:28 ` H. Peter Anvin 2007-03-06 17:09 ` Linus Torvalds @ 2007-03-06 17:12 ` Eric Dumazet 2007-03-06 17:19 ` Linus Torvalds 2007-03-06 18:10 ` Davide Libenzi 2 siblings, 1 reply; 29+ messages in thread From: Eric Dumazet @ 2007-03-06 17:12 UTC (permalink / raw) To: H. Peter Anvin Cc: Davide Libenzi, Linus Torvalds, Linux Kernel Mailing List, Andrew Morton [-- Attachment #1: Type: text/plain, Size: 1214 bytes --] On Tuesday 06 March 2007 17:28, H. Peter Anvin wrote: > Eric Dumazet wrote: > > For epoll, I suspect this is harmless : Programs dont allocate epolls fd > > every milli second, but at startup only. > > > > For pipes/sockets, using a 64 bits would be problematic, because > > sprintf() uses a divide for each digit. And a divide is slow. Ten > > divides are *very* slow. > > That's true for *any* sprintf(), though. sprintf() converts all its > arguments to 64 bits. > > However, this could be optimized. I think right now sprintf() uses a > generic divide-by-base, but a divide by 8 and 16 can of course be > handled with a shift, and divide by 10 can be replaced with a > multiplication by 0x1999999999999999ULL on most architectures. Or maybe just use reciprocal division, to keep the 35 bases available in number() Something like : [PATCH] : Use reciprocal divides in sprintf() pipe() syscalls spend a noticeable amount of time in sprintf() to format their dentry name. One name may want to print 9 or 10 digits, using one divide per digit. Using reciprocal divides permits to change each divide by two multiplies, less expensive on current CPUS. Signed-off-by: Eric Dumazet <dada1@cosmosbay.com> [-- Attachment #2: reciprocal_divide_in_sprintf.patch --] [-- Type: text/plain, Size: 2863 bytes --] diff --git a/include/linux/reciprocal_div.h b/include/linux/reciprocal_div.h index f9c90b3..55a79e0 100644 --- a/include/linux/reciprocal_div.h +++ b/include/linux/reciprocal_div.h @@ -23,7 +23,10 @@ #include <linux/types.h> * or else the performance is slower than a normal divide. */ extern u32 reciprocal_value(u32 B); - +/* + * precomputed reciprocal values of integers from 0 to 36 + */ +extern const u32 reciprocal_values[36 + 1]; static inline u32 reciprocal_divide(u32 A, u32 R) { diff --git a/lib/reciprocal_div.c b/lib/reciprocal_div.c index 6a3bd48..2dcea45 100644 --- a/lib/reciprocal_div.c +++ b/lib/reciprocal_div.c @@ -1,6 +1,31 @@ #include <asm/div64.h> #include <linux/reciprocal_div.h> +#define CONSTANT_RECIPROCAL_VALUE(k) \ + (u32)(((1LL << 32) + (k - 1)) / k) + +const u32 reciprocal_values[36 + 1] = { + 0, + CONSTANT_RECIPROCAL_VALUE(1), CONSTANT_RECIPROCAL_VALUE(2), + CONSTANT_RECIPROCAL_VALUE(3), CONSTANT_RECIPROCAL_VALUE(4), + CONSTANT_RECIPROCAL_VALUE(5), CONSTANT_RECIPROCAL_VALUE(6), + CONSTANT_RECIPROCAL_VALUE(7), CONSTANT_RECIPROCAL_VALUE(8), + CONSTANT_RECIPROCAL_VALUE(9), CONSTANT_RECIPROCAL_VALUE(10), + CONSTANT_RECIPROCAL_VALUE(11), CONSTANT_RECIPROCAL_VALUE(12), + CONSTANT_RECIPROCAL_VALUE(13), CONSTANT_RECIPROCAL_VALUE(14), + CONSTANT_RECIPROCAL_VALUE(15), CONSTANT_RECIPROCAL_VALUE(16), + CONSTANT_RECIPROCAL_VALUE(17), CONSTANT_RECIPROCAL_VALUE(18), + CONSTANT_RECIPROCAL_VALUE(19), CONSTANT_RECIPROCAL_VALUE(20), + CONSTANT_RECIPROCAL_VALUE(21), CONSTANT_RECIPROCAL_VALUE(22), + CONSTANT_RECIPROCAL_VALUE(23), CONSTANT_RECIPROCAL_VALUE(24), + CONSTANT_RECIPROCAL_VALUE(25), CONSTANT_RECIPROCAL_VALUE(26), + CONSTANT_RECIPROCAL_VALUE(27), CONSTANT_RECIPROCAL_VALUE(28), + CONSTANT_RECIPROCAL_VALUE(29), CONSTANT_RECIPROCAL_VALUE(30), + CONSTANT_RECIPROCAL_VALUE(31), CONSTANT_RECIPROCAL_VALUE(32), + CONSTANT_RECIPROCAL_VALUE(33), CONSTANT_RECIPROCAL_VALUE(34), + CONSTANT_RECIPROCAL_VALUE(35), CONSTANT_RECIPROCAL_VALUE(36), +}; + u32 reciprocal_value(u32 k) { u64 val = (1LL << 32) + (k - 1); diff --git a/lib/vsprintf.c b/lib/vsprintf.c index b025864..9c98cc4 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -22,6 +22,7 @@ #include <linux/types.h> #include <linux/string.h> #include <linux/ctype.h> #include <linux/kernel.h> +#include <linux/reciprocal_div.h> #include <asm/page.h> /* for PAGE_SIZE */ #include <asm/div64.h> @@ -180,8 +181,16 @@ static char * number(char * buf, char * i = 0; if (num == 0) tmp[i++]='0'; - else while (num != 0) - tmp[i++] = digits[do_div(num,base)]; + else { + while (num >> 32) + tmp[i++] = digits[do_div(num,base)]; + while (num != 0) { + u32 next = reciprocal_divide((u32)num, + reciprocal_values[base]); + tmp[i++] = digits[num - next*base]; + num = next; + } + } if (i > precision) precision = i; size -= precision; ^ permalink raw reply related [flat|nested] 29+ messages in thread
* Re: [patch v2] epoll use a single inode ... 2007-03-06 17:12 ` Eric Dumazet @ 2007-03-06 17:19 ` Linus Torvalds 2007-03-06 17:27 ` H. Peter Anvin 2007-03-06 17:28 ` Eric Dumazet 0 siblings, 2 replies; 29+ messages in thread From: Linus Torvalds @ 2007-03-06 17:19 UTC (permalink / raw) To: Eric Dumazet Cc: H. Peter Anvin, Davide Libenzi, Linux Kernel Mailing List, Andrew Morton On Tue, 6 Mar 2007, Eric Dumazet wrote: > > Something like : > > [PATCH] : Use reciprocal divides in sprintf() Try this on Core 2, and I suspect that you'll find that the hardware is actually *faster* than doing the shift/test, function call and the two multiplies. > Using reciprocal divides permits to change each divide by two multiplies, less > expensive on current CPUS. Are you sure? Linus ^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [patch v2] epoll use a single inode ... 2007-03-06 17:19 ` Linus Torvalds @ 2007-03-06 17:27 ` H. Peter Anvin 2007-03-06 17:28 ` Eric Dumazet 1 sibling, 0 replies; 29+ messages in thread From: H. Peter Anvin @ 2007-03-06 17:27 UTC (permalink / raw) To: Linus Torvalds Cc: Eric Dumazet, Davide Libenzi, Linux Kernel Mailing List, Andrew Morton Linus Torvalds wrote: > > On Tue, 6 Mar 2007, Eric Dumazet wrote: >> Something like : >> >> [PATCH] : Use reciprocal divides in sprintf() > > Try this on Core 2, and I suspect that you'll find that the hardware is > actually *faster* than doing the shift/test, function call and the > two multiplies. > >> Using reciprocal divides permits to change each divide by two multiplies, less >> expensive on current CPUS. > > Are you sure? > For base 8 and 16, this is shift and mask, respectively, so it's bound to be faster (although modern hardware can often optimize this, embedded hardware definitely can't.) Base 10, which even in the Linux kernel is almost certainly the most common case, is a lot iffier. -hpa ^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [patch v2] epoll use a single inode ... 2007-03-06 17:19 ` Linus Torvalds 2007-03-06 17:27 ` H. Peter Anvin @ 2007-03-06 17:28 ` Eric Dumazet 2007-03-06 18:10 ` Eric Dumazet 1 sibling, 1 reply; 29+ messages in thread From: Eric Dumazet @ 2007-03-06 17:28 UTC (permalink / raw) To: Linus Torvalds Cc: H. Peter Anvin, Davide Libenzi, Linux Kernel Mailing List, Andrew Morton On Tuesday 06 March 2007 18:19, Linus Torvalds wrote: > On Tue, 6 Mar 2007, Eric Dumazet wrote: > > Something like : > > > > [PATCH] : Use reciprocal divides in sprintf() > > Try this on Core 2, and I suspect that you'll find that the hardware is > actually *faster* than doing the shift/test, function call and the > two multiplies. > Where do you see a function call ? 448: 44 89 d0 mov %r10d,%eax 44b: 44 89 ea mov %r13d,%edx 44e: 48 0f af c1 imul %rcx,%rax 452: 48 c1 e8 20 shr $0x20,%rax 456: 0f af d0 imul %eax,%edx 459: 49 29 d2 sub %rdx,%r10 45c: 43 0f b6 14 16 movzbl (%r14,%r10,1),%edx 461: 41 89 c2 mov %eax,%r10d 464: 41 88 13 mov %dl,(%r11) 467: 49 ff c3 inc %r11 46a: 85 c0 test %eax,%eax 46c: 75 da jne 448 <number+0x138> > > Using reciprocal divides permits to change each divide by two multiplies, > > less expensive on current CPUS. > > Are you sure? I am going to test this, but at least on Opterons, the reciprocal divide I added into mm/slab.c gave me a nice speedup. I am going to bench some stupid loop : for (i = 0 ; i < 1000*1000 ; i++) { pipe(fds); close(fds[0]); close(fds[1]); } ^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [patch v2] epoll use a single inode ... 2007-03-06 17:28 ` Eric Dumazet @ 2007-03-06 18:10 ` Eric Dumazet 2007-03-06 20:20 ` Eric Dumazet 0 siblings, 1 reply; 29+ messages in thread From: Eric Dumazet @ 2007-03-06 18:10 UTC (permalink / raw) To: Linus Torvalds Cc: H. Peter Anvin, Davide Libenzi, Linux Kernel Mailing List, Andrew Morton [-- Attachment #1: Type: text/plain, Size: 2099 bytes --] On Tuesday 06 March 2007 18:28, Eric Dumazet wrote: > On Tuesday 06 March 2007 18:19, Linus Torvalds wrote: > > > > Using reciprocal divides permits to change each divide by two > > > multiplies, less expensive on current CPUS. > > > > Are you sure? > > I am going to test this, but at least on Opterons, the reciprocal divide I > added into mm/slab.c gave me a nice speedup. > With attached test program (one million calls to pipe()), I got about 0.1 s of speedup on my 1.6 GHz Pentium(M) dell D610 machine (3.350 s instead of 3.450 s, on many runs) Thats about 100 ns saved per number() call But then I realized that on ia32, gcc compilers is not very smart : static inline u32 reciprocal_divide(u32 A, u32 R) { return (u32)(((u64)A * R) >> 32); } It generates two multiplies... arg... //begin of reciprocal_divide() 4b0: 8b 4c 24 28 mov 0x28(%esp),%ecx 4b4: 89 f0 mov %esi,%eax 4b6: f7 64 24 24 mull 0x24(%esp) 4ba: 0f af ce imul %esi,%ecx 4bd: 8d 14 11 lea (%ecx,%edx,1),%edx // end of reciprocal_divide() 4c0: 8b 8c 24 8c 00 00 00 mov 0x8c(%esp),%ecx 4c7: 89 d0 mov %edx,%eax 4c9: 0f af c8 imul %eax,%ecx 4cc: 29 ce sub %ecx,%esi 4ce: 8b 4c 24 1c mov 0x1c(%esp),%ecx 4d2: 0f b6 34 31 movzbl (%ecx,%esi,1),%esi 4d6: 89 f1 mov %esi,%ecx 4d8: 89 c6 mov %eax,%esi 4da: 88 0f mov %cl,(%edi) 4dc: 47 inc %edi 4dd: ff 44 24 20 incl 0x20(%esp) 4e1: 85 c0 test %eax,%eax 4e3: 75 cb jne 4b0 <number+0x160> So even with a total of 3 multiplies per digit, we win... Maybe some bit of x86 asm is needed to make gcc be smarter (using only one multiply for reciprocal_divide()) [-- Attachment #2: pipes.c --] [-- Type: text/plain, Size: 241 bytes --] /* * micro benchmark to time calls to pipe()/close() */ main() { int fd[100*2]; unsigned int l, i; for (l = 0 ; l < 10000 ; l++) { for (i = 0 ; i < 100*2 ; i+=2) pipe(fd + i); for (i = 0 ; i < 100*2 ; i++) close(fd[i]); } } ^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [patch v2] epoll use a single inode ... 2007-03-06 18:10 ` Eric Dumazet @ 2007-03-06 20:20 ` Eric Dumazet 2007-03-07 3:47 ` Linus Torvalds 2007-03-07 23:46 ` Sami Farin 0 siblings, 2 replies; 29+ messages in thread From: Eric Dumazet @ 2007-03-06 20:20 UTC (permalink / raw) To: Linus Torvalds Cc: H. Peter Anvin, Davide Libenzi, Linux Kernel Mailing List, Andrew Morton [-- Attachment #1: Type: text/plain, Size: 3299 bytes --] Eric Dumazet a écrit : > On Tuesday 06 March 2007 18:28, Eric Dumazet wrote: >> On Tuesday 06 March 2007 18:19, Linus Torvalds wrote: >> >>>> Using reciprocal divides permits to change each divide by two >>>> multiplies, less expensive on current CPUS. >>> Are you sure? >> I am going to test this, but at least on Opterons, the reciprocal divide I >> added into mm/slab.c gave me a nice speedup. >> > Linus, I did a user space program, attached to this mail. I rewrote the reciprocal_div() for i386 so that one multiply is used. static inline u32 reciprocal_divide(u32 A, u32 R) { #if __i386 unsigned int edx, eax; asm("mul %2":"=a" (eax), "=d" (edx):"rm" (R), "0" (A)); return edx; #else return (u32)(((u64)A * R) >> 32); #endif } Results are really good on 32bit. On 64bit/Opteron they are impressive. $ gcc -O2 -o divide_bench divide_bench.c First result gives the number of cycles to perform old number() routine using plain do_div() Second result gives the number of cycles using reciprocal_div trick results on a Intel Pentium III 866 MHz $ ./divide_bench 413.453 cycles per call, last res=99999901 132.746 cycles per call, last res=99999901 $ ./divide_bench 411.833 cycles per call, last res=99999901 129.652 cycles per call, last res=99999901 $ ./divide_bench 480.645 cycles per call, last res=99999901 158.642 cycles per call, last res=99999901 $ ./divide_bench 412.769 cycles per call, last res=99999901 129.643 cycles per call, last res=99999901 $ ./divide_bench 410.809 cycles per call, last res=99999901 129.609 cycles per call, last res=99999901 results on AMD 246 (2GHz) Sorry this machine is quite loaded... I dont have a dev x86_64 machine. $ gcc -O2 -m32 -o divide_bench32 divide_bench.c $ ./divide_bench32 412.181 cycles per call, last res=99999901 112.314 cycles per call, last res=99999901 $ ./divide_bench32 444.008 cycles per call, last res=99999901 114.314 cycles per call, last res=99999901 $ ./divide_bench32 423.168 cycles per call, last res=99999901 112.318 cycles per call, last res=99999901 $ ./divide_bench32 427.73 cycles per call, last res=99999901 110.712 cycles per call, last res=99999901 $ ./divide_bench32 410.529 cycles per call, last res=99999901 114.068 cycles per call, last res=99999901 $ ./divide_bench32 489.856 cycles per call, last res=99999901 124.889 cycles per call, last res=99999901 $ ./divide_bench32 389.278 cycles per call, last res=99999901 104.697 cycles per call, last res=99999901 With a 64bit prog : $ gcc -O2 -m64 -o divide_bench64 divide_bench.c $ ./divide_bench64 826.136 cycles per call, last res=99999901 105.912 cycles per call, last res=99999901 $ ./divide_bench64 627.096 cycles per call, last res=99999901 76.2473 cycles per call, last res=99999901 $ ./divide_bench64 604.524 cycles per call, last res=99999901 76.1405 cycles per call, last res=99999901 $ ./divide_bench64 621.013 cycles per call, last res=99999901 76.0963 cycles per call, last res=99999901 $ ./divide_bench64 836.799 cycles per call, last res=99999901 103.967 cycles per call, last res=99999901 $ ./divide_bench64 982.718 cycles per call, last res=99999901 127.945 cycles per call, last res=99999901 $ ./divide_bench64 609.346 cycles per call, last res=99999901 76.0768 cycles per call, last res=99999901 [-- Attachment #2: divide_bench.c --] [-- Type: text/plain, Size: 3616 bytes --] #include <stdio.h> #include <stdlib.h> #include <stdint.h> #include <math.h> #ifdef __x86_64 #define rdtscll(val) do { \ unsigned int __a,__d; \ asm volatile("rdtsc" : "=a" (__a), "=d" (__d)); \ (val) = ((unsigned long)__a) | (((unsigned long)__d)<<32); \ } while(0) # define do_div(n,base) ({ \ uint32_t __base = (base); \ uint32_t __rem; \ __rem = ((uint64_t)(n)) % __base; \ (n) = ((uint64_t)(n)) / __base; \ __rem; \ }) #elif __i386 #define rdtscll(val) \ __asm__ __volatile__("rdtsc" : "=A" (val)) #define do_div(n,base) ({ \ unsigned long __upper, __low, __high, __mod, __base; \ __base = (base); \ asm("":"=a" (__low), "=d" (__high):"A" (n)); \ __upper = __high; \ if (__high) { \ __upper = __high % (__base); \ __high = __high / (__base); \ } \ asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (__base), "0" (__low), "1" (__upper)); \ asm("":"=A" (n):"a" (__low),"d" (__high)); \ __mod; \ }) #endif static const char digits[] = "0123456789abcdefghijklmnopqrstuvwxyz"; char *number_divides(unsigned long long num, int base, char *result) { if (num == 0) *result++ = '0'; else while (num) *result++ = digits[do_div(num,base)]; *result = 0; return result; } typedef unsigned int u32; typedef unsigned long long u64; #define CONSTANT_RECIPROCAL_VALUE(k) \ (u32)(((1LL << 32) + (k - 1)) / k) const u32 reciprocal_values[36 + 1] = { 0, CONSTANT_RECIPROCAL_VALUE(1), CONSTANT_RECIPROCAL_VALUE(2), CONSTANT_RECIPROCAL_VALUE(3), CONSTANT_RECIPROCAL_VALUE(4), CONSTANT_RECIPROCAL_VALUE(5), CONSTANT_RECIPROCAL_VALUE(6), CONSTANT_RECIPROCAL_VALUE(7), CONSTANT_RECIPROCAL_VALUE(8), CONSTANT_RECIPROCAL_VALUE(9), CONSTANT_RECIPROCAL_VALUE(10), CONSTANT_RECIPROCAL_VALUE(11), CONSTANT_RECIPROCAL_VALUE(12), CONSTANT_RECIPROCAL_VALUE(13), CONSTANT_RECIPROCAL_VALUE(14), CONSTANT_RECIPROCAL_VALUE(15), CONSTANT_RECIPROCAL_VALUE(16), CONSTANT_RECIPROCAL_VALUE(17), CONSTANT_RECIPROCAL_VALUE(18), CONSTANT_RECIPROCAL_VALUE(19), CONSTANT_RECIPROCAL_VALUE(20), CONSTANT_RECIPROCAL_VALUE(21), CONSTANT_RECIPROCAL_VALUE(22), CONSTANT_RECIPROCAL_VALUE(23), CONSTANT_RECIPROCAL_VALUE(24), CONSTANT_RECIPROCAL_VALUE(25), CONSTANT_RECIPROCAL_VALUE(26), CONSTANT_RECIPROCAL_VALUE(27), CONSTANT_RECIPROCAL_VALUE(28), CONSTANT_RECIPROCAL_VALUE(29), CONSTANT_RECIPROCAL_VALUE(30), CONSTANT_RECIPROCAL_VALUE(31), CONSTANT_RECIPROCAL_VALUE(32), CONSTANT_RECIPROCAL_VALUE(33), CONSTANT_RECIPROCAL_VALUE(34), CONSTANT_RECIPROCAL_VALUE(35), CONSTANT_RECIPROCAL_VALUE(36) }; static inline u32 reciprocal_divide(u32 A, u32 R) { #if __i386 unsigned int edx, eax; asm("mul %2":"=a" (eax), "=d" (edx):"rm" (R), "0" (A)); return edx; #else return (u32)(((u64)A * R) >> 32); #endif } char *number_reciprocal(unsigned long long num, int base, char *result) { if (num == 0) *result++ = '0'; else { while (num >>32) *result++ = digits[do_div(num,base)]; while (num) { u32 next = reciprocal_divide((u32)num, reciprocal_values[base]); *result++ = digits[num - next*base]; num = next; } } *result = 0; return result; } #define START 10000000 int base = 10; main() { unsigned long long start, end; char result[64]; unsigned long ul; rdtscll(start); for (ul = START; ul < START + 1000000;ul++) number_divides(ul, base, result); rdtscll(end); printf("%g cycles per call, last res=%s\n", (double)(end - start)/1000000.0, result); rdtscll(start); for (ul = START; ul < START + 1000000;ul++) number_reciprocal(ul, base, result); rdtscll(end); printf("%g cycles per call, last res=%s\n", (double)(end - start)/1000000.0, result); } ^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [patch v2] epoll use a single inode ... 2007-03-06 20:20 ` Eric Dumazet @ 2007-03-07 3:47 ` Linus Torvalds 2007-03-07 5:40 ` H. Peter Anvin 2007-03-07 6:57 ` Eric Dumazet 2007-03-07 23:46 ` Sami Farin 1 sibling, 2 replies; 29+ messages in thread From: Linus Torvalds @ 2007-03-07 3:47 UTC (permalink / raw) To: Eric Dumazet Cc: H. Peter Anvin, Davide Libenzi, Linux Kernel Mailing List, Andrew Morton On Tue, 6 Mar 2007, Eric Dumazet wrote: > > I did a user space program, attached to this mail. > > I rewrote the reciprocal_div() for i386 so that one multiply is used. Ok, this is definitely faster on Core 2 as well, so "numbers talk, bullshit walks". No more objections. (That said, I bet you could do even better for octal and hex numbers, so if you *really* want to speed things up, you should just make a special-case routine for each base (there's just three of them), and you can then also optimize the base-10 thing much better (you can do two digits at a time by dividing by 100, etc) Linus ^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [patch v2] epoll use a single inode ... 2007-03-07 3:47 ` Linus Torvalds @ 2007-03-07 5:40 ` H. Peter Anvin 2007-03-07 6:57 ` Eric Dumazet 1 sibling, 0 replies; 29+ messages in thread From: H. Peter Anvin @ 2007-03-07 5:40 UTC (permalink / raw) To: Linus Torvalds Cc: Eric Dumazet, Davide Libenzi, Linux Kernel Mailing List, Andrew Morton Linus Torvalds wrote: > > On Tue, 6 Mar 2007, Eric Dumazet wrote: >> I did a user space program, attached to this mail. >> >> I rewrote the reciprocal_div() for i386 so that one multiply is used. > > Ok, this is definitely faster on Core 2 as well, so "numbers talk, > bullshit walks". No more objections. > > (That said, I bet you could do even better for octal and hex numbers, so > if you *really* want to speed things up, you should just make a > special-case routine for each base (there's just three of them), and you > can then also optimize the base-10 thing much better (you can do two > digits at a time by dividing by 100, etc) > Of course you can do better for octal and hex -- it's just shift and mask. Decimal is trickier; however, at least on i386 it might make sense to divide by 100 and then use the AAM instruction, or a table lookup, to split it into individual digits. -hpa ^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [patch v2] epoll use a single inode ... 2007-03-07 3:47 ` Linus Torvalds 2007-03-07 5:40 ` H. Peter Anvin @ 2007-03-07 6:57 ` Eric Dumazet 2007-03-07 7:13 ` H. Peter Anvin 1 sibling, 1 reply; 29+ messages in thread From: Eric Dumazet @ 2007-03-07 6:57 UTC (permalink / raw) To: Linus Torvalds Cc: H. Peter Anvin, Davide Libenzi, Linux Kernel Mailing List, Andrew Morton Linus Torvalds a écrit : > > On Tue, 6 Mar 2007, Eric Dumazet wrote: >> I did a user space program, attached to this mail. >> >> I rewrote the reciprocal_div() for i386 so that one multiply is used. > > Ok, this is definitely faster on Core 2 as well, so "numbers talk, > bullshit walks". No more objections. And the numbers were ? :) > > (That said, I bet you could do even better for octal and hex numbers, so > if you *really* want to speed things up, you should just make a > special-case routine for each base (there's just three of them), and you > can then also optimize the base-10 thing much better (you can do two > digits at a time by dividing by 100, etc) Well, given that sprintf() is frequently called only for pipe/sockets creation, we probably better : 1) wait a very clever idea to suppress individual dentry per pipe/sockets (no more sprintf() at pipe/socket setup) 2) delay the sprintf() only if needed as you mentioned in a previous mail (when someone wants ls -l /proc/pid/fd/....), since their dentries are not anymore inserted in the global dcache hash, they could stay with a (nul) dname. ^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [patch v2] epoll use a single inode ... 2007-03-07 6:57 ` Eric Dumazet @ 2007-03-07 7:13 ` H. Peter Anvin 0 siblings, 0 replies; 29+ messages in thread From: H. Peter Anvin @ 2007-03-07 7:13 UTC (permalink / raw) To: Eric Dumazet Cc: Linus Torvalds, Davide Libenzi, Linux Kernel Mailing List, Andrew Morton Eric Dumazet wrote: > Linus Torvalds a écrit : >> >> On Tue, 6 Mar 2007, Eric Dumazet wrote: >>> I did a user space program, attached to this mail. >>> >>> I rewrote the reciprocal_div() for i386 so that one multiply is used. >> >> Ok, this is definitely faster on Core 2 as well, so "numbers talk, >> bullshit walks". No more objections. > > And the numbers were ? :) > >> >> (That said, I bet you could do even better for octal and hex numbers, >> so if you *really* want to speed things up, you should just make a >> special-case routine for each base (there's just three of them), and >> you can then also optimize the base-10 thing much better (you can do >> two digits at a time by dividing by 100, etc) > > Well, given that sprintf() is frequently called only for pipe/sockets > creation, we probably better : > > 1) wait a very clever idea to suppress individual dentry per > pipe/sockets (no more sprintf() at pipe/socket setup) > > 2) delay the sprintf() only if needed as you mentioned in a previous > mail (when someone wants ls -l /proc/pid/fd/....), since their dentries > are not anymore inserted in the global dcache hash, they could stay with > a (nul) dname. Yes, the right thing to do is probably to only generate these strings when someone tries to list them, not on every socket/pipe/epoll creation. One can assign a counter and keep it as a binary value at the start, but create the strings when necessary. -hpa ^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [patch v2] epoll use a single inode ... 2007-03-06 20:20 ` Eric Dumazet 2007-03-07 3:47 ` Linus Torvalds @ 2007-03-07 23:46 ` Sami Farin 1 sibling, 0 replies; 29+ messages in thread From: Sami Farin @ 2007-03-07 23:46 UTC (permalink / raw) To: Linux Kernel Mailing List On Tue, Mar 06, 2007 at 21:20:33 +0100, Eric Dumazet wrote: ... > I rewrote the reciprocal_div() for i386 so that one multiply is used. > > static inline u32 reciprocal_divide(u32 A, u32 R) > { > #if __i386 > unsigned int edx, eax; > asm("mul %2":"=a" (eax), "=d" (edx):"rm" (R), "0" (A)); ^^^ mul does not work if R is memory operand. mull should be used instead. -- ^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [patch v2] epoll use a single inode ... 2007-03-06 16:28 ` H. Peter Anvin 2007-03-06 17:09 ` Linus Torvalds 2007-03-06 17:12 ` Eric Dumazet @ 2007-03-06 18:10 ` Davide Libenzi 2 siblings, 0 replies; 29+ messages in thread From: Davide Libenzi @ 2007-03-06 18:10 UTC (permalink / raw) To: H. Peter Anvin Cc: Eric Dumazet, Linus Torvalds, Linux Kernel Mailing List, Andrew Morton On Tue, 6 Mar 2007, H. Peter Anvin wrote: > Eric Dumazet wrote: > > > > For epoll, I suspect this is harmless : Programs dont allocate epolls fd > > every milli second, but at startup only. > > > > For pipes/sockets, using a 64 bits would be problematic, because sprintf() > > uses a divide for each digit. And a divide is slow. Ten divides are *very* > > slow. > > > > That's true for *any* sprintf(), though. sprintf() converts all its arguments > to 64 bits. Sigh! We *always* do do_div(), even for the more trivial %x case. Anyway, it does not really matter for the type of dfs will be applied to. - Davide ^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [patch v2] epoll use a single inode ... 2007-03-06 0:12 ` Davide Libenzi 2007-03-06 0:20 ` Davide Libenzi 2007-03-06 0:25 ` Linus Torvalds @ 2007-03-10 8:06 ` Pavel Machek 2007-03-10 8:24 ` Davide Libenzi 2 siblings, 1 reply; 29+ messages in thread From: Pavel Machek @ 2007-03-10 8:06 UTC (permalink / raw) To: Davide Libenzi Cc: Linus Torvalds, Linux Kernel Mailing List, Andrew Morton, Eric Dumazet Hi! > > > @@ -763,15 +767,17 @@ > > > * using the inode number. > > > */ > > > error = -ENOMEM; > > > - sprintf(name, "[%lu]", inode->i_ino); > > > this.name = name; > > > - this.len = strlen(name); > > > - this.hash = inode->i_ino; > > > + this.len = sprintf(name, "[%p]", ep); > > > + this.hash = 0; > > > > Please don't expose kernel pointers to user space. > > > > It's much better to do something like > > > > static unsigned int epoll_inode; > > > > this.len = sprintf(name, "[%u]", ++epoll_inode); > > > > if you just need some pseudo-unique name to distinguish two epoll things > > from each other (vs from a dup'ed fd). > > Heh, this is what Al was saying ;) > I'm fine with that, but how about counter cycles (going back to zero)? Just use u64? Pavel -- (english) http://www.livejournal.com/~pavelmachek (cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html ^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [patch v2] epoll use a single inode ... 2007-03-10 8:06 ` Pavel Machek @ 2007-03-10 8:24 ` Davide Libenzi 0 siblings, 0 replies; 29+ messages in thread From: Davide Libenzi @ 2007-03-10 8:24 UTC (permalink / raw) To: Pavel Machek Cc: Linus Torvalds, Linux Kernel Mailing List, Andrew Morton, Eric Dumazet On Sat, 10 Mar 2007, Pavel Machek wrote: > > Heh, this is what Al was saying ;) > > I'm fine with that, but how about counter cycles (going back to zero)? > > Just use u64? Yeah, the second patch was using an u64. I ended up using a "class" name (signalfd, timerfd, asyncfd) as dname entry. An incremental counter would not add any useful information, and noone will ever care about dentry name of those objects. Prolly the class name is the only thing you might want to know, or we can drop even that and use a shared dentry for everything. - Davide ^ permalink raw reply [flat|nested] 29+ messages in thread
end of thread, other threads:[~2007-03-10 8:24 UTC | newest] Thread overview: 29+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2007-03-05 23:41 [patch v2] epoll use a single inode Davide Libenzi 2007-03-06 0:00 ` Linus Torvalds 2007-03-06 0:12 ` Davide Libenzi 2007-03-06 0:20 ` Davide Libenzi 2007-03-06 0:25 ` Linus Torvalds 2007-03-06 2:25 ` H. Peter Anvin 2007-03-06 2:34 ` Davide Libenzi 2007-03-06 2:37 ` H. Peter Anvin 2007-03-06 2:43 ` Davide Libenzi 2007-03-06 6:22 ` Eric Dumazet 2007-03-06 6:31 ` Davide Libenzi 2007-03-06 6:37 ` Davide Libenzi 2007-03-06 16:28 ` H. Peter Anvin 2007-03-06 17:09 ` Linus Torvalds 2007-03-06 17:14 ` H. Peter Anvin 2007-03-06 17:12 ` Eric Dumazet 2007-03-06 17:19 ` Linus Torvalds 2007-03-06 17:27 ` H. Peter Anvin 2007-03-06 17:28 ` Eric Dumazet 2007-03-06 18:10 ` Eric Dumazet 2007-03-06 20:20 ` Eric Dumazet 2007-03-07 3:47 ` Linus Torvalds 2007-03-07 5:40 ` H. Peter Anvin 2007-03-07 6:57 ` Eric Dumazet 2007-03-07 7:13 ` H. Peter Anvin 2007-03-07 23:46 ` Sami Farin 2007-03-06 18:10 ` Davide Libenzi 2007-03-10 8:06 ` Pavel Machek 2007-03-10 8:24 ` Davide Libenzi
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox