[patch v2] epoll use a single inode ...

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [patch v2] epoll use a single inode ...
@ 2007-03-05 23:41 Davide Libenzi
  2007-03-06  0:00 ` Linus Torvalds
  0 siblings, 1 reply; 29+ messages in thread
From: Davide Libenzi @ 2007-03-05 23:41 UTC (permalink / raw)
  To: Linux Kernel Mailing List; +Cc: Linus Torvalds, Andrew Morton, Eric Dumazet


ChangeLog:

- v2) Properly size the buffer. Avoid extra strlen() (Eric Dumazet)

Epoll does not keep any private data attached to its inode, so there'd be 
no need to allocate one inode per fd. For epoll, the inode is just a 
placeholder for the file operations and could be shared by all instances.
I'd like to use the same optimization even for the upcoming file-based 
objects, so if you see problems let me know.
One that Al was pointing out was that an fstat(2) over an epoll fd would 
show the same st_ino. IMO that should be fine since an fstat(2) over an 
epoll fd is not something you want to do in any case and expecting 
meaningfull results.
This patch also avoids the dentry associated with the file* to be hashed
inside the global dentry hash.



Signed-off-by: Davide Libenzi <davidel@xmailserver.org>



- Davide



eventpoll.c |   39 +++++++++++++++++++++++++++++++++------
1 file changed, 33 insertions(+), 6 deletions(-)



Index: linux-2.6.20.ep2/fs/eventpoll.c
===================================================================
--- linux-2.6.20.ep2.orig/fs/eventpoll.c	2007-03-04 14:40:01.000000000 -0800
+++ linux-2.6.20.ep2/fs/eventpoll.c	2007-03-05 15:26:16.000000000 -0800
@@ -258,6 +258,7 @@
 		   int maxevents, long timeout);
 static int eventpollfs_delete_dentry(struct dentry *dentry);
 static struct inode *ep_eventpoll_inode(void);
+static struct inode *ep_create_inode(void);
 static int eventpollfs_get_sb(struct file_system_type *fs_type,
 			      int flags, const char *dev_name,
 			      void *data, struct vfsmount *mnt);
@@ -279,6 +280,9 @@
 /* Virtual fs used to allocate inodes for eventpoll files */
 static struct vfsmount *eventpoll_mnt __read_mostly;
 
+/* Placeholder inode for eventpoll fds */
+static struct inode *eventpoll_inode;
+
 /* File callbacks that implement the eventpoll file behaviour */
 static const struct file_operations eventpoll_fops = {
 	.release	= ep_eventpoll_close,
@@ -733,7 +737,7 @@
 		    struct eventpoll *ep)
 {
 	struct qstr this;
-	char name[32];
+	char name[2 * sizeof(void *) + 8];
 	struct dentry *dentry;
 	struct inode *inode;
 	struct file *file;
@@ -763,15 +767,17 @@
 	 * using the inode number.
 	 */
 	error = -ENOMEM;
-	sprintf(name, "[%lu]", inode->i_ino);
 	this.name = name;
-	this.len = strlen(name);
-	this.hash = inode->i_ino;
+	this.len = sprintf(name, "[%p]", ep);
+	this.hash = 0;
 	dentry = d_alloc(eventpoll_mnt->mnt_sb->s_root, &this);
 	if (!dentry)
 		goto eexit_4;
 	dentry->d_op = &eventpollfs_dentry_operations;
-	d_add(dentry, inode);
+	/* Do not publish this dentry inside the global dentry hash table */
+	dentry->d_flags &= ~DCACHE_UNHASHED;
+	d_instantiate(dentry, inode);
+
 	file->f_path.mnt = mntget(eventpoll_mnt);
 	file->f_path.dentry = dentry;
 	file->f_mapping = inode->i_mapping;
@@ -1555,6 +1561,11 @@
 
 static int eventpollfs_delete_dentry(struct dentry *dentry)
 {
+	/*
+	 * We faked vfs to believe the dentry was hashed when we created it.
+	 * Now we restore the flag so that dput() will work correctly.
+	 */
+	dentry->d_flags |= DCACHE_UNHASHED;
 
 	return 1;
 }
@@ -1562,6 +1573,17 @@
 
 static struct inode *ep_eventpoll_inode(void)
 {
+
+	return igrab(eventpoll_inode);
+}
+
+/*
+ * A single inode exist for all eventpoll files. On the contrary of pipes,
+ * eventpoll inodes has no per-instance data associated, so we can avoid
+ * the allocation of multiple of them.
+ */
+static struct inode *ep_create_inode(void)
+{
 	int error = -ENOMEM;
 	struct inode *inode = new_inode(eventpoll_mnt->mnt_sb);
 
@@ -1626,10 +1648,14 @@
 
 	/* Mount the above commented virtual file system */
 	eventpoll_mnt = kern_mount(&eventpoll_fs_type);
-	error = PTR_ERR(eventpoll_mnt);
 	if (IS_ERR(eventpoll_mnt))
 		goto epanic;
 
+	/* Create the single instance of inode for all eventpoll fds */
+	eventpoll_inode = ep_create_inode();
+	if (IS_ERR(eventpoll_inode))
+		goto epanic;
+
 	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: successfully initialized.\n",
 			current));
 	return 0;
@@ -1642,6 +1668,7 @@
 static void __exit eventpoll_exit(void)
 {
 	/* Undo all operations done inside eventpoll_init() */
+	iput(eventpoll_inode);
 	unregister_filesystem(&eventpoll_fs_type);
 	mntput(eventpoll_mnt);
 	kmem_cache_destroy(pwq_cache);

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [patch v2] epoll use a single inode ...
  2007-03-05 23:41 [patch v2] epoll use a single inode Davide Libenzi
@ 2007-03-06  0:00 ` Linus Torvalds
  2007-03-06  0:12   ` Davide Libenzi
  0 siblings, 1 reply; 29+ messages in thread
From: Linus Torvalds @ 2007-03-06  0:00 UTC (permalink / raw)
  To: Davide Libenzi; +Cc: Linux Kernel Mailing List, Andrew Morton, Eric Dumazet



On Mon, 5 Mar 2007, Davide Libenzi wrote:
> @@ -763,15 +767,17 @@
>  	 * using the inode number.
>  	 */
>  	error = -ENOMEM;
> -	sprintf(name, "[%lu]", inode->i_ino);
>  	this.name = name;
> -	this.len = strlen(name);
> -	this.hash = inode->i_ino;
> +	this.len = sprintf(name, "[%p]", ep);
> +	this.hash = 0;

Please don't expose kernel pointers to user space.

It's much better to do something like

	static unsigned int epoll_inode;

	this.len = sprintf(name, "[%u]", ++epoll_inode);

if you just need some pseudo-unique name to distinguish two epoll things 
from each other (vs from a dup'ed fd).

		Linus

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [patch v2] epoll use a single inode ...
  2007-03-06  0:00 ` Linus Torvalds
@ 2007-03-06  0:12   ` Davide Libenzi
  2007-03-06  0:20     ` Davide Libenzi
                       ` (2 more replies)
  0 siblings, 3 replies; 29+ messages in thread
From: Davide Libenzi @ 2007-03-06  0:12 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Linux Kernel Mailing List, Andrew Morton, Eric Dumazet

On Mon, 5 Mar 2007, Linus Torvalds wrote:

> On Mon, 5 Mar 2007, Davide Libenzi wrote:
> > @@ -763,15 +767,17 @@
> >  	 * using the inode number.
> >  	 */
> >  	error = -ENOMEM;
> > -	sprintf(name, "[%lu]", inode->i_ino);
> >  	this.name = name;
> > -	this.len = strlen(name);
> > -	this.hash = inode->i_ino;
> > +	this.len = sprintf(name, "[%p]", ep);
> > +	this.hash = 0;
> 
> Please don't expose kernel pointers to user space.
> 
> It's much better to do something like
> 
> 	static unsigned int epoll_inode;
> 
> 	this.len = sprintf(name, "[%u]", ++epoll_inode);
> 
> if you just need some pseudo-unique name to distinguish two epoll things 
> from each other (vs from a dup'ed fd).

Heh, this is what Al was saying ;)
I'm fine with that, but how about counter cycles (going back to zero)? 
Should we care to handle them correctly? 



- Davide



^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [patch v2] epoll use a single inode ...
  2007-03-06  0:12   ` Davide Libenzi
@ 2007-03-06  0:20     ` Davide Libenzi
  2007-03-06  0:25     ` Linus Torvalds
  2007-03-10  8:06     ` Pavel Machek
  2 siblings, 0 replies; 29+ messages in thread
From: Davide Libenzi @ 2007-03-06  0:20 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Linux Kernel Mailing List, Andrew Morton, Eric Dumazet

On Mon, 5 Mar 2007, Davide Libenzi wrote:

> On Mon, 5 Mar 2007, Linus Torvalds wrote:
> 
> > On Mon, 5 Mar 2007, Davide Libenzi wrote:
> > > @@ -763,15 +767,17 @@
> > >  	 * using the inode number.
> > >  	 */
> > >  	error = -ENOMEM;
> > > -	sprintf(name, "[%lu]", inode->i_ino);
> > >  	this.name = name;
> > > -	this.len = strlen(name);
> > > -	this.hash = inode->i_ino;
> > > +	this.len = sprintf(name, "[%p]", ep);
> > > +	this.hash = 0;
> > 
> > Please don't expose kernel pointers to user space.
> > 
> > It's much better to do something like
> > 
> > 	static unsigned int epoll_inode;
> > 
> > 	this.len = sprintf(name, "[%u]", ++epoll_inode);
> > 
> > if you just need some pseudo-unique name to distinguish two epoll things 
> > from each other (vs from a dup'ed fd).
> 
> Heh, this is what Al was saying ;)
> I'm fine with that, but how about counter cycles (going back to zero)? 
> Should we care to handle them correctly? 

Maybe a [TID.FD] format?



- Davide



^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [patch v2] epoll use a single inode ...
  2007-03-06  0:12   ` Davide Libenzi
  2007-03-06  0:20     ` Davide Libenzi
@ 2007-03-06  0:25     ` Linus Torvalds
  2007-03-06  2:25       ` H. Peter Anvin
  2007-03-10  8:06     ` Pavel Machek
  2 siblings, 1 reply; 29+ messages in thread
From: Linus Torvalds @ 2007-03-06  0:25 UTC (permalink / raw)
  To: Davide Libenzi; +Cc: Linux Kernel Mailing List, Andrew Morton, Eric Dumazet

On Mon, 5 Mar 2007, Davide Libenzi wrote:

> On Mon, 5 Mar 2007, Linus Torvalds wrote:
> > 
> > It's much better to do something like
> > 
> > 	static unsigned int epoll_inode;
> > 
> > 	this.len = sprintf(name, "[%u]", ++epoll_inode);
> > 
> > if you just need some pseudo-unique name to distinguish two epoll things 
> > from each other (vs from a dup'ed fd).
> 
> Heh, this is what Al was saying ;)
> I'm fine with that, but how about counter cycles (going back to zero)? 
> Should we care to handle them correctly? 

Since this is not actually *used* for anything but showing the fd's in 
/proc/<pid>/fd/ etc, no. In fact, an integer will wrap a *lot* less than a 
kernel data structure will be re-used, so even with the simple "wraps 
every 4G uses", you're still better off.

IOW, if the thing actually _mattered_ we should use some bitmap allocator 
or similar (eg pidmaps etc), but with something where the only reason 
really is as a visible indicator of difference for a user that happens to 
look, simple-and-stupid is better.

		Linus

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [patch v2] epoll use a single inode ...
  2007-03-06  0:25     ` Linus Torvalds
@ 2007-03-06  2:25       ` H. Peter Anvin
  2007-03-06  2:34         ` Davide Libenzi
  0 siblings, 1 reply; 29+ messages in thread
From: H. Peter Anvin @ 2007-03-06  2:25 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Davide Libenzi, Linux Kernel Mailing List, Andrew Morton,
	Eric Dumazet

Linus Torvalds wrote:
> 
> Since this is not actually *used* for anything but showing the fd's in 
> /proc/<pid>/fd/ etc, no. In fact, an integer will wrap a *lot* less than a 
> kernel data structure will be re-used, so even with the simple "wraps 
> every 4G uses", you're still better off.
> 

... and if that worries you, use a 64-bit counter.  They're cheap 
(compared to an sprintf), and even if they advance once a nanosecond 
they won't wrap around for over 584 years.

> IOW, if the thing actually _mattered_ we should use some bitmap allocator 
> or similar (eg pidmaps etc), but with something where the only reason 
> really is as a visible indicator of difference for a user that happens to 
> look, simple-and-stupid is better.

That only makes sense if it matters that the numbers are kept small. 
This is in fact a highly suboptimal constraint, as it is almost 
guaranteed to create wraparounds much sooner, but there are situations 
in which that's the only option, e.g. for pty number allocations due to 
glibc braindamage.

	-hpa

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [patch v2] epoll use a single inode ...
  2007-03-06  2:25       ` H. Peter Anvin
@ 2007-03-06  2:34         ` Davide Libenzi
  2007-03-06  2:37           ` H. Peter Anvin
  0 siblings, 1 reply; 29+ messages in thread
From: Davide Libenzi @ 2007-03-06  2:34 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Linus Torvalds, Linux Kernel Mailing List, Andrew Morton,
	Eric Dumazet

On Mon, 5 Mar 2007, H. Peter Anvin wrote:

> Linus Torvalds wrote:
> > 
> > Since this is not actually *used* for anything but showing the fd's in
> > /proc/<pid>/fd/ etc, no. In fact, an integer will wrap a *lot* less than a
> > kernel data structure will be re-used, so even with the simple "wraps every
> > 4G uses", you're still better off.
> > 
> 
> ... and if that worries you, use a 64-bit counter.  They're cheap (compared to
> an sprintf), and even if they advance once a nanosecond they won't wrap around
> for over 584 years.

Right now is using:

	this.len = sprintf(name, "[%u.%d]", current->pid, fd);

That should be unique and not have the wraparound problem. Ack?



- Davide



^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [patch v2] epoll use a single inode ...
  2007-03-06  2:34         ` Davide Libenzi
@ 2007-03-06  2:37           ` H. Peter Anvin
  2007-03-06  2:43             ` Davide Libenzi
  0 siblings, 1 reply; 29+ messages in thread
From: H. Peter Anvin @ 2007-03-06  2:37 UTC (permalink / raw)
  To: Davide Libenzi
  Cc: Linus Torvalds, Linux Kernel Mailing List, Andrew Morton,
	Eric Dumazet

Davide Libenzi wrote:
> 
> Right now is using:
> 
> 	this.len = sprintf(name, "[%u.%d]", current->pid, fd);
> 
> That should be unique and not have the wraparound problem. Ack?
> 

NAK, very much NAK.

File descriptors aren't file structures, they're *pointers* to file 
structures.

It's perfectly possible -- downright common -- for a file descriptor to 
be inherited by another process, and then the pid is recycled -- collision.

	-hpa

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [patch v2] epoll use a single inode ...
  2007-03-06  2:37           ` H. Peter Anvin
@ 2007-03-06  2:43             ` Davide Libenzi
  2007-03-06  6:22               ` Eric Dumazet
  0 siblings, 1 reply; 29+ messages in thread
From: Davide Libenzi @ 2007-03-06  2:43 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Linus Torvalds, Linux Kernel Mailing List, Andrew Morton,
	Eric Dumazet

On Mon, 5 Mar 2007, H. Peter Anvin wrote:

> Davide Libenzi wrote:
> > 
> > Right now is using:
> > 
> > 	this.len = sprintf(name, "[%u.%d]", current->pid, fd);
> > 
> > That should be unique and not have the wraparound problem. Ack?
> > 
> 
> NAK, very much NAK.
> 
> File descriptors aren't file structures, they're *pointers* to file
> structures.
> 
> It's perfectly possible -- downright common -- for a file descriptor to be
> inherited by another process, and then the pid is recycled -- collision.

Ugh! Right! 64 bit counter it is ... :)



- Davide



^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [patch v2] epoll use a single inode ...
  2007-03-06  2:43             ` Davide Libenzi
@ 2007-03-06  6:22               ` Eric Dumazet
  2007-03-06  6:31                 ` Davide Libenzi
  2007-03-06 16:28                 ` H. Peter Anvin
  0 siblings, 2 replies; 29+ messages in thread
From: Eric Dumazet @ 2007-03-06  6:22 UTC (permalink / raw)
  To: Davide Libenzi
  Cc: H. Peter Anvin, Linus Torvalds, Linux Kernel Mailing List,
	Andrew Morton

Davide Libenzi a écrit :
> On Mon, 5 Mar 2007, H. Peter Anvin wrote:
> 
>> Davide Libenzi wrote:
>>> Right now is using:
>>>
>>> 	this.len = sprintf(name, "[%u.%d]", current->pid, fd);
>>>
>>> That should be unique and not have the wraparound problem. Ack?
>>>
>> NAK, very much NAK.
>>
>> File descriptors aren't file structures, they're *pointers* to file
>> structures.
>>
>> It's perfectly possible -- downright common -- for a file descriptor to be
>> inherited by another process, and then the pid is recycled -- collision.
> 
> Ugh! Right! 64 bit counter it is ... :)

For epoll, I suspect this is harmless : Programs dont allocate epolls fd every 
milli second, but at startup only.

For pipes/sockets, using a 64 bits would be problematic, because sprintf() 
uses a divide for each digit. And a divide is slow. Ten divides are *very* slow.



^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [patch v2] epoll use a single inode ...
  2007-03-06  6:22               ` Eric Dumazet
@ 2007-03-06  6:31                 ` Davide Libenzi
  2007-03-06  6:37                   ` Davide Libenzi
  2007-03-06 16:28                 ` H. Peter Anvin
  1 sibling, 1 reply; 29+ messages in thread
From: Davide Libenzi @ 2007-03-06  6:31 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: H. Peter Anvin, Linus Torvalds, Linux Kernel Mailing List,
	Andrew Morton

[-- Attachment #1: Type: TEXT/PLAIN, Size: 1101 bytes --]

On Tue, 6 Mar 2007, Eric Dumazet wrote:

> Davide Libenzi a écrit :
> > On Mon, 5 Mar 2007, H. Peter Anvin wrote:
> > 
> > > Davide Libenzi wrote:
> > > > Right now is using:
> > > > 
> > > > 	this.len = sprintf(name, "[%u.%d]", current->pid, fd);
> > > > 
> > > > That should be unique and not have the wraparound problem. Ack?
> > > > 
> > > NAK, very much NAK.
> > > 
> > > File descriptors aren't file structures, they're *pointers* to file
> > > structures.
> > > 
> > > It's perfectly possible -- downright common -- for a file descriptor to be
> > > inherited by another process, and then the pid is recycled -- collision.
> > 
> > Ugh! Right! 64 bit counter it is ... :)
> 
> For epoll, I suspect this is harmless : Programs dont allocate epolls fd every
> milli second, but at startup only.
> 
> For pipes/sockets, using a 64 bits would be problematic, because sprintf()
> uses a divide for each digit. And a divide is slow. Ten divides are *very*
> slow.

This would be used for epoll, signalfd and timerfd. And the printf format 
is %llu ;)



- Davide


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [patch v2] epoll use a single inode ...
  2007-03-06  6:31                 ` Davide Libenzi
@ 2007-03-06  6:37                   ` Davide Libenzi
  0 siblings, 0 replies; 29+ messages in thread
From: Davide Libenzi @ 2007-03-06  6:37 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: H. Peter Anvin, Linus Torvalds, Linux Kernel Mailing List,
	Andrew Morton

On Mon, 5 Mar 2007, Davide Libenzi wrote:

> This would be used for epoll, signalfd and timerfd. And the printf format 
> is %llu ;)

Today is really one on those days ... %llx



- Davide



^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [patch v2] epoll use a single inode ...
  2007-03-06  6:22               ` Eric Dumazet
  2007-03-06  6:31                 ` Davide Libenzi
@ 2007-03-06 16:28                 ` H. Peter Anvin
  2007-03-06 17:09                   ` Linus Torvalds
                                     ` (2 more replies)
  1 sibling, 3 replies; 29+ messages in thread
From: H. Peter Anvin @ 2007-03-06 16:28 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Davide Libenzi, Linus Torvalds, Linux Kernel Mailing List,
	Andrew Morton

Eric Dumazet wrote:
> 
> For epoll, I suspect this is harmless : Programs dont allocate epolls fd 
> every milli second, but at startup only.
> 
> For pipes/sockets, using a 64 bits would be problematic, because 
> sprintf() uses a divide for each digit. And a divide is slow. Ten 
> divides are *very* slow.
> 

That's true for *any* sprintf(), though.  sprintf() converts all its 
arguments to 64 bits.

However, this could be optimized.  I think right now sprintf() uses a 
generic divide-by-base, but a divide by 8 and 16 can of course be 
handled with a shift, and divide by 10 can be replaced with a 
multiplication by 0x1999999999999999ULL on most architectures.

	-hpa

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [patch v2] epoll use a single inode ...
  2007-03-06 16:28                 ` H. Peter Anvin
@ 2007-03-06 17:09                   ` Linus Torvalds
  2007-03-06 17:14                     ` H. Peter Anvin
  2007-03-06 17:12                   ` Eric Dumazet
  2007-03-06 18:10                   ` Davide Libenzi
  2 siblings, 1 reply; 29+ messages in thread
From: Linus Torvalds @ 2007-03-06 17:09 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Eric Dumazet, Davide Libenzi, Linux Kernel Mailing List,
	Andrew Morton

On Tue, 6 Mar 2007, H. Peter Anvin wrote:
> 
> That's true for *any* sprintf(), though.  sprintf() converts all its arguments
> to 64 bits.

Well, it very much uses "do_div()", so that it can do a

	64 / 32 -> (div64,mod32) 

divide, which is often faster than a full 64:64 divide.

> However, this could be optimized.  I think right now sprintf() uses a generic
> divide-by-base, but a divide by 8 and 16 can of course be handled with a
> shift, and divide by 10 can be replaced with a multiplication by
> 0x1999999999999999ULL on most architectures.

Nope. You need both the result of the division *and* the remainder, and 
you can't do that with a single multiply.

Also, with modern hardware, divides are usually fairly cheap, with early 
exit logic, so that the common case of small integers is fairly cheap. 
Yeah, generating a full 64-bit number printout is still expensive, of 
course (both because you need to do many divides *and* because only the 
last few divides will be able to do any appreciable early exit logic.

Anyway, I think a full integer divide on Core 2 is something like 22 
cycles. Yes, the multiply is much fasster (at 4 cycles), but I think that 
22 cycles is actually worst-case.

Somebody who has a benchmark could try.

		Linus

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [patch v2] epoll use a single inode ...
  2007-03-06 16:28                 ` H. Peter Anvin
  2007-03-06 17:09                   ` Linus Torvalds
@ 2007-03-06 17:12                   ` Eric Dumazet
  2007-03-06 17:19                     ` Linus Torvalds
  2007-03-06 18:10                   ` Davide Libenzi
  2 siblings, 1 reply; 29+ messages in thread
From: Eric Dumazet @ 2007-03-06 17:12 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Davide Libenzi, Linus Torvalds, Linux Kernel Mailing List,
	Andrew Morton

[-- Attachment #1: Type: text/plain, Size: 1214 bytes --]

On Tuesday 06 March 2007 17:28, H. Peter Anvin wrote:
> Eric Dumazet wrote:
> > For epoll, I suspect this is harmless : Programs dont allocate epolls fd
> > every milli second, but at startup only.
> >
> > For pipes/sockets, using a 64 bits would be problematic, because
> > sprintf() uses a divide for each digit. And a divide is slow. Ten
> > divides are *very* slow.
>
> That's true for *any* sprintf(), though.  sprintf() converts all its
> arguments to 64 bits.
>
> However, this could be optimized.  I think right now sprintf() uses a
> generic divide-by-base, but a divide by 8 and 16 can of course be
> handled with a shift, and divide by 10 can be replaced with a
> multiplication by 0x1999999999999999ULL on most architectures.

Or maybe just use reciprocal division, to keep the  35 bases available in 
number() 

Something like :

[PATCH] : Use reciprocal divides in sprintf()

pipe() syscalls spend a noticeable amount of time in sprintf() to format their 
dentry name. One name may want to print 9 or 10 digits, using one divide per 
digit.

Using reciprocal divides permits to change each divide by two multiplies, less 
expensive on current CPUS.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>

[-- Attachment #2: reciprocal_divide_in_sprintf.patch --]
[-- Type: text/plain, Size: 2863 bytes --]

diff --git a/include/linux/reciprocal_div.h b/include/linux/reciprocal_div.h
index f9c90b3..55a79e0 100644
--- a/include/linux/reciprocal_div.h
+++ b/include/linux/reciprocal_div.h
@@ -23,7 +23,10 @@ #include <linux/types.h>
  * or else the performance is slower than a normal divide.
  */
 extern u32 reciprocal_value(u32 B);
-
+/*
+ * precomputed reciprocal values of integers from 0 to 36
+ */
+extern const u32 reciprocal_values[36 + 1];
 
 static inline u32 reciprocal_divide(u32 A, u32 R)
 {
diff --git a/lib/reciprocal_div.c b/lib/reciprocal_div.c
index 6a3bd48..2dcea45 100644
--- a/lib/reciprocal_div.c
+++ b/lib/reciprocal_div.c
@@ -1,6 +1,31 @@
 #include <asm/div64.h>
 #include <linux/reciprocal_div.h>
 
+#define CONSTANT_RECIPROCAL_VALUE(k) \
+	(u32)(((1LL << 32) + (k - 1)) / k)
+
+const u32 reciprocal_values[36 + 1] = {
+	0,
+	CONSTANT_RECIPROCAL_VALUE(1),	CONSTANT_RECIPROCAL_VALUE(2),
+	CONSTANT_RECIPROCAL_VALUE(3),	CONSTANT_RECIPROCAL_VALUE(4),
+	CONSTANT_RECIPROCAL_VALUE(5),	CONSTANT_RECIPROCAL_VALUE(6),
+	CONSTANT_RECIPROCAL_VALUE(7),	CONSTANT_RECIPROCAL_VALUE(8),
+	CONSTANT_RECIPROCAL_VALUE(9),	CONSTANT_RECIPROCAL_VALUE(10),
+	CONSTANT_RECIPROCAL_VALUE(11),	CONSTANT_RECIPROCAL_VALUE(12),
+	CONSTANT_RECIPROCAL_VALUE(13),	CONSTANT_RECIPROCAL_VALUE(14),
+	CONSTANT_RECIPROCAL_VALUE(15),	CONSTANT_RECIPROCAL_VALUE(16),
+	CONSTANT_RECIPROCAL_VALUE(17),	CONSTANT_RECIPROCAL_VALUE(18),
+	CONSTANT_RECIPROCAL_VALUE(19),	CONSTANT_RECIPROCAL_VALUE(20),
+	CONSTANT_RECIPROCAL_VALUE(21),	CONSTANT_RECIPROCAL_VALUE(22),
+	CONSTANT_RECIPROCAL_VALUE(23),	CONSTANT_RECIPROCAL_VALUE(24),
+	CONSTANT_RECIPROCAL_VALUE(25),	CONSTANT_RECIPROCAL_VALUE(26),
+	CONSTANT_RECIPROCAL_VALUE(27),	CONSTANT_RECIPROCAL_VALUE(28),
+	CONSTANT_RECIPROCAL_VALUE(29),	CONSTANT_RECIPROCAL_VALUE(30),
+	CONSTANT_RECIPROCAL_VALUE(31),	CONSTANT_RECIPROCAL_VALUE(32),
+	CONSTANT_RECIPROCAL_VALUE(33),	CONSTANT_RECIPROCAL_VALUE(34),
+	CONSTANT_RECIPROCAL_VALUE(35),	CONSTANT_RECIPROCAL_VALUE(36),
+};
+
 u32 reciprocal_value(u32 k)
 {
 	u64 val = (1LL << 32) + (k - 1);
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index b025864..9c98cc4 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -22,6 +22,7 @@ #include <linux/types.h>
 #include <linux/string.h>
 #include <linux/ctype.h>
 #include <linux/kernel.h>
+#include <linux/reciprocal_div.h>
 
 #include <asm/page.h>		/* for PAGE_SIZE */
 #include <asm/div64.h>
@@ -180,8 +181,16 @@ static char * number(char * buf, char * 
 	i = 0;
 	if (num == 0)
 		tmp[i++]='0';
-	else while (num != 0)
-		tmp[i++] = digits[do_div(num,base)];
+	else {
+		while (num >> 32)
+			tmp[i++] = digits[do_div(num,base)];
+		while (num != 0) {
+			u32 next = reciprocal_divide((u32)num,
+				reciprocal_values[base]);
+			tmp[i++] = digits[num - next*base];
+			num = next;
+		}
+	}
 	if (i > precision)
 		precision = i;
 	size -= precision;

^ permalink raw reply related	[flat|nested] 29+ messages in thread

* Re: [patch v2] epoll use a single inode ...
  2007-03-06 17:09                   ` Linus Torvalds
@ 2007-03-06 17:14                     ` H. Peter Anvin
  0 siblings, 0 replies; 29+ messages in thread
From: H. Peter Anvin @ 2007-03-06 17:14 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Eric Dumazet, Davide Libenzi, Linux Kernel Mailing List,
	Andrew Morton

Linus Torvalds wrote:
> 
>> However, this could be optimized.  I think right now sprintf() uses a generic
>> divide-by-base, but a divide by 8 and 16 can of course be handled with a
>> shift, and divide by 10 can be replaced with a multiplication by
>> 0x1999999999999999ULL on most architectures.
> 
> Nope. You need both the result of the division *and* the remainder, and 
> you can't do that with a single multiply.

Oh, right.  *D'oh*.

	-hpa

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [patch v2] epoll use a single inode ...
  2007-03-06 17:12                   ` Eric Dumazet
@ 2007-03-06 17:19                     ` Linus Torvalds
  2007-03-06 17:27                       ` H. Peter Anvin
  2007-03-06 17:28                       ` Eric Dumazet
  0 siblings, 2 replies; 29+ messages in thread
From: Linus Torvalds @ 2007-03-06 17:19 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: H. Peter Anvin, Davide Libenzi, Linux Kernel Mailing List,
	Andrew Morton



On Tue, 6 Mar 2007, Eric Dumazet wrote:
> 
> Something like :
> 
> [PATCH] : Use reciprocal divides in sprintf()

Try this on Core 2, and I suspect that you'll find that the hardware is 
actually *faster* than doing the shift/test, function call and the 
two multiplies.

> Using reciprocal divides permits to change each divide by two multiplies, less 
> expensive on current CPUS.

Are you sure?

		Linus

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [patch v2] epoll use a single inode ...
  2007-03-06 17:19                     ` Linus Torvalds
@ 2007-03-06 17:27                       ` H. Peter Anvin
  2007-03-06 17:28                       ` Eric Dumazet
  1 sibling, 0 replies; 29+ messages in thread
From: H. Peter Anvin @ 2007-03-06 17:27 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Eric Dumazet, Davide Libenzi, Linux Kernel Mailing List,
	Andrew Morton

Linus Torvalds wrote:
> 
> On Tue, 6 Mar 2007, Eric Dumazet wrote:
>> Something like :
>>
>> [PATCH] : Use reciprocal divides in sprintf()
> 
> Try this on Core 2, and I suspect that you'll find that the hardware is 
> actually *faster* than doing the shift/test, function call and the 
> two multiplies.
> 
>> Using reciprocal divides permits to change each divide by two multiplies, less 
>> expensive on current CPUS.
> 
> Are you sure?
> 

For base 8 and 16, this is shift and mask, respectively, so it's bound 
to be faster (although modern hardware can often optimize this, embedded 
hardware definitely can't.)  Base 10, which even in the Linux kernel is 
almost certainly the most common case, is a lot iffier.

	-hpa

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [patch v2] epoll use a single inode ...
  2007-03-06 17:19                     ` Linus Torvalds
  2007-03-06 17:27                       ` H. Peter Anvin
@ 2007-03-06 17:28                       ` Eric Dumazet
  2007-03-06 18:10                         ` Eric Dumazet
  1 sibling, 1 reply; 29+ messages in thread
From: Eric Dumazet @ 2007-03-06 17:28 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: H. Peter Anvin, Davide Libenzi, Linux Kernel Mailing List,
	Andrew Morton

On Tuesday 06 March 2007 18:19, Linus Torvalds wrote:
> On Tue, 6 Mar 2007, Eric Dumazet wrote:
> > Something like :
> >
> > [PATCH] : Use reciprocal divides in sprintf()
>
> Try this on Core 2, and I suspect that you'll find that the hardware is
> actually *faster* than doing the shift/test, function call and the
> two multiplies.
>

Where do you see a function call ?

     448:       44 89 d0                mov    %r10d,%eax
     44b:       44 89 ea                mov    %r13d,%edx
     44e:       48 0f af c1             imul   %rcx,%rax
     452:       48 c1 e8 20             shr    $0x20,%rax
     456:       0f af d0                imul   %eax,%edx
     459:       49 29 d2                sub    %rdx,%r10
     45c:       43 0f b6 14 16          movzbl (%r14,%r10,1),%edx
     461:       41 89 c2                mov    %eax,%r10d
     464:       41 88 13                mov    %dl,(%r11)
     467:       49 ff c3                inc    %r11
     46a:       85 c0                   test   %eax,%eax
     46c:       75 da                   jne    448 <number+0x138>



> > Using reciprocal divides permits to change each divide by two multiplies,
> > less expensive on current CPUS.
>
> Are you sure?

I am going to test this, but at least on Opterons, the reciprocal divide I 
added into mm/slab.c gave me a nice speedup.

I am going to bench some stupid loop :

for (i = 0 ; i < 1000*1000 ; i++) {
	pipe(fds); close(fds[0]); close(fds[1]);
}


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [patch v2] epoll use a single inode ...
  2007-03-06 16:28                 ` H. Peter Anvin
  2007-03-06 17:09                   ` Linus Torvalds
  2007-03-06 17:12                   ` Eric Dumazet
@ 2007-03-06 18:10                   ` Davide Libenzi
  2 siblings, 0 replies; 29+ messages in thread
From: Davide Libenzi @ 2007-03-06 18:10 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Eric Dumazet, Linus Torvalds, Linux Kernel Mailing List,
	Andrew Morton

On Tue, 6 Mar 2007, H. Peter Anvin wrote:

> Eric Dumazet wrote:
> > 
> > For epoll, I suspect this is harmless : Programs dont allocate epolls fd
> > every milli second, but at startup only.
> > 
> > For pipes/sockets, using a 64 bits would be problematic, because sprintf()
> > uses a divide for each digit. And a divide is slow. Ten divides are *very*
> > slow.
> > 
> 
> That's true for *any* sprintf(), though.  sprintf() converts all its arguments
> to 64 bits.

Sigh! We *always* do do_div(), even for the more trivial %x case.
Anyway, it does not really matter for the type of dfs will be applied to.


- Davide



^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [patch v2] epoll use a single inode ...
  2007-03-06 17:28                       ` Eric Dumazet
@ 2007-03-06 18:10                         ` Eric Dumazet
  2007-03-06 20:20                           ` Eric Dumazet
  0 siblings, 1 reply; 29+ messages in thread
From: Eric Dumazet @ 2007-03-06 18:10 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: H. Peter Anvin, Davide Libenzi, Linux Kernel Mailing List,
	Andrew Morton

[-- Attachment #1: Type: text/plain, Size: 2099 bytes --]

On Tuesday 06 March 2007 18:28, Eric Dumazet wrote:
> On Tuesday 06 March 2007 18:19, Linus Torvalds wrote:
>
> > > Using reciprocal divides permits to change each divide by two
> > > multiplies, less expensive on current CPUS.
> >
> > Are you sure?
>
> I am going to test this, but at least on Opterons, the reciprocal divide I
> added into mm/slab.c gave me a nice speedup.
>

With attached test program (one million calls to pipe()), I got about 0.1 s of 
speedup on my 1.6 GHz Pentium(M) dell D610 machine
(3.350 s instead of 3.450 s, on many runs)

Thats about 100 ns saved per number() call

But then I realized that on ia32, gcc compilers is not very smart :

static inline u32 reciprocal_divide(u32 A, u32 R)
{
	return (u32)(((u64)A * R) >> 32); 
}

It generates two multiplies... arg...

//begin of reciprocal_divide()
     4b0:       8b 4c 24 28             mov    0x28(%esp),%ecx
     4b4:       89 f0                   mov    %esi,%eax
     4b6:       f7 64 24 24             mull   0x24(%esp)
     4ba:       0f af ce                imul   %esi,%ecx
     4bd:       8d 14 11                lea    (%ecx,%edx,1),%edx
// end of reciprocal_divide()
     4c0:       8b 8c 24 8c 00 00 00    mov    0x8c(%esp),%ecx
     4c7:       89 d0                   mov    %edx,%eax
     4c9:       0f af c8                imul   %eax,%ecx
     4cc:       29 ce                   sub    %ecx,%esi
     4ce:       8b 4c 24 1c             mov    0x1c(%esp),%ecx
     4d2:       0f b6 34 31             movzbl (%ecx,%esi,1),%esi
     4d6:       89 f1                   mov    %esi,%ecx
     4d8:       89 c6                   mov    %eax,%esi
     4da:       88 0f                   mov    %cl,(%edi)
     4dc:       47                      inc    %edi
     4dd:       ff 44 24 20             incl   0x20(%esp)
     4e1:       85 c0                   test   %eax,%eax
     4e3:       75 cb                   jne    4b0 <number+0x160>

So even with a total of 3 multiplies per digit, we win...

Maybe some bit of x86 asm is needed to make gcc be smarter (using only one 
multiply for  reciprocal_divide())


[-- Attachment #2: pipes.c --]
[-- Type: text/plain, Size: 241 bytes --]

/*
 * micro benchmark to time calls to pipe()/close()
 */
main()
{
	int fd[100*2];
	unsigned int l, i;

	for (l = 0 ; l < 10000 ; l++) {
		for (i = 0 ; i < 100*2 ; i+=2)
			pipe(fd + i);
		for (i = 0 ; i < 100*2 ; i++)
			close(fd[i]);
	}
}

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [patch v2] epoll use a single inode ...
  2007-03-06 18:10                         ` Eric Dumazet
@ 2007-03-06 20:20                           ` Eric Dumazet
  2007-03-07  3:47                             ` Linus Torvalds
  2007-03-07 23:46                             ` Sami Farin
  0 siblings, 2 replies; 29+ messages in thread
From: Eric Dumazet @ 2007-03-06 20:20 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: H. Peter Anvin, Davide Libenzi, Linux Kernel Mailing List,
	Andrew Morton

[-- Attachment #1: Type: text/plain, Size: 3299 bytes --]

Eric Dumazet a écrit :
> On Tuesday 06 March 2007 18:28, Eric Dumazet wrote:
>> On Tuesday 06 March 2007 18:19, Linus Torvalds wrote:
>>
>>>> Using reciprocal divides permits to change each divide by two
>>>> multiplies, less expensive on current CPUS.
>>> Are you sure?
>> I am going to test this, but at least on Opterons, the reciprocal divide I
>> added into mm/slab.c gave me a nice speedup.
>>
> 

Linus,

I did a user space program, attached to this mail.

I rewrote the reciprocal_div() for i386 so that one multiply is used.

static inline u32 reciprocal_divide(u32 A, u32 R)
{
#if __i386
         unsigned int edx, eax;
         asm("mul %2":"=a" (eax), "=d" (edx):"rm" (R), "0" (A));
         return edx;
#else
         return (u32)(((u64)A * R) >> 32);
#endif
}


Results are really good on 32bit. On 64bit/Opteron they are impressive.

$ gcc -O2 -o divide_bench divide_bench.c

First result gives the number of cycles to perform old number() routine using 
plain do_div()
Second result gives the number of cycles using reciprocal_div trick


results on a Intel Pentium III 866 MHz

$ ./divide_bench
413.453 cycles per call, last res=99999901
132.746 cycles per call, last res=99999901
$ ./divide_bench
411.833 cycles per call, last res=99999901
129.652 cycles per call, last res=99999901
$ ./divide_bench
480.645 cycles per call, last res=99999901
158.642 cycles per call, last res=99999901
$ ./divide_bench
412.769 cycles per call, last res=99999901
129.643 cycles per call, last res=99999901
$ ./divide_bench
410.809 cycles per call, last res=99999901
129.609 cycles per call, last res=99999901


results on AMD 246 (2GHz)
Sorry this machine is quite loaded... I dont have a dev x86_64 machine.

$ gcc -O2 -m32 -o divide_bench32 divide_bench.c
$ ./divide_bench32
412.181 cycles per call, last res=99999901
112.314 cycles per call, last res=99999901
$ ./divide_bench32
444.008 cycles per call, last res=99999901
114.314 cycles per call, last res=99999901
$ ./divide_bench32
423.168 cycles per call, last res=99999901
112.318 cycles per call, last res=99999901
$ ./divide_bench32
427.73 cycles per call, last res=99999901
110.712 cycles per call, last res=99999901
$ ./divide_bench32
410.529 cycles per call, last res=99999901
114.068 cycles per call, last res=99999901
$ ./divide_bench32
489.856 cycles per call, last res=99999901
124.889 cycles per call, last res=99999901
$ ./divide_bench32
389.278 cycles per call, last res=99999901
104.697 cycles per call, last res=99999901

With a 64bit prog :
$ gcc -O2 -m64 -o divide_bench64 divide_bench.c
$ ./divide_bench64
826.136 cycles per call, last res=99999901
105.912 cycles per call, last res=99999901
$ ./divide_bench64
627.096 cycles per call, last res=99999901
76.2473 cycles per call, last res=99999901
$ ./divide_bench64
604.524 cycles per call, last res=99999901
76.1405 cycles per call, last res=99999901
$ ./divide_bench64
621.013 cycles per call, last res=99999901
76.0963 cycles per call, last res=99999901
$ ./divide_bench64
836.799 cycles per call, last res=99999901
103.967 cycles per call, last res=99999901
$ ./divide_bench64
982.718 cycles per call, last res=99999901
127.945 cycles per call, last res=99999901
$ ./divide_bench64
609.346 cycles per call, last res=99999901
76.0768 cycles per call, last res=99999901



[-- Attachment #2: divide_bench.c --]
[-- Type: text/plain, Size: 3616 bytes --]

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <math.h>

#ifdef __x86_64

#define rdtscll(val) do { \
     unsigned int __a,__d; \
     asm volatile("rdtsc" : "=a" (__a), "=d" (__d)); \
     (val) = ((unsigned long)__a) | (((unsigned long)__d)<<32); \
} while(0)

# define do_div(n,base) ({					\
	uint32_t __base = (base);				\
	uint32_t __rem;						\
	__rem = ((uint64_t)(n)) % __base;			\
	(n) = ((uint64_t)(n)) / __base;				\
	__rem;							\
 })
#elif  __i386

#define rdtscll(val) \
     __asm__ __volatile__("rdtsc" : "=A" (val))

#define do_div(n,base) ({ \
	unsigned long __upper, __low, __high, __mod, __base; \
	__base = (base); \
	asm("":"=a" (__low), "=d" (__high):"A" (n)); \
	__upper = __high; \
	if (__high) { \
		__upper = __high % (__base); \
		__high = __high / (__base); \
	} \
	asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (__base), "0" (__low), "1" (__upper)); \
	asm("":"=A" (n):"a" (__low),"d" (__high)); \
	__mod; \
})

#endif

static const char digits[] = "0123456789abcdefghijklmnopqrstuvwxyz";

char *number_divides(unsigned long long num, int base, char *result)
{
if (num == 0)
	*result++ = '0';
else while (num) 
	*result++ = digits[do_div(num,base)];
*result = 0;
return result;
}
typedef unsigned int u32;
typedef unsigned long long u64;

#define CONSTANT_RECIPROCAL_VALUE(k) \
	(u32)(((1LL << 32) + (k - 1)) / k)

const u32 reciprocal_values[36 + 1] = {
	0,
	CONSTANT_RECIPROCAL_VALUE(1),	CONSTANT_RECIPROCAL_VALUE(2),
	CONSTANT_RECIPROCAL_VALUE(3),	CONSTANT_RECIPROCAL_VALUE(4),
	CONSTANT_RECIPROCAL_VALUE(5),	CONSTANT_RECIPROCAL_VALUE(6),
	CONSTANT_RECIPROCAL_VALUE(7),	CONSTANT_RECIPROCAL_VALUE(8),
	CONSTANT_RECIPROCAL_VALUE(9),	CONSTANT_RECIPROCAL_VALUE(10),
	CONSTANT_RECIPROCAL_VALUE(11),	CONSTANT_RECIPROCAL_VALUE(12),
	CONSTANT_RECIPROCAL_VALUE(13),	CONSTANT_RECIPROCAL_VALUE(14),
	CONSTANT_RECIPROCAL_VALUE(15),	CONSTANT_RECIPROCAL_VALUE(16),
	CONSTANT_RECIPROCAL_VALUE(17),	CONSTANT_RECIPROCAL_VALUE(18),
	CONSTANT_RECIPROCAL_VALUE(19),	CONSTANT_RECIPROCAL_VALUE(20),
	CONSTANT_RECIPROCAL_VALUE(21),	CONSTANT_RECIPROCAL_VALUE(22),
	CONSTANT_RECIPROCAL_VALUE(23),	CONSTANT_RECIPROCAL_VALUE(24),
	CONSTANT_RECIPROCAL_VALUE(25),	CONSTANT_RECIPROCAL_VALUE(26),
	CONSTANT_RECIPROCAL_VALUE(27),	CONSTANT_RECIPROCAL_VALUE(28),
	CONSTANT_RECIPROCAL_VALUE(29),	CONSTANT_RECIPROCAL_VALUE(30),
	CONSTANT_RECIPROCAL_VALUE(31),	CONSTANT_RECIPROCAL_VALUE(32),
	CONSTANT_RECIPROCAL_VALUE(33),	CONSTANT_RECIPROCAL_VALUE(34),
	CONSTANT_RECIPROCAL_VALUE(35),	CONSTANT_RECIPROCAL_VALUE(36)
};

static inline u32 reciprocal_divide(u32 A, u32 R)
{
#if __i386
	unsigned int edx, eax;
	asm("mul %2":"=a" (eax), "=d" (edx):"rm" (R), "0" (A));
	return edx;
#else
	return (u32)(((u64)A * R) >> 32);
#endif
}

char *number_reciprocal(unsigned long long num, int base, char *result)
{
if (num == 0)
	*result++ = '0';
else {
	while (num >>32)
		*result++ = digits[do_div(num,base)];
	while (num) {
		u32 next = reciprocal_divide((u32)num,
				reciprocal_values[base]);
			*result++ = digits[num - next*base];
			num = next;

		}
	}
*result = 0;
return result;
}

#define START 10000000
int base = 10;
main()
{
unsigned long long start, end;
char result[64];
unsigned long ul;
	rdtscll(start);
	for (ul = START; ul < START + 1000000;ul++)
		number_divides(ul, base, result);
	rdtscll(end);
printf("%g cycles per call, last res=%s\n", (double)(end - start)/1000000.0, result);
	rdtscll(start);
	for (ul = START; ul < START + 1000000;ul++)
		number_reciprocal(ul, base, result);
	rdtscll(end);
printf("%g cycles per call, last res=%s\n", (double)(end - start)/1000000.0, result);
}

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [patch v2] epoll use a single inode ...
  2007-03-06 20:20                           ` Eric Dumazet
@ 2007-03-07  3:47                             ` Linus Torvalds
  2007-03-07  5:40                               ` H. Peter Anvin
  2007-03-07  6:57                               ` Eric Dumazet
  2007-03-07 23:46                             ` Sami Farin
  1 sibling, 2 replies; 29+ messages in thread
From: Linus Torvalds @ 2007-03-07  3:47 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: H. Peter Anvin, Davide Libenzi, Linux Kernel Mailing List,
	Andrew Morton



On Tue, 6 Mar 2007, Eric Dumazet wrote:
> 
> I did a user space program, attached to this mail.
> 
> I rewrote the reciprocal_div() for i386 so that one multiply is used.

Ok, this is definitely faster on Core 2 as well, so "numbers talk, 
bullshit walks". No more objections.

(That said, I bet you could do even better for octal and hex numbers, so 
if you *really* want to speed things up, you should just make a 
special-case routine for each base (there's just three of them), and you 
can then also optimize the base-10 thing much better (you can do two 
digits at a time by dividing by 100, etc)

		Linus

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [patch v2] epoll use a single inode ...
  2007-03-07  3:47                             ` Linus Torvalds
@ 2007-03-07  5:40                               ` H. Peter Anvin
  2007-03-07  6:57                               ` Eric Dumazet
  1 sibling, 0 replies; 29+ messages in thread
From: H. Peter Anvin @ 2007-03-07  5:40 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Eric Dumazet, Davide Libenzi, Linux Kernel Mailing List,
	Andrew Morton

Linus Torvalds wrote:
> 
> On Tue, 6 Mar 2007, Eric Dumazet wrote:
>> I did a user space program, attached to this mail.
>>
>> I rewrote the reciprocal_div() for i386 so that one multiply is used.
> 
> Ok, this is definitely faster on Core 2 as well, so "numbers talk, 
> bullshit walks". No more objections.
> 
> (That said, I bet you could do even better for octal and hex numbers, so 
> if you *really* want to speed things up, you should just make a 
> special-case routine for each base (there's just three of them), and you 
> can then also optimize the base-10 thing much better (you can do two 
> digits at a time by dividing by 100, etc)
> 

Of course you can do better for octal and hex -- it's just shift and mask.

Decimal is trickier; however, at least on i386 it might make sense to 
divide by 100 and then use the AAM instruction, or a table lookup, to 
split it into individual digits.

	-hpa

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [patch v2] epoll use a single inode ...
  2007-03-07  3:47                             ` Linus Torvalds
  2007-03-07  5:40                               ` H. Peter Anvin
@ 2007-03-07  6:57                               ` Eric Dumazet
  2007-03-07  7:13                                 ` H. Peter Anvin
  1 sibling, 1 reply; 29+ messages in thread
From: Eric Dumazet @ 2007-03-07  6:57 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: H. Peter Anvin, Davide Libenzi, Linux Kernel Mailing List,
	Andrew Morton

Linus Torvalds a écrit :
> 
> On Tue, 6 Mar 2007, Eric Dumazet wrote:
>> I did a user space program, attached to this mail.
>>
>> I rewrote the reciprocal_div() for i386 so that one multiply is used.
> 
> Ok, this is definitely faster on Core 2 as well, so "numbers talk, 
> bullshit walks". No more objections.

And the numbers were ? :)

> 
> (That said, I bet you could do even better for octal and hex numbers, so 
> if you *really* want to speed things up, you should just make a 
> special-case routine for each base (there's just three of them), and you 
> can then also optimize the base-10 thing much better (you can do two 
> digits at a time by dividing by 100, etc)

Well, given that sprintf() is frequently called only for pipe/sockets 
creation, we probably better :

1) wait a very clever idea to suppress individual dentry per pipe/sockets (no 
more sprintf() at pipe/socket setup)

2) delay the sprintf() only if needed as you mentioned in a previous mail 
(when someone wants ls -l /proc/pid/fd/....), since their dentries are not 
anymore inserted in the global dcache hash, they could stay with a (nul) dname.



^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [patch v2] epoll use a single inode ...
  2007-03-07  6:57                               ` Eric Dumazet
@ 2007-03-07  7:13                                 ` H. Peter Anvin
  0 siblings, 0 replies; 29+ messages in thread
From: H. Peter Anvin @ 2007-03-07  7:13 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Linus Torvalds, Davide Libenzi, Linux Kernel Mailing List,
	Andrew Morton

Eric Dumazet wrote:
> Linus Torvalds a écrit :
>>
>> On Tue, 6 Mar 2007, Eric Dumazet wrote:
>>> I did a user space program, attached to this mail.
>>>
>>> I rewrote the reciprocal_div() for i386 so that one multiply is used.
>>
>> Ok, this is definitely faster on Core 2 as well, so "numbers talk, 
>> bullshit walks". No more objections.
> 
> And the numbers were ? :)
> 
>>
>> (That said, I bet you could do even better for octal and hex numbers, 
>> so if you *really* want to speed things up, you should just make a 
>> special-case routine for each base (there's just three of them), and 
>> you can then also optimize the base-10 thing much better (you can do 
>> two digits at a time by dividing by 100, etc)
> 
> Well, given that sprintf() is frequently called only for pipe/sockets 
> creation, we probably better :
> 
> 1) wait a very clever idea to suppress individual dentry per 
> pipe/sockets (no more sprintf() at pipe/socket setup)
> 
> 2) delay the sprintf() only if needed as you mentioned in a previous 
> mail (when someone wants ls -l /proc/pid/fd/....), since their dentries 
> are not anymore inserted in the global dcache hash, they could stay with 
> a (nul) dname.

Yes, the right thing to do is probably to only generate these strings 
when someone tries to list them, not on every socket/pipe/epoll 
creation.  One can assign a counter and keep it as a binary value at the 
start, but create the strings when necessary.

	-hpa


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [patch v2] epoll use a single inode ...
  2007-03-06 20:20                           ` Eric Dumazet
  2007-03-07  3:47                             ` Linus Torvalds
@ 2007-03-07 23:46                             ` Sami Farin
  1 sibling, 0 replies; 29+ messages in thread
From: Sami Farin @ 2007-03-07 23:46 UTC (permalink / raw)
  To: Linux Kernel Mailing List

On Tue, Mar 06, 2007 at 21:20:33 +0100, Eric Dumazet wrote:
...
> I rewrote the reciprocal_div() for i386 so that one multiply is used.
> 
> static inline u32 reciprocal_divide(u32 A, u32 R)
> {
> #if __i386
>         unsigned int edx, eax;
>         asm("mul %2":"=a" (eax), "=d" (edx):"rm" (R), "0" (A));
               ^^^

mul does not work if R is memory operand.
mull should be used instead.

-- 

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [patch v2] epoll use a single inode ...
  2007-03-06  0:12   ` Davide Libenzi
  2007-03-06  0:20     ` Davide Libenzi
  2007-03-06  0:25     ` Linus Torvalds
@ 2007-03-10  8:06     ` Pavel Machek
  2007-03-10  8:24       ` Davide Libenzi
  2 siblings, 1 reply; 29+ messages in thread
From: Pavel Machek @ 2007-03-10  8:06 UTC (permalink / raw)
  To: Davide Libenzi
  Cc: Linus Torvalds, Linux Kernel Mailing List, Andrew Morton,
	Eric Dumazet

Hi!

> > > @@ -763,15 +767,17 @@
> > >  	 * using the inode number.
> > >  	 */
> > >  	error = -ENOMEM;
> > > -	sprintf(name, "[%lu]", inode->i_ino);
> > >  	this.name = name;
> > > -	this.len = strlen(name);
> > > -	this.hash = inode->i_ino;
> > > +	this.len = sprintf(name, "[%p]", ep);
> > > +	this.hash = 0;
> > 
> > Please don't expose kernel pointers to user space.
> > 
> > It's much better to do something like
> > 
> > 	static unsigned int epoll_inode;
> > 
> > 	this.len = sprintf(name, "[%u]", ++epoll_inode);
> > 
> > if you just need some pseudo-unique name to distinguish two epoll things 
> > from each other (vs from a dup'ed fd).
> 
> Heh, this is what Al was saying ;)
> I'm fine with that, but how about counter cycles (going back to zero)? 

Just use u64?
							Pavel

-- 
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [patch v2] epoll use a single inode ...
  2007-03-10  8:06     ` Pavel Machek
@ 2007-03-10  8:24       ` Davide Libenzi
  0 siblings, 0 replies; 29+ messages in thread
From: Davide Libenzi @ 2007-03-10  8:24 UTC (permalink / raw)
  To: Pavel Machek
  Cc: Linus Torvalds, Linux Kernel Mailing List, Andrew Morton,
	Eric Dumazet

On Sat, 10 Mar 2007, Pavel Machek wrote:

> > Heh, this is what Al was saying ;)
> > I'm fine with that, but how about counter cycles (going back to zero)? 
> 
> Just use u64?

Yeah, the second patch was using an u64.
I ended up using a "class" name (signalfd, timerfd, asyncfd) as dname 
entry. An incremental counter would not add any useful information, and 
noone will ever care about dentry name of those objects. Prolly the class 
name is the only thing you might want to know, or we can drop even that 
and use a shared dentry for everything.

- Davide

^ permalink raw reply	[flat|nested] 29+ messages in thread

end of thread, other threads:[~2007-03-10  8:24 UTC | newest]

Thread overview: 29+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-03-05 23:41 [patch v2] epoll use a single inode Davide Libenzi
2007-03-06  0:00 ` Linus Torvalds
2007-03-06  0:12   ` Davide Libenzi
2007-03-06  0:20     ` Davide Libenzi
2007-03-06  0:25     ` Linus Torvalds
2007-03-06  2:25       ` H. Peter Anvin
2007-03-06  2:34         ` Davide Libenzi
2007-03-06  2:37           ` H. Peter Anvin
2007-03-06  2:43             ` Davide Libenzi
2007-03-06  6:22               ` Eric Dumazet
2007-03-06  6:31                 ` Davide Libenzi
2007-03-06  6:37                   ` Davide Libenzi
2007-03-06 16:28                 ` H. Peter Anvin
2007-03-06 17:09                   ` Linus Torvalds
2007-03-06 17:14                     ` H. Peter Anvin
2007-03-06 17:12                   ` Eric Dumazet
2007-03-06 17:19                     ` Linus Torvalds
2007-03-06 17:27                       ` H. Peter Anvin
2007-03-06 17:28                       ` Eric Dumazet
2007-03-06 18:10                         ` Eric Dumazet
2007-03-06 20:20                           ` Eric Dumazet
2007-03-07  3:47                             ` Linus Torvalds
2007-03-07  5:40                               ` H. Peter Anvin
2007-03-07  6:57                               ` Eric Dumazet
2007-03-07  7:13                                 ` H. Peter Anvin
2007-03-07 23:46                             ` Sami Farin
2007-03-06 18:10                   ` Davide Libenzi
2007-03-10  8:06     ` Pavel Machek
2007-03-10  8:24       ` Davide Libenzi

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox