All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] Add getdents32t syscall
@ 2004-02-26 19:38 Jakub Jelinek
  2004-02-26 22:03 ` Randy.Dunlap
                   ` (2 more replies)
  0 siblings, 3 replies; 16+ messages in thread
From: Jakub Jelinek @ 2004-02-26 19:38 UTC (permalink / raw)
  To: torvalds; +Cc: linux-kernel, drepper

Hi!

glibc struct dirent has d_type field (similarly to struct dirent64).
Because no 32-bit getdents syscall provides this field to userland,
glibc needs to use getdents64 syscall even for 32-bit getdents
(and readdir etc.) and convert dirent entries from struct dirent64
to struct dirent.  The code is quite complicated and as the former
is bigger and the size of 64-bit dirents cannot be predicted accurately,
it can happen that glibc reads too many entries and has to seek back
on the dir etc.

The following patch introduces a new syscall (on 32-bit architectures),
which fills in 32-bit struct dirent with d_type member.
With this syscall glibc can simply call this syscall in 32-bit getdents
and be done with it, no seeking, issues with NFS zero extended d_ino values,
buffer translation etc.  sys_getdents32t (the t in there is for type,
to differentiate it from compatibility sys_getdents32 which don't provide
d_type) function should be usable both on 32-bit arches and in 32-bit
compatibility layers on 64-bit arches (on most arches directly, if
the arguments are zero extended in assembly).

--- linux-2.6.3/fs/readdir.c.jj	2004-02-18 04:57:52.000000000 +0100
+++ linux-2.6.3/fs/readdir.c	2004-02-26 09:49:20.073123212 +0100
@@ -207,6 +207,88 @@ out:
 	return error;
 }
 
+struct getdents_callback32t {
+	struct linux_dirent32t __user * current_dir;
+	struct linux_dirent32t __user * previous;
+	int count;
+	int error;
+};
+
+int filldir32t(void * __buf, const char * name, int namlen, loff_t offset,
+	       ino_t ino, unsigned int d_type)
+{
+	struct linux_dirent32t __user * dirent;
+	struct getdents_callback32t * buf = (struct getdents_callback32t *) __buf;
+	int reclen = ROUND_UP(NAME_OFFSET(dirent) + namlen + 1);
+
+	buf->error = -EINVAL;	/* only used if we fail.. */
+	if (reclen > buf->count)
+		return -EINVAL;
+	dirent = buf->previous;
+	if (dirent) {
+		if (__put_user(offset, &dirent->d_off))
+			goto efault;
+	}
+	dirent = buf->current_dir;
+	if (__put_user(ino, &dirent->d_ino))
+		goto efault;
+	if (__put_user(reclen, &dirent->d_reclen))
+		goto efault;
+	if (__put_user(d_type, &dirent->d_type))
+		goto efault;
+	if (copy_to_user(dirent->d_name, name, namlen))
+		goto efault;
+	if (__put_user(0, dirent->d_name + namlen))
+		goto efault;
+	buf->previous = dirent;
+	dirent = (void *)dirent + reclen;
+	buf->current_dir = dirent;
+	buf->count -= reclen;
+	return 0;
+efault:
+	buf->error = -EFAULT;
+	return -EFAULT;
+}
+
+asmlinkage long sys_getdents32t(unsigned int fd, struct linux_dirent32t __user * dirent, unsigned int count)
+{
+	struct file * file;
+	struct linux_dirent32t __user * lastdirent;
+	struct getdents_callback32t buf;
+	int error;
+
+	error = -EFAULT;
+	if (!access_ok(VERIFY_WRITE, dirent, count))
+		goto out;
+
+	error = -EBADF;
+	file = fget(fd);
+	if (!file)
+		goto out;
+
+	buf.current_dir = dirent;
+	buf.previous = NULL;
+	buf.count = count;
+	buf.error = 0;
+
+	error = vfs_readdir(file, filldir32t, &buf);
+	if (error < 0)
+		goto out_putf;
+	error = buf.error;
+	lastdirent = buf.previous;
+	if (lastdirent) {
+		if (put_user(file->f_pos, &lastdirent->d_off))
+			error = -EFAULT;
+		else
+			error = count - buf.count;
+	}
+
+out_putf:
+	fput(file);
+out:
+	return error;
+}
+
 #define ROUND_UP64(x) (((x)+sizeof(u64)-1) & ~(sizeof(u64)-1))
 
 struct getdents_callback64 {
--- linux-2.6.3/include/asm-x86_64/ia32_unistd.h.jj	2004-02-18 04:58:34.000000000 +0100
+++ linux-2.6.3/include/asm-x86_64/ia32_unistd.h	2004-02-26 15:29:06.138150177 +0100
@@ -278,6 +278,8 @@
 #define __NR_ia32_tgkill		270
 #define __NR_ia32_utimes		271
 #define __NR_ia32_fadvise64_64		272
+#define __NR_ia32_vserver		273
+#define __NR_ia32_getdents32t		274
 
 #define IA32_NR_syscalls 275	/* must be > than biggest syscall! */	
 
--- linux-2.6.3/include/asm-i386/unistd.h.jj	2004-02-24 16:19:19.000000000 +0100
+++ linux-2.6.3/include/asm-i386/unistd.h	2004-02-26 09:50:50.400877877 +0100
@@ -279,8 +279,9 @@
 #define __NR_utimes		271
 #define __NR_fadvise64_64	272
 #define __NR_vserver		273
+#define __NR_getdents32t	274
 
-#define NR_syscalls 274
+#define NR_syscalls 275
 
 #ifndef __KERNEL_SYSCALLS_NO_ERRNO__
 /* user-visible error numbers are in the range -1 - -124: see <asm-i386/errno.h> */
--- linux-2.6.3/include/linux/dirent.h.jj	2004-02-18 04:59:07.000000000 +0100
+++ linux-2.6.3/include/linux/dirent.h	2004-02-26 09:47:16.694315247 +0100
@@ -18,6 +18,14 @@ struct dirent64 {
 
 #ifdef __KERNEL__
 
+struct linux_dirent32t {
+	u32		d_ino;
+	s32		d_off;
+	unsigned short	d_reclen;
+	unsigned char	d_type;
+	char		d_name[0];
+};
+
 struct linux_dirent64 {
 	u64		d_ino;
 	s64		d_off;
--- linux-2.6.3/arch/i386/kernel/entry.S.jj	2004-02-24 16:19:19.000000000 +0100
+++ linux-2.6.3/arch/i386/kernel/entry.S	2004-02-26 15:26:20.086930379 +0100
@@ -1031,5 +1031,6 @@ ENTRY(sys_call_table)
 	.long sys_utimes
  	.long sys_fadvise64_64
 	.long sys_ni_syscall	/* sys_vserver */
+	.long sys_getdents32t
 
 syscall_table_size=(.-sys_call_table)
--- linux-2.6.3/arch/x86_64/ia32/ia32entry.S.jj	2004-02-26 00:08:16.000000000 +0100
+++ linux-2.6.3/arch/x86_64/ia32/ia32entry.S	2004-02-26 15:28:14.555401079 +0100
@@ -486,6 +486,8 @@ ia32_sys_call_table:
 	.quad sys_tgkill
 	.quad compat_sys_utimes
 	.quad sys32_fadvise64_64
+	.quad quiet_ni_syscall
+	.quad sys_getdents32t
 	/* don't forget to change IA32_NR_syscalls */
 ia32_syscall_end:		
 	.rept IA32_NR_syscalls-(ia32_syscall_end-ia32_sys_call_table)/8


	Jakub

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Add getdents32t syscall
  2004-02-26 19:38 [PATCH] Add getdents32t syscall Jakub Jelinek
@ 2004-02-26 22:03 ` Randy.Dunlap
  2004-02-26 22:15 ` Linus Torvalds
  2004-02-27 19:28 ` Linus Torvalds
  2 siblings, 0 replies; 16+ messages in thread
From: Randy.Dunlap @ 2004-02-26 22:03 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: torvalds, linux-kernel, drepper

On Thu, 26 Feb 2004 20:38:19 +0100 Jakub Jelinek wrote:

| glibc struct dirent has d_type field (similarly to struct dirent64).
| Because no 32-bit getdents syscall provides this field to userland,
| glibc needs to use getdents64 syscall even for 32-bit getdents
| (and readdir etc.) and convert dirent entries from struct dirent64
| to struct dirent.  The code is quite complicated and as the former
| is bigger and the size of 64-bit dirents cannot be predicted accurately,
| it can happen that glibc reads too many entries and has to seek back
| on the dir etc.
| 
| The following patch introduces a new syscall (on 32-bit architectures),
| which fills in 32-bit struct dirent with d_type member.
| With this syscall glibc can simply call this syscall in 32-bit getdents
| and be done with it, no seeking, issues with NFS zero extended d_ino values,
| buffer translation etc.  sys_getdents32t (the t in there is for type,
| to differentiate it from compatibility sys_getdents32 which don't provide
| d_type) function should be usable both on 32-bit arches and in 32-bit
| compatibility layers on 64-bit arches (on most arches directly, if
| the arguments are zero extended in assembly).
| 

| +asmlinkage long sys_getdents32t(unsigned int fd, struct linux_dirent32t __user * dirent, unsigned int count)
| +{

Please add function prototype for that to include/linux/syscalls.h.

Thanks,
--
~Randy

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Add getdents32t syscall
  2004-02-26 19:38 [PATCH] Add getdents32t syscall Jakub Jelinek
  2004-02-26 22:03 ` Randy.Dunlap
@ 2004-02-26 22:15 ` Linus Torvalds
  2004-02-26 22:25   ` Linus Torvalds
  2004-02-29  0:25   ` Jamie Lokier
  2004-02-27 19:28 ` Linus Torvalds
  2 siblings, 2 replies; 16+ messages in thread
From: Linus Torvalds @ 2004-02-26 22:15 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: linux-kernel, drepper



On Thu, 26 Feb 2004, Jakub Jelinek wrote:
> 
> glibc struct dirent has d_type field (similarly to struct dirent64).
> Because no 32-bit getdents syscall provides this field to userland,
> glibc needs to use getdents64 syscall even for 32-bit getdents
> (and readdir etc.) and convert dirent entries from struct dirent64
> to struct dirent.  The code is quite complicated and as the former
> is bigger and the size of 64-bit dirents cannot be predicted accurately,
> it can happen that glibc reads too many entries and has to seek back
> on the dir etc.

Nooo..

Please just use the old "getdents()", and if you really really need this, 
add the "type" char after the end of the name.

This is a two-liner change (yeah, and we'd need to add a flag saying we do 
this). 

In other words, what's wrong with this much simpler "extended getdents" 
instead?

		Linus

--- 1.23/fs/readdir.c	Tue Feb  3 21:29:14 2004
+++ edited/fs/readdir.c	Thu Feb 26 14:12:57 2004
@@ -139,7 +139,7 @@
 {
 	struct linux_dirent __user * dirent;
 	struct getdents_callback * buf = (struct getdents_callback *) __buf;
-	int reclen = ROUND_UP(NAME_OFFSET(dirent) + namlen + 1);
+	int reclen = ROUND_UP(NAME_OFFSET(dirent) + namlen + 2);
 
 	buf->error = -EINVAL;	/* only used if we fail.. */
 	if (reclen > buf->count)
@@ -157,6 +157,8 @@
 	if (copy_to_user(dirent->d_name, name, namlen))
 		goto efault;
 	if (__put_user(0, dirent->d_name + namlen))
+		goto efault;
+	if (__put_user(d_type, dirent->d_name + namlen + 1))
 		goto efault;
 	buf->previous = dirent;
 	dirent = (void *)dirent + reclen;

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Add getdents32t syscall
  2004-02-26 22:15 ` Linus Torvalds
@ 2004-02-26 22:25   ` Linus Torvalds
  2004-02-26 22:29     ` Ulrich Drepper
  2004-02-26 22:32     ` Jakub Jelinek
  2004-02-29  0:25   ` Jamie Lokier
  1 sibling, 2 replies; 16+ messages in thread
From: Linus Torvalds @ 2004-02-26 22:25 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: linux-kernel, drepper



On Thu, 26 Feb 2004, Linus Torvalds wrote:
> 
> In other words, what's wrong with this much simpler "extended getdents" 
> instead?

Actually, let's put the "d_type" always at the last character, so that you 
don't have to search for it. Ie like the appended.

Then you just get

	d_type = ((unsigned char *)dirent)[dirent->d_reclen-1];

inside glibc. Instead of having a new system call.

You can even trivially check whether the system call fills in the d_type 
field or not:

 - pre-fill the dirent area with 0xff or something
 - do a small old-style "readdir()"
 - check the first entry: the above gives a d_type of 0xff, then you have 
   an old-style readdir. If it gives 0, then you have to test whether it 
   is an old-style readdir (and the zero is the end-of-name marker) or a 
   new-style readdir (and the zero is DT_UNKNOWN). You can trivially do 
   that by checking the length of the name, and comparing it with the 
   reclen.

See? No new system call, and trivial detection of whether the new code is 
there or not.

		Linus

--
--- 1.23/fs/readdir.c	Tue Feb  3 21:29:14 2004
+++ edited/fs/readdir.c	Thu Feb 26 14:17:05 2004
@@ -139,7 +139,7 @@
 {
 	struct linux_dirent __user * dirent;
 	struct getdents_callback * buf = (struct getdents_callback *) __buf;
-	int reclen = ROUND_UP(NAME_OFFSET(dirent) + namlen + 1);
+	int reclen = ROUND_UP(NAME_OFFSET(dirent) + namlen + 2);
 
 	buf->error = -EINVAL;	/* only used if we fail.. */
 	if (reclen > buf->count)
@@ -157,6 +157,8 @@
 	if (copy_to_user(dirent->d_name, name, namlen))
 		goto efault;
 	if (__put_user(0, dirent->d_name + namlen))
+		goto efault;
+	if (__put_user(d_type, (char *) dirent + reclen - 1))
 		goto efault;
 	buf->previous = dirent;
 	dirent = (void *)dirent + reclen;

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Add getdents32t syscall
  2004-02-26 22:25   ` Linus Torvalds
@ 2004-02-26 22:29     ` Ulrich Drepper
  2004-02-26 23:00       ` Linus Torvalds
  2004-02-26 22:32     ` Jakub Jelinek
  1 sibling, 1 reply; 16+ messages in thread
From: Ulrich Drepper @ 2004-02-26 22:29 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Jakub Jelinek, linux-kernel

-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

Linus Torvalds wrote:

>  - pre-fill the dirent area with 0xff or something

fill whole temporary buffer allocated by opendir() for every call to
getdents(2)?


You are swapping a bit of additional kernel code for quite a bit of
overhead at every program's runtime.

Yes, you want to keep your code small and tidy, but getdents calls are
frequent and the wasted cycles spent on the memory operations far
outweigh the extra code.  In many cases we have to clear a 8k+ buffer
just because getdents fills in 200 bytes.


Plus, the old code, like all compatibility interface, can over time be
grouped together and moved to one side of the kernel so that they don't
disturb the icache.

- -- 
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.2.3 (GNU/Linux)

iD8DBQFAPnNI2ijCOnn/RHQRAnynAKDH4GOGnF3ai/7UHdfc7siArOF7fwCePZZP
huZUMCSGIhAM5uw1zgpLSmM=
=YtUt
-----END PGP SIGNATURE-----

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Add getdents32t syscall
  2004-02-26 22:25   ` Linus Torvalds
  2004-02-26 22:29     ` Ulrich Drepper
@ 2004-02-26 22:32     ` Jakub Jelinek
  2004-02-26 23:15       ` Linus Torvalds
  2004-02-27  3:36       ` Theodore Ts'o
  1 sibling, 2 replies; 16+ messages in thread
From: Jakub Jelinek @ 2004-02-26 22:32 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-kernel, drepper

On Thu, Feb 26, 2004 at 02:25:01PM -0800, Linus Torvalds wrote:
> 
> 
> On Thu, 26 Feb 2004, Linus Torvalds wrote:
> > 
> > In other words, what's wrong with this much simpler "extended getdents" 
> > instead?
> 
> Actually, let's put the "d_type" always at the last character, so that you 
> don't have to search for it. Ie like the appended.
> 
> Then you just get
> 
> 	d_type = ((unsigned char *)dirent)[dirent->d_reclen-1];
> 
> inside glibc. Instead of having a new system call.

Userland struct dirent is:

struct dirent
  {
#ifndef __USE_FILE_OFFSET64
    __ino_t d_ino;
    __off_t d_off;
#else
    __ino64_t d_ino;
    __off64_t d_off;
#endif
    unsigned short int d_reclen;
    unsigned char d_type;
    char d_name[256];           /* We must not include limits.h! */
  };

(since 1997 or so), so with the extended getdents syscall glibc would need
to memmove every name by 1 byte.

> You can even trivially check whether the system call fills in the d_type 
> field or not:
> 
>  - pre-fill the dirent area with 0xff or something
>  - do a small old-style "readdir()"
>  - check the first entry: the above gives a d_type of 0xff, then you have 
>    an old-style readdir. If it gives 0, then you have to test whether it 
>    is an old-style readdir (and the zero is the end-of-name marker) or a 
>    new-style readdir (and the zero is DT_UNKNOWN). You can trivially do 
>    that by checking the length of the name, and comparing it with the 
>    reclen.

This has a few problems:
- unless glibc is configured to assume 2.6.4+ kernel, it would need to
  do on first readdir one small getdents (instead of as big getdents as
  needed)
- how to find what is the right small size for the test (it would need to
  retry if it did not fit any entries)
- and if on the old kernel, it would need to seek back so that getdents64
  can be used
Yes, this would only happen the first time getdents is called, but still,
aren't syscalls quite cheap?

	Jakub

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Add getdents32t syscall
  2004-02-26 22:29     ` Ulrich Drepper
@ 2004-02-26 23:00       ` Linus Torvalds
  0 siblings, 0 replies; 16+ messages in thread
From: Linus Torvalds @ 2004-02-26 23:00 UTC (permalink / raw)
  To: Ulrich Drepper; +Cc: Jakub Jelinek, linux-kernel



On Thu, 26 Feb 2004, Ulrich Drepper wrote:
> Linus Torvalds wrote:
> 
> >  - pre-fill the dirent area with 0xff or something
> 
> fill whole temporary buffer allocated by opendir() for every call to
> getdents(2)?

No no.

You only need to do this _once_. Once you know that the kernel is ok, you 
never ever need to do it again.

It's not even "once per file descriptor" or anything like that. It's 
literally _once_. 

And you don't need to fill the whole buffer even that first time. You 
only need to fill enough to guarantee that the _first_ entry is filled 
up.

In fact, if you're willing to have an algorithm that always works, but
might under some circumstances be a bit conservative, you can avoid
filling entirely, and just have a static flag that says "newformat", you
can do the following:

 - assume old format
 - if you ever see a reclen that is "too big" for the name length, you 
   know you have a new-format case (this will happen with any name that is 
   of length 1 modulo 4 on a 32-bit architecture).

For old-format stuff, you return DT_UNKNOWN, or you do your old existing 
song and dance. For new-format stuff you do the trivial thing.

And guess what? The entry "." will give you the information abotu whether 
it is old-format or not. On an old-format thing, "." will look like this:

	offset
	 0:	32-bit d_ino
	 4:	32-bit d_offset = 12
	 8:     16-bit d_namelen = 1
	10:	string ".\0"

	12:	32-bit d_ino for the next entry
	...

while with a new-format readdir you will get

	offset
	 0:	32-bit d_ino
	 4:	32-bit d_offset = 16
	 8:	16-bit d_namelen = 1
	10:	string ".\0"
	12-14:	three bytes of garbage
	15:	8-bit d_type

	16:	32-bit d_ino for the next entry..
	...

Notice? You are guaranteed to find out really quickly whether it's old- or 
new-format unless the user is doing something really really strange, and 
even if the user is doing something strange, returning D_UNKNOWN is always 
"correct".

So not only is my solution simple in kernel space, it allows you to 
simplify glibc too, if you are willing to make the old case go slower.

			Linus

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Add getdents32t syscall
  2004-02-26 22:32     ` Jakub Jelinek
@ 2004-02-26 23:15       ` Linus Torvalds
  2004-02-27  1:33         ` Ulrich Drepper
  2004-02-27  1:46         ` Andreas Dilger
  2004-02-27  3:36       ` Theodore Ts'o
  1 sibling, 2 replies; 16+ messages in thread
From: Linus Torvalds @ 2004-02-26 23:15 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: linux-kernel, drepper



On Thu, 26 Feb 2004, Jakub Jelinek wrote:
> 
> Userland struct dirent is:

Ahh. So with the new thing, you'd need no conversion at all.

> (since 1997 or so), so with the extended getdents syscall glibc would need
> to memmove every name by 1 byte.

The thing is, I hate encouraging glibc's behaviour of "we'll make up our
own structures", and then ask the kernel to fix it later when it was done
wrong in glibc. This is a totally new format that is totally unnecessary,
and the RIGHT thing to do is to have glibc just use the proper 64-bit
format.

In other words, why doesn't glibc ever just make a new major number and
make its "struct dirent" be the 64-bit version? It is _ridiculous_ to
carry this baggage around, and then complain and add MORE baggage to the
kernel because of having done things wrong the first time around.

			Linus

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Add getdents32t syscall
  2004-02-26 23:15       ` Linus Torvalds
@ 2004-02-27  1:33         ` Ulrich Drepper
  2004-02-27  6:16           ` Linus Torvalds
  2004-02-28 23:21           ` Jamie Lokier
  2004-02-27  1:46         ` Andreas Dilger
  1 sibling, 2 replies; 16+ messages in thread
From: Ulrich Drepper @ 2004-02-27  1:33 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Jakub Jelinek, linux-kernel

-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

Linus Torvalds wrote:

> The thing is, I hate encouraging glibc's behaviour of "we'll make up our
> own structures", and then ask the kernel to fix it later when it was done
> wrong in glibc. This is a totally new format that is totally unnecessary,
> and the RIGHT thing to do is to have glibc just use the proper 64-bit
> format.

Good idea.  Let's go back in time to 1994/5 and add d_type then.  May I
remember you that I always wanted large data types but whenever it was
proposed the kernel people (including you) said: we don't need it this
big now therefore it won't be changed.


> In other words, why doesn't glibc ever just make a new major number and
> make its "struct dirent" be the 64-bit version?

You can't be serious.  Can you even imagine the pain this would cause?

- -- 
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.2.3 (GNU/Linux)

iD8DBQFAPp5N2ijCOnn/RHQRAuuOAJ4rY41j0ifWbK3R8Qv5elA+RULCqwCeP+RC
pIC2Re4DR6tUZSPmqsijjGw=
=PfkG
-----END PGP SIGNATURE-----

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Add getdents32t syscall
  2004-02-26 23:15       ` Linus Torvalds
  2004-02-27  1:33         ` Ulrich Drepper
@ 2004-02-27  1:46         ` Andreas Dilger
  1 sibling, 0 replies; 16+ messages in thread
From: Andreas Dilger @ 2004-02-27  1:46 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Jakub Jelinek, linux-kernel, drepper

On Thu, 26 Feb 2004, Jakub Jelinek wrote:
> Userland struct dirent is:
> (since 1997 or so), so with the extended getdents syscall glibc would need
> to memmove every name by 1 byte.

Actually, before you go through contortions trying to return this stuff to
user space, you should fix the GNU tools to use the information properly.

For example, rm stats the file each time before trying to unlink it even
if it has the file type information from getdents64().  It also tries to
do silly things like unlink() on a directory instead of rmdir, even after
the stat (RH9 coreutils 4.5.3-19 and 2.4.24):

    $ mkdir /tmp/foo
    $ touch /tmp/foo/f{1,2,3,4,5}
    $ strace rm -r /tmp/foo
    :
    :
    lstat64("/tmp/foo", {st_mode=S_IFDIR|0775, st_size=4096, ...}) = 0
    access("/tmp/foo", W_OK)                 = 0
    unlink("/tmp/foo")                       = -1 EISDIR (Is a directory)

Um, hello - we just saw that it was a directory so why not try rmdir()?
It does the same thing when the directory is empty too.

    open(".", O_RDONLY|O_LARGEFILE|O_DIRECTORY) = 3
    lstat64("/tmp/foo", {st_mode=S_IFDIR|0775, st_size=4096, ...}) = 0
    chdir("/tmp/foo")                        = 0
    lstat64(".", {st_mode=S_IFDIR|0775, st_size=4096, ...}) = 0
    open(".", O_RDONLY|O_NONBLOCK|O_LARGEFILE|O_DIRECTORY) = 4
    fstat64(4, {st_mode=S_IFDIR|0775, st_size=4096, ...}) = 0
    fcntl64(4, F_SETFD, FD_CLOEXEC)         = 0

    getdents64(4, /* 7 entries */, 4096)    = 168
    lstat64("f1", {st_mode=S_IFREG|0664, st_size=0, ...}) = 0
    access("f1", W_OK)                      = 0
    unlink("f1")                            = 0
    lstat64("f2", {st_mode=S_IFREG|0664, st_size=0, ...}) = 0
    access("f2", W_OK)                      = 0
    unlink("f2")                            = 0
    :
    :

So, for each file, we just got the file type info from getdents64(),
we still stat the file (not sure why), we check the permissions even
though we just statted it (could check perms from stat info or just
try to unlink and handle the error return code), and finally an unlink.

    getdents64(4, /* 0 entries */, 4096)    = 0
    close(4)                                = 0
    fchdir(3)                               = 0
    lstat64(".", {st_mode=S_IFDIR|0775, st_size=4096, ...}) = 0
    lstat64("/tmp/foo", {st_mode=S_IFDIR|0775, st_size=4096, ...}) = 0
    access("/tmp/foo", W_OK)                 = 0
    rmdir("/tmp/foo")                        = 0

Cheers, Andreas
--
Andreas Dilger
http://sourceforge.net/projects/ext2resize/
http://www-mddsp.enel.ucalgary.ca/People/adilger/


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Add getdents32t syscall
  2004-02-26 22:32     ` Jakub Jelinek
  2004-02-26 23:15       ` Linus Torvalds
@ 2004-02-27  3:36       ` Theodore Ts'o
  1 sibling, 0 replies; 16+ messages in thread
From: Theodore Ts'o @ 2004-02-27  3:36 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: Linus Torvalds, linux-kernel, drepper

On Thu, Feb 26, 2004 at 05:32:26PM -0500, Jakub Jelinek wrote:
> (since 1997 or so), so with the extended getdents syscall glibc would need
> to memmove every name by 1 byte.

Glibc will have to have the code to play the memmove game for
compatibility with existing kernels.  So the question is whether or
not the memmove by one byte will actually be noticeable for most
application.  Has anyone checked to see whether or not it would even
be noticeable on a profiling run?

						- Ted

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Add getdents32t syscall
  2004-02-27  1:33         ` Ulrich Drepper
@ 2004-02-27  6:16           ` Linus Torvalds
  2004-02-27  7:05             ` Ulrich Drepper
  2004-02-28 23:21           ` Jamie Lokier
  1 sibling, 1 reply; 16+ messages in thread
From: Linus Torvalds @ 2004-02-27  6:16 UTC (permalink / raw)
  To: Ulrich Drepper; +Cc: Jakub Jelinek, linux-kernel



On Thu, 26 Feb 2004, Ulrich Drepper wrote:
> 
> > In other words, why doesn't glibc ever just make a new major number and
> > make its "struct dirent" be the 64-bit version?
> 
> You can't be serious.  Can you even imagine the pain this would cause?

Do you know how much pain it causes when you make changes and you do _not_ 
change the major number?

At some point you need to clean up. 

This issue has been with us for years and years. I don't understand why
you'd want to add a totally new system call now. Instead, I'd suggest you 
just put this on the long list of things to fix for when a new major 
number is to be used.

And yes, you should use a new major number at some point. It is
_senseless_ to never update the majors. Just make a clean break, go
through every little nagging ugly thing (and there are a _lot_ of them 
that have accumulated over the years), and make ready for "libc-3.0".

No, I'm not suggesting you do it for this one thing, obviously. But 
there's bound to be _thousands_ of these stupid things where glibc has 
compatibility crap that makes no sense.

		Linus

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Add getdents32t syscall
  2004-02-27  6:16           ` Linus Torvalds
@ 2004-02-27  7:05             ` Ulrich Drepper
  0 siblings, 0 replies; 16+ messages in thread
From: Ulrich Drepper @ 2004-02-27  7:05 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Jakub Jelinek, linux-kernel

-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

Linus Torvalds wrote:

> No, I'm not suggesting you do it for this one thing, obviously. But 
> there's bound to be _thousands_ of these stupid things where glibc has 
> compatibility crap that makes no sense.

No there are not, at least not for the libc which everybody uses.  FC
comes with three versions.  They each have increasing kernel
requirements.  The normally used version assumes a recent kernel (2.4.20
in FC1, hopefully 2.6.x in FC2) which allows to eliminate all
compatibility with older kernels.  This cuts out almost all of the
cruft.  The other libcs are for various degrees of backward
compatibility for apps which do not follow the API rules and depend on
non-guaranteed behavior of one sort or another.


Anyway, if you add the d_type field after the string and we do the
memmove that already much better than what we have to do today.

- -- 
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.2.3 (GNU/Linux)

iD8DBQFAPuwl2ijCOnn/RHQRAtKkAKCjyzGzwEOgH8MFTWYkXhJOrhHHZACeML4p
wBKxnHhbQNQaycRwocS4Bjc=
=69Uo
-----END PGP SIGNATURE-----

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Add getdents32t syscall
  2004-02-26 19:38 [PATCH] Add getdents32t syscall Jakub Jelinek
  2004-02-26 22:03 ` Randy.Dunlap
  2004-02-26 22:15 ` Linus Torvalds
@ 2004-02-27 19:28 ` Linus Torvalds
  2 siblings, 0 replies; 16+ messages in thread
From: Linus Torvalds @ 2004-02-27 19:28 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: linux-kernel, drepper



On Thu, 26 Feb 2004, Jakub Jelinek wrote:
> 
> Because no 32-bit getdents syscall provides this field to userland,
> glibc needs to use getdents64 syscall even for 32-bit getdents
> (and readdir etc.) and convert dirent entries from struct dirent64
> to struct dirent.  The code is quite complicated and as the former
> is bigger and the size of 64-bit dirents cannot be predicted accurately,
> it can happen that glibc reads too many entries and has to seek back
> on the dir etc.

Ok, I just committed the "add hidden d_type to the 32-bit getdents" thing, 
and I'd like to have people verify that it works (I just verified that it 
didn't break anything, but hey, nothing is using the data, so..)

However, the more I look at the above, the more confused I get. Your 
explanation simplty doesn't make any sense.

The thing is, it's _trivial_ to convert from the bigger format into the 
smaller format. It would be much harder to convert the other way. What's 
the problem with just using he getdents64 format and converting the data 
in-place?

			Linus

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Add getdents32t syscall
  2004-02-27  1:33         ` Ulrich Drepper
  2004-02-27  6:16           ` Linus Torvalds
@ 2004-02-28 23:21           ` Jamie Lokier
  1 sibling, 0 replies; 16+ messages in thread
From: Jamie Lokier @ 2004-02-28 23:21 UTC (permalink / raw)
  To: Ulrich Drepper; +Cc: Linus Torvalds, Jakub Jelinek, linux-kernel

Ulrich Drepper wrote:
> > In other words, why doesn't glibc ever just make a new major number and
> > make its "struct dirent" be the 64-bit version?
> 
> You can't be serious.  Can you even imagine the pain this would cause?

Why wouldn't this work?

Change the 32-bit struct dirent to this:

    struct direct
      {
#if __BYTE_ORDER == __LITTLE_ENDIAN
        __ino_t d_ino;
        __u32   __padding1;
        __off_t d_off;
        __u32   __padding2;
#elif __BYTE_ORDER == __BIG_ENDIAN
        __u32   __padding1;
        __ino_t d_ino;
        __u32   __padding2;
        __off_t d_off;
#else
#error Help!
#endif
        unsigned short int d_reclen;
        unsigned char d_type;
        char d_name[256];
      };

And also change getdirentries() and readdir() to call the kernel's getdents64.

Use symbol versioning, so that old binaries will link to the old
(compatible and slower) functions, and newly compiled code, using the
new definition of struct dirent, uses the fast new versions of those
functions?

I presume that, if ELF symbol versioning doesn't allow you to do this,
then the same trick you do with stat() can be used.  From Glibc's
<sys/stat.h>:

    /* To allow the `struct stat' structure and the file type `mode_t'
       bits to vary without changing shared library major version number,
       the `stat' family of functions and `mknod' are in fact inline
       wrappers around calls to `xstat', `fxstat', `lxstat', and `xmknod',
       which all take a leading version-number argument designating the
       data structure and bits used.

-- Jamie

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Add getdents32t syscall
  2004-02-26 22:15 ` Linus Torvalds
  2004-02-26 22:25   ` Linus Torvalds
@ 2004-02-29  0:25   ` Jamie Lokier
  1 sibling, 0 replies; 16+ messages in thread
From: Jamie Lokier @ 2004-02-29  0:25 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Jakub Jelinek, linux-kernel, drepper

Linus Torvalds wrote:
> Ok, I just committed the "add hidden d_type to the 32-bit getdents" thing, 
> and I'd like to have people verify that it works (I just verified that it 
> didn't break anything, but hey, nothing is using the data, so..)

I'd like to draw your attention to a patch posted earlier last century:

>> From: Jamie Lokier <lkd@tantalophile.demon.co.uk>
>> Date: Wed, 17 Nov 1999 22:56:56 +0100
>> Subject: [PATCH] dirent->d_type support
>>
>> I rejected adding a new system call.  Instead, I extended getdents() to
>> return data in a backward compatible way.  The d_type information as
>> placed at the end of each dirent record, and can be recognised
>> unambiguously.

:)

It got flamed by Alex Viro for being a useless feature so was lost in
the dismay.

(Btw, that patch also added support for reporting a useful d_type from
a large number of filesystems, some of which I think still don't have it
in the main kernel tree.)

I added d_type to dirent slightly differently, like this:

 	copy_to_user(dirent->d_name, name, namlen);
 	put_user(0, dirent->d_name + namlen);
+	/*
+	 * This, plus adding sizeof(long) to reclen, allows d_type to be
+	 * included in a backward & forward compatible way.
+	 */
+	if (type != DT_UNKNOWN) {
+		put_user(0, (unsigned char *) dirent + reclen - (1 + sizeof (long)));
+		put_user(type, (unsigned char *) dirent + reclen - 2);
+		put_user(0, (unsigned char *) dirent + reclen - 1);
+	}
+
 	((char *) dirent) += reclen;
 	buf->current_dir = dirent;

The idea is that you can unambiguously determine if a d_type byte is
included in the record without calling strlen(d_name), and in a way
that allows more data to be appended to the record in future, like
this:

#define D_TYPE(dent)							      \
  (((dent)->d_reclen > ((dent)->d_name - (char *) (dent) + sizeof (long))     \
    && !*((unsigned char *) (dent) + (dent)->d_reclen - (1 + sizeof (long)))) \
   ? *((unsigned char *) (dent) + (dent)->d_reclen - 2) : DT_UNKNOWN)

However, as there are no plans to ever add any more information to the
records from getdents(), your method is fine and of course it's much
simpler to use.

> On Thu, 26 Feb 2004, Jakub Jelinek wrote:
> > Because no 32-bit getdents syscall provides this field to userland,
> > glibc needs to use getdents64 syscall even for 32-bit getdents
> > (and readdir etc.) and convert dirent entries from struct dirent64
> > to struct dirent.  The code is quite complicated and as the former
> > is bigger and the size of 64-bit dirents cannot be predicted accurately,
> > it can happen that glibc reads too many entries and has to seek back
> > on the dir etc.
> 
> However, the more I look at the above, the more confused I get. Your 
> explanation simplty doesn't make any sense.
> 
> The thing is, it's _trivial_ to convert from the bigger format into the 
> smaller format. It would be much harder to convert the other way. What's 
> the problem with just using he getdents64 format and converting the data 
> in-place?

You can convert dirent64 to dirent in-place, and it gets smaller so
that isn't a cause for "read too many entries".  What can happen is
that Glibc reads too _few_ entries.  That doesn't cause lseek()
problems.

In Glibc 2.3.1's code, there's a more complicate heuristic that
sometimes reads more than can be returned to userspace, and thus has
to lseek(), but that's only used if it's running on an old kernel that
_doesn't_ have getdents64().

The problem with seeking back on the directory that Jakub mentions can
happen when Glibc uses getdents64() -- but it has nothing to do with
the structure sizes or running out of space.

What happens is that if getdents64() returns a d_ino or d_off value
which overflows the 32-bit fields returned to userspace, then Glibc
2.3.1 calls lseek() to go back just before that entry, and returns as
many entries as it could.

This is so that userspace sees all the entries that could be read
whose inodes and offsets fitted into 32 bits, and then sees EOVERFLOW
when it tries to read the next, overflowing entry.

-- Jamie

^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2004-02-29  0:26 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2004-02-26 19:38 [PATCH] Add getdents32t syscall Jakub Jelinek
2004-02-26 22:03 ` Randy.Dunlap
2004-02-26 22:15 ` Linus Torvalds
2004-02-26 22:25   ` Linus Torvalds
2004-02-26 22:29     ` Ulrich Drepper
2004-02-26 23:00       ` Linus Torvalds
2004-02-26 22:32     ` Jakub Jelinek
2004-02-26 23:15       ` Linus Torvalds
2004-02-27  1:33         ` Ulrich Drepper
2004-02-27  6:16           ` Linus Torvalds
2004-02-27  7:05             ` Ulrich Drepper
2004-02-28 23:21           ` Jamie Lokier
2004-02-27  1:46         ` Andreas Dilger
2004-02-27  3:36       ` Theodore Ts'o
2004-02-29  0:25   ` Jamie Lokier
2004-02-27 19:28 ` Linus Torvalds

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.