* [PATCH] Add getdents32t syscall
@ 2004-02-26 19:38 Jakub Jelinek
2004-02-26 22:03 ` Randy.Dunlap
` (2 more replies)
0 siblings, 3 replies; 16+ messages in thread
From: Jakub Jelinek @ 2004-02-26 19:38 UTC (permalink / raw)
To: torvalds; +Cc: linux-kernel, drepper
Hi!
glibc struct dirent has d_type field (similarly to struct dirent64).
Because no 32-bit getdents syscall provides this field to userland,
glibc needs to use getdents64 syscall even for 32-bit getdents
(and readdir etc.) and convert dirent entries from struct dirent64
to struct dirent. The code is quite complicated and as the former
is bigger and the size of 64-bit dirents cannot be predicted accurately,
it can happen that glibc reads too many entries and has to seek back
on the dir etc.
The following patch introduces a new syscall (on 32-bit architectures),
which fills in 32-bit struct dirent with d_type member.
With this syscall glibc can simply call this syscall in 32-bit getdents
and be done with it, no seeking, issues with NFS zero extended d_ino values,
buffer translation etc. sys_getdents32t (the t in there is for type,
to differentiate it from compatibility sys_getdents32 which don't provide
d_type) function should be usable both on 32-bit arches and in 32-bit
compatibility layers on 64-bit arches (on most arches directly, if
the arguments are zero extended in assembly).
--- linux-2.6.3/fs/readdir.c.jj 2004-02-18 04:57:52.000000000 +0100
+++ linux-2.6.3/fs/readdir.c 2004-02-26 09:49:20.073123212 +0100
@@ -207,6 +207,88 @@ out:
return error;
}
+struct getdents_callback32t {
+ struct linux_dirent32t __user * current_dir;
+ struct linux_dirent32t __user * previous;
+ int count;
+ int error;
+};
+
+int filldir32t(void * __buf, const char * name, int namlen, loff_t offset,
+ ino_t ino, unsigned int d_type)
+{
+ struct linux_dirent32t __user * dirent;
+ struct getdents_callback32t * buf = (struct getdents_callback32t *) __buf;
+ int reclen = ROUND_UP(NAME_OFFSET(dirent) + namlen + 1);
+
+ buf->error = -EINVAL; /* only used if we fail.. */
+ if (reclen > buf->count)
+ return -EINVAL;
+ dirent = buf->previous;
+ if (dirent) {
+ if (__put_user(offset, &dirent->d_off))
+ goto efault;
+ }
+ dirent = buf->current_dir;
+ if (__put_user(ino, &dirent->d_ino))
+ goto efault;
+ if (__put_user(reclen, &dirent->d_reclen))
+ goto efault;
+ if (__put_user(d_type, &dirent->d_type))
+ goto efault;
+ if (copy_to_user(dirent->d_name, name, namlen))
+ goto efault;
+ if (__put_user(0, dirent->d_name + namlen))
+ goto efault;
+ buf->previous = dirent;
+ dirent = (void *)dirent + reclen;
+ buf->current_dir = dirent;
+ buf->count -= reclen;
+ return 0;
+efault:
+ buf->error = -EFAULT;
+ return -EFAULT;
+}
+
+asmlinkage long sys_getdents32t(unsigned int fd, struct linux_dirent32t __user * dirent, unsigned int count)
+{
+ struct file * file;
+ struct linux_dirent32t __user * lastdirent;
+ struct getdents_callback32t buf;
+ int error;
+
+ error = -EFAULT;
+ if (!access_ok(VERIFY_WRITE, dirent, count))
+ goto out;
+
+ error = -EBADF;
+ file = fget(fd);
+ if (!file)
+ goto out;
+
+ buf.current_dir = dirent;
+ buf.previous = NULL;
+ buf.count = count;
+ buf.error = 0;
+
+ error = vfs_readdir(file, filldir32t, &buf);
+ if (error < 0)
+ goto out_putf;
+ error = buf.error;
+ lastdirent = buf.previous;
+ if (lastdirent) {
+ if (put_user(file->f_pos, &lastdirent->d_off))
+ error = -EFAULT;
+ else
+ error = count - buf.count;
+ }
+
+out_putf:
+ fput(file);
+out:
+ return error;
+}
+
#define ROUND_UP64(x) (((x)+sizeof(u64)-1) & ~(sizeof(u64)-1))
struct getdents_callback64 {
--- linux-2.6.3/include/asm-x86_64/ia32_unistd.h.jj 2004-02-18 04:58:34.000000000 +0100
+++ linux-2.6.3/include/asm-x86_64/ia32_unistd.h 2004-02-26 15:29:06.138150177 +0100
@@ -278,6 +278,8 @@
#define __NR_ia32_tgkill 270
#define __NR_ia32_utimes 271
#define __NR_ia32_fadvise64_64 272
+#define __NR_ia32_vserver 273
+#define __NR_ia32_getdents32t 274
#define IA32_NR_syscalls 275 /* must be > than biggest syscall! */
--- linux-2.6.3/include/asm-i386/unistd.h.jj 2004-02-24 16:19:19.000000000 +0100
+++ linux-2.6.3/include/asm-i386/unistd.h 2004-02-26 09:50:50.400877877 +0100
@@ -279,8 +279,9 @@
#define __NR_utimes 271
#define __NR_fadvise64_64 272
#define __NR_vserver 273
+#define __NR_getdents32t 274
-#define NR_syscalls 274
+#define NR_syscalls 275
#ifndef __KERNEL_SYSCALLS_NO_ERRNO__
/* user-visible error numbers are in the range -1 - -124: see <asm-i386/errno.h> */
--- linux-2.6.3/include/linux/dirent.h.jj 2004-02-18 04:59:07.000000000 +0100
+++ linux-2.6.3/include/linux/dirent.h 2004-02-26 09:47:16.694315247 +0100
@@ -18,6 +18,14 @@ struct dirent64 {
#ifdef __KERNEL__
+struct linux_dirent32t {
+ u32 d_ino;
+ s32 d_off;
+ unsigned short d_reclen;
+ unsigned char d_type;
+ char d_name[0];
+};
+
struct linux_dirent64 {
u64 d_ino;
s64 d_off;
--- linux-2.6.3/arch/i386/kernel/entry.S.jj 2004-02-24 16:19:19.000000000 +0100
+++ linux-2.6.3/arch/i386/kernel/entry.S 2004-02-26 15:26:20.086930379 +0100
@@ -1031,5 +1031,6 @@ ENTRY(sys_call_table)
.long sys_utimes
.long sys_fadvise64_64
.long sys_ni_syscall /* sys_vserver */
+ .long sys_getdents32t
syscall_table_size=(.-sys_call_table)
--- linux-2.6.3/arch/x86_64/ia32/ia32entry.S.jj 2004-02-26 00:08:16.000000000 +0100
+++ linux-2.6.3/arch/x86_64/ia32/ia32entry.S 2004-02-26 15:28:14.555401079 +0100
@@ -486,6 +486,8 @@ ia32_sys_call_table:
.quad sys_tgkill
.quad compat_sys_utimes
.quad sys32_fadvise64_64
+ .quad quiet_ni_syscall
+ .quad sys_getdents32t
/* don't forget to change IA32_NR_syscalls */
ia32_syscall_end:
.rept IA32_NR_syscalls-(ia32_syscall_end-ia32_sys_call_table)/8
Jakub
^ permalink raw reply [flat|nested] 16+ messages in thread* Re: [PATCH] Add getdents32t syscall 2004-02-26 19:38 [PATCH] Add getdents32t syscall Jakub Jelinek @ 2004-02-26 22:03 ` Randy.Dunlap 2004-02-26 22:15 ` Linus Torvalds 2004-02-27 19:28 ` Linus Torvalds 2 siblings, 0 replies; 16+ messages in thread From: Randy.Dunlap @ 2004-02-26 22:03 UTC (permalink / raw) To: Jakub Jelinek; +Cc: torvalds, linux-kernel, drepper On Thu, 26 Feb 2004 20:38:19 +0100 Jakub Jelinek wrote: | glibc struct dirent has d_type field (similarly to struct dirent64). | Because no 32-bit getdents syscall provides this field to userland, | glibc needs to use getdents64 syscall even for 32-bit getdents | (and readdir etc.) and convert dirent entries from struct dirent64 | to struct dirent. The code is quite complicated and as the former | is bigger and the size of 64-bit dirents cannot be predicted accurately, | it can happen that glibc reads too many entries and has to seek back | on the dir etc. | | The following patch introduces a new syscall (on 32-bit architectures), | which fills in 32-bit struct dirent with d_type member. | With this syscall glibc can simply call this syscall in 32-bit getdents | and be done with it, no seeking, issues with NFS zero extended d_ino values, | buffer translation etc. sys_getdents32t (the t in there is for type, | to differentiate it from compatibility sys_getdents32 which don't provide | d_type) function should be usable both on 32-bit arches and in 32-bit | compatibility layers on 64-bit arches (on most arches directly, if | the arguments are zero extended in assembly). | | +asmlinkage long sys_getdents32t(unsigned int fd, struct linux_dirent32t __user * dirent, unsigned int count) | +{ Please add function prototype for that to include/linux/syscalls.h. Thanks, -- ~Randy ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH] Add getdents32t syscall 2004-02-26 19:38 [PATCH] Add getdents32t syscall Jakub Jelinek 2004-02-26 22:03 ` Randy.Dunlap @ 2004-02-26 22:15 ` Linus Torvalds 2004-02-26 22:25 ` Linus Torvalds 2004-02-29 0:25 ` Jamie Lokier 2004-02-27 19:28 ` Linus Torvalds 2 siblings, 2 replies; 16+ messages in thread From: Linus Torvalds @ 2004-02-26 22:15 UTC (permalink / raw) To: Jakub Jelinek; +Cc: linux-kernel, drepper On Thu, 26 Feb 2004, Jakub Jelinek wrote: > > glibc struct dirent has d_type field (similarly to struct dirent64). > Because no 32-bit getdents syscall provides this field to userland, > glibc needs to use getdents64 syscall even for 32-bit getdents > (and readdir etc.) and convert dirent entries from struct dirent64 > to struct dirent. The code is quite complicated and as the former > is bigger and the size of 64-bit dirents cannot be predicted accurately, > it can happen that glibc reads too many entries and has to seek back > on the dir etc. Nooo.. Please just use the old "getdents()", and if you really really need this, add the "type" char after the end of the name. This is a two-liner change (yeah, and we'd need to add a flag saying we do this). In other words, what's wrong with this much simpler "extended getdents" instead? Linus --- 1.23/fs/readdir.c Tue Feb 3 21:29:14 2004 +++ edited/fs/readdir.c Thu Feb 26 14:12:57 2004 @@ -139,7 +139,7 @@ { struct linux_dirent __user * dirent; struct getdents_callback * buf = (struct getdents_callback *) __buf; - int reclen = ROUND_UP(NAME_OFFSET(dirent) + namlen + 1); + int reclen = ROUND_UP(NAME_OFFSET(dirent) + namlen + 2); buf->error = -EINVAL; /* only used if we fail.. */ if (reclen > buf->count) @@ -157,6 +157,8 @@ if (copy_to_user(dirent->d_name, name, namlen)) goto efault; if (__put_user(0, dirent->d_name + namlen)) + goto efault; + if (__put_user(d_type, dirent->d_name + namlen + 1)) goto efault; buf->previous = dirent; dirent = (void *)dirent + reclen; ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH] Add getdents32t syscall 2004-02-26 22:15 ` Linus Torvalds @ 2004-02-26 22:25 ` Linus Torvalds 2004-02-26 22:29 ` Ulrich Drepper 2004-02-26 22:32 ` Jakub Jelinek 2004-02-29 0:25 ` Jamie Lokier 1 sibling, 2 replies; 16+ messages in thread From: Linus Torvalds @ 2004-02-26 22:25 UTC (permalink / raw) To: Jakub Jelinek; +Cc: linux-kernel, drepper On Thu, 26 Feb 2004, Linus Torvalds wrote: > > In other words, what's wrong with this much simpler "extended getdents" > instead? Actually, let's put the "d_type" always at the last character, so that you don't have to search for it. Ie like the appended. Then you just get d_type = ((unsigned char *)dirent)[dirent->d_reclen-1]; inside glibc. Instead of having a new system call. You can even trivially check whether the system call fills in the d_type field or not: - pre-fill the dirent area with 0xff or something - do a small old-style "readdir()" - check the first entry: the above gives a d_type of 0xff, then you have an old-style readdir. If it gives 0, then you have to test whether it is an old-style readdir (and the zero is the end-of-name marker) or a new-style readdir (and the zero is DT_UNKNOWN). You can trivially do that by checking the length of the name, and comparing it with the reclen. See? No new system call, and trivial detection of whether the new code is there or not. Linus -- --- 1.23/fs/readdir.c Tue Feb 3 21:29:14 2004 +++ edited/fs/readdir.c Thu Feb 26 14:17:05 2004 @@ -139,7 +139,7 @@ { struct linux_dirent __user * dirent; struct getdents_callback * buf = (struct getdents_callback *) __buf; - int reclen = ROUND_UP(NAME_OFFSET(dirent) + namlen + 1); + int reclen = ROUND_UP(NAME_OFFSET(dirent) + namlen + 2); buf->error = -EINVAL; /* only used if we fail.. */ if (reclen > buf->count) @@ -157,6 +157,8 @@ if (copy_to_user(dirent->d_name, name, namlen)) goto efault; if (__put_user(0, dirent->d_name + namlen)) + goto efault; + if (__put_user(d_type, (char *) dirent + reclen - 1)) goto efault; buf->previous = dirent; dirent = (void *)dirent + reclen; ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH] Add getdents32t syscall 2004-02-26 22:25 ` Linus Torvalds @ 2004-02-26 22:29 ` Ulrich Drepper 2004-02-26 23:00 ` Linus Torvalds 2004-02-26 22:32 ` Jakub Jelinek 1 sibling, 1 reply; 16+ messages in thread From: Ulrich Drepper @ 2004-02-26 22:29 UTC (permalink / raw) To: Linus Torvalds; +Cc: Jakub Jelinek, linux-kernel -----BEGIN PGP SIGNED MESSAGE----- Hash: SHA1 Linus Torvalds wrote: > - pre-fill the dirent area with 0xff or something fill whole temporary buffer allocated by opendir() for every call to getdents(2)? You are swapping a bit of additional kernel code for quite a bit of overhead at every program's runtime. Yes, you want to keep your code small and tidy, but getdents calls are frequent and the wasted cycles spent on the memory operations far outweigh the extra code. In many cases we have to clear a 8k+ buffer just because getdents fills in 200 bytes. Plus, the old code, like all compatibility interface, can over time be grouped together and moved to one side of the kernel so that they don't disturb the icache. - -- ➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖ -----BEGIN PGP SIGNATURE----- Version: GnuPG v1.2.3 (GNU/Linux) iD8DBQFAPnNI2ijCOnn/RHQRAnynAKDH4GOGnF3ai/7UHdfc7siArOF7fwCePZZP huZUMCSGIhAM5uw1zgpLSmM= =YtUt -----END PGP SIGNATURE----- ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH] Add getdents32t syscall 2004-02-26 22:29 ` Ulrich Drepper @ 2004-02-26 23:00 ` Linus Torvalds 0 siblings, 0 replies; 16+ messages in thread From: Linus Torvalds @ 2004-02-26 23:00 UTC (permalink / raw) To: Ulrich Drepper; +Cc: Jakub Jelinek, linux-kernel On Thu, 26 Feb 2004, Ulrich Drepper wrote: > Linus Torvalds wrote: > > > - pre-fill the dirent area with 0xff or something > > fill whole temporary buffer allocated by opendir() for every call to > getdents(2)? No no. You only need to do this _once_. Once you know that the kernel is ok, you never ever need to do it again. It's not even "once per file descriptor" or anything like that. It's literally _once_. And you don't need to fill the whole buffer even that first time. You only need to fill enough to guarantee that the _first_ entry is filled up. In fact, if you're willing to have an algorithm that always works, but might under some circumstances be a bit conservative, you can avoid filling entirely, and just have a static flag that says "newformat", you can do the following: - assume old format - if you ever see a reclen that is "too big" for the name length, you know you have a new-format case (this will happen with any name that is of length 1 modulo 4 on a 32-bit architecture). For old-format stuff, you return DT_UNKNOWN, or you do your old existing song and dance. For new-format stuff you do the trivial thing. And guess what? The entry "." will give you the information abotu whether it is old-format or not. On an old-format thing, "." will look like this: offset 0: 32-bit d_ino 4: 32-bit d_offset = 12 8: 16-bit d_namelen = 1 10: string ".\0" 12: 32-bit d_ino for the next entry ... while with a new-format readdir you will get offset 0: 32-bit d_ino 4: 32-bit d_offset = 16 8: 16-bit d_namelen = 1 10: string ".\0" 12-14: three bytes of garbage 15: 8-bit d_type 16: 32-bit d_ino for the next entry.. ... Notice? You are guaranteed to find out really quickly whether it's old- or new-format unless the user is doing something really really strange, and even if the user is doing something strange, returning D_UNKNOWN is always "correct". So not only is my solution simple in kernel space, it allows you to simplify glibc too, if you are willing to make the old case go slower. Linus ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH] Add getdents32t syscall 2004-02-26 22:25 ` Linus Torvalds 2004-02-26 22:29 ` Ulrich Drepper @ 2004-02-26 22:32 ` Jakub Jelinek 2004-02-26 23:15 ` Linus Torvalds 2004-02-27 3:36 ` Theodore Ts'o 1 sibling, 2 replies; 16+ messages in thread From: Jakub Jelinek @ 2004-02-26 22:32 UTC (permalink / raw) To: Linus Torvalds; +Cc: linux-kernel, drepper On Thu, Feb 26, 2004 at 02:25:01PM -0800, Linus Torvalds wrote: > > > On Thu, 26 Feb 2004, Linus Torvalds wrote: > > > > In other words, what's wrong with this much simpler "extended getdents" > > instead? > > Actually, let's put the "d_type" always at the last character, so that you > don't have to search for it. Ie like the appended. > > Then you just get > > d_type = ((unsigned char *)dirent)[dirent->d_reclen-1]; > > inside glibc. Instead of having a new system call. Userland struct dirent is: struct dirent { #ifndef __USE_FILE_OFFSET64 __ino_t d_ino; __off_t d_off; #else __ino64_t d_ino; __off64_t d_off; #endif unsigned short int d_reclen; unsigned char d_type; char d_name[256]; /* We must not include limits.h! */ }; (since 1997 or so), so with the extended getdents syscall glibc would need to memmove every name by 1 byte. > You can even trivially check whether the system call fills in the d_type > field or not: > > - pre-fill the dirent area with 0xff or something > - do a small old-style "readdir()" > - check the first entry: the above gives a d_type of 0xff, then you have > an old-style readdir. If it gives 0, then you have to test whether it > is an old-style readdir (and the zero is the end-of-name marker) or a > new-style readdir (and the zero is DT_UNKNOWN). You can trivially do > that by checking the length of the name, and comparing it with the > reclen. This has a few problems: - unless glibc is configured to assume 2.6.4+ kernel, it would need to do on first readdir one small getdents (instead of as big getdents as needed) - how to find what is the right small size for the test (it would need to retry if it did not fit any entries) - and if on the old kernel, it would need to seek back so that getdents64 can be used Yes, this would only happen the first time getdents is called, but still, aren't syscalls quite cheap? Jakub ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH] Add getdents32t syscall 2004-02-26 22:32 ` Jakub Jelinek @ 2004-02-26 23:15 ` Linus Torvalds 2004-02-27 1:33 ` Ulrich Drepper 2004-02-27 1:46 ` Andreas Dilger 2004-02-27 3:36 ` Theodore Ts'o 1 sibling, 2 replies; 16+ messages in thread From: Linus Torvalds @ 2004-02-26 23:15 UTC (permalink / raw) To: Jakub Jelinek; +Cc: linux-kernel, drepper On Thu, 26 Feb 2004, Jakub Jelinek wrote: > > Userland struct dirent is: Ahh. So with the new thing, you'd need no conversion at all. > (since 1997 or so), so with the extended getdents syscall glibc would need > to memmove every name by 1 byte. The thing is, I hate encouraging glibc's behaviour of "we'll make up our own structures", and then ask the kernel to fix it later when it was done wrong in glibc. This is a totally new format that is totally unnecessary, and the RIGHT thing to do is to have glibc just use the proper 64-bit format. In other words, why doesn't glibc ever just make a new major number and make its "struct dirent" be the 64-bit version? It is _ridiculous_ to carry this baggage around, and then complain and add MORE baggage to the kernel because of having done things wrong the first time around. Linus ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH] Add getdents32t syscall 2004-02-26 23:15 ` Linus Torvalds @ 2004-02-27 1:33 ` Ulrich Drepper 2004-02-27 6:16 ` Linus Torvalds 2004-02-28 23:21 ` Jamie Lokier 2004-02-27 1:46 ` Andreas Dilger 1 sibling, 2 replies; 16+ messages in thread From: Ulrich Drepper @ 2004-02-27 1:33 UTC (permalink / raw) To: Linus Torvalds; +Cc: Jakub Jelinek, linux-kernel -----BEGIN PGP SIGNED MESSAGE----- Hash: SHA1 Linus Torvalds wrote: > The thing is, I hate encouraging glibc's behaviour of "we'll make up our > own structures", and then ask the kernel to fix it later when it was done > wrong in glibc. This is a totally new format that is totally unnecessary, > and the RIGHT thing to do is to have glibc just use the proper 64-bit > format. Good idea. Let's go back in time to 1994/5 and add d_type then. May I remember you that I always wanted large data types but whenever it was proposed the kernel people (including you) said: we don't need it this big now therefore it won't be changed. > In other words, why doesn't glibc ever just make a new major number and > make its "struct dirent" be the 64-bit version? You can't be serious. Can you even imagine the pain this would cause? - -- ➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖ -----BEGIN PGP SIGNATURE----- Version: GnuPG v1.2.3 (GNU/Linux) iD8DBQFAPp5N2ijCOnn/RHQRAuuOAJ4rY41j0ifWbK3R8Qv5elA+RULCqwCeP+RC pIC2Re4DR6tUZSPmqsijjGw= =PfkG -----END PGP SIGNATURE----- ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH] Add getdents32t syscall 2004-02-27 1:33 ` Ulrich Drepper @ 2004-02-27 6:16 ` Linus Torvalds 2004-02-27 7:05 ` Ulrich Drepper 2004-02-28 23:21 ` Jamie Lokier 1 sibling, 1 reply; 16+ messages in thread From: Linus Torvalds @ 2004-02-27 6:16 UTC (permalink / raw) To: Ulrich Drepper; +Cc: Jakub Jelinek, linux-kernel On Thu, 26 Feb 2004, Ulrich Drepper wrote: > > > In other words, why doesn't glibc ever just make a new major number and > > make its "struct dirent" be the 64-bit version? > > You can't be serious. Can you even imagine the pain this would cause? Do you know how much pain it causes when you make changes and you do _not_ change the major number? At some point you need to clean up. This issue has been with us for years and years. I don't understand why you'd want to add a totally new system call now. Instead, I'd suggest you just put this on the long list of things to fix for when a new major number is to be used. And yes, you should use a new major number at some point. It is _senseless_ to never update the majors. Just make a clean break, go through every little nagging ugly thing (and there are a _lot_ of them that have accumulated over the years), and make ready for "libc-3.0". No, I'm not suggesting you do it for this one thing, obviously. But there's bound to be _thousands_ of these stupid things where glibc has compatibility crap that makes no sense. Linus ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH] Add getdents32t syscall 2004-02-27 6:16 ` Linus Torvalds @ 2004-02-27 7:05 ` Ulrich Drepper 0 siblings, 0 replies; 16+ messages in thread From: Ulrich Drepper @ 2004-02-27 7:05 UTC (permalink / raw) To: Linus Torvalds; +Cc: Jakub Jelinek, linux-kernel -----BEGIN PGP SIGNED MESSAGE----- Hash: SHA1 Linus Torvalds wrote: > No, I'm not suggesting you do it for this one thing, obviously. But > there's bound to be _thousands_ of these stupid things where glibc has > compatibility crap that makes no sense. No there are not, at least not for the libc which everybody uses. FC comes with three versions. They each have increasing kernel requirements. The normally used version assumes a recent kernel (2.4.20 in FC1, hopefully 2.6.x in FC2) which allows to eliminate all compatibility with older kernels. This cuts out almost all of the cruft. The other libcs are for various degrees of backward compatibility for apps which do not follow the API rules and depend on non-guaranteed behavior of one sort or another. Anyway, if you add the d_type field after the string and we do the memmove that already much better than what we have to do today. - -- ➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖ -----BEGIN PGP SIGNATURE----- Version: GnuPG v1.2.3 (GNU/Linux) iD8DBQFAPuwl2ijCOnn/RHQRAtKkAKCjyzGzwEOgH8MFTWYkXhJOrhHHZACeML4p wBKxnHhbQNQaycRwocS4Bjc= =69Uo -----END PGP SIGNATURE----- ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH] Add getdents32t syscall 2004-02-27 1:33 ` Ulrich Drepper 2004-02-27 6:16 ` Linus Torvalds @ 2004-02-28 23:21 ` Jamie Lokier 1 sibling, 0 replies; 16+ messages in thread From: Jamie Lokier @ 2004-02-28 23:21 UTC (permalink / raw) To: Ulrich Drepper; +Cc: Linus Torvalds, Jakub Jelinek, linux-kernel Ulrich Drepper wrote: > > In other words, why doesn't glibc ever just make a new major number and > > make its "struct dirent" be the 64-bit version? > > You can't be serious. Can you even imagine the pain this would cause? Why wouldn't this work? Change the 32-bit struct dirent to this: struct direct { #if __BYTE_ORDER == __LITTLE_ENDIAN __ino_t d_ino; __u32 __padding1; __off_t d_off; __u32 __padding2; #elif __BYTE_ORDER == __BIG_ENDIAN __u32 __padding1; __ino_t d_ino; __u32 __padding2; __off_t d_off; #else #error Help! #endif unsigned short int d_reclen; unsigned char d_type; char d_name[256]; }; And also change getdirentries() and readdir() to call the kernel's getdents64. Use symbol versioning, so that old binaries will link to the old (compatible and slower) functions, and newly compiled code, using the new definition of struct dirent, uses the fast new versions of those functions? I presume that, if ELF symbol versioning doesn't allow you to do this, then the same trick you do with stat() can be used. From Glibc's <sys/stat.h>: /* To allow the `struct stat' structure and the file type `mode_t' bits to vary without changing shared library major version number, the `stat' family of functions and `mknod' are in fact inline wrappers around calls to `xstat', `fxstat', `lxstat', and `xmknod', which all take a leading version-number argument designating the data structure and bits used. -- Jamie ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH] Add getdents32t syscall 2004-02-26 23:15 ` Linus Torvalds 2004-02-27 1:33 ` Ulrich Drepper @ 2004-02-27 1:46 ` Andreas Dilger 1 sibling, 0 replies; 16+ messages in thread From: Andreas Dilger @ 2004-02-27 1:46 UTC (permalink / raw) To: Linus Torvalds; +Cc: Jakub Jelinek, linux-kernel, drepper On Thu, 26 Feb 2004, Jakub Jelinek wrote: > Userland struct dirent is: > (since 1997 or so), so with the extended getdents syscall glibc would need > to memmove every name by 1 byte. Actually, before you go through contortions trying to return this stuff to user space, you should fix the GNU tools to use the information properly. For example, rm stats the file each time before trying to unlink it even if it has the file type information from getdents64(). It also tries to do silly things like unlink() on a directory instead of rmdir, even after the stat (RH9 coreutils 4.5.3-19 and 2.4.24): $ mkdir /tmp/foo $ touch /tmp/foo/f{1,2,3,4,5} $ strace rm -r /tmp/foo : : lstat64("/tmp/foo", {st_mode=S_IFDIR|0775, st_size=4096, ...}) = 0 access("/tmp/foo", W_OK) = 0 unlink("/tmp/foo") = -1 EISDIR (Is a directory) Um, hello - we just saw that it was a directory so why not try rmdir()? It does the same thing when the directory is empty too. open(".", O_RDONLY|O_LARGEFILE|O_DIRECTORY) = 3 lstat64("/tmp/foo", {st_mode=S_IFDIR|0775, st_size=4096, ...}) = 0 chdir("/tmp/foo") = 0 lstat64(".", {st_mode=S_IFDIR|0775, st_size=4096, ...}) = 0 open(".", O_RDONLY|O_NONBLOCK|O_LARGEFILE|O_DIRECTORY) = 4 fstat64(4, {st_mode=S_IFDIR|0775, st_size=4096, ...}) = 0 fcntl64(4, F_SETFD, FD_CLOEXEC) = 0 getdents64(4, /* 7 entries */, 4096) = 168 lstat64("f1", {st_mode=S_IFREG|0664, st_size=0, ...}) = 0 access("f1", W_OK) = 0 unlink("f1") = 0 lstat64("f2", {st_mode=S_IFREG|0664, st_size=0, ...}) = 0 access("f2", W_OK) = 0 unlink("f2") = 0 : : So, for each file, we just got the file type info from getdents64(), we still stat the file (not sure why), we check the permissions even though we just statted it (could check perms from stat info or just try to unlink and handle the error return code), and finally an unlink. getdents64(4, /* 0 entries */, 4096) = 0 close(4) = 0 fchdir(3) = 0 lstat64(".", {st_mode=S_IFDIR|0775, st_size=4096, ...}) = 0 lstat64("/tmp/foo", {st_mode=S_IFDIR|0775, st_size=4096, ...}) = 0 access("/tmp/foo", W_OK) = 0 rmdir("/tmp/foo") = 0 Cheers, Andreas -- Andreas Dilger http://sourceforge.net/projects/ext2resize/ http://www-mddsp.enel.ucalgary.ca/People/adilger/ ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH] Add getdents32t syscall 2004-02-26 22:32 ` Jakub Jelinek 2004-02-26 23:15 ` Linus Torvalds @ 2004-02-27 3:36 ` Theodore Ts'o 1 sibling, 0 replies; 16+ messages in thread From: Theodore Ts'o @ 2004-02-27 3:36 UTC (permalink / raw) To: Jakub Jelinek; +Cc: Linus Torvalds, linux-kernel, drepper On Thu, Feb 26, 2004 at 05:32:26PM -0500, Jakub Jelinek wrote: > (since 1997 or so), so with the extended getdents syscall glibc would need > to memmove every name by 1 byte. Glibc will have to have the code to play the memmove game for compatibility with existing kernels. So the question is whether or not the memmove by one byte will actually be noticeable for most application. Has anyone checked to see whether or not it would even be noticeable on a profiling run? - Ted ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH] Add getdents32t syscall 2004-02-26 22:15 ` Linus Torvalds 2004-02-26 22:25 ` Linus Torvalds @ 2004-02-29 0:25 ` Jamie Lokier 1 sibling, 0 replies; 16+ messages in thread From: Jamie Lokier @ 2004-02-29 0:25 UTC (permalink / raw) To: Linus Torvalds; +Cc: Jakub Jelinek, linux-kernel, drepper Linus Torvalds wrote: > Ok, I just committed the "add hidden d_type to the 32-bit getdents" thing, > and I'd like to have people verify that it works (I just verified that it > didn't break anything, but hey, nothing is using the data, so..) I'd like to draw your attention to a patch posted earlier last century: >> From: Jamie Lokier <lkd@tantalophile.demon.co.uk> >> Date: Wed, 17 Nov 1999 22:56:56 +0100 >> Subject: [PATCH] dirent->d_type support >> >> I rejected adding a new system call. Instead, I extended getdents() to >> return data in a backward compatible way. The d_type information as >> placed at the end of each dirent record, and can be recognised >> unambiguously. :) It got flamed by Alex Viro for being a useless feature so was lost in the dismay. (Btw, that patch also added support for reporting a useful d_type from a large number of filesystems, some of which I think still don't have it in the main kernel tree.) I added d_type to dirent slightly differently, like this: copy_to_user(dirent->d_name, name, namlen); put_user(0, dirent->d_name + namlen); + /* + * This, plus adding sizeof(long) to reclen, allows d_type to be + * included in a backward & forward compatible way. + */ + if (type != DT_UNKNOWN) { + put_user(0, (unsigned char *) dirent + reclen - (1 + sizeof (long))); + put_user(type, (unsigned char *) dirent + reclen - 2); + put_user(0, (unsigned char *) dirent + reclen - 1); + } + ((char *) dirent) += reclen; buf->current_dir = dirent; The idea is that you can unambiguously determine if a d_type byte is included in the record without calling strlen(d_name), and in a way that allows more data to be appended to the record in future, like this: #define D_TYPE(dent) \ (((dent)->d_reclen > ((dent)->d_name - (char *) (dent) + sizeof (long)) \ && !*((unsigned char *) (dent) + (dent)->d_reclen - (1 + sizeof (long)))) \ ? *((unsigned char *) (dent) + (dent)->d_reclen - 2) : DT_UNKNOWN) However, as there are no plans to ever add any more information to the records from getdents(), your method is fine and of course it's much simpler to use. > On Thu, 26 Feb 2004, Jakub Jelinek wrote: > > Because no 32-bit getdents syscall provides this field to userland, > > glibc needs to use getdents64 syscall even for 32-bit getdents > > (and readdir etc.) and convert dirent entries from struct dirent64 > > to struct dirent. The code is quite complicated and as the former > > is bigger and the size of 64-bit dirents cannot be predicted accurately, > > it can happen that glibc reads too many entries and has to seek back > > on the dir etc. > > However, the more I look at the above, the more confused I get. Your > explanation simplty doesn't make any sense. > > The thing is, it's _trivial_ to convert from the bigger format into the > smaller format. It would be much harder to convert the other way. What's > the problem with just using he getdents64 format and converting the data > in-place? You can convert dirent64 to dirent in-place, and it gets smaller so that isn't a cause for "read too many entries". What can happen is that Glibc reads too _few_ entries. That doesn't cause lseek() problems. In Glibc 2.3.1's code, there's a more complicate heuristic that sometimes reads more than can be returned to userspace, and thus has to lseek(), but that's only used if it's running on an old kernel that _doesn't_ have getdents64(). The problem with seeking back on the directory that Jakub mentions can happen when Glibc uses getdents64() -- but it has nothing to do with the structure sizes or running out of space. What happens is that if getdents64() returns a d_ino or d_off value which overflows the 32-bit fields returned to userspace, then Glibc 2.3.1 calls lseek() to go back just before that entry, and returns as many entries as it could. This is so that userspace sees all the entries that could be read whose inodes and offsets fitted into 32 bits, and then sees EOVERFLOW when it tries to read the next, overflowing entry. -- Jamie ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH] Add getdents32t syscall 2004-02-26 19:38 [PATCH] Add getdents32t syscall Jakub Jelinek 2004-02-26 22:03 ` Randy.Dunlap 2004-02-26 22:15 ` Linus Torvalds @ 2004-02-27 19:28 ` Linus Torvalds 2 siblings, 0 replies; 16+ messages in thread From: Linus Torvalds @ 2004-02-27 19:28 UTC (permalink / raw) To: Jakub Jelinek; +Cc: linux-kernel, drepper On Thu, 26 Feb 2004, Jakub Jelinek wrote: > > Because no 32-bit getdents syscall provides this field to userland, > glibc needs to use getdents64 syscall even for 32-bit getdents > (and readdir etc.) and convert dirent entries from struct dirent64 > to struct dirent. The code is quite complicated and as the former > is bigger and the size of 64-bit dirents cannot be predicted accurately, > it can happen that glibc reads too many entries and has to seek back > on the dir etc. Ok, I just committed the "add hidden d_type to the 32-bit getdents" thing, and I'd like to have people verify that it works (I just verified that it didn't break anything, but hey, nothing is using the data, so..) However, the more I look at the above, the more confused I get. Your explanation simplty doesn't make any sense. The thing is, it's _trivial_ to convert from the bigger format into the smaller format. It would be much harder to convert the other way. What's the problem with just using he getdents64 format and converting the data in-place? Linus ^ permalink raw reply [flat|nested] 16+ messages in thread
end of thread, other threads:[~2004-02-29 0:26 UTC | newest] Thread overview: 16+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2004-02-26 19:38 [PATCH] Add getdents32t syscall Jakub Jelinek 2004-02-26 22:03 ` Randy.Dunlap 2004-02-26 22:15 ` Linus Torvalds 2004-02-26 22:25 ` Linus Torvalds 2004-02-26 22:29 ` Ulrich Drepper 2004-02-26 23:00 ` Linus Torvalds 2004-02-26 22:32 ` Jakub Jelinek 2004-02-26 23:15 ` Linus Torvalds 2004-02-27 1:33 ` Ulrich Drepper 2004-02-27 6:16 ` Linus Torvalds 2004-02-27 7:05 ` Ulrich Drepper 2004-02-28 23:21 ` Jamie Lokier 2004-02-27 1:46 ` Andreas Dilger 2004-02-27 3:36 ` Theodore Ts'o 2004-02-29 0:25 ` Jamie Lokier 2004-02-27 19:28 ` Linus Torvalds
This is an external index of several public inboxes, see mirroring instructions on how to clone and mirror all data and code used by this external index.