* [PATCH] struct file cleanup : the very large file_ra_state is now allocated only on demand. [not found] ` <20050817215357.GU3996@wotan.suse.de> @ 2005-08-18 0:40 ` Eric Dumazet 2005-08-18 1:05 ` Andi Kleen ` (2 more replies) 0 siblings, 3 replies; 11+ messages in thread From: Eric Dumazet @ 2005-08-18 0:40 UTC (permalink / raw) To: Andi Kleen; +Cc: Benjamin LaHaise, David S. Miller, netdev, linux-kernel [-- Attachment #1: Type: text/plain, Size: 977 bytes --] Andi Kleen a écrit : > >>(because of the insane struct file_ra_state f_ra. I wish this structure >>were dynamically allocated only for files that really use it) > > > How about you submit a patch for that instead? > > -Andi OK, could you please comment this patch ? The problem of dynamically allocating the readahead state data is that the allocation can fail and should not be fatal. I made some choices that might be not good. I also chose not to align "file_ra" slab on SLAB_HWCACHE_ALIGN because the object size is 10*sizeof(long), so alignment would loose 6*sizeof(long) bytes for each object. [PATCH] * struct file cleanup : the very large file_ra_state is now allocated only on demand, using a dedicated "file_ra" slab. 64bits machines handling lot of sockets can save about 72 bytes per file. * private_data : The field is moved close to f_count and f_op fields to speedup sockfd_lookups Thank you Signed-off-by: Eric Dumazet <dada1@cosmosbay.com> [-- Attachment #2: readahead.patch --] [-- Type: text/plain, Size: 7652 bytes --] --- linux-2.6.13-rc6/include/linux/fs.h 2005-08-07 20:18:56.000000000 +0200 +++ linux-2.6.13-rc6-ed/include/linux/fs.h 2005-08-18 01:33:04.000000000 +0200 @@ -586,20 +586,19 @@ struct dentry *f_dentry; struct vfsmount *f_vfsmnt; struct file_operations *f_op; + void *private_data; atomic_t f_count; unsigned int f_flags; mode_t f_mode; loff_t f_pos; struct fown_struct f_owner; unsigned int f_uid, f_gid; - struct file_ra_state f_ra; + struct file_ra_state *f_rap; size_t f_maxcount; unsigned long f_version; void *f_security; - /* needed for tty driver, and maybe others */ - void *private_data; #ifdef CONFIG_EPOLL /* Used by fs/eventpoll.c to link all the hooks to this file */ @@ -1514,8 +1513,7 @@ extern void do_generic_mapping_read(struct address_space *mapping, struct file_ra_state *, struct file *, loff_t *, read_descriptor_t *, read_actor_t); -extern void -file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping); +extern struct file_ra_state *file_ra_state_init(struct file *); extern ssize_t generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t offset, unsigned long nr_segs); extern ssize_t generic_file_readv(struct file *filp, const struct iovec *iov, @@ -1549,8 +1547,10 @@ read_descriptor_t * desc, read_actor_t actor) { + if (filp->f_rap == NULL) + file_ra_state_init(filp); do_generic_mapping_read(filp->f_mapping, - &filp->f_ra, + filp->f_rap, filp, ppos, desc, --- linux-2.6.13-rc6/include/linux/slab.h 2005-08-07 20:18:56.000000000 +0200 +++ linux-2.6.13-rc6-ed/include/linux/slab.h 2005-08-18 00:37:59.000000000 +0200 @@ -125,6 +125,7 @@ extern kmem_cache_t *names_cachep; extern kmem_cache_t *files_cachep; extern kmem_cache_t *filp_cachep; +extern kmem_cache_t *ra_cachep; extern kmem_cache_t *fs_cachep; extern kmem_cache_t *signal_cachep; extern kmem_cache_t *sighand_cachep; --- linux-2.6.13-rc6/mm/readahead.c 2005-08-07 20:18:56.000000000 +0200 +++ linux-2.6.13-rc6-ed/mm/readahead.c 2005-08-18 01:14:11.000000000 +0200 @@ -29,14 +29,20 @@ EXPORT_SYMBOL_GPL(default_backing_dev_info); /* - * Initialise a struct file's readahead state. Assumes that the caller has - * memset *ra to zero. + * Initialise a struct file's readahead state. */ -void -file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping) +struct file_ra_state * +file_ra_state_init(struct file *file) { - ra->ra_pages = mapping->backing_dev_info->ra_pages; - ra->prev_page = -1; + struct file_ra_state *ra = kmem_cache_alloc(ra_cachep, GFP_KERNEL); + + if (ra != NULL) { + file->f_rap = ra; + memset(ra, 0, sizeof(*ra)); + ra->ra_pages = file->f_mapping->host->i_mapping->backing_dev_info->ra_pages; + ra->prev_page = -1; + } + return ra; } /* --- linux-2.6.13-rc6/mm/filemap.c 2005-08-07 20:18:56.000000000 +0200 +++ linux-2.6.13-rc6-ed/mm/filemap.c 2005-08-18 01:33:58.000000000 +0200 @@ -711,7 +711,7 @@ * NULL. */ void do_generic_mapping_read(struct address_space *mapping, - struct file_ra_state *_ra, + struct file_ra_state *_rap, struct file *filp, loff_t *ppos, read_descriptor_t *desc, @@ -727,8 +727,15 @@ loff_t isize; struct page *cached_page; int error; - struct file_ra_state ra = *_ra; + struct file_ra_state ra; + if (likely(_rap != NULL)) + ra = *_rap; + else { + memset(&ra, 0, sizeof(ra)); + ra.prev_page = -1; + ra.ra_pages = default_backing_dev_info.ra_pages; + } cached_page = NULL; index = *ppos >> PAGE_CACHE_SHIFT; next_index = index; @@ -908,7 +915,8 @@ } out: - *_ra = ra; + if (_rap != NULL) + *_rap = ra; *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset; if (cached_page) @@ -1187,12 +1195,15 @@ int error; struct file *file = area->vm_file; struct address_space *mapping = file->f_mapping; - struct file_ra_state *ra = &file->f_ra; + struct file_ra_state *ra = file->f_rap; struct inode *inode = mapping->host; struct page *page; unsigned long size, pgoff; int did_readaround = 0, majmin = VM_FAULT_MINOR; + if (ra == NULL) + ra = file_ra_state_init(file); + pgoff = ((address-area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff; retry_all: @@ -1225,14 +1236,16 @@ handle_ra_miss(mapping, ra, pgoff); goto no_cached_page; } - ra->mmap_miss++; + if (ra != NULL) { + ra->mmap_miss++; - /* - * Do we miss much more than hit in this file? If so, - * stop bothering with read-ahead. It will only hurt. - */ - if (ra->mmap_miss > ra->mmap_hit + MMAP_LOTSAMISS) - goto no_cached_page; + /* + * Do we miss much more than hit in this file? If so, + * stop bothering with read-ahead. It will only hurt. + */ + if (ra->mmap_miss > ra->mmap_hit + MMAP_LOTSAMISS) + goto no_cached_page; + } /* * To keep the pgmajfault counter straight, we need to @@ -1243,7 +1256,7 @@ inc_page_state(pgmajfault); } did_readaround = 1; - ra_pages = max_sane_readahead(file->f_ra.ra_pages); + ra_pages = max_sane_readahead(ra != NULL ? ra->ra_pages : default_backing_dev_info.ra_pages); if (ra_pages) { pgoff_t start = 0; @@ -1256,7 +1269,7 @@ goto no_cached_page; } - if (!did_readaround) + if (!did_readaround && ra != NULL) ra->mmap_hit++; /* --- linux-2.6.13-rc6/mm/fadvise.c 2005-08-07 20:18:56.000000000 +0200 +++ linux-2.6.13-rc6-ed/mm/fadvise.c 2005-08-18 01:05:15.000000000 +0200 @@ -28,6 +28,7 @@ struct file *file = fget(fd); struct address_space *mapping; struct backing_dev_info *bdi; + struct file_ra_state *ra; loff_t endbyte; pgoff_t start_index; pgoff_t end_index; @@ -54,15 +55,20 @@ bdi = mapping->backing_dev_info; + if ((ra = file->f_rap) == NULL) + ra = file_ra_state_init(file); switch (advice) { case POSIX_FADV_NORMAL: - file->f_ra.ra_pages = bdi->ra_pages; + if (ra != NULL) + ra->ra_pages = bdi->ra_pages; break; case POSIX_FADV_RANDOM: - file->f_ra.ra_pages = 0; + if (ra != NULL) + ra->ra_pages = 0; break; case POSIX_FADV_SEQUENTIAL: - file->f_ra.ra_pages = bdi->ra_pages * 2; + if (ra != NULL) + ra->ra_pages = bdi->ra_pages * 2; break; case POSIX_FADV_WILLNEED: case POSIX_FADV_NOREUSE: --- linux-2.6.13-rc6/fs/open.c 2005-08-07 20:18:56.000000000 +0200 +++ linux-2.6.13-rc6-ed/fs/open.c 2005-08-18 01:05:15.000000000 +0200 @@ -804,7 +804,7 @@ } f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); - file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping); + f->f_rap = NULL; /* NB: we're sure to have correct a_ops only after f_op->open */ if (f->f_flags & O_DIRECT) { --- linux-2.6.13-rc6/fs/file_table.c 2005-08-07 20:18:56.000000000 +0200 +++ linux-2.6.13-rc6-ed/fs/file_table.c 2005-08-18 00:37:59.000000000 +0200 @@ -55,6 +55,8 @@ static inline void file_free(struct file *f) { + if (f->f_rap != NULL) + kmem_cache_free(ra_cachep, f->f_rap); kmem_cache_free(filp_cachep, f); } --- linux-2.6.13-rc6/fs/dcache.c 2005-08-07 20:18:56.000000000 +0200 +++ linux-2.6.13-rc6-ed/fs/dcache.c 2005-08-18 02:22:56.000000000 +0200 @@ -1705,6 +1705,8 @@ /* SLAB cache for file structures */ kmem_cache_t *filp_cachep; +/* SLAB cache for ra structures */ +kmem_cache_t *ra_cachep; EXPORT_SYMBOL(d_genocide); @@ -1733,6 +1735,9 @@ filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, filp_ctor, filp_dtor); + ra_cachep = kmem_cache_create("file_ra", sizeof(struct file_ra_state), 0, + SLAB_PANIC, NULL, NULL); + dcache_init(mempages); inode_init(mempages); files_init(mempages); ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH] struct file cleanup : the very large file_ra_state is now allocated only on demand. 2005-08-18 0:40 ` [PATCH] struct file cleanup : the very large file_ra_state is now allocated only on demand Eric Dumazet @ 2005-08-18 1:05 ` Andi Kleen 2005-08-18 2:43 ` David S. Miller 2005-08-18 2:52 ` Nick Piggin 2005-08-18 1:39 ` Coywolf Qi Hunt 2005-08-18 9:05 ` [PATCH] Put the very large file_ra_state outside of 'struct file' Eric Dumazet 2 siblings, 2 replies; 11+ messages in thread From: Andi Kleen @ 2005-08-18 1:05 UTC (permalink / raw) To: Eric Dumazet Cc: Andi Kleen, Benjamin LaHaise, David S. Miller, netdev, linux-kernel On Thu, Aug 18, 2005 at 02:40:46AM +0200, Eric Dumazet wrote: > Andi Kleen a ?crit : > > > > >>(because of the insane struct file_ra_state f_ra. I wish this structure > >>were dynamically allocated only for files that really use it) > > > > > >How about you submit a patch for that instead? > > > >-Andi > > OK, could you please comment this patch ? I would just set the ra pointer to a single global structure if the allocation fails. Then you can avoid all the other checks. It will slow down things and trash some state, but not fail and nobody should expect good performance after out of memory anyways. The only check still needed would be on freeing. -Andi ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH] struct file cleanup : the very large file_ra_state is now allocated only on demand. 2005-08-18 1:05 ` Andi Kleen @ 2005-08-18 2:43 ` David S. Miller 2005-08-18 7:14 ` Eric Dumazet 2005-08-18 2:52 ` Nick Piggin 1 sibling, 1 reply; 11+ messages in thread From: David S. Miller @ 2005-08-18 2:43 UTC (permalink / raw) To: ak; +Cc: dada1, bcrl, netdev, linux-kernel From: Andi Kleen <ak@suse.de> Date: Thu, 18 Aug 2005 03:05:25 +0200 > I would just set the ra pointer to a single global structure if the > allocation fails. Then you can avoid all the other checks. It will > slow down things and trash some state, but not fail and nobody > should expect good performance after out of memory anyways. The only > check still needed would be on freeing. I would think twice about that due to repeatability concerns. Yes, we should care less when memory is so low, but if we can avoid this kind of scenerio easily we should. Having said that, I would like to recommend looking into a scheme where the path leading to the filp allocation states whether the read-ahead bits are needed or not. This has two benefits: 1) Repeatability, and error signalling at the correct place should the memory allocation fail. 2) We can avoid the pointer dereference overhead. The read-ahead state is always at (filp + 1). Macro'ized or static inline function'ized interfaces for this access can make it look clean and perhaps even implement debugging of the case where we try to get at the read-ahead state for a non-read-ahead filp. I do really think that would be a better approach. A quick glance shows that it should be easy to propagate the "need_read_ahead" state, just by passing a boolean to get_unused_fd() via sock_map_fd(). ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH] struct file cleanup : the very large file_ra_state is now allocated only on demand. 2005-08-18 2:43 ` David S. Miller @ 2005-08-18 7:14 ` Eric Dumazet 2005-08-18 7:18 ` David S. Miller 0 siblings, 1 reply; 11+ messages in thread From: Eric Dumazet @ 2005-08-18 7:14 UTC (permalink / raw) To: David S. Miller; +Cc: ak, bcrl, netdev, linux-kernel David S. Miller a écrit : > From: Andi Kleen <ak@suse.de> > Date: Thu, 18 Aug 2005 03:05:25 +0200 > > >>I would just set the ra pointer to a single global structure if the >>allocation fails. Then you can avoid all the other checks. It will >>slow down things and trash some state, but not fail and nobody >>should expect good performance after out of memory anyways. The only >>check still needed would be on freeing. > > > I would think twice about that due to repeatability concerns. Yes, we > should care less when memory is so low, but if we can avoid this kind > of scenerio easily we should. > > Having said that, I would like to recommend looking into a scheme > where the path leading to the filp allocation states whether the > read-ahead bits are needed or not. This has two benefits: > > 1) Repeatability, and error signalling at the correct place > should the memory allocation fail. > > 2) We can avoid the pointer dereference overhead. The read-ahead > state is always at (filp + 1). Macro'ized or static inline > function'ized interfaces for this access can make it look > clean and perhaps even implement debugging of the case where > we try to get at the read-ahead state for a non-read-ahead > filp. > > I do really think that would be a better approach. A quick glance > shows that it should be easy to propagate the "need_read_ahead" > state, just by passing a boolean to get_unused_fd() via > sock_map_fd(). Thanks David and Andi Hum... get_unused_fd() allocates a file descriptor, not a 'struct file *' Maybe you meant get_empty_filp(void), that we might change to get_empty_filp(int need_read_ahead) After reading your suggestions, I understand we still need two slabs. One (filp_cachep) without the readahead data, the other one (filp_ra_cachep) with it. static inline struct file_ra_state *get_ra_state(struct file *f) { #ifdef CONFIG_DEBUG_READAHEAD BUG_ON(filp_ra_cachep != GET_PAGE_CACHE(virt_to_page(f))); #endif return (struct file_ra_state *)(f+1); } struct file *get_empty_filp(int need_read_ahead) { struct file *f; ... f = kmem_cache_alloc(need_read_ahead ? filp_ra_cachep : filp_cachep, GFP_KERNEL); if (f == NULL) goto fail; memset(f, 0, need_read_ahead ? sizeof(struct file) : sizeof(struct file) + sizeof(struct file_ra_state)); ... } file_free() then call kfree() that should find the right kmem_cache_t * static inline void file_free(struct file *f) { kfree(f); } I will provide a new version of the patch in a later mail Eric ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH] struct file cleanup : the very large file_ra_state is now allocated only on demand. 2005-08-18 7:14 ` Eric Dumazet @ 2005-08-18 7:18 ` David S. Miller 0 siblings, 0 replies; 11+ messages in thread From: David S. Miller @ 2005-08-18 7:18 UTC (permalink / raw) To: dada1; +Cc: ak, bcrl, netdev, linux-kernel From: Eric Dumazet <dada1@cosmosbay.com> Date: Thu, 18 Aug 2005 09:14:47 +0200 > After reading your suggestions, I understand we still need two slabs. > One (filp_cachep) without the readahead data, the other one > (filp_ra_cachep) with it. Correct. > static inline struct file_ra_state *get_ra_state(struct file *f) > { > #ifdef CONFIG_DEBUG_READAHEAD > BUG_ON(filp_ra_cachep != GET_PAGE_CACHE(virt_to_page(f))); > #endif > return (struct file_ra_state *)(f+1); > } Or, instead of mucking with SLAB internals, just put something like a "filp_ra_present" debugging binary state into filp while it gets debugged. ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH] struct file cleanup : the very large file_ra_state is now allocated only on demand. 2005-08-18 1:05 ` Andi Kleen 2005-08-18 2:43 ` David S. Miller @ 2005-08-18 2:52 ` Nick Piggin 2005-08-18 2:57 ` Andi Kleen 1 sibling, 1 reply; 11+ messages in thread From: Nick Piggin @ 2005-08-18 2:52 UTC (permalink / raw) To: Andi Kleen Cc: Eric Dumazet, Benjamin LaHaise, David S. Miller, netdev, linux-kernel Andi Kleen wrote: > >I would just set the ra pointer to a single global structure if the allocation >fails. Then you can avoid all the other checks. It will slow down >things and trash some state, but not fail and nobody should expect good >performance after out of memory anyways. The only check still >needed would be on freeing. > > You don't want to always have bad performance though, so you could attempt to allocate if either the pointer is null _or_ it points to the global structure? Send instant messages to your online friends http://au.messenger.yahoo.com ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH] struct file cleanup : the very large file_ra_state is now allocated only on demand. 2005-08-18 2:52 ` Nick Piggin @ 2005-08-18 2:57 ` Andi Kleen 2005-08-18 3:00 ` Nick Piggin 0 siblings, 1 reply; 11+ messages in thread From: Andi Kleen @ 2005-08-18 2:57 UTC (permalink / raw) To: Nick Piggin Cc: Andi Kleen, Eric Dumazet, Benjamin LaHaise, David S. Miller, netdev, linux-kernel > You don't want to always have bad performance though, so you > could attempt to allocate if either the pointer is null _or_ it > points to the global structure? Remember it's after a GFP_KERNEL OOM. If that fails most likely you have deadlocked somewhere else already because Linux's handling of that is so bad. Suboptimal readahead somewhere is the smallest of your problems. -Andi ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH] struct file cleanup : the very large file_ra_state is now allocated only on demand. 2005-08-18 2:57 ` Andi Kleen @ 2005-08-18 3:00 ` Nick Piggin 0 siblings, 0 replies; 11+ messages in thread From: Nick Piggin @ 2005-08-18 3:00 UTC (permalink / raw) To: Andi Kleen Cc: Eric Dumazet, Benjamin LaHaise, David S. Miller, netdev, linux-kernel Andi Kleen wrote: >>You don't want to always have bad performance though, so you >>could attempt to allocate if either the pointer is null _or_ it >>points to the global structure? >> > >Remember it's after a GFP_KERNEL OOM. If that fails most likely >you have deadlocked somewhere else already because Linux's handling >of that is so bad. Suboptimal readahead somewhere is the smallest >of your problems. > > True. And in practice it may not even be able to happen at the moment if the page allocator still doesn't fail small order allocs. But I guess the dream one day is to robustly handle any OOM :\ Send instant messages to your online friends http://au.messenger.yahoo.com ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH] struct file cleanup : the very large file_ra_state is now allocated only on demand. 2005-08-18 0:40 ` [PATCH] struct file cleanup : the very large file_ra_state is now allocated only on demand Eric Dumazet 2005-08-18 1:05 ` Andi Kleen @ 2005-08-18 1:39 ` Coywolf Qi Hunt 2005-08-18 6:51 ` Eric Dumazet 2005-08-18 9:05 ` [PATCH] Put the very large file_ra_state outside of 'struct file' Eric Dumazet 2 siblings, 1 reply; 11+ messages in thread From: Coywolf Qi Hunt @ 2005-08-18 1:39 UTC (permalink / raw) To: Eric Dumazet Cc: Andi Kleen, Benjamin LaHaise, David S. Miller, netdev, linux-kernel On 8/18/05, Eric Dumazet <dada1@cosmosbay.com> wrote: > Andi Kleen a écrit : > > > > >>(because of the insane struct file_ra_state f_ra. I wish this structure > >>were dynamically allocated only for files that really use it) > > > > > > How about you submit a patch for that instead? > > > > -Andi > > OK, could you please comment this patch ? > > The problem of dynamically allocating the readahead state data is that the allocation can fail and should not be fatal. > I made some choices that might be not good. > > I also chose not to align "file_ra" slab on SLAB_HWCACHE_ALIGN because the object size is 10*sizeof(long), so alignment would loose > 6*sizeof(long) bytes for each object. > > > [PATCH] > > * struct file cleanup : the very large file_ra_state is now allocated only on demand, using a dedicated "file_ra" slab. > 64bits machines handling lot of sockets can save about 72 bytes per file. > * private_data : The field is moved close to f_count and f_op fields to speedup sockfd_lookups Why not keep the comment or fix it? -- Coywolf Qi Hunt http://ahbl.org/~coywolf/ ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH] struct file cleanup : the very large file_ra_state is now allocated only on demand. 2005-08-18 1:39 ` Coywolf Qi Hunt @ 2005-08-18 6:51 ` Eric Dumazet 0 siblings, 0 replies; 11+ messages in thread From: Eric Dumazet @ 2005-08-18 6:51 UTC (permalink / raw) To: Coywolf Qi Hunt Cc: Andi Kleen, Benjamin LaHaise, David S. Miller, netdev, linux-kernel Coywolf Qi Hunt a écrit : > On 8/18/05, Eric Dumazet <dada1@cosmosbay.com> wrote: > >>Andi Kleen a écrit : >> >> >>>>(because of the insane struct file_ra_state f_ra. I wish this structure >>>>were dynamically allocated only for files that really use it) >>> >>> >>>How about you submit a patch for that instead? >>> >>>-Andi >> >>OK, could you please comment this patch ? >> >>The problem of dynamically allocating the readahead state data is that the allocation can fail and should not be fatal. >>I made some choices that might be not good. >> >>I also chose not to align "file_ra" slab on SLAB_HWCACHE_ALIGN because the object size is 10*sizeof(long), so alignment would loose >>6*sizeof(long) bytes for each object. >> >> >>[PATCH] >> >>* struct file cleanup : the very large file_ra_state is now allocated only on demand, using a dedicated "file_ra" slab. >> 64bits machines handling lot of sockets can save about 72 bytes per file. >>* private_data : The field is moved close to f_count and f_op fields to speedup sockfd_lookups > > > Why not keep the comment or fix it? > You mean the comment in include/linux/fs.h : /* needed for tty driver, and maybe others */ ? I think this comment is outdated, since nearly every 'struct file' user store something of his own in this place, not only 'tty drivers' As no other fields has comment, why should we keep this outdated comment ? Eric ^ permalink raw reply [flat|nested] 11+ messages in thread
* [PATCH] Put the very large file_ra_state outside of 'struct file' 2005-08-18 0:40 ` [PATCH] struct file cleanup : the very large file_ra_state is now allocated only on demand Eric Dumazet 2005-08-18 1:05 ` Andi Kleen 2005-08-18 1:39 ` Coywolf Qi Hunt @ 2005-08-18 9:05 ` Eric Dumazet 2 siblings, 0 replies; 11+ messages in thread From: Eric Dumazet @ 2005-08-18 9:05 UTC (permalink / raw) To: Eric Dumazet Cc: Andi Kleen, Benjamin LaHaise, David S. Miller, netdev, linux-kernel [-- Attachment #1: Type: text/plain, Size: 409 bytes --] [PATCH] * struct file cleanup : the very large file_ra_state is now allocated only on demand, using a dedicated "filp_ra_cache" slab. machines handling lot of sockets or pipes can save about 80 bytes (or 40 bytes on 32bits platforms) per file. * private_data : The field is moved close to f_count and f_op fields to speedup sockfd_lookups Thank you Signed-off-by: Eric Dumazet <dada1@cosmosbay.com> [-- Attachment #2: readahead.patch --] [-- Type: text/plain, Size: 10565 bytes --] --- linux-2.6.13-rc6/include/linux/fs.h 2005-08-07 20:18:56.000000000 +0200 +++ linux-2.6.13-rc6-ed/include/linux/fs.h 2005-08-18 10:30:35.000000000 +0200 @@ -586,20 +586,18 @@ struct dentry *f_dentry; struct vfsmount *f_vfsmnt; struct file_operations *f_op; + void *private_data; atomic_t f_count; unsigned int f_flags; mode_t f_mode; loff_t f_pos; struct fown_struct f_owner; unsigned int f_uid, f_gid; - struct file_ra_state f_ra; size_t f_maxcount; unsigned long f_version; void *f_security; - /* needed for tty driver, and maybe others */ - void *private_data; #ifdef CONFIG_EPOLL /* Used by fs/eventpoll.c to link all the hooks to this file */ @@ -1480,7 +1478,10 @@ __insert_inode_hash(inode, inode->i_ino); } -extern struct file * get_empty_filp(void); +#define FILP_NO_READ_AHEAD 0 +#define FILP_WITH_READ_AHEAD 1 +extern struct file * get_empty_filp(int need_read_ahead); + extern void file_move(struct file *f, struct list_head *list); extern void file_kill(struct file *f); struct bio; @@ -1514,8 +1515,7 @@ extern void do_generic_mapping_read(struct address_space *mapping, struct file_ra_state *, struct file *, loff_t *, read_descriptor_t *, read_actor_t); -extern void -file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping); +extern void file_ra_state_init(struct file *); extern ssize_t generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t offset, unsigned long nr_segs); extern ssize_t generic_file_readv(struct file *filp, const struct iovec *iov, @@ -1545,12 +1545,16 @@ } #endif +static inline struct file_ra_state *get_ra_state(struct file *f) +{ + return (struct file_ra_state *)(f+1); +} static inline void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * desc, read_actor_t actor) { do_generic_mapping_read(filp->f_mapping, - &filp->f_ra, + get_ra_state(filp), filp, ppos, desc, --- linux-2.6.13-rc6/include/linux/slab.h 2005-08-07 20:18:56.000000000 +0200 +++ linux-2.6.13-rc6-ed/include/linux/slab.h 2005-08-18 09:46:40.000000000 +0200 @@ -125,6 +125,7 @@ extern kmem_cache_t *names_cachep; extern kmem_cache_t *files_cachep; extern kmem_cache_t *filp_cachep; +extern kmem_cache_t *filp_ra_cachep; extern kmem_cache_t *fs_cachep; extern kmem_cache_t *signal_cachep; extern kmem_cache_t *sighand_cachep; --- linux-2.6.13-rc6/mm/readahead.c 2005-08-07 20:18:56.000000000 +0200 +++ linux-2.6.13-rc6-ed/mm/readahead.c 2005-08-18 10:41:13.000000000 +0200 @@ -33,9 +33,10 @@ * memset *ra to zero. */ void -file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping) +file_ra_state_init(struct file *file) { - ra->ra_pages = mapping->backing_dev_info->ra_pages; + struct file_ra_state *ra = get_ra_state(file); + ra->ra_pages = file->f_mapping->host->i_mapping->backing_dev_info->ra_pages; ra->prev_page = -1; } --- linux-2.6.13-rc6/mm/filemap.c 2005-08-07 20:18:56.000000000 +0200 +++ linux-2.6.13-rc6-ed/mm/filemap.c 2005-08-18 10:46:53.000000000 +0200 @@ -1187,7 +1187,7 @@ int error; struct file *file = area->vm_file; struct address_space *mapping = file->f_mapping; - struct file_ra_state *ra = &file->f_ra; + struct file_ra_state *ra = get_ra_state(file); struct inode *inode = mapping->host; struct page *page; unsigned long size, pgoff; @@ -1243,7 +1243,7 @@ inc_page_state(pgmajfault); } did_readaround = 1; - ra_pages = max_sane_readahead(file->f_ra.ra_pages); + ra_pages = max_sane_readahead(ra->ra_pages); if (ra_pages) { pgoff_t start = 0; --- linux-2.6.13-rc6/mm/fadvise.c 2005-08-07 20:18:56.000000000 +0200 +++ linux-2.6.13-rc6-ed/mm/fadvise.c 2005-08-18 10:44:16.000000000 +0200 @@ -28,6 +28,7 @@ struct file *file = fget(fd); struct address_space *mapping; struct backing_dev_info *bdi; + struct file_ra_state *ra; loff_t endbyte; pgoff_t start_index; pgoff_t end_index; @@ -54,15 +55,20 @@ bdi = mapping->backing_dev_info; + /* + * FIXME : Are we sure here this 'file' has ra_state available ? + */ + ra = get_ra_state(file); + switch (advice) { case POSIX_FADV_NORMAL: - file->f_ra.ra_pages = bdi->ra_pages; + ra->ra_pages = bdi->ra_pages; break; case POSIX_FADV_RANDOM: - file->f_ra.ra_pages = 0; + ra->ra_pages = 0; break; case POSIX_FADV_SEQUENTIAL: - file->f_ra.ra_pages = bdi->ra_pages * 2; + ra->ra_pages = bdi->ra_pages * 2; break; case POSIX_FADV_WILLNEED: case POSIX_FADV_NOREUSE: --- linux-2.6.13-rc6/fs/file_table.c 2005-08-07 20:18:56.000000000 +0200 +++ linux-2.6.13-rc6-ed/fs/file_table.c 2005-08-18 10:46:53.000000000 +0200 @@ -53,16 +53,20 @@ spin_unlock_irqrestore(&filp_count_lock, flags); } +/* + * a file can belong to different kmem_cache (filp_cache or filp_ra_cache), + * so let kfree() find it + */ static inline void file_free(struct file *f) { - kmem_cache_free(filp_cachep, f); + kfree(f); } /* Find an unused file structure and return a pointer to it. * Returns NULL, if there are no more free file structures or * we run out of memory. */ -struct file *get_empty_filp(void) +struct file *get_empty_filp(int need_read_ahead) { static int old_max; struct file * f; @@ -74,11 +78,11 @@ !capable(CAP_SYS_ADMIN)) goto over; - f = kmem_cache_alloc(filp_cachep, GFP_KERNEL); + f = kmem_cache_alloc(need_read_ahead ? filp_ra_cachep : filp_cachep, GFP_KERNEL); if (f == NULL) goto fail; - memset(f, 0, sizeof(*f)); + memset(f, 0, need_read_ahead ? sizeof(struct file) + sizeof(struct file_ra_state) : sizeof(struct file)); if (security_file_alloc(f)) goto fail_sec; --- linux-2.6.13-rc6/fs/open.c 2005-08-07 20:18:56.000000000 +0200 +++ linux-2.6.13-rc6-ed/fs/open.c 2005-08-18 10:30:35.000000000 +0200 @@ -778,7 +778,7 @@ int error; error = -ENFILE; - f = get_empty_filp(); + f = get_empty_filp(FILP_WITH_READ_AHEAD); if (!f) goto cleanup_dentry; f->f_flags = flags; @@ -804,7 +804,7 @@ } f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); - file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping); + file_ra_state_init(f); /* NB: we're sure to have correct a_ops only after f_op->open */ if (f->f_flags & O_DIRECT) { --- linux-2.6.13-rc6/fs/dcache.c 2005-08-07 20:18:56.000000000 +0200 +++ linux-2.6.13-rc6-ed/fs/dcache.c 2005-08-18 09:48:41.000000000 +0200 @@ -1703,8 +1703,10 @@ /* SLAB cache for __getname() consumers */ kmem_cache_t *names_cachep; -/* SLAB cache for file structures */ +/* SLAB cache for file structures without read ahead state data */ kmem_cache_t *filp_cachep; +/* SLAB cache for file structures with read ahead state data */ +kmem_cache_t *filp_ra_cachep; EXPORT_SYMBOL(d_genocide); @@ -1733,6 +1735,9 @@ filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, filp_ctor, filp_dtor); + filp_ra_cachep = kmem_cache_create("filp_ra", sizeof(struct file) + sizeof(struct file_ra_state), 0, + SLAB_HWCACHE_ALIGN|SLAB_PANIC, filp_ctor, filp_dtor); + dcache_init(mempages); inode_init(mempages); files_init(mempages); --- linux-2.6.13-rc6/net/socket.c 2005-08-07 20:18:56.000000000 +0200 +++ linux-2.6.13-rc6-ed/net/socket.c 2005-08-18 10:30:35.000000000 +0200 @@ -375,7 +375,7 @@ fd = get_unused_fd(); if (fd >= 0) { - struct file *file = get_empty_filp(); + struct file *file = get_empty_filp(FILP_NO_READ_AHEAD); if (!file) { put_unused_fd(fd); --- linux-2.6.13-rc6/fs/eventpoll.c 2005-08-07 20:18:56.000000000 +0200 +++ linux-2.6.13-rc6-ed/fs/eventpoll.c 2005-08-18 10:30:35.000000000 +0200 @@ -717,7 +717,7 @@ /* Get an ready to use file */ error = -ENFILE; - file = get_empty_filp(); + file = get_empty_filp(FILP_NO_READ_AHEAD); if (!file) goto eexit_1; --- linux-2.6.13-rc6/kernel/futex.c 2005-08-07 20:18:56.000000000 +0200 +++ linux-2.6.13-rc6-ed/kernel/futex.c 2005-08-18 10:30:35.000000000 +0200 @@ -661,7 +661,7 @@ ret = get_unused_fd(); if (ret < 0) goto out; - filp = get_empty_filp(); + filp = get_empty_filp(FILP_NO_READ_AHEAD); if (!filp) { put_unused_fd(ret); ret = -ENFILE; --- linux-2.6.13-rc6/mm/shmem.c 2005-08-07 20:18:56.000000000 +0200 +++ linux-2.6.13-rc6-ed/mm/shmem.c 2005-08-18 10:30:35.000000000 +0200 @@ -2272,7 +2272,7 @@ goto put_memory; error = -ENFILE; - file = get_empty_filp(); + file = get_empty_filp(FILP_WITH_READ_AHEAD); if (!file) goto put_dentry; --- linux-2.6.13-rc6/fs/pipe.c 2005-08-07 20:18:56.000000000 +0200 +++ linux-2.6.13-rc6-ed/fs/pipe.c 2005-08-18 10:30:35.000000000 +0200 @@ -723,11 +723,11 @@ int i,j; error = -ENFILE; - f1 = get_empty_filp(); + f1 = get_empty_filp(FILP_NO_READ_AHEAD); if (!f1) goto no_files; - f2 = get_empty_filp(); + f2 = get_empty_filp(FILP_NO_READ_AHEAD); if (!f2) goto close_f1; --- linux-2.6.13-rc6/fs/inotify.c 2005-08-07 20:18:56.000000000 +0200 +++ linux-2.6.13-rc6-ed/fs/inotify.c 2005-08-18 10:30:35.000000000 +0200 @@ -865,7 +865,7 @@ if (fd < 0) return fd; - filp = get_empty_filp(); + filp = get_empty_filp(FILP_NO_READ_AHEAD); if (!filp) { ret = -ENFILE; goto out_put_fd; --- linux-2.6.13-rc6/fs/hugetlbfs/inode.c 2005-08-07 20:18:56.000000000 +0200 +++ linux-2.6.13-rc6-ed/fs/hugetlbfs/inode.c 2005-08-18 10:30:35.000000000 +0200 @@ -785,7 +785,7 @@ goto out_shm_unlock; error = -ENFILE; - file = get_empty_filp(); + file = get_empty_filp(FILP_WITH_READ_AHEAD); if (!file) goto out_dentry; --- linux-2.6.13-rc6/arch/ia64/kernel/perfmon.c 2005-08-07 20:18:56.000000000 +0200 +++ linux-2.6.13-rc6-ed/arch/ia64/kernel/perfmon.c 2005-08-18 10:36:28.000000000 +0200 @@ -2157,7 +2157,7 @@ ret = -ENFILE; - file = get_empty_filp(); + file = get_empty_filp(FILP_WITH_READ_AHEAD); if (!file) goto out; /* --- linux-2.6.13-rc6/drivers/infiniband/core/uverbs_main.c 2005-08-07 20:18:56.000000000 +0200 +++ linux-2.6.13-rc6-ed/drivers/infiniband/core/uverbs_main.c 2005-08-18 10:30:35.000000000 +0200 @@ -367,7 +367,7 @@ if (file->fd < 0) return file->fd; - filp = get_empty_filp(); + filp = get_empty_filp(FILP_NO_READ_AHEAD); if (!filp) { put_unused_fd(file->fd); return -ENFILE; --- linux-2.6.13-rc6/mm/tiny-shmem.c 2005-08-07 20:18:56.000000000 +0200 +++ linux-2.6.13-rc6-ed/mm/tiny-shmem.c 2005-08-18 10:30:35.000000000 +0200 @@ -68,7 +68,7 @@ goto put_memory; error = -ENFILE; - file = get_empty_filp(); + file = get_empty_filp(FILP_WITH_READ_AHEAD); if (!file) goto put_dentry; ^ permalink raw reply [flat|nested] 11+ messages in thread
end of thread, other threads:[~2005-08-18 9:05 UTC | newest]
Thread overview: 11+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
[not found] <20050810164655.GB4162@linux.intel.com>
[not found] ` <20050810.135306.79296985.davem@davemloft.net>
[not found] ` <20050810211737.GA21581@linux.intel.com>
[not found] ` <430391F1.9080900@cosmosbay.com>
[not found] ` <20050817211829.GK27628@wotan.suse.de>
[not found] ` <4303AEC4.3060901@cosmosbay.com>
[not found] ` <20050817215357.GU3996@wotan.suse.de>
2005-08-18 0:40 ` [PATCH] struct file cleanup : the very large file_ra_state is now allocated only on demand Eric Dumazet
2005-08-18 1:05 ` Andi Kleen
2005-08-18 2:43 ` David S. Miller
2005-08-18 7:14 ` Eric Dumazet
2005-08-18 7:18 ` David S. Miller
2005-08-18 2:52 ` Nick Piggin
2005-08-18 2:57 ` Andi Kleen
2005-08-18 3:00 ` Nick Piggin
2005-08-18 1:39 ` Coywolf Qi Hunt
2005-08-18 6:51 ` Eric Dumazet
2005-08-18 9:05 ` [PATCH] Put the very large file_ra_state outside of 'struct file' Eric Dumazet
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).