From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S262381AbUFNKsk (ORCPT ); Mon, 14 Jun 2004 06:48:40 -0400 Received: (majordomo@vger.kernel.org) by vger.kernel.org id S262380AbUFNKsk (ORCPT ); Mon, 14 Jun 2004 06:48:40 -0400 Received: from mx1.redhat.com ([66.187.233.31]:65183 "EHLO mx1.redhat.com") by vger.kernel.org with ESMTP id S262381AbUFNKrz (ORCPT ); Mon, 14 Jun 2004 06:47:55 -0400 From: David Howells In-Reply-To: <20040611150110.73fadefb.akpm@osdl.org> References: <20040611150110.73fadefb.akpm@osdl.org> <567.1086950642@redhat.com> <6567.1086963705@redhat.com> To: Andrew Morton Cc: torvalds@osdl.org, linux-kernel@vger.kernel.org Subject: Re: [PATCH] Permit inode & dentry hash tables to be allocated > MAX_ORDER size [try #3] User-Agent: EMH/1.14.1 SEMI/1.14.5 (Awara-Onsen) FLIM/1.14.5 (Demachiyanagi) APEL/10.6 Emacs/21.3 (i386-redhat-linux-gnu) MULE/5.0 (SAKAKI) MIME-Version: 1.0 (generated by SEMI 1.14.5 - "Awara-Onsen") Content-Type: multipart/mixed; boundary="Multipart_Mon_Jun_14_11:47:47_2004-1" Date: Mon, 14 Jun 2004 11:47:47 +0100 Message-ID: <18241.1087210067@redhat.com> Sender: linux-kernel-owner@vger.kernel.org X-Mailing-List: linux-kernel@vger.kernel.org --Multipart_Mon_Jun_14_11:47:47_2004-1 Content-Type: text/plain; charset=US-ASCII Here's version 3 of the patch, incorporating comments from Andrew Morton. Andrew Morton wrote: > We have an opportunity to make this loop less baroque. That seems reasonable. gcc's optimiser should be able to render them the same. > whereas in other places you _have_ made the definition of __init functions > include the "__init". I'm not sure which is best, really, but it seems > better to include the __init in the declaration. You're probably right. This should probably apply to vfs_caches_init() and inode_init() too. > What's the attribute((pure)) for? It tells gcc that the function's result only depends on its arguments... it makes no references at all to external data. It's like __attribute__((const)) but stronger. This means that gcc can generate code that caches the result. See the gcc manual. > It generates a warning on older gcc - please use __attribute_pure__. We still support gcc's that old? > The other four or five implementations of log2() use ffx(~n). Yes... but ffs() and ffz() take int args, not long args. I suspect that shouldn't matter (that would require the hash table to be calculated at 4Gig buckets in size or greater to be a problem), but why take the chance when we can avoid it easily? David --Multipart_Mon_Jun_14_11:47:47_2004-1 Content-Type: text/plain; type=patch; charset=US-ASCII Content-Disposition: attachment; filename="bootmem-hash-267rc3-3.diff" Content-Transfer-Encoding: 7bit Signed-off-by: Paul Mackerras Signed-off-by: David Howells diff -uNrp linux-2.6.7-rc3/fs/dcache.c linux-2.6.7-rc3-hash/fs/dcache.c --- linux-2.6.7-rc3/fs/dcache.c 2004-06-11 09:36:25.000000000 +0100 +++ linux-2.6.7-rc3-hash/fs/dcache.c 2004-06-14 10:53:11.297499326 +0100 @@ -30,6 +30,7 @@ #include #include #include +#include #define DCACHE_PARANOIA 1 /* #define DCACHE_DEBUG 1 */ @@ -1561,13 +1562,25 @@ static int __init set_dhash_entries(char } __setup("dhash_entries=", set_dhash_entries); -static void __init dcache_init(unsigned long mempages) +static void __init dcache_init_early(void) { - struct hlist_head *d; - unsigned long order; - unsigned int nr_hash; - int i; + int loop; + + dentry_hashtable = + alloc_large_system_hash("Dentry cache", + sizeof(struct hlist_head), + dhash_entries, + 13, + 0, + &d_hash_shift, + &d_hash_mask); + for (loop = 0; loop < (1 << d_hash_shift); loop++) + INIT_HLIST_HEAD(&dentry_hashtable[loop]); +} + +static void __init dcache_init(unsigned long mempages) +{ /* * A constructor could be added for stable state like the lists, * but it is probably not worth it because of the cache nature @@ -1580,45 +1593,6 @@ static void __init dcache_init(unsigned NULL, NULL); set_shrinker(DEFAULT_SEEKS, shrink_dcache_memory); - - if (!dhash_entries) - dhash_entries = PAGE_SHIFT < 13 ? - mempages >> (13 - PAGE_SHIFT) : - mempages << (PAGE_SHIFT - 13); - - dhash_entries *= sizeof(struct hlist_head); - for (order = 0; ((1UL << order) << PAGE_SHIFT) < dhash_entries; order++) - ; - - do { - unsigned long tmp; - - nr_hash = (1UL << order) * PAGE_SIZE / - sizeof(struct hlist_head); - d_hash_mask = (nr_hash - 1); - - tmp = nr_hash; - d_hash_shift = 0; - while ((tmp >>= 1UL) != 0UL) - d_hash_shift++; - - dentry_hashtable = (struct hlist_head *) - __get_free_pages(GFP_ATOMIC, order); - } while (dentry_hashtable == NULL && --order >= 0); - - printk(KERN_INFO "Dentry cache hash table entries: %d (order: %ld, %ld bytes)\n", - nr_hash, order, (PAGE_SIZE << order)); - - if (!dentry_hashtable) - panic("Failed to allocate dcache hash table\n"); - - d = dentry_hashtable; - i = nr_hash; - do { - INIT_HLIST_HEAD(d); - d++; - i--; - } while (i); } /* SLAB cache for __getname() consumers */ @@ -1632,6 +1606,12 @@ EXPORT_SYMBOL(d_genocide); extern void bdev_cache_init(void); extern void chrdev_init(void); +void __init vfs_caches_init_early(void) +{ + dcache_init_early(); + inode_init_early(); +} + void __init vfs_caches_init(unsigned long mempages) { unsigned long reserve; diff -uNrp linux-2.6.7-rc3/fs/inode.c linux-2.6.7-rc3-hash/fs/inode.c --- linux-2.6.7-rc3/fs/inode.c 2004-06-11 09:36:27.000000000 +0100 +++ linux-2.6.7-rc3-hash/fs/inode.c 2004-06-14 10:53:16.278082113 +0100 @@ -20,6 +20,7 @@ #include #include #include +#include /* * This is needed for the following functions: @@ -1345,55 +1346,30 @@ __setup("ihash_entries=", set_ihash_entr /* * Initialize the waitqueues and inode hash table. */ +void __init inode_init_early(void) +{ + int loop; + + inode_hashtable = + alloc_large_system_hash("Inode-cache", + sizeof(struct hlist_head), + ihash_entries, + 14, + 0, + &i_hash_shift, + &i_hash_mask); + + for (loop = 0; loop < (1 << i_hash_shift); loop++) + INIT_HLIST_HEAD(&inode_hashtable[loop]); +} + void __init inode_init(unsigned long mempages) { - struct hlist_head *head; - unsigned long order; - unsigned int nr_hash; int i; for (i = 0; i < ARRAY_SIZE(i_wait_queue_heads); i++) init_waitqueue_head(&i_wait_queue_heads[i].wqh); - if (!ihash_entries) - ihash_entries = PAGE_SHIFT < 14 ? - mempages >> (14 - PAGE_SHIFT) : - mempages << (PAGE_SHIFT - 14); - - ihash_entries *= sizeof(struct hlist_head); - for (order = 0; ((1UL << order) << PAGE_SHIFT) < ihash_entries; order++) - ; - - do { - unsigned long tmp; - - nr_hash = (1UL << order) * PAGE_SIZE / - sizeof(struct hlist_head); - i_hash_mask = (nr_hash - 1); - - tmp = nr_hash; - i_hash_shift = 0; - while ((tmp >>= 1UL) != 0UL) - i_hash_shift++; - - inode_hashtable = (struct hlist_head *) - __get_free_pages(GFP_ATOMIC, order); - } while (inode_hashtable == NULL && --order >= 0); - - printk("Inode-cache hash table entries: %d (order: %ld, %ld bytes)\n", - nr_hash, order, (PAGE_SIZE << order)); - - if (!inode_hashtable) - panic("Failed to allocate inode hash table\n"); - - head = inode_hashtable; - i = nr_hash; - do { - INIT_HLIST_HEAD(head); - head++; - i--; - } while (i); - /* inode slab cache */ inode_cachep = kmem_cache_create("inode_cache", sizeof(struct inode), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, init_once, diff -uNrp linux-2.6.7-rc3/include/linux/bootmem.h linux-2.6.7-rc3-hash/include/linux/bootmem.h --- linux-2.6.7-rc3/include/linux/bootmem.h 2004-06-11 09:25:03.000000000 +0100 +++ linux-2.6.7-rc3-hash/include/linux/bootmem.h 2004-06-11 10:38:23.000000000 +0100 @@ -67,4 +67,12 @@ extern void * __init __alloc_bootmem_nod __alloc_bootmem_node((pgdat), (x), PAGE_SIZE, 0) #endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */ +extern void *__init alloc_large_system_hash(const char *tablename, + unsigned long bucketsize, + unsigned long numentries, + int scale, + int consider_highmem, + unsigned int *_hash_shift, + unsigned int *_hash_mask); + #endif /* _LINUX_BOOTMEM_H */ diff -uNrp linux-2.6.7-rc3/include/linux/fs.h linux-2.6.7-rc3-hash/include/linux/fs.h --- linux-2.6.7-rc3/include/linux/fs.h 2004-06-11 09:36:36.000000000 +0100 +++ linux-2.6.7-rc3-hash/include/linux/fs.h 2004-06-14 10:50:52.147156861 +0100 @@ -214,15 +214,17 @@ extern int leases_enable, dir_notify_ena #include #include #include +#include #include #include /* Used to be a macro which just called the function, now just a function */ extern void update_atime (struct inode *); -extern void inode_init(unsigned long); -extern void mnt_init(unsigned long); -extern void files_init(unsigned long); +extern void __init inode_init(unsigned long); +extern void __init inode_init_early(void); +extern void __init mnt_init(unsigned long); +extern void __init files_init(unsigned long); struct buffer_head; typedef int (get_block_t)(struct inode *inode, sector_t iblock, @@ -1199,7 +1201,8 @@ extern int filp_close(struct file *, fl_ extern char * getname(const char __user *); /* fs/dcache.c */ -extern void vfs_caches_init(unsigned long); +extern void __init vfs_caches_init_early(void); +extern void __init vfs_caches_init(unsigned long); #define __getname() kmem_cache_alloc(names_cachep, SLAB_KERNEL) #define __putname(name) kmem_cache_free(names_cachep, (void *)(name)) diff -uNrp linux-2.6.7-rc3/include/linux/kernel.h linux-2.6.7-rc3-hash/include/linux/kernel.h --- linux-2.6.7-rc3/include/linux/kernel.h 2004-06-11 09:36:36.000000000 +0100 +++ linux-2.6.7-rc3-hash/include/linux/kernel.h 2004-06-14 10:48:45.130799964 +0100 @@ -92,6 +92,15 @@ asmlinkage int printk(const char * fmt, unsigned long int_sqrt(unsigned long); +static inline int __attribute_pure__ long_log2(unsigned long x) +{ + int r = 0; + for (x >>= 1; x > 0; x >>= 1) + r++; + return r; +} + + extern int printk_ratelimit(void); extern int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst); diff -uNrp linux-2.6.7-rc3/include/linux/mmzone.h linux-2.6.7-rc3-hash/include/linux/mmzone.h --- linux-2.6.7-rc3/include/linux/mmzone.h 2004-06-11 09:36:36.000000000 +0100 +++ linux-2.6.7-rc3-hash/include/linux/mmzone.h 2004-06-11 09:44:41.000000000 +0100 @@ -20,6 +20,18 @@ #define MAX_ORDER CONFIG_FORCE_MAX_ZONEORDER #endif +/* + * system hash table size limits + * - on large memory machines, we may want to allocate a bigger hash than that + * permitted by MAX_ORDER, so we allocate with the bootmem allocator, and are + * limited to this size + */ +#if MAX_ORDER > 14 +#define MAX_SYS_HASH_TABLE_ORDER MAX_ORDER +#else +#define MAX_SYS_HASH_TABLE_ORDER 14 +#endif + struct free_area { struct list_head free_list; unsigned long *map; diff -uNrp linux-2.6.7-rc3/init/main.c linux-2.6.7-rc3-hash/init/main.c --- linux-2.6.7-rc3/init/main.c 2004-06-11 09:36:36.000000000 +0100 +++ linux-2.6.7-rc3-hash/init/main.c 2004-06-11 10:48:37.000000000 +0100 @@ -454,6 +454,7 @@ asmlinkage void __init start_kernel(void initrd_start = 0; } #endif + vfs_caches_init_early(); mem_init(); kmem_cache_init(); if (late_time_init) diff -uNrp linux-2.6.7-rc3/mm/page_alloc.c linux-2.6.7-rc3-hash/mm/page_alloc.c --- linux-2.6.7-rc3/mm/page_alloc.c 2004-06-11 09:36:36.000000000 +0100 +++ linux-2.6.7-rc3-hash/mm/page_alloc.c 2004-06-14 10:48:58.902645878 +0100 @@ -55,6 +55,9 @@ EXPORT_SYMBOL(zone_table); static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" }; int min_free_kbytes = 1024; +static unsigned long __initdata nr_kernel_pages; +static unsigned long __initdata nr_all_pages; + /* * Temporary debugging check for pages not lying within a given zone. */ @@ -1454,6 +1457,10 @@ static void __init free_area_init_core(s if (zholes_size) realsize -= zholes_size[j]; + if (j == ZONE_DMA || j == ZONE_NORMAL) + nr_kernel_pages += realsize; + nr_all_pages += realsize; + zone->spanned_pages = size; zone->present_pages = realsize; zone->name = zone_names[j]; @@ -1994,3 +2001,69 @@ int lower_zone_protection_sysctl_handler setup_per_zone_protection(); return 0; } + +/* + * allocate a large system hash table from bootmem + * - it is assumed that the hash table must contain an exact power-of-2 + * quantity of entries + */ +void *__init alloc_large_system_hash(const char *tablename, + unsigned long bucketsize, + unsigned long numentries, + int scale, + int consider_highmem, + unsigned int *_hash_shift, + unsigned int *_hash_mask) +{ + unsigned long mem, max, log2qty, size; + void *table; + + /* round applicable memory size up to nearest megabyte */ + mem = consider_highmem ? nr_all_pages : nr_kernel_pages; + mem += (1UL << (20 - PAGE_SHIFT)) - 1; + mem >>= 20 - PAGE_SHIFT; + mem <<= 20 - PAGE_SHIFT; + + /* limit to 1 bucket per 2^scale bytes of low memory (rounded up to + * nearest power of 2 in size) */ + if (scale > PAGE_SHIFT) + mem >>= (scale - PAGE_SHIFT); + else + mem <<= (PAGE_SHIFT - scale); + + mem = 1UL << (long_log2(mem) + 1); + + /* limit allocation size */ + max = (1UL << (PAGE_SHIFT + MAX_SYS_HASH_TABLE_ORDER)) / bucketsize; + if (max > mem) + max = mem; + + /* allow the kernel cmdline to have a say */ + if (!numentries || numentries > max) + numentries = max; + + log2qty = long_log2(numentries); + + do { + size = bucketsize << log2qty; + + table = (void *) alloc_bootmem(size); + + } while (!table && size > PAGE_SIZE); + + if (!table) + panic("Failed to allocate %s hash table\n", tablename); + + printk("%s hash table entries: %d (order: %d, %lu bytes)\n", + tablename, + (1U << log2qty), + long_log2(size) - PAGE_SHIFT, + size); + + if (_hash_shift) + *_hash_shift = log2qty; + if (_hash_mask) + *_hash_mask = (1 << log2qty) - 1; + + return table; +} --Multipart_Mon_Jun_14_11:47:47_2004-1--