public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH] Permit inode & dentry hash tables to be allocated > MAX_ORDER size
@ 2004-06-11 10:44 David Howells
  2004-06-11 10:48 ` Andrew Morton
  0 siblings, 1 reply; 16+ messages in thread
From: David Howells @ 2004-06-11 10:44 UTC (permalink / raw)
  To: torvalds, akpm; +Cc: linux-kernel

[-- Attachment #1: Type: text/plain, Size: 1300 bytes --]


Hi Linus, Andrew,

Here's a patch to allocate memory for big system hash tables with the bootmem
allocator rather than with main page allocator.

I've coelesced the meat of the inode and dentry allocation routines into one
such routine in mm/page_alloc.c that the the respective initialisation
functions now call before mem_init() is called.

This routine gets it's approximation of memory size by counting up the
ZONE_NORMAL and ZONE_DMA pages (and ZONE_HIGHMEM if requested) in all the
nodes passed to the main allocator by paging_init() (or wherever the arch does
it). It does not use max_low_pfn as that doesn't seem to be available on all
archs, and it doesn't use num_physpages since that includes highmem pages not
available to the kernel for allocating data structures upon - which may not be
appropriate when calculating hash table size.

On the off chance that the size of each hash bucket may not be exactly a power
of two, the routine will only allocate as many pages as is necessary to ensure
that the number of buckets is exactly a power of two, rather than allocating
the smallest power-of-two sized chunk of memory that will hold the same array
of buckets.

The maximum size of any single hash table is given by MAX_SYS_HASH_TABLE_ORDER,
as is now defined in linux/mmzone.h.

David


[-- Attachment #2: bootmem-hash-267rc3.diff --]
[-- Type: text/plain, Size: 10511 bytes --]

diff -uNrp linux-2.6.7-rc3/fs/dcache.c linux-2.6.7-rc3-hash/fs/dcache.c
--- linux-2.6.7-rc3/fs/dcache.c	2004-06-11 09:36:25.000000000 +0100
+++ linux-2.6.7-rc3-hash/fs/dcache.c	2004-06-11 10:52:23.000000000 +0100
@@ -30,6 +30,7 @@
 #include <linux/security.h>
 #include <linux/seqlock.h>
 #include <linux/swap.h>
+#include <linux/bootmem.h>
 
 #define DCACHE_PARANOIA 1
 /* #define DCACHE_DEBUG 1 */
@@ -1561,13 +1562,31 @@ static int __init set_dhash_entries(char
 }
 __setup("dhash_entries=", set_dhash_entries);
 
-static void __init dcache_init(unsigned long mempages)
+static void __init dcache_init_early(void)
 {
-	struct hlist_head *d;
-	unsigned long order;
-	unsigned int nr_hash;
-	int i;
+	struct hlist_head *p;
+	int loop;
+
+	dentry_hashtable =
+		alloc_large_system_hash("Dentry cache",
+					sizeof(struct hlist_head),
+					dhash_entries,
+					13,
+					0,
+					&d_hash_shift,
+					&d_hash_mask);
+
+	p = dentry_hashtable;
+	loop = 1 << d_hash_shift;
+	do {
+		INIT_HLIST_HEAD(p);
+		p++;
+		loop--;
+	} while (loop);
+}
 
+static void __init dcache_init(unsigned long mempages)
+{
 	/* 
 	 * A constructor could be added for stable state like the lists,
 	 * but it is probably not worth it because of the cache nature
@@ -1580,45 +1599,6 @@ static void __init dcache_init(unsigned 
 					 NULL, NULL);
 	
 	set_shrinker(DEFAULT_SEEKS, shrink_dcache_memory);
-
-	if (!dhash_entries)
-		dhash_entries = PAGE_SHIFT < 13 ?
-				mempages >> (13 - PAGE_SHIFT) :
-				mempages << (PAGE_SHIFT - 13);
-
-	dhash_entries *= sizeof(struct hlist_head);
-	for (order = 0; ((1UL << order) << PAGE_SHIFT) < dhash_entries; order++)
-		;
-
-	do {
-		unsigned long tmp;
-
-		nr_hash = (1UL << order) * PAGE_SIZE /
-			sizeof(struct hlist_head);
-		d_hash_mask = (nr_hash - 1);
-
-		tmp = nr_hash;
-		d_hash_shift = 0;
-		while ((tmp >>= 1UL) != 0UL)
-			d_hash_shift++;
-
-		dentry_hashtable = (struct hlist_head *)
-			__get_free_pages(GFP_ATOMIC, order);
-	} while (dentry_hashtable == NULL && --order >= 0);
-
-	printk(KERN_INFO "Dentry cache hash table entries: %d (order: %ld, %ld bytes)\n",
-			nr_hash, order, (PAGE_SIZE << order));
-
-	if (!dentry_hashtable)
-		panic("Failed to allocate dcache hash table\n");
-
-	d = dentry_hashtable;
-	i = nr_hash;
-	do {
-		INIT_HLIST_HEAD(d);
-		d++;
-		i--;
-	} while (i);
 }
 
 /* SLAB cache for __getname() consumers */
@@ -1632,6 +1612,12 @@ EXPORT_SYMBOL(d_genocide);
 extern void bdev_cache_init(void);
 extern void chrdev_init(void);
 
+void __init vfs_caches_init_early(void)
+{
+	dcache_init_early();
+	inode_init_early();
+}
+
 void __init vfs_caches_init(unsigned long mempages)
 {
 	unsigned long reserve;
diff -uNrp linux-2.6.7-rc3/fs/inode.c linux-2.6.7-rc3-hash/fs/inode.c
--- linux-2.6.7-rc3/fs/inode.c	2004-06-11 09:36:27.000000000 +0100
+++ linux-2.6.7-rc3-hash/fs/inode.c	2004-06-11 10:52:48.744705411 +0100
@@ -20,6 +20,7 @@
 #include <linux/security.h>
 #include <linux/pagemap.h>
 #include <linux/cdev.h>
+#include <linux/bootmem.h>
 
 /*
  * This is needed for the following functions:
@@ -1345,54 +1346,35 @@ __setup("ihash_entries=", set_ihash_entr
 /*
  * Initialize the waitqueues and inode hash table.
  */
-void __init inode_init(unsigned long mempages)
+void __init inode_init_early(void)
 {
-	struct hlist_head *head;
-	unsigned long order;
-	unsigned int nr_hash;
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(i_wait_queue_heads); i++)
-		init_waitqueue_head(&i_wait_queue_heads[i].wqh);
+	struct hlist_head *p;
+	int loop;
 
-	if (!ihash_entries)
-		ihash_entries = PAGE_SHIFT < 14 ?
-				mempages >> (14 - PAGE_SHIFT) :
-				mempages << (PAGE_SHIFT - 14);
-
-	ihash_entries *= sizeof(struct hlist_head);
-	for (order = 0; ((1UL << order) << PAGE_SHIFT) < ihash_entries; order++)
-		;
+	inode_hashtable =
+		alloc_large_system_hash("Inode-cache",
+					sizeof(struct hlist_head),
+					ihash_entries,
+					14,
+					0,
+					&i_hash_shift,
+					&i_hash_mask);
 
+	p = inode_hashtable;
+	loop = 1 << i_hash_shift;
 	do {
-		unsigned long tmp;
-
-		nr_hash = (1UL << order) * PAGE_SIZE /
-			sizeof(struct hlist_head);
-		i_hash_mask = (nr_hash - 1);
-
-		tmp = nr_hash;
-		i_hash_shift = 0;
-		while ((tmp >>= 1UL) != 0UL)
-			i_hash_shift++;
-
-		inode_hashtable = (struct hlist_head *)
-			__get_free_pages(GFP_ATOMIC, order);
-	} while (inode_hashtable == NULL && --order >= 0);
-
-	printk("Inode-cache hash table entries: %d (order: %ld, %ld bytes)\n",
-			nr_hash, order, (PAGE_SIZE << order));
+		INIT_HLIST_HEAD(p);
+		p++;
+		loop--;
+	} while (loop);
+}
 
-	if (!inode_hashtable)
-		panic("Failed to allocate inode hash table\n");
+void __init inode_init(unsigned long mempages)
+{
+	int i;
 
-	head = inode_hashtable;
-	i = nr_hash;
-	do {
-		INIT_HLIST_HEAD(head);
-		head++;
-		i--;
-	} while (i);
+	for (i = 0; i < ARRAY_SIZE(i_wait_queue_heads); i++)
+		init_waitqueue_head(&i_wait_queue_heads[i].wqh);
 
 	/* inode slab cache */
 	inode_cachep = kmem_cache_create("inode_cache", sizeof(struct inode),
diff -uNrp linux-2.6.7-rc3/include/linux/bootmem.h linux-2.6.7-rc3-hash/include/linux/bootmem.h
--- linux-2.6.7-rc3/include/linux/bootmem.h	2004-06-11 09:25:03.000000000 +0100
+++ linux-2.6.7-rc3-hash/include/linux/bootmem.h	2004-06-11 10:38:23.000000000 +0100
@@ -67,4 +67,12 @@ extern void * __init __alloc_bootmem_nod
 	__alloc_bootmem_node((pgdat), (x), PAGE_SIZE, 0)
 #endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
 
+extern void *__init alloc_large_system_hash(const char *tablename,
+					    unsigned long bucketsize,
+					    unsigned long numentries,
+					    int scale,
+					    int consider_highmem,
+					    unsigned int *_hash_shift,
+					    unsigned int *_hash_mask);
+
 #endif /* _LINUX_BOOTMEM_H */
diff -uNrp linux-2.6.7-rc3/include/linux/fs.h linux-2.6.7-rc3-hash/include/linux/fs.h
--- linux-2.6.7-rc3/include/linux/fs.h	2004-06-11 09:36:36.000000000 +0100
+++ linux-2.6.7-rc3-hash/include/linux/fs.h	2004-06-11 10:46:44.290698753 +0100
@@ -221,6 +221,7 @@ extern int leases_enable, dir_notify_ena
 extern void update_atime (struct inode *);
 
 extern void inode_init(unsigned long);
+extern void inode_init_early(void);
 extern void mnt_init(unsigned long);
 extern void files_init(unsigned long);
 
@@ -1199,6 +1200,7 @@ extern int filp_close(struct file *, fl_
 extern char * getname(const char __user *);
 
 /* fs/dcache.c */
+extern void vfs_caches_init_early(void);
 extern void vfs_caches_init(unsigned long);
 
 #define __getname()	kmem_cache_alloc(names_cachep, SLAB_KERNEL)
diff -uNrp linux-2.6.7-rc3/include/linux/mmzone.h linux-2.6.7-rc3-hash/include/linux/mmzone.h
--- linux-2.6.7-rc3/include/linux/mmzone.h	2004-06-11 09:36:36.000000000 +0100
+++ linux-2.6.7-rc3-hash/include/linux/mmzone.h	2004-06-11 09:44:41.531812981 +0100
@@ -20,6 +20,18 @@
 #define MAX_ORDER CONFIG_FORCE_MAX_ZONEORDER
 #endif
 
+/*
+ * system hash table size limits
+ * - on large memory machines, we may want to allocate a bigger hash than that
+ *   permitted by MAX_ORDER, so we allocate with the bootmem allocator, and are
+ *   limited to this size
+ */
+#if MAX_ORDER > 14
+#define MAX_SYS_HASH_TABLE_ORDER MAX_ORDER
+#else
+#define MAX_SYS_HASH_TABLE_ORDER 14
+#endif
+
 struct free_area {
 	struct list_head	free_list;
 	unsigned long		*map;
diff -uNrp linux-2.6.7-rc3/init/main.c linux-2.6.7-rc3-hash/init/main.c
--- linux-2.6.7-rc3/init/main.c	2004-06-11 09:36:36.565153337 +0100
+++ linux-2.6.7-rc3-hash/init/main.c	2004-06-11 10:48:37.028742509 +0100
@@ -454,6 +454,7 @@ asmlinkage void __init start_kernel(void
 		initrd_start = 0;
 	}
 #endif
+	vfs_caches_init_early();
 	mem_init();
 	kmem_cache_init();
 	if (late_time_init)
diff -uNrp linux-2.6.7-rc3/mm/page_alloc.c linux-2.6.7-rc3-hash/mm/page_alloc.c
--- linux-2.6.7-rc3/mm/page_alloc.c	2004-06-11 09:36:36.000000000 +0100
+++ linux-2.6.7-rc3-hash/mm/page_alloc.c	2004-06-11 11:21:59.761918397 +0100
@@ -55,6 +55,9 @@ EXPORT_SYMBOL(zone_table);
 static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
 int min_free_kbytes = 1024;
 
+static unsigned long __initdata nr_kernel_pages;
+static unsigned long __initdata nr_all_pages;
+
 /*
  * Temporary debugging check for pages not lying within a given zone.
  */
@@ -1454,6 +1457,10 @@ static void __init free_area_init_core(s
 		if (zholes_size)
 			realsize -= zholes_size[j];
 
+		if (j == ZONE_DMA || j == ZONE_NORMAL)
+			nr_kernel_pages += realsize;
+		nr_all_pages += realsize;
+
 		zone->spanned_pages = size;
 		zone->present_pages = realsize;
 		zone->name = zone_names[j];
@@ -1994,3 +2001,78 @@ int lower_zone_protection_sysctl_handler
 	setup_per_zone_protection();
 	return 0;
 }
+
+/*
+ * allocate a large system hash table from bootmem
+ * - it is assumed that the hash table must contain an exact power-of-2
+ *   quantity of entries
+ */
+static inline int log2(unsigned long x) __attribute__((pure));
+static inline int log2(unsigned long x)
+{
+	int r = 0;
+	for (x >>= 1; x > 0; x >>= 1)
+		r++;
+	return r;
+}
+
+void *__init alloc_large_system_hash(const char *tablename,
+				     unsigned long bucketsize,
+				     unsigned long numentries,
+				     int scale,
+				     int consider_highmem,
+				     unsigned int *_hash_shift,
+				     unsigned int *_hash_mask)
+{
+	unsigned long mem, max, log2qty, size;
+	void *table;
+
+	/* round applicable memory size up to nearest megabyte */
+	mem = consider_highmem ? nr_all_pages : nr_kernel_pages;
+	mem += (1UL << (20 - PAGE_SHIFT)) - 1;
+	mem >>= 20 - PAGE_SHIFT;
+	mem <<= 20 - PAGE_SHIFT;
+
+	/* limit to 1 bucket per 2^scale bytes of low memory (rounded up to
+	 * nearest power of 2 in size) */
+	if (scale > PAGE_SHIFT)
+		mem >>= (scale - PAGE_SHIFT);
+	else
+		mem <<= (PAGE_SHIFT - scale);
+
+	mem = 1UL << (log2(mem) + 1);
+
+	/* limit allocation size */
+	max = (1UL << (PAGE_SHIFT + MAX_SYS_HASH_TABLE_ORDER)) / bucketsize;
+	if (max > mem)
+		max = mem;
+
+	/* allow the kernel cmdline to have a say */
+	if (!numentries || numentries > max)
+		numentries = max;
+
+	log2qty = log2(numentries);
+
+	do {
+		size = bucketsize << log2qty;
+
+		table = (void *) alloc_bootmem(size);
+
+	} while (!table && size > PAGE_SIZE);
+
+	if (!table)
+		panic("Failed to allocate %s hash table\n", tablename);
+
+	printk("%s hash table entries: %d (order: %d, %lu bytes)\n",
+	       tablename,
+	       (1U << log2qty),
+	       log2(size) - PAGE_SHIFT,
+	       size);
+
+	if (_hash_shift)
+		*_hash_shift = log2qty;
+	if (_hash_mask)
+		*_hash_mask = (1 << log2qty) - 1;
+
+	return table;
+}

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Permit inode & dentry hash tables to be allocated > MAX_ORDER size
  2004-06-11 10:44 David Howells
@ 2004-06-11 10:48 ` Andrew Morton
  2004-06-11 11:12   ` David Howells
  0 siblings, 1 reply; 16+ messages in thread
From: Andrew Morton @ 2004-06-11 10:48 UTC (permalink / raw)
  To: David Howells; +Cc: torvalds, linux-kernel

David Howells <dhowells@redhat.com> wrote:
>
>  Here's a patch to allocate memory for big system hash tables with the bootmem
>  allocator rather than with main page allocator.

umm, why?

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Permit inode & dentry hash tables to be allocated > MAX_ORDER size
  2004-06-11 10:48 ` Andrew Morton
@ 2004-06-11 11:12   ` David Howells
  2004-06-11 22:04     ` Andrew Morton
  0 siblings, 1 reply; 16+ messages in thread
From: David Howells @ 2004-06-11 11:12 UTC (permalink / raw)
  To: Andrew Morton; +Cc: torvalds, linux-kernel


> >  Here's a patch to allocate memory for big system hash tables with the bootmem
> >  allocator rather than with main page allocator.
> 
> umm, why?

Three reasons:

 (1) So that the size can be bigger than MAX_ORDER. IBM have done some testing
     on their big PPC64 systems (64GB of RAM) with linux-2.4 and found that
     they get better performance if the sizes of the inode cache hash, dentry
     cache hash, buffer head hash and page cache hash are increased beyond
     MAX_ORDER (order 11).

     Now the main allocator can't allocate anything larger than MAX_ORDER, but
     the bootmem allocator can.

     In 2.6 it appears that only the inode and dentry hashes remain of those
     four, but there are other hash tables that could use this service.

 (2) Changing MAX_ORDER appears to have a number of effects beyond just
     limiting the maximum size that can be allocated in one go.

 (3) Should someone want a hash table in which each bucket isn't a power of two
     in size, memory will be wasted as the chunk of memory allocated will be a
     power of two in size (to hold a power of two number of buckets).

     On the other hand, using the bootmem allocator means the allocation will
     only take up sufficient pages to hold it, rather than the next power of
     two up.

     Admittedly, this point doesn't apply to the dentry and inode hashes, but
     it might to another hash table that might want to use this service.

David

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Permit inode & dentry hash tables to be allocated > MAX_ORDER size
  2004-06-11 11:12   ` David Howells
@ 2004-06-11 22:04     ` Andrew Morton
  2004-06-11 23:03       ` Martin J. Bligh
  0 siblings, 1 reply; 16+ messages in thread
From: Andrew Morton @ 2004-06-11 22:04 UTC (permalink / raw)
  To: David Howells; +Cc: torvalds, linux-kernel

David Howells <dhowells@redhat.com> wrote:
>
>  (2) Changing MAX_ORDER appears to have a number of effects beyond just
>      limiting the maximum size that can be allocated in one go.

Several architectures implement CONFIG_FORCE_MAX_ZONEORDER and I haven't
heard of larger MAX_ORDERs causing problems.

Certainly, increasing MAX_ORDER is the simplest solution to the problems
which you identify so we need to substantiate these "number of effects"
much more than this please.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Permit inode & dentry hash tables to be allocated > MAX_ORDER size
  2004-06-11 22:04     ` Andrew Morton
@ 2004-06-11 23:03       ` Martin J. Bligh
  2004-06-11 23:19         ` Andrew Morton
  0 siblings, 1 reply; 16+ messages in thread
From: Martin J. Bligh @ 2004-06-11 23:03 UTC (permalink / raw)
  To: Andrew Morton, David Howells; +Cc: torvalds, linux-kernel, Andy Whitcroft

>>  (2) Changing MAX_ORDER appears to have a number of effects beyond just
>>      limiting the maximum size that can be allocated in one go.
> 
> Several architectures implement CONFIG_FORCE_MAX_ZONEORDER and I haven't
> heard of larger MAX_ORDERs causing problems.
> 
> Certainly, increasing MAX_ORDER is the simplest solution to the problems
> which you identify so we need to substantiate these "number of effects"
> much more than this please.

We've hit a problem with alignment issues where the start of the zone is
aligned to 16MB, for instance, and the max grouping is now 256MB. That
generatates a "warning: wrong zone alignment: it will crash" error (or
something similar). Andy sent me a patch this morning to throw away
the lower section, which is much nicer than crashing ... but I'd prefer
not to throw that RAM away unless we have to. 

Allocating the big-assed hashes out of bootmem seems much cleaner to me,
at least ...

M.


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Permit inode & dentry hash tables to be allocated > MAX_ORDER size
  2004-06-11 23:19         ` Andrew Morton
@ 2004-06-11 23:18           ` Martin J. Bligh
  2004-06-11 23:30             ` Andrew Morton
  2004-06-13 16:09           ` Linus Torvalds
  1 sibling, 1 reply; 16+ messages in thread
From: Martin J. Bligh @ 2004-06-11 23:18 UTC (permalink / raw)
  To: Andrew Morton; +Cc: dhowells, torvalds, linux-kernel, apw

--On Friday, June 11, 2004 16:19:20 -0700 Andrew Morton <akpm@osdl.org> wrote:

> "Martin J. Bligh" <mbligh@aracnet.com> wrote:
>> 
>> We've hit a problem with alignment issues where the start of the zone is
>> aligned to 16MB, for instance, and the max grouping is now 256MB. That
>> generatates a "warning: wrong zone alignment: it will crash" error (or
>> something similar). Andy sent me a patch this morning to throw away
>> the lower section, which is much nicer than crashing ... but I'd prefer
>> not to throw that RAM away unless we have to. 
> 
> Confused.  Why do we have that test in there at all?  We should just toss
> the pages one at a time into the buddy list and let the normal coalescing
> work it out.  That way we'd end up with a single 16MB "page" followed by N
> 256MB "pages".

That's what I thought ... Andy looked at it more than I did, but I think
he's asleep, unfortunately. IIRC, he said the buddy stuff keys off 
zone_start_pfn though. Maybe that's fixable ...

M.


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Permit inode & dentry hash tables to be allocated > MAX_ORDER size
  2004-06-11 23:03       ` Martin J. Bligh
@ 2004-06-11 23:19         ` Andrew Morton
  2004-06-11 23:18           ` Martin J. Bligh
  2004-06-13 16:09           ` Linus Torvalds
  0 siblings, 2 replies; 16+ messages in thread
From: Andrew Morton @ 2004-06-11 23:19 UTC (permalink / raw)
  To: Martin J. Bligh; +Cc: dhowells, torvalds, linux-kernel, apw

"Martin J. Bligh" <mbligh@aracnet.com> wrote:
>
> We've hit a problem with alignment issues where the start of the zone is
> aligned to 16MB, for instance, and the max grouping is now 256MB. That
> generatates a "warning: wrong zone alignment: it will crash" error (or
> something similar). Andy sent me a patch this morning to throw away
> the lower section, which is much nicer than crashing ... but I'd prefer
> not to throw that RAM away unless we have to. 

Confused.  Why do we have that test in there at all?  We should just toss
the pages one at a time into the buddy list and let the normal coalescing
work it out.  That way we'd end up with a single 16MB "page" followed by N
256MB "pages".

> Allocating the big-assed hashes out of bootmem seems much cleaner to me,
> at least ...

Maybe.  That code seems fragile and I have premonitions of unhappy arch
maintainers.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Permit inode & dentry hash tables to be allocated > MAX_ORDER size
  2004-06-11 23:18           ` Martin J. Bligh
@ 2004-06-11 23:30             ` Andrew Morton
  2004-06-12 12:45               ` Andy Whitcroft
  0 siblings, 1 reply; 16+ messages in thread
From: Andrew Morton @ 2004-06-11 23:30 UTC (permalink / raw)
  To: Martin J. Bligh; +Cc: dhowells, torvalds, linux-kernel, apw

"Martin J. Bligh" <mbligh@aracnet.com> wrote:
>
> --On Friday, June 11, 2004 16:19:20 -0700 Andrew Morton <akpm@osdl.org> wrote:
> 
> > "Martin J. Bligh" <mbligh@aracnet.com> wrote:
> >> 
> >> We've hit a problem with alignment issues where the start of the zone is
> >> aligned to 16MB, for instance, and the max grouping is now 256MB. That
> >> generatates a "warning: wrong zone alignment: it will crash" error (or
> >> something similar). Andy sent me a patch this morning to throw away
> >> the lower section, which is much nicer than crashing ... but I'd prefer
> >> not to throw that RAM away unless we have to. 
> > 
> > Confused.  Why do we have that test in there at all?  We should just toss
> > the pages one at a time into the buddy list and let the normal coalescing
> > work it out.  That way we'd end up with a single 16MB "page" followed by N
> > 256MB "pages".
> 
> That's what I thought ... Andy looked at it more than I did, but I think
> he's asleep, unfortunately. IIRC, he said the buddy stuff keys off 
> zone_start_pfn though. Maybe that's fixable ...

Doesn't look that way.  It uses

	(zone->zone_mem_map - page) >> (1 + order)


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Permit inode & dentry hash tables to be allocated > MAX_ORDER size
       [not found]   ` <263jX-5RZ-17@gated-at.bofh.it>
@ 2004-06-12  0:21     ` Andi Kleen
  2004-06-12  6:00       ` Martin J. Bligh
  2004-06-12  7:36       ` Dave Hansen
  0 siblings, 2 replies; 16+ messages in thread
From: Andi Kleen @ 2004-06-12  0:21 UTC (permalink / raw)
  To: Martin J. Bligh; +Cc: torvalds, linux-kernel, Andy Whitcroft, akpm

"Martin J. Bligh" <mbligh@aracnet.com> writes:
>
> Allocating the big-assed hashes out of bootmem seems much cleaner to me,
> at least ...

Machines big enough that such big hashes make sense are probably NUMA.
And on NUMA systems you imho should rather use node interleaving vmalloc(),
not a bit physical allocation on a specific node for these hashes. 
This will avoid memory controller hot spots and avoid the problem completely.
Likely it will perform better too.

-Andi


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Permit inode & dentry hash tables to be allocated > MAX_ORDER size
  2004-06-12  0:21     ` [PATCH] Permit inode & dentry hash tables to be allocated > MAX_ORDER size Andi Kleen
@ 2004-06-12  6:00       ` Martin J. Bligh
  2004-06-12  7:36       ` Dave Hansen
  1 sibling, 0 replies; 16+ messages in thread
From: Martin J. Bligh @ 2004-06-12  6:00 UTC (permalink / raw)
  To: Andi Kleen; +Cc: torvalds, linux-kernel, Andy Whitcroft, akpm

>> Allocating the big-assed hashes out of bootmem seems much cleaner to me,
>> at least ...
> 
> Machines big enough that such big hashes make sense are probably NUMA.
> And on NUMA systems you imho should rather use node interleaving vmalloc(),
> not a bit physical allocation on a specific node for these hashes. 
> This will avoid memory controller hot spots and avoid the problem completely.
> Likely it will perform better too.

I thought Manfred already fixed all that up, didn't he?

M.


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Permit inode & dentry hash tables to be allocated > MAX_ORDER size
  2004-06-12  0:21     ` [PATCH] Permit inode & dentry hash tables to be allocated > MAX_ORDER size Andi Kleen
  2004-06-12  6:00       ` Martin J. Bligh
@ 2004-06-12  7:36       ` Dave Hansen
  2004-06-12 13:11         ` Andi Kleen
  1 sibling, 1 reply; 16+ messages in thread
From: Dave Hansen @ 2004-06-12  7:36 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Martin J. Bligh, torvalds, Linux Kernel Mailing List,
	Andy Whitcroft, Andrew Morton

On Fri, 2004-06-11 at 17:21, Andi Kleen wrote:
> "Martin J. Bligh" <mbligh@aracnet.com> writes:
> >
> > Allocating the big-assed hashes out of bootmem seems much cleaner to me,
> > at least ...
> 
> Machines big enough that such big hashes make sense are probably NUMA.
> And on NUMA systems you imho should rather use node interleaving vmalloc(),
> not a bit physical allocation on a specific node for these hashes. 
> This will avoid memory controller hot spots and avoid the problem completely.
> Likely it will perform better too.

Since vmalloc() maps the pages with small pagetable entries (unlike most
of the rest of the kernel address space), do you think the interleaving
will outweigh any negative TLB effects?  

-- Dave


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Permit inode & dentry hash tables to be allocated > MAX_ORDER size
  2004-06-11 23:30             ` Andrew Morton
@ 2004-06-12 12:45               ` Andy Whitcroft
  0 siblings, 0 replies; 16+ messages in thread
From: Andy Whitcroft @ 2004-06-12 12:45 UTC (permalink / raw)
  To: Andrew Morton, Martin J. Bligh; +Cc: dhowells, torvalds, linux-kernel

--On 11 June 2004 16:30 -0700 Andrew Morton <akpm@osdl.org> wrote:

> Doesn't look that way.  It uses
>
> 	(zone->zone_mem_map - page) >> (1 + order)

Hmmm, yes.  Does this not mean that we will violate the object size N will be aligned at size N.  Presumably that's why the error says 'it'll crash'.  If we are two pages offset we will allocate 4 page objects 2 page aligned.  I'll have a closer look and see if we can just 'round down' the zone_mem_map pointer here to give the correct alignment.  As long as we don't try and free them into the allocator originally we should be ok, as they are marked allocated in the bitmaps at the start.

-apw

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Permit inode & dentry hash tables to be allocated > MAX_ORDER size
  2004-06-12  7:36       ` Dave Hansen
@ 2004-06-12 13:11         ` Andi Kleen
  2004-06-12 15:00           ` Martin J. Bligh
  0 siblings, 1 reply; 16+ messages in thread
From: Andi Kleen @ 2004-06-12 13:11 UTC (permalink / raw)
  To: Dave Hansen
  Cc: Andi Kleen, Martin J. Bligh, torvalds, Linux Kernel Mailing List,
	Andy Whitcroft, Andrew Morton

> Since vmalloc() maps the pages with small pagetable entries (unlike most
> of the rest of the kernel address space), do you think the interleaving
> will outweigh any negative TLB effects?  

I think so, yes (assuming you run the benchmark on all CPUs)

-Andi

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Permit inode & dentry hash tables to be allocated > MAX_ORDER size
  2004-06-12 13:11         ` Andi Kleen
@ 2004-06-12 15:00           ` Martin J. Bligh
  2004-06-15 16:40             ` Dave Hansen
  0 siblings, 1 reply; 16+ messages in thread
From: Martin J. Bligh @ 2004-06-12 15:00 UTC (permalink / raw)
  To: Andi Kleen, Dave Hansen
  Cc: torvalds, Linux Kernel Mailing List, Andy Whitcroft,
	Andrew Morton

>> Since vmalloc() maps the pages with small pagetable entries (unlike most
>> of the rest of the kernel address space), do you think the interleaving
>> will outweigh any negative TLB effects?  
> 
> I think so, yes (assuming you run the benchmark on all CPUs)

On the other hand, there's no reason we can't hack up a version of vmalloc
to use large pages, and interleave only based on that. 

M.


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Permit inode & dentry hash tables to be allocated > MAX_ORDER size
  2004-06-11 23:19         ` Andrew Morton
  2004-06-11 23:18           ` Martin J. Bligh
@ 2004-06-13 16:09           ` Linus Torvalds
  1 sibling, 0 replies; 16+ messages in thread
From: Linus Torvalds @ 2004-06-13 16:09 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Martin J. Bligh, dhowells, linux-kernel, apw



On Fri, 11 Jun 2004, Andrew Morton wrote:
> 
> Confused.  Why do we have that test in there at all?  We should just toss
> the pages one at a time into the buddy list and let the normal coalescing
> work it out.  That way we'd end up with a single 16MB "page" followed by N
> 256MB "pages".

Doesn't work that way. We use the base of the memory area as the "zero
point", and while the buddy allocator itself shouldn't really care where
that zero-point is, anybody who expects physical alignment would be really
upset if it doesn't get it.

So the base address has to be as aligned as anybody could ever want. And
"anybody" wants quite a bit of alignment. Largepages usually want
alignment guarantees, and on most architectures that means a minimum
_physical_ address alignment of at least a few megabytes.

So the rule really should be: make sure that the buddy system base address 
is maximally aligned, and if your memory doesn't actually _start_ at that 
alignment point, just add the pages and let the buddy allocator build up 
all the bitmaps etc for you.

		Linus

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Permit inode & dentry hash tables to be allocated > MAX_ORDER size
  2004-06-12 15:00           ` Martin J. Bligh
@ 2004-06-15 16:40             ` Dave Hansen
  0 siblings, 0 replies; 16+ messages in thread
From: Dave Hansen @ 2004-06-15 16:40 UTC (permalink / raw)
  To: Martin J. Bligh
  Cc: Andi Kleen, torvalds, Linux Kernel Mailing List, Andy Whitcroft,
	Andrew Morton

On Sat, 2004-06-12 at 08:00, Martin J. Bligh wrote:
> >> Since vmalloc() maps the pages with small pagetable entries (unlike most
> >> of the rest of the kernel address space), do you think the interleaving
> >> will outweigh any negative TLB effects?  
> > 
> > I think so, yes (assuming you run the benchmark on all CPUs)
> 
> On the other hand, there's no reason we can't hack up a version of vmalloc
> to use large pages, and interleave only based on that. 

Think about ppc64 where the large page size is 16MB.  That might hurt
interleaving a bit if the structure is only 32MB.  It's better than
*everything* on node 0, but not by much.  

-- Dave


^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2004-06-15 16:43 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <263jX-5RZ-19@gated-at.bofh.it>
     [not found] ` <262nZ-56Z-5@gated-at.bofh.it>
     [not found]   ` <263jX-5RZ-17@gated-at.bofh.it>
2004-06-12  0:21     ` [PATCH] Permit inode & dentry hash tables to be allocated > MAX_ORDER size Andi Kleen
2004-06-12  6:00       ` Martin J. Bligh
2004-06-12  7:36       ` Dave Hansen
2004-06-12 13:11         ` Andi Kleen
2004-06-12 15:00           ` Martin J. Bligh
2004-06-15 16:40             ` Dave Hansen
2004-06-11 10:44 David Howells
2004-06-11 10:48 ` Andrew Morton
2004-06-11 11:12   ` David Howells
2004-06-11 22:04     ` Andrew Morton
2004-06-11 23:03       ` Martin J. Bligh
2004-06-11 23:19         ` Andrew Morton
2004-06-11 23:18           ` Martin J. Bligh
2004-06-11 23:30             ` Andrew Morton
2004-06-12 12:45               ` Andy Whitcroft
2004-06-13 16:09           ` Linus Torvalds

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox