* [PATCH] x86_64: Make the NUMA hash function nodemap allocation dynamic and remove NODEMAPSIZE
@ 2006-11-15 21:48 Amul Shah
2006-11-26 20:49 ` Andi Kleen
0 siblings, 1 reply; 10+ messages in thread
From: Amul Shah @ 2006-11-15 21:48 UTC (permalink / raw)
To: LKML, Andi Kleen, Eric Dumazet
This patch removes the statically allocated memory to NUMA node hash map
in favor of a dynamically allocated memory to node hash map (it is cache
aligned).
This patch has the nice side effect in that it allows the hash map to
grow for systems with large amounts of memory (256GB - 1TB), but suffer
from having small PCI space tacked onto the boot node (which is
somewhere between 192MB to 512MB on the ES7000).
Signed-off-by: Amul Shah <amul.shah@unisys.com>
---
Patch applies to 2.6.19-rc4 and has been tested.
This patch needs testing on a K8 NUMA platform.
Thanks to Eric Dumazet and Andi Kleen for their improvement suggestions.
diff -upNr linux-2.6.19-rc4/arch/x86_64/kernel/e820.c linux-2.6.19-rc4-az/arch/x86_64/kernel/e820.c
--- linux-2.6.19-rc4/arch/x86_64/kernel/e820.c 2006-10-31 17:38:41.000000000 -0500
+++ linux-2.6.19-rc4-az/arch/x86_64/kernel/e820.c 2006-11-08 17:55:48.000000000 -0500
@@ -83,6 +83,12 @@ static inline int bad_addr(unsigned long
return 1;
}
+ /* NUMA memory to node map */
+ if (last >= nodemap_addr && addr < nodemap_addr + nodemap_size) {
+ *addrp = nodemap_addr + nodemap_size;
+ return 1;
+ }
+
/* XXX ramdisk image here? */
return 0;
}
diff -upNr linux-2.6.19-rc4/arch/x86_64/kernel/setup.c linux-2.6.19-rc4-az/arch/x86_64/kernel/setup.c
--- linux-2.6.19-rc4/arch/x86_64/kernel/setup.c 2006-10-31 17:38:41.000000000 -0500
+++ linux-2.6.19-rc4-az/arch/x86_64/kernel/setup.c 2006-11-08 17:55:48.000000000 -0500
@@ -445,6 +445,10 @@ void __init setup_arch(char **cmdline_p)
if (ebda_addr)
reserve_bootmem_generic(ebda_addr, ebda_size);
+ /* reserve nodemap region */
+ if (nodemap_addr)
+ reserve_bootmem_generic(nodemap_addr, nodemap_size);
+
#ifdef CONFIG_SMP
/*
* But first pinch a few for the stack/trampoline stuff
diff -upNr linux-2.6.19-rc4/arch/x86_64/mm/numa.c linux-2.6.19-rc4-az/arch/x86_64/mm/numa.c
--- linux-2.6.19-rc4/arch/x86_64/mm/numa.c 2006-10-31 17:38:41.000000000 -0500
+++ linux-2.6.19-rc4-az/arch/x86_64/mm/numa.c 2006-11-15 15:54:55.000000000 -0500
@@ -36,6 +36,8 @@ unsigned char apicid_to_node[MAX_LOCAL_A
cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly;
int numa_off __initdata;
+unsigned long __initdata nodemap_addr;
+unsigned long __initdata nodemap_size;
/*
@@ -52,34 +54,87 @@ populate_memnodemap(const struct bootnod
int res = -1;
unsigned long addr, end;
- if (shift >= 64)
- return -1;
- memset(memnodemap, 0xff, sizeof(memnodemap));
+ memset(memnodemap, 0xff, memnodemapsize);
for (i = 0; i < numnodes; i++) {
addr = nodes[i].start;
end = nodes[i].end;
if (addr >= end)
continue;
- if ((end >> shift) >= NODEMAPSIZE)
+ if ((end >> shift) >= memnodemapsize)
return 0;
do {
if (memnodemap[addr >> shift] != 0xff)
return -1;
memnodemap[addr >> shift] = i;
- addr += (1UL << shift);
+ addr += (1UL << shift);
} while (addr < end);
res = 1;
}
return res;
}
-int __init compute_hash_shift(struct bootnode *nodes, int numnodes)
+static int __init allocate_cachealigned_memnodemap(void)
{
- int shift = 20;
+ unsigned long pad, pad_addr;
- while (populate_memnodemap(nodes, numnodes, shift + 1) >= 0)
- shift++;
+ memnodemap = memnode.embedded_map;
+ if (memnodemapsize <= 48) {
+ printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n",
+ nodemap_addr, nodemap_addr + nodemap_size);
+ return 0;
+ }
+
+ pad = L1_CACHE_BYTES - 1;
+ pad_addr = 0x8000;
+ nodemap_size = pad + memnodemapsize;
+ nodemap_addr = find_e820_area(pad_addr, end_pfn<<PAGE_SHIFT,
+ nodemap_size);
+ if (nodemap_addr == -1UL) {
+ printk(KERN_ERR
+ "NUMA: Unable to allocate Memory to Node hash map\n");
+ nodemap_addr = nodemap_size = 0;
+ return -1;
+ }
+ pad_addr = (nodemap_addr + pad) & ~pad;
+ memnodemap = phys_to_virt(pad_addr);
+
+ printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n",
+ nodemap_addr, nodemap_addr + nodemap_size);
+ return 0;
+}
+/*
+ * The LSB of all start and end addresses in the node map is the value of the
+ * maximum possible shift.
+ */
+static int __init
+extract_lsb_from_nodes (const struct bootnode *nodes, int numnodes)
+{
+ int i;
+ unsigned long start, end;
+ unsigned long bitfield = 0, memtop = 0;
+
+ for (i = 0; i < numnodes; i++) {
+ start = nodes[i].start;
+ end = nodes[i].end;
+ if (start >= end)
+ continue;
+ bitfield |= start | end;
+ if (end > memtop)
+ memtop = end;
+ }
+ i = find_first_bit(&bitfield, sizeof(unsigned long)*8);
+ memnodemapsize = (memtop >> i)+1;
+ return i;
+}
+
+int __init compute_hash_shift(struct bootnode *nodes, int numnodes)
+{
+ int shift;
+
+ shift = extract_lsb_from_nodes(nodes, numnodes);
+ if (allocate_cachealigned_memnodemap())
+ return -1;
printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
shift);
diff -upNr linux-2.6.19-rc4/include/asm-x86_64/e820.h linux-2.6.19-rc4-az/include/asm-x86_64/e820.h
--- linux-2.6.19-rc4/include/asm-x86_64/e820.h 2006-10-31 17:39:24.000000000 -0500
+++ linux-2.6.19-rc4-az/include/asm-x86_64/e820.h 2006-11-08 17:55:48.000000000 -0500
@@ -56,6 +56,7 @@ extern void finish_e820_parsing(void);
extern struct e820map e820;
extern unsigned ebda_addr, ebda_size;
+extern unsigned long nodemap_addr, nodemap_size;
#endif/*!__ASSEMBLY__*/
#endif/*__E820_HEADER*/
diff -upNr linux-2.6.19-rc4/include/asm-x86_64/mmzone.h linux-2.6.19-rc4-az/include/asm-x86_64/mmzone.h
--- linux-2.6.19-rc4/include/asm-x86_64/mmzone.h 2006-09-19 23:42:06.000000000 -0400
+++ linux-2.6.19-rc4-az/include/asm-x86_64/mmzone.h 2006-11-15 15:26:16.000000000 -0500
@@ -11,24 +11,25 @@
#include <asm/smp.h>
-/* Should really switch to dynamic allocation at some point */
-#define NODEMAPSIZE 0x4fff
-
/* Simple perfect hash to map physical addresses to node numbers */
struct memnode {
int shift;
- u8 map[NODEMAPSIZE];
-} ____cacheline_aligned;
+ unsigned int mapsize;
+ u8 *map;
+ u8 embedded_map[64-16];
+} ____cacheline_aligned; /* total size = 64 bytes */
extern struct memnode memnode;
#define memnode_shift memnode.shift
#define memnodemap memnode.map
+#define memnodemapsize memnode.mapsize
extern struct pglist_data *node_data[];
static inline __attribute__((pure)) int phys_to_nid(unsigned long addr)
{
unsigned nid;
- VIRTUAL_BUG_ON((addr >> memnode_shift) >= NODEMAPSIZE);
+ VIRTUAL_BUG_ON(!memnodemap);
+ VIRTUAL_BUG_ON((addr >> memnode_shift) >= memnodemapsize);
nid = memnodemap[addr >> memnode_shift];
VIRTUAL_BUG_ON(nid >= MAX_NUMNODES || !node_data[nid]);
return nid;
^ permalink raw reply [flat|nested] 10+ messages in thread* Re: [PATCH] x86_64: Make the NUMA hash function nodemap allocation dynamic and remove NODEMAPSIZE
2006-11-15 21:48 [PATCH] x86_64: Make the NUMA hash function nodemap allocation dynamic and remove NODEMAPSIZE Amul Shah
@ 2006-11-26 20:49 ` Andi Kleen
2006-11-27 10:23 ` Eric Dumazet
0 siblings, 1 reply; 10+ messages in thread
From: Andi Kleen @ 2006-11-26 20:49 UTC (permalink / raw)
To: Amul Shah; +Cc: LKML, Eric Dumazet
On Wednesday 15 November 2006 22:48, Amul Shah wrote:
> This patch removes the statically allocated memory to NUMA node hash map
> in favor of a dynamically allocated memory to node hash map (it is cache
> aligned).
>
> This patch has the nice side effect in that it allows the hash map to
> grow for systems with large amounts of memory (256GB - 1TB), but suffer
> from having small PCI space tacked onto the boot node (which is
> somewhere between 192MB to 512MB on the ES7000).
>
> Signed-off-by: Amul Shah <amul.shah@unisys.com>
>
> ---
> Patch applies to 2.6.19-rc4 and has been tested.
> This patch needs testing on a K8 NUMA platform.
> Thanks to Eric Dumazet and Andi Kleen for their improvement suggestions.
I had the patch in, but had to drop it again because it makes one of my
test system triple fault. Haven't done much investigation yet.
BIOS-provided physical RAM map:
BIOS-e820: 0000000000000000 - 000000000009fc00 (usable)
BIOS-e820: 000000000009fc00 - 00000000000a0000 (reserved)
BIOS-e820: 00000000000e6000 - 0000000000100000 (reserved)
BIOS-e820: 0000000000100000 - 000000003ef30000 (usable)
BIOS-e820: 000000003ef30000 - 000000003ef40000 (ACPI data)
BIOS-e820: 000000003ef40000 - 000000003eff0000 (ACPI NVS)
BIOS-e820: 000000003eff0000 - 000000003f000000 (reserved)
BIOS-e820: 00000000fecf0000 - 00000000fecf1000 (reserved)
BIOS-e820: 00000000fed20000 - 00000000feda0000 (reserved)
end_pfn_map = 1043872
kernel direct mapping tables up to feda0000 @ 8000-d000
DMI 2.3 present.
No NUMA configuration found
Faking a node at 0000000000000000-000000003ef30000
<triple fault>
-Andi
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH] x86_64: Make the NUMA hash function nodemap allocation dynamic and remove NODEMAPSIZE
2006-11-26 20:49 ` Andi Kleen
@ 2006-11-27 10:23 ` Eric Dumazet
2006-11-27 15:32 ` Amul Shah
0 siblings, 1 reply; 10+ messages in thread
From: Eric Dumazet @ 2006-11-27 10:23 UTC (permalink / raw)
To: Andi Kleen; +Cc: Amul Shah, LKML
On Sunday 26 November 2006 21:49, Andi Kleen wrote:
> On Wednesday 15 November 2006 22:48, Amul Shah wrote:
> > This patch removes the statically allocated memory to NUMA node hash map
> > in favor of a dynamically allocated memory to node hash map (it is cache
> > aligned).
> >
> > This patch has the nice side effect in that it allows the hash map to
> > grow for systems with large amounts of memory (256GB - 1TB), but suffer
> > from having small PCI space tacked onto the boot node (which is
> > somewhere between 192MB to 512MB on the ES7000).
> >
> > Signed-off-by: Amul Shah <amul.shah@unisys.com>
> >
> > ---
> > Patch applies to 2.6.19-rc4 and has been tested.
> > This patch needs testing on a K8 NUMA platform.
> > Thanks to Eric Dumazet and Andi Kleen for their improvement suggestions.
>
> I had the patch in, but had to drop it again because it makes one of my
> test system triple fault. Haven't done much investigation yet.
>
> BIOS-provided physical RAM map:
> BIOS-e820: 0000000000000000 - 000000000009fc00 (usable)
> BIOS-e820: 000000000009fc00 - 00000000000a0000 (reserved)
> BIOS-e820: 00000000000e6000 - 0000000000100000 (reserved)
> BIOS-e820: 0000000000100000 - 000000003ef30000 (usable)
> BIOS-e820: 000000003ef30000 - 000000003ef40000 (ACPI data)
> BIOS-e820: 000000003ef40000 - 000000003eff0000 (ACPI NVS)
> BIOS-e820: 000000003eff0000 - 000000003f000000 (reserved)
> BIOS-e820: 00000000fecf0000 - 00000000fecf1000 (reserved)
> BIOS-e820: 00000000fed20000 - 00000000feda0000 (reserved)
> end_pfn_map = 1043872
> kernel direct mapping tables up to feda0000 @ 8000-d000
> DMI 2.3 present.
> No NUMA configuration found
> Faking a node at 0000000000000000-000000003ef30000
> <triple fault>
>
Well, I dont have currently an AMD64 test machine so I cannot really help.
With previous implementation, the nimimum shift value was 20 (one megabytes)
If a memnode had a finer range (with chunks not multiple of megabytes), some
bits of memory could be ignored.
But with your fake node (0-3ef30000), Amul patch may give a shift value of 16.
Maybe this breaks something in the kernel...
Eric
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH] x86_64: Make the NUMA hash function nodemap allocation dynamic and remove NODEMAPSIZE
2006-11-27 10:23 ` Eric Dumazet
@ 2006-11-27 15:32 ` Amul Shah
2006-11-27 15:38 ` Andi Kleen
0 siblings, 1 reply; 10+ messages in thread
From: Amul Shah @ 2006-11-27 15:32 UTC (permalink / raw)
To: Eric Dumazet; +Cc: Andi Kleen, LKML
On Mon, 2006-11-27 at 11:23 +0100, Eric Dumazet wrote:
> On Sunday 26 November 2006 21:49, Andi Kleen wrote:
> > I had the patch in, but had to drop it again because it makes one of my
> > test system triple fault. Haven't done much investigation yet.
> >
> > No NUMA configuration found
> > Faking a node at 0000000000000000-000000003ef30000
> > <triple fault>
> >
>
> Well, I dont have currently an AMD64 test machine so I cannot really help.
>
> With previous implementation, the nimimum shift value was 20 (one megabytes)
>
> If a memnode had a finer range (with chunks not multiple of megabytes), some
> bits of memory could be ignored.
>
> But with your fake node (0-3ef30000), Amul patch may give a shift value of 16.
> Maybe this breaks something in the kernel...
I believe that this problem is related to a new patch that enhances the
fake NUMA code (see http://article.gmane.org/gmane.linux.kernel/469457).
I'll work with the submitter of said patches to make them compatible.
Hopefully that will fix the problem.
thanks,
Amul
^ permalink raw reply [flat|nested] 10+ messages in thread
* [PATCH] x86_64: Make the NUMA hash function nodemap allocation dynamic and remove NODEMAPSIZE
@ 2006-11-08 23:37 Amul Shah
2006-11-10 6:48 ` Andi Kleen
0 siblings, 1 reply; 10+ messages in thread
From: Amul Shah @ 2006-11-08 23:37 UTC (permalink / raw)
To: LKML, Andi Kleen
This patch removes the statically allocated memory to NUMA node hash map
in favor of a dynamically allocated memory to node hash map (it is cache
aligned).
This patch has the nice side effect in that it allows the hash map to
grow for systems with large amounts of memory (256GB - 1TB), but suffer
from having small PCI space tacked onto the boot node (which is
somewhere between 192MB to 512MB on the ES7000).
Signed-off-by: Amul Shah <amul.shah@unisys.com>
---
Patch applies to 2.6.18 and 2.6.19-rc4 and has been tested with each.
This patch needs testing on a K8 NUMA platform.
diff -uprN linux-2.6.19-rc4/arch/x86_64/kernel/e820.c linux-2.6.19-rc4-az/arch/x86_64/kernel/e820.c
--- linux-2.6.19-rc4/arch/x86_64/kernel/e820.c 2006-10-31 17:38:41.000000000 -0500
+++ linux-2.6.19-rc4-az/arch/x86_64/kernel/e820.c 2006-11-08 17:55:48.000000000 -0500
@@ -83,6 +83,12 @@ static inline int bad_addr(unsigned long
return 1;
}
+ /* NUMA memory to node map */
+ if (last >= nodemap_addr && addr < nodemap_addr + nodemap_size) {
+ *addrp = nodemap_addr + nodemap_size;
+ return 1;
+ }
+
/* XXX ramdisk image here? */
return 0;
}
diff -uprN linux-2.6.19-rc4/arch/x86_64/kernel/setup.c linux-2.6.19-rc4-az/arch/x86_64/kernel/setup.c
--- linux-2.6.19-rc4/arch/x86_64/kernel/setup.c 2006-10-31 17:38:41.000000000 -0500
+++ linux-2.6.19-rc4-az/arch/x86_64/kernel/setup.c 2006-11-08 17:55:48.000000000 -0500
@@ -445,6 +445,10 @@ void __init setup_arch(char **cmdline_p)
if (ebda_addr)
reserve_bootmem_generic(ebda_addr, ebda_size);
+ /* reserve nodemap region */
+ if (nodemap_addr)
+ reserve_bootmem_generic(nodemap_addr, nodemap_size);
+
#ifdef CONFIG_SMP
/*
* But first pinch a few for the stack/trampoline stuff
diff -uprN linux-2.6.19-rc4/arch/x86_64/mm/numa.c linux-2.6.19-rc4-az/arch/x86_64/mm/numa.c
--- linux-2.6.19-rc4/arch/x86_64/mm/numa.c 2006-10-31 17:38:41.000000000 -0500
+++ linux-2.6.19-rc4-az/arch/x86_64/mm/numa.c 2006-11-08 19:27:00.000000000 -0500
@@ -36,6 +36,8 @@ unsigned char apicid_to_node[MAX_LOCAL_A
cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly;
int numa_off __initdata;
+unsigned long __initdata nodemap_addr;
+unsigned long __initdata nodemap_size;
/*
@@ -52,34 +54,82 @@ populate_memnodemap(const struct bootnod
int res = -1;
unsigned long addr, end;
- if (shift >= 64)
- return -1;
- memset(memnodemap, 0xff, sizeof(memnodemap));
+ memset(memnodemap, 0xff, memnodemapsize);
for (i = 0; i < numnodes; i++) {
addr = nodes[i].start;
end = nodes[i].end;
if (addr >= end)
continue;
- if ((end >> shift) >= NODEMAPSIZE)
+ if ((end >> shift) >= memnodemapsize)
return 0;
do {
if (memnodemap[addr >> shift] != 0xff)
return -1;
memnodemap[addr >> shift] = i;
- addr += (1UL << shift);
+ addr += (1UL << shift);
} while (addr < end);
res = 1;
}
return res;
}
-int __init compute_hash_shift(struct bootnode *nodes, int numnodes)
+static int __init allocate_cachealigned_memnodemap(void)
+{
+ unsigned long pad, pad_addr;
+
+ pad = L1_CACHE_BYTES - 1;
+ pad_addr = 0x8000;
+ nodemap_size = pad + memnodemapsize;
+ nodemap_addr = find_e820_area(pad_addr, end_pfn<<PAGE_SHIFT,
+ nodemap_size);
+ if (nodemap_addr == -1UL) {
+ printk(KERN_ERR
+ "NUMA: Unable to allocate Memory to Node hash map\n");
+ memnodemap = &(memnode.zero);
+ nodemap_addr = nodemap_size = 0;
+ return -1;
+ }
+ nodemap_size += nodemap_addr;
+ pad_addr = (nodemap_addr + pad) & ~pad;
+ memnodemap = phys_to_virt(pad_addr);
+
+ printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx to %lx\n",
+ nodemap_addr, nodemap_addr + nodemap_size);
+ return 0;
+}
+
+/*
+ * The LSB of all start and end addresses in the node map is the value of the
+ * maximum possible shift.
+ */
+static int __init
+extract_lsb_from_nodes (const struct bootnode *nodes, int numnodes)
{
- int shift = 20;
+ int i;
+ unsigned long start, end;
+ unsigned long bitfield = 0, memtop = 0;
+
+ for (i = 0; i < numnodes; i++) {
+ start = nodes[i].start;
+ end = nodes[i].end;
+ if (start >= end)
+ continue;
+ bitfield |= start | end;
+ if ( end > memtop )
+ memtop = end;
+ }
+ for (i=20; !(bitfield&(1UL << i)) && i<BITS_PER_LONG; i++);
+ memnodemapsize = (memtop >> i)+1;
+ return i;
+}
- while (populate_memnodemap(nodes, numnodes, shift + 1) >= 0)
- shift++;
+int __init compute_hash_shift(struct bootnode *nodes, int numnodes)
+{
+ int shift;
+ shift = extract_lsb_from_nodes(nodes, numnodes);
+ if ( allocate_cachealigned_memnodemap() )
+ return -1;
printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
shift);
diff -uprN linux-2.6.19-rc4/include/asm-x86_64/e820.h linux-2.6.19-rc4-az/include/asm-x86_64/e820.h
--- linux-2.6.19-rc4/include/asm-x86_64/e820.h 2006-10-31 17:39:24.000000000 -0500
+++ linux-2.6.19-rc4-az/include/asm-x86_64/e820.h 2006-11-08 17:55:48.000000000 -0500
@@ -56,6 +56,7 @@ extern void finish_e820_parsing(void);
extern struct e820map e820;
extern unsigned ebda_addr, ebda_size;
+extern unsigned long nodemap_addr, nodemap_size;
#endif/*!__ASSEMBLY__*/
#endif/*__E820_HEADER*/
diff -uprN linux-2.6.19-rc4/include/asm-x86_64/mmzone.h linux-2.6.19-rc4-az/include/asm-x86_64/mmzone.h
--- linux-2.6.19-rc4/include/asm-x86_64/mmzone.h 2006-09-19 23:42:06.000000000 -0400
+++ linux-2.6.19-rc4-az/include/asm-x86_64/mmzone.h 2006-11-08 17:55:48.000000000 -0500
@@ -11,24 +11,25 @@
#include <asm/smp.h>
-/* Should really switch to dynamic allocation at some point */
-#define NODEMAPSIZE 0x4fff
-
/* Simple perfect hash to map physical addresses to node numbers */
struct memnode {
int shift;
- u8 map[NODEMAPSIZE];
+ u64 mapsize;
+ u8 *map;
+ u8 zero;
} ____cacheline_aligned;
extern struct memnode memnode;
#define memnode_shift memnode.shift
#define memnodemap memnode.map
+#define memnodemapsize memnode.mapsize
extern struct pglist_data *node_data[];
static inline __attribute__((pure)) int phys_to_nid(unsigned long addr)
{
unsigned nid;
- VIRTUAL_BUG_ON((addr >> memnode_shift) >= NODEMAPSIZE);
+ VIRTUAL_BUG_ON(!memnodemap);
+ VIRTUAL_BUG_ON((addr >> memnode_shift) >= memnodemapsize);
nid = memnodemap[addr >> memnode_shift];
VIRTUAL_BUG_ON(nid >= MAX_NUMNODES || !node_data[nid]);
return nid;
^ permalink raw reply [flat|nested] 10+ messages in thread* Re: [PATCH] x86_64: Make the NUMA hash function nodemap allocation dynamic and remove NODEMAPSIZE
2006-11-08 23:37 Amul Shah
@ 2006-11-10 6:48 ` Andi Kleen
2006-11-10 9:43 ` Eric Dumazet
0 siblings, 1 reply; 10+ messages in thread
From: Andi Kleen @ 2006-11-10 6:48 UTC (permalink / raw)
To: Amul Shah; +Cc: LKML
> diff -uprN linux-2.6.19-rc4/arch/x86_64/kernel/e820.c linux-2.6.19-rc4-az/arch/x86_64/kernel/e820.c
> --- linux-2.6.19-rc4/arch/x86_64/kernel/e820.c 2006-10-31 17:38:41.000000000 -0500
> +++ linux-2.6.19-rc4-az/arch/x86_64/kernel/e820.c 2006-11-08 17:55:48.000000000 -0500
> @@ -83,6 +83,12 @@ static inline int bad_addr(unsigned long
> return 1;
> }
>
> + /* NUMA memory to node map */
> + if (last >= nodemap_addr && addr < nodemap_addr + nodemap_size) {
> + *addrp = nodemap_addr + nodemap_size;
> + return 1;
> + }
Using the e820 allocator will now mean it's rounded up to pages.
That will waste a bit of memory, but i suppose it's ok.
> + for (i=20; !(bitfield&(1UL << i)) && i<BITS_PER_LONG; i++);
That's find_first_bit() ? Please use that
>
> + shift = extract_lsb_from_nodes(nodes, numnodes);
> + if ( allocate_cachealigned_memnodemap() )
No extra spaces here please (and in some other places)
> + u8 *map;
> + u8 zero;
zero?
> } ____cacheline_aligned;
> extern struct memnode memnode;
> #define memnode_shift memnode.shift
> #define memnodemap memnode.map
> +#define memnodemapsize memnode.mapsize
Have you checked how much the code .text size changes because
of the pointer reference? If it's a lot phys_to_nid might need to
be out of lined.
-Andi
^ permalink raw reply [flat|nested] 10+ messages in thread* Re: [PATCH] x86_64: Make the NUMA hash function nodemap allocation dynamic and remove NODEMAPSIZE
2006-11-10 6:48 ` Andi Kleen
@ 2006-11-10 9:43 ` Eric Dumazet
2006-11-10 9:46 ` Eric Dumazet
2006-11-10 9:51 ` Andi Kleen
0 siblings, 2 replies; 10+ messages in thread
From: Eric Dumazet @ 2006-11-10 9:43 UTC (permalink / raw)
To: Andi Kleen; +Cc: Amul Shah, LKML
On Friday 10 November 2006 07:48, Andi Kleen wrote:
> Have you checked how much the code .text size changes because
> of the pointer reference? If it's a lot phys_to_nid might need to
> be out of lined.
Here I have also big numbers on pfn_to_page(), on a machine with mapsize=1
(NUMA kernel, but one node)
oprofile results L1_AND_L2_DTLB_MISSES /usr/src/linux-2.6.18/vmlinux
Counted L1_AND_L2_DTLB_MISSES events (L1 and L2 DTLB misses) with a unit mask
of 0x00 (No unit mask) count 10000
ffffffff80258fa0 <pfn_to_page>: /* pfn_to_page total: 48433 0.4914 */
So adding yet another indirection (to a another cache line) might hurt.
Therefore I suggest to use a structure like that :
struct memnode {
int shift;
unsigned int mapsize; /* no need to use 8 bytes here */
u8 *map;
u8 embedded_map[64-8]; /* total size = 64 bytes */
} ____cacheline_aligned;
and make memnode.map point to memnode.embedded_map if mapsize <= 56 ?
This way, most AMD64 dual/quad processors wont waste a full PAGE to store few
bytes in it, and should use only one cache line.
Thank you
Eric
^ permalink raw reply [flat|nested] 10+ messages in thread* Re: [PATCH] x86_64: Make the NUMA hash function nodemap allocation dynamic and remove NODEMAPSIZE
2006-11-10 9:43 ` Eric Dumazet
@ 2006-11-10 9:46 ` Eric Dumazet
2006-11-10 9:51 ` Andi Kleen
1 sibling, 0 replies; 10+ messages in thread
From: Eric Dumazet @ 2006-11-10 9:46 UTC (permalink / raw)
To: Andi Kleen; +Cc: Amul Shah, LKML
On Friday 10 November 2006 10:43, Eric Dumazet wrote:
>
> Therefore I suggest to use a structure like that :
>
> struct memnode {
> int shift;
> unsigned int mapsize; /* no need to use 8 bytes here */
> u8 *map;
> u8 embedded_map[64-8]; /* total size = 64 bytes */
> } ____cacheline_aligned;
>
Arg... [64 - 16] sorry
> and make memnode.map point to memnode.embedded_map if mapsize <= 56 ?
mapsize <= 48
>
> This way, most AMD64 dual/quad processors wont waste a full PAGE to store
> few bytes in it, and should use only one cache line.
>
> Thank you
>
> Eric
>
^ permalink raw reply [flat|nested] 10+ messages in thread* Re: [PATCH] x86_64: Make the NUMA hash function nodemap allocation dynamic and remove NODEMAPSIZE
2006-11-10 9:43 ` Eric Dumazet
2006-11-10 9:46 ` Eric Dumazet
@ 2006-11-10 9:51 ` Andi Kleen
1 sibling, 0 replies; 10+ messages in thread
From: Andi Kleen @ 2006-11-10 9:51 UTC (permalink / raw)
To: Eric Dumazet; +Cc: Amul Shah, LKML
> Therefore I suggest to use a structure like that :
>
> struct memnode {
> int shift;
> unsigned int mapsize; /* no need to use 8 bytes here */
> u8 *map;
> u8 embedded_map[64-8]; /* total size = 64 bytes */
> } ____cacheline_aligned;
>
> and make memnode.map point to memnode.embedded_map if mapsize <= 56 ?
That's a good idea yes.
-Andi
^ permalink raw reply [flat|nested] 10+ messages in thread
end of thread, other threads:[~2006-11-27 15:39 UTC | newest]
Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2006-11-15 21:48 [PATCH] x86_64: Make the NUMA hash function nodemap allocation dynamic and remove NODEMAPSIZE Amul Shah
2006-11-26 20:49 ` Andi Kleen
2006-11-27 10:23 ` Eric Dumazet
2006-11-27 15:32 ` Amul Shah
2006-11-27 15:38 ` Andi Kleen
-- strict thread matches above, loose matches on Subject: below --
2006-11-08 23:37 Amul Shah
2006-11-10 6:48 ` Andi Kleen
2006-11-10 9:43 ` Eric Dumazet
2006-11-10 9:46 ` Eric Dumazet
2006-11-10 9:51 ` Andi Kleen
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox