diff -Nur linux-2.4.20-base/arch/ia64/config.in linux-2.4.20-dcm/arch/ia64/config.in --- linux-2.4.20-base/arch/ia64/config.in Mon Mar 3 10:24:21 2003 +++ linux-2.4.20-dcm/arch/ia64/config.in Mon Mar 3 10:55:12 2003 @@ -66,6 +66,14 @@ fi if [ "$CONFIG_IA64_GENERIC" = "y" -o "$CONFIG_IA64_DIG" = "y" -o "$CONFIG_IA64_HP_ZX1" = "y" ]; then + bool ' Enable NUMA support' CONFIG_NUMA + if [ "$CONFIG_NUMA" = "y" ]; then + define_bool CONFIG_DISCONTIGMEM y + choice 'Maximum Memory per NUMA Node' \ + "16GB CONFIG_IA64_NODESIZE_16GB \ + 64GB CONFIG_IA64_NODESIZE_64GB \ + 256GB CONFIG_IA64_NODESIZE_256GB" 16GB + fi bool ' Enable IA-64 Machine Check Abort' CONFIG_IA64_MCA define_bool CONFIG_PM y fi diff -Nur linux-2.4.20-base/arch/ia64/kernel/acpi.c linux-2.4.20-dcm/arch/ia64/kernel/acpi.c --- linux-2.4.20-base/arch/ia64/kernel/acpi.c Mon Mar 3 10:24:21 2003 +++ linux-2.4.20-dcm/arch/ia64/kernel/acpi.c Wed Mar 12 13:36:18 2003 @@ -8,6 +8,9 @@ * Copyright (C) 2000 Intel Corp. * Copyright (C) 2000,2001 J.I. Lee * Copyright (C) 2001 Paul Diefenbaugh + * Copyright (C) 2001 Jenna Hall + * Copyright (C) 2001 Takayoshi Kochi + * Copyright (C) 2002 Erich Focht * * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ * @@ -38,11 +41,13 @@ #include #include #include +#include #include #include #include #include #include +#include #define PREFIX "ACPI: " @@ -222,7 +227,7 @@ acpi_status acpi_hp_csr_space(acpi_handle obj, u64 *csr_base, u64 *csr_length) { - int i, offset = 0; + int offset = 0; acpi_status status; acpi_buffer buf = { .length = ACPI_ALLOCATE_BUFFER, .pointer = NULL }; @@ -559,6 +564,191 @@ } +#ifdef CONFIG_ACPI_NUMA + +#define SLIT_DEBUG + +#define PXM_FLAG_LEN ((MAX_PXM_DOMAINS + 1)/32) + +static int __initdata srat_num_cpus; /* number of cpus */ +static u32 __initdata pxm_flag[PXM_FLAG_LEN]; +#define pxm_bit_set(bit) (set_bit(bit,(void *)pxm_flag)) +#define pxm_bit_test(bit) (test_bit(bit,(void *)pxm_flag)) +/* maps to convert between proximity domain and logical node ID */ +int __initdata pxm_to_nid_map[MAX_PXM_DOMAINS]; +int __initdata nid_to_pxm_map[NR_NODES]; +static struct acpi_table_slit __initdata *slit_table; + +/* + * ACPI 2.0 SLIT (System Locality Information Table) + * http://devresource.hp.com/devresource/Docs/TechPapers/IA64/slit.pdf + */ +void __init +acpi_numa_slit_init (struct acpi_table_slit *slit) +{ + u32 len; + + len = sizeof(struct acpi_table_header) + 8 + + slit->localities * slit->localities; + if (slit->header.length != len) { + printk("ACPI 2.0 SLIT: size mismatch: %d expected, %d actual\n", + len, slit->header.length); + memset(numa_slit, 10, sizeof(numa_slit)); + return; + } + slit_table = slit; +} + +void __init +acpi_numa_processor_affinity_init (struct acpi_table_processor_affinity *pa) +{ + /* record this node in proximity bitmap */ + pxm_bit_set(pa->proximity_domain); + + node_cpuid[srat_num_cpus].phys_id = (pa->apic_id << 8) | (pa->lsapic_eid); + /* nid should be overridden as logical node id later */ + node_cpuid[srat_num_cpus].nid = pa->proximity_domain; + srat_num_cpus++; +} + +void __init +acpi_numa_memory_affinity_init (struct acpi_table_memory_affinity *ma) +{ + unsigned long paddr, size, hole_size, min_hole_size; + u8 pxm; + struct node_memblk_s *p, *q, *pend; + + pxm = ma->proximity_domain; + + /* fill node memory chunk structure */ + paddr = ma->base_addr_hi; + paddr = (paddr << 32) | ma->base_addr_lo; + size = ma->length_hi; + size = (size << 32) | ma->length_lo; + + if (num_memblks >= NR_MEMBLKS) { + printk("Too many mem chunks in SRAT. Ignoring %ld MBytes at %lx\n", + size/(1024*1024), paddr); + return; + } + + /* Ignore disabled entries */ + if (!ma->flags.enabled) + return; + + /* + * When the chunk is not the first one in the node, check distance + * from the other chunks. When the hole is too huge ignore the chunk. + * This restriction should be removed when multiple chunks per node + * is supported. + */ + pend = &node_memblk[num_memblks]; + min_hole_size = 0; + for (p = &node_memblk[0]; p < pend; p++) { + if (p->nid != pxm) + continue; + if (p->start_paddr < paddr) + hole_size = paddr - (p->start_paddr + p->size); + else + hole_size = p->start_paddr - (paddr + size); + + if (!min_hole_size || hole_size < min_hole_size) + min_hole_size = hole_size; + } + +#if 0 /* test */ + if (min_hole_size) { + if (min_hole_size > size) { + printk("Too huge memory hole. Ignoring %ld MBytes at %lx\n", + size/(1024*1024), paddr); + return; + } + } +#endif + + /* record this node in proximity bitmap */ + pxm_bit_set(pxm); + + /* Insertion sort based on base address */ + pend = &node_memblk[num_memblks]; + for (p = &node_memblk[0]; p < pend; p++) { + if (paddr < p->start_paddr) + break; + } + if (p < pend) { + for (q = pend; q >= p; q--) + *(q + 1) = *q; + } + p->start_paddr = paddr; + p->size = size; + p->nid = pxm; + num_memblks++; +} + +void __init +acpi_numa_arch_fixup(void) +{ + int i, j, node_from, node_to; + + /* calculate total number of nodes in system from PXM bitmap */ + numnodes = 0; /* init total nodes in system */ + + memset(pxm_to_nid_map, -1, sizeof(pxm_to_nid_map)); + memset(nid_to_pxm_map, -1, sizeof(nid_to_pxm_map)); + for (i = 0; i < MAX_PXM_DOMAINS; i++) { + if (pxm_bit_test(i)) { + pxm_to_nid_map[i] = numnodes; + nid_to_pxm_map[numnodes++] = i; + } + } + + /* set logical node id in memory chunk structure */ + for (i = 0; i < num_memblks; i++) + node_memblk[i].nid = pxm_to_nid_map[node_memblk[i].nid]; + + /* assign memory bank numbers for each chunk on each node */ + for (i = 0; i < numnodes; i++) { + int bank; + + bank = 0; + for (j = 0; j < num_memblks; j++) + if (node_memblk[j].nid == i) + node_memblk[j].bank = bank++; + } + + /* set logical node id in cpu structure */ + for (i = 0; i < srat_num_cpus; i++) + node_cpuid[i].nid = pxm_to_nid_map[node_cpuid[i].nid]; + + printk("Number of logical nodes in system = %d\n", numnodes); + printk("Number of memory chunks in system = %d\n", num_memblks); + + if (!slit_table) return; + memset(numa_slit, -1, sizeof(numa_slit)); + for (i=0; ilocalities; i++) { + if (!pxm_bit_test(i)) + continue; + node_from = pxm_to_nid_map[i]; + for (j=0; jlocalities; j++) { + if (!pxm_bit_test(j)) + continue; + node_to = pxm_to_nid_map[j]; + node_distance(node_from, node_to) = + slit_table->entry[i*slit_table->localities + j]; + } + } + +#ifdef SLIT_DEBUG + printk("ACPI 2.0 SLIT locality table:\n"); + for (i = 0; i < numnodes; i++) { + for (j = 0; j < numnodes; j++) + printk("%03d ", node_distance(i,j)); + printk("\n"); + } +#endif +} +#endif /* CONFIG_ACPI_NUMA */ + static int __init acpi_parse_fadt (unsigned long phys_addr, unsigned long size) { @@ -665,12 +855,6 @@ int __init acpi_boot_init (char *cmdline) { - int result; - - /* Initialize the ACPI boot-time table parser */ - result = acpi_table_init(cmdline); - if (result) - return result; /* * MADT @@ -738,6 +922,10 @@ available_cpus = 1; /* We've got at least one of these, no? */ } smp_boot_data.cpu_count = total_cpus; + smp_build_cpu_map(); +#ifdef CONFIG_NUMA + build_cpu_to_node_map(); +#endif #endif /* Make boot-up look pretty */ printk("%d CPUs available, %d CPUs total\n", available_cpus, total_cpus); diff -Nur linux-2.4.20-base/arch/ia64/kernel/setup.c linux-2.4.20-dcm/arch/ia64/kernel/setup.c --- linux-2.4.20-base/arch/ia64/kernel/setup.c Fri Nov 29 08:53:09 2002 +++ linux-2.4.20-dcm/arch/ia64/kernel/setup.c Wed Mar 12 13:49:06 2003 @@ -34,6 +34,7 @@ #include #include +#include #include #include #include @@ -49,16 +50,9 @@ # error "struct cpuinfo_ia64 too big!" #endif -#define MIN(a,b) ((a) < (b) ? (a) : (b)) -#define MAX(a,b) ((a) > (b) ? (a) : (b)) - extern char _end; -#ifdef CONFIG_NUMA - struct cpuinfo_ia64 *boot_cpu_data; -#else struct cpuinfo_ia64 _cpu_data[NR_CPUS] __attribute__ ((section ("__special_page_section"))); -#endif unsigned long ia64_cycles_per_usec; struct ia64_boot_param *ia64_boot_param; @@ -95,6 +89,7 @@ static struct rsvd_region rsvd_region[IA64_MAX_RSVD_REGIONS + 1]; static int num_rsvd_regions; +#ifndef CONFIG_DISCONTIGMEM static unsigned long bootmap_start; /* physical address where the bootmem map is located */ static int @@ -107,18 +102,64 @@ *max_pfn = pfn; return 0; } +#endif /* !CONFIG_DISCONTIGMEM */ #define IGNORE_PFN0 1 /* XXX fix me: ignore pfn 0 until TLB miss handler is updated... */ +#ifdef CONFIG_DISCONTIGMEM /* - * Free available memory based on the primitive map created from - * the boot parameters. This routine does not assume the incoming - * segments are sorted. + * efi_memmap_walk() knows nothing about layout of memory across nodes. Find + * out to which node a block of memory belongs. Ignore memory that we cannot + * identify, and split blocks that run across multiple nodes. + * + * Take this opportunity to round the start address up and the end address + * down to page boundaries. */ -static int -free_available_memory (unsigned long start, unsigned long end, void *arg) +void +call_pernode_memory (unsigned long start, unsigned long end, void *arg) +{ + unsigned long rs, re; + void (*func)(unsigned long, unsigned long, int, int); + int i; + + start = PAGE_ALIGN(start); + end &= PAGE_MASK; + if (start >= end) + return; + + func = arg; + + if (!num_memblks) { + /* this machine doesn't have SRAT, */ + /* so call func with nid=0, bank=0 */ + if (start < end) + (*func)(start, end - start, 0, 0); + return; + } + + for (i = 0; i < num_memblks; i++) { + rs = max(start, node_memblk[i].start_paddr); + re = min(end, node_memblk[i].start_paddr+node_memblk[i].size); + + if (rs < re) + (*func)(rs, re-rs, node_memblk[i].nid, + node_memblk[i].bank); + } +} +#endif /* CONFIG_DISCONTIGMEM */ + +/* + * Filter incoming memory segments based on the primitive map created from + * the boot parameters. Segments contained in the map are removed from the + * memory ranges. A caller-specified function is called with the memory + * ranges that remain after filtering. + * This routine does not assume the incoming segments are sorted. + */ +int +filter_rsvd_memory (unsigned long start, unsigned long end, void *arg) { unsigned long range_start, range_end, prev_start; + void (*func)(unsigned long, unsigned long); int i; #if IGNORE_PFN0 @@ -132,13 +173,18 @@ * lowest possible address(walker uses virtual) */ prev_start = PAGE_OFFSET; + func = arg; for (i = 0; i < num_rsvd_regions; ++i) { - range_start = MAX(start, prev_start); - range_end = MIN(end, rsvd_region[i].start); + range_start = max(start, prev_start); + range_end = min(end, rsvd_region[i].start); if (range_start < range_end) - free_bootmem(__pa(range_start), range_end - range_start); +#ifdef CONFIG_DISCONTIGMEM + call_pernode_memory(__pa(range_start), __pa(range_end), func); +#else + (*func)(__pa(range_start), range_end - range_start); +#endif /* nothing more available in this segment */ if (range_end == end) return 0; @@ -150,6 +196,7 @@ } +#ifndef CONFIG_DISCONTIGMEM /* * Find a place to put the bootmap and return its starting address in bootmap_start. * This address must be page-aligned. @@ -171,8 +218,8 @@ free_start = PAGE_OFFSET; for (i = 0; i < num_rsvd_regions; i++) { - range_start = MAX(start, free_start); - range_end = MIN(end, rsvd_region[i].start & PAGE_MASK); + range_start = max(start, free_start); + range_end = min(end, rsvd_region[i].start & PAGE_MASK); if (range_end <= range_start) continue; /* skip over empty range */ @@ -188,6 +235,7 @@ } return 0; } +#endif /* CONFIG_DISCONTIGMEM */ static void sort_regions (struct rsvd_region *rsvd_region, int max) @@ -252,6 +300,14 @@ sort_regions(rsvd_region, num_rsvd_regions); +#ifdef CONFIG_DISCONTIGMEM + { + extern void discontig_mem_init(void); + bootmap_size = max_pfn = 0; /* stop gcc warnings */ + discontig_mem_init(); + } +#else /* !CONFIG_DISCONTIGMEM */ + /* first find highest page frame number */ max_pfn = 0; efi_memmap_walk(find_max_pfn, &max_pfn); @@ -268,8 +324,9 @@ bootmap_size = init_bootmem(bootmap_start >> PAGE_SHIFT, max_pfn); /* Free all available memory, then mark bootmem-map as being in use. */ - efi_memmap_walk(free_available_memory, 0); + efi_memmap_walk(filter_rsvd_memory, free_bootmem); reserve_bootmem(bootmap_start, bootmap_size); +#endif /* !CONFIG_DISCONTIGMEM */ #ifdef CONFIG_BLK_DEV_INITRD if (ia64_boot_param->initrd_start) { @@ -296,6 +353,19 @@ efi_init(); +#ifdef CONFIG_ACPI_BOOT + /* Initialize the ACPI boot-time table parser */ + acpi_table_init(*cmdline_p); + +#ifdef CONFIG_ACPI_NUMA + acpi_numa_init(); +#endif +#else +# ifdef CONFIG_SMP + smp_build_cpu_map(); /* happens, e.g., with the Ski simulator */ +# endif +#endif /* CONFIG_APCI_BOOT */ + iomem_resource.end = ~0UL; /* FIXME probably belongs elsewhere */ find_memory(); @@ -537,40 +607,11 @@ pal_vm_info_2_u_t vmi; unsigned int max_ctx; struct cpuinfo_ia64 *my_cpu_data; -#ifdef CONFIG_NUMA - int cpu, order; - /* - * If NUMA is configured, the cpu_data array is not preallocated. The boot cpu - * allocates entries for every possible cpu. As the remaining cpus come online, - * they reallocate a new cpu_data structure on their local node. This extra work - * is required because some boot code references all cpu_data structures - * before the cpus are actually started. - */ - if (!boot_cpu_data) { - my_cpu_data = alloc_bootmem_pages_node(NODE_DATA(numa_node_id()), - sizeof(struct cpuinfo_ia64)); - boot_cpu_data = my_cpu_data; - my_cpu_data->cpu_data[0] = my_cpu_data; - for (cpu = 1; cpu < NR_CPUS; ++cpu) - my_cpu_data->cpu_data[cpu] - = alloc_bootmem_pages_node(NODE_DATA(numa_node_id()), - sizeof(struct cpuinfo_ia64)); - for (cpu = 1; cpu < NR_CPUS; ++cpu) - memcpy(my_cpu_data->cpu_data[cpu]->cpu_data, - my_cpu_data->cpu_data, sizeof(my_cpu_data->cpu_data)); - } else { - order = get_order(sizeof(struct cpuinfo_ia64)); - my_cpu_data = page_address(alloc_pages_node(numa_node_id(), GFP_KERNEL, order)); - memcpy(my_cpu_data, boot_cpu_data->cpu_data[smp_processor_id()], - sizeof(struct cpuinfo_ia64)); - __free_pages(virt_to_page(boot_cpu_data->cpu_data[smp_processor_id()]), - order); - for (cpu = 0; cpu < NR_CPUS; ++cpu) - boot_cpu_data->cpu_data[cpu]->cpu_data[smp_processor_id()] = my_cpu_data; - } -#else my_cpu_data = cpu_data(smp_processor_id()); + +#ifdef CONFIG_DISCONTIGMEM + my_cpu_data->node_data = get_node_data_ptr(); #endif /* diff -Nur linux-2.4.20-base/arch/ia64/kernel/smpboot.c linux-2.4.20-dcm/arch/ia64/kernel/smpboot.c --- linux-2.4.20-base/arch/ia64/kernel/smpboot.c Mon Mar 3 10:24:21 2003 +++ linux-2.4.20-dcm/arch/ia64/kernel/smpboot.c Wed Mar 12 13:34:08 2003 @@ -575,3 +575,66 @@ smp_num_cpus = 1; } } + +/* + * Initialize the logical CPU number to SAPICID mapping + */ +void __init +smp_build_cpu_map (void) +{ + int sapicid, cpu, i; + int boot_cpu_id = hard_smp_processor_id(); + + for (cpu = 0; cpu < NR_CPUS; cpu++) + ia64_cpu_to_sapicid[cpu] = -1; + + ia64_cpu_to_sapicid[0] = boot_cpu_id; + + for (cpu = 1, i = 0; i < smp_boot_data.cpu_count; i++) { + sapicid = smp_boot_data.cpu_phys_id[i]; + if (sapicid == -1 || sapicid == boot_cpu_id) + continue; + ia64_cpu_to_sapicid[cpu] = sapicid; + cpu++; + } +} + +#ifdef CONFIG_NUMA + +/* on which node is each logical CPU (one cacheline even for 64 CPUs) */ +volatile char cpu_to_node_map[NR_CPUS] __cacheline_aligned; +/* which logical CPUs are on which nodes */ +volatile unsigned long node_to_cpu_mask[MAX_NUMNODES] __cacheline_aligned; + +/* + * Build cpu to node mapping and initialize the per node cpu masks. + */ +void __init +build_cpu_to_node_map (void) +{ + int cpu, i, node; + + for(node=0; node= 0) + node_to_cpu_mask[node] |= (1UL << cpu); + } +} + +#endif /* CONFIG_NUMA */ diff -Nur linux-2.4.20-base/arch/ia64/mm/Makefile linux-2.4.20-dcm/arch/ia64/mm/Makefile --- linux-2.4.20-base/arch/ia64/mm/Makefile Mon Mar 3 10:24:21 2003 +++ linux-2.4.20-dcm/arch/ia64/mm/Makefile Mon Mar 3 10:55:12 2003 @@ -12,5 +12,7 @@ export-objs := init.o obj-y := init.o fault.o tlb.o extable.o +obj-$(CONFIG_NUMA) += numa.o +obj-$(CONFIG_DISCONTIGMEM) += discontig.o include $(TOPDIR)/Rules.make diff -Nur linux-2.4.20-base/arch/ia64/mm/discontig.c linux-2.4.20-dcm/arch/ia64/mm/discontig.c --- linux-2.4.20-base/arch/ia64/mm/discontig.c Thu Jan 1 09:00:00 1970 +++ linux-2.4.20-dcm/arch/ia64/mm/discontig.c Wed Mar 12 13:57:10 2003 @@ -0,0 +1,315 @@ +/* + * Copyright (c) 2000 Silicon Graphics, Inc. All rights reserved. + * Copyright (c) 2001 Intel Corp. + * Copyright (c) 2001 Tony Luck + * Copyright (c) 2002 NEC Corp. + * Copyright (c) 2002 Kimio Suganuma + */ + +/* + * Platform initialization for Discontig Memory + */ + +#include +#include +#include +#include +#include +#include + + +/* + * Round an address upward to the next multiple of GRANULE size. + */ +#define GRANULEROUNDUP(n) (((n)+IA64_GRANULE_SIZE-1) & ~(IA64_GRANULE_SIZE-1)) + +static struct ia64_node_data *node_data[NR_NODES]; +static long boot_pg_data[8*NR_NODES+sizeof(pg_data_t)] __initdata; +static pg_data_t *pg_data_ptr[NR_NODES] __initdata; +static bootmem_data_t bdata[NR_NODES][NR_BANKS_PER_NODE+1] __initdata; + +extern int filter_rsvd_memory (unsigned long start, unsigned long end, void *arg); + +/* + * Return the compact node number of this cpu. Used prior to + * setting up the cpu_data area. + * Note - not fast, intended for boot use only!! + */ +int +boot_get_local_nodeid(void) +{ + int i; + + for (i = 0; i < NR_CPUS; i++) + if (node_cpuid[i].phys_id == hard_smp_processor_id()) + return node_cpuid[i].nid; + + /* node info missing, so nid should be 0.. */ + return 0; +} + +/* + * Return a pointer to the pg_data structure for a node. + * This function is used ONLY in early boot before the cpu_data + * structure is available. + */ +pg_data_t* __init +boot_get_pg_data_ptr(long node) +{ + return pg_data_ptr[node]; +} + + +/* + * Return a pointer to the node data for the current node. + * (boottime initialization only) + */ +struct ia64_node_data * +get_node_data_ptr(void) +{ + return node_data[boot_get_local_nodeid()]; +} + +/* + * We allocate one of the bootmem_data_t structs for each piece of memory + * that we wish to treat as a contiguous block. Each such block must start + * on a BANKSIZE boundary. Multiple banks per node is not supported. + */ +static int __init +build_maps(unsigned long pstart, unsigned long length, int node) +{ + bootmem_data_t *bdp; + unsigned long cstart, epfn; + + bdp = pg_data_ptr[node]->bdata; + epfn = GRANULEROUNDUP(pstart + length) >> PAGE_SHIFT; + cstart = pstart & ~(BANKSIZE - 1); + + if (!bdp->node_low_pfn) { + bdp->node_boot_start = cstart; + bdp->node_low_pfn = epfn; + } else { + bdp->node_boot_start = min(cstart, bdp->node_boot_start); + bdp->node_low_pfn = max(epfn, bdp->node_low_pfn); + } + + min_low_pfn = min(min_low_pfn, bdp->node_boot_start>>PAGE_SHIFT); + max_low_pfn = max(max_low_pfn, bdp->node_low_pfn); + + return 0; +} + +/* + * Find space on each node for the bootmem map. + * + * Called by efi_memmap_walk to find boot memory on each node. Note that + * only blocks that are free are passed to this routine (currently filtered by + * free_available_memory). + */ +static int __init +find_bootmap_space(unsigned long pstart, unsigned long length, int node) +{ + unsigned long mapsize, pages, epfn; + bootmem_data_t *bdp; + + epfn = (pstart + length) >> PAGE_SHIFT; + bdp = &pg_data_ptr[node]->bdata[0]; + + if (pstart < bdp->node_boot_start || epfn > bdp->node_low_pfn) + return 0; + + if (!bdp->node_bootmem_map) { + pages = bdp->node_low_pfn - (bdp->node_boot_start>>PAGE_SHIFT); + mapsize = bootmem_bootmap_pages(pages) << PAGE_SHIFT; + if (length > mapsize) { + init_bootmem_node( + BOOT_NODE_DATA(node), + pstart>>PAGE_SHIFT, + bdp->node_boot_start>>PAGE_SHIFT, + bdp->node_low_pfn); + } + + } + + return 0; +} + + +/* + * Free available memory to the bootmem allocator. + * + * Note that only blocks that are free are passed to this routine (currently + * filtered by free_available_memory). + * + */ +static int __init +discontig_free_bootmem_node(unsigned long pstart, unsigned long length, int node) +{ + free_bootmem_node(BOOT_NODE_DATA(node), pstart, length); + + return 0; +} + + +/* + * Reserve the space used by the bootmem maps. + */ +static void __init +discontig_reserve_bootmem(void) +{ + int node; + unsigned long mapbase, mapsize, pages; + bootmem_data_t *bdp; + + for (node = 0; node < numnodes; node++) { + bdp = BOOT_NODE_DATA(node)->bdata; + + pages = bdp->node_low_pfn - (bdp->node_boot_start>>PAGE_SHIFT); + mapsize = bootmem_bootmap_pages(pages) << PAGE_SHIFT; + mapbase = __pa(bdp->node_bootmem_map); + reserve_bootmem_node(BOOT_NODE_DATA(node), mapbase, mapsize); + } +} + +/* + * Allocate per node tables. + * - the pg_data structure is allocated on each node. This minimizes offnode + * memory references + * - the node data is allocated & initialized. Portions of this structure is read-only (after + * boot) and contains node-local pointers to usefuls data structures located on + * other nodes. + * + * We also switch to using the "real" pg_data structures at this point. Earlier in boot, we + * use a different structure. The only use for pg_data prior to the point in boot is to get + * the pointer to the bdata for the node. + */ +static void __init +allocate_pernode_structures(void) +{ + pg_data_t *pgdat=0, *new_pgdat_list=0; + int node, mynode; + + mynode = boot_get_local_nodeid(); + for (node = numnodes - 1; node >= 0 ; node--) { + node_data[node] = alloc_bootmem_node(BOOT_NODE_DATA(node), sizeof (struct ia64_node_data)); + pgdat = __alloc_bootmem_node(BOOT_NODE_DATA(node), sizeof(pg_data_t), SMP_CACHE_BYTES, 0); + pgdat->bdata = &(bdata[node][0]); + pg_data_ptr[node] = pgdat; + pgdat->node_next = new_pgdat_list; + new_pgdat_list = pgdat; + } + + memcpy(node_data[mynode]->pg_data_ptrs, pg_data_ptr, sizeof(pg_data_ptr)); + memcpy(node_data[mynode]->node_data_ptrs, node_data, sizeof(node_data)); + + pgdat_list = new_pgdat_list; +} + +/* + * Called early in boot to setup the boot memory allocator, and to + * allocate the node-local pg_data & node-directory data structures.. + */ +void __init +discontig_mem_init(void) +{ + int node; + + if (numnodes == 0) { + printk("node info missing!\n"); + numnodes = 1; + } + + for (node = 0; node < numnodes; node++) { + pg_data_ptr[node] = (pg_data_t*) &boot_pg_data[node]; + pg_data_ptr[node]->bdata = &bdata[node][0]; + } + + min_low_pfn = -1; + max_low_pfn = 0; + + efi_memmap_walk(filter_rsvd_memory, build_maps); + efi_memmap_walk(filter_rsvd_memory, find_bootmap_space); + efi_memmap_walk(filter_rsvd_memory, discontig_free_bootmem_node); + discontig_reserve_bootmem(); + allocate_pernode_structures(); +} + +/* + * Initialize the paging system. + * - determine sizes of each node + * - initialize the paging system for the node + * - build the nodedir for the node. This contains pointers to + * the per-bank mem_map entries. + * - fix the page struct "virtual" pointers. These are bank specific + * values that the paging system doesnt understand. + * - replicate the nodedir structure to other nodes + */ + +void __init +discontig_paging_init(void) +{ + int node, mynode; + unsigned long max_dma, zones_size[MAX_NR_ZONES]; + unsigned long kaddr, ekaddr, bid; + struct page *page; + bootmem_data_t *bdp; + + max_mapnr = 0; + max_dma = virt_to_phys((void *) MAX_DMA_ADDRESS) >> PAGE_SHIFT; + + mynode = boot_get_local_nodeid(); + for (bid = 0; bid < NR_BANKS; bid++) { + node_data[mynode]->node_id_map[bid] = -1; + node_data[mynode]->bank_mem_map_base[bid] = NULL; + } + + for (node = 0; node < numnodes; node++) { + long pfn, startpfn; + + memset(zones_size, 0, sizeof(zones_size)); + + startpfn = -1; + bdp = BOOT_NODE_DATA(node)->bdata; + pfn = bdp->node_boot_start >> PAGE_SHIFT; + if (startpfn == -1) + startpfn = pfn; + if (pfn > max_dma) + zones_size[ZONE_NORMAL] += (bdp->node_low_pfn - pfn); + else if (bdp->node_low_pfn < max_dma) + zones_size[ZONE_DMA] += (bdp->node_low_pfn - pfn); + else { + zones_size[ZONE_DMA] += (max_dma - pfn); + zones_size[ZONE_NORMAL] += (bdp->node_low_pfn - max_dma); + } + + free_area_init_node(node, NODE_DATA(node), NULL, zones_size, startpfn<node_mem_map; + + bdp = BOOT_NODE_DATA(node)->bdata; + + kaddr = (unsigned long)__va(bdp->node_boot_start); + ekaddr = (unsigned long)__va(bdp->node_low_pfn << PAGE_SHIFT); + while (kaddr < ekaddr) { + if (paddr_to_nid(__pa(kaddr)) == node) { + bid = BANK_MEM_MAP_INDEX(kaddr); + node_data[mynode]->node_id_map[bid] = node; + node_data[mynode]->bank_mem_map_base[bid] = page; + printk("addr(%lx), bank(%ld) -> node(%d), page(%lx)\n", kaddr, bid, node, (unsigned long)page); + } + kaddr += BANKSIZE; + page += BANKSIZE/PAGE_SIZE; + } + max_mapnr = max(max_mapnr, (unsigned long)(page - mem_map)); + } + + /* + * Finish setting up the node data for this node, then copy it to the other nodes. + */ + for (node=0; node < numnodes; node++) + if (mynode != node) { + memcpy(node_data[node], node_data[mynode], sizeof(struct ia64_node_data)); + node_data[node]->node = node; + } +} + diff -Nur linux-2.4.20-base/arch/ia64/mm/init.c linux-2.4.20-dcm/arch/ia64/mm/init.c --- linux-2.4.20-base/arch/ia64/mm/init.c Mon Mar 3 10:24:21 2003 +++ linux-2.4.20-dcm/arch/ia64/mm/init.c Wed Mar 12 14:00:32 2003 @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -38,12 +39,14 @@ unsigned long MAX_DMA_ADDRESS = PAGE_OFFSET + 0x100000000UL; #define LARGE_GAP 0x40000000 /* Use virtual mem map if a hole is > than this */ -static unsigned long totalram_pages; +static unsigned long totalram_pages, reserved_pages; unsigned long vmalloc_end = VMALLOC_END_INIT; +#ifndef CONFIG_DISCONTIGMEM static struct page *vmem_map; static unsigned long num_dma_physpages; +#endif int do_check_pgt_cache (int low, int high) @@ -186,41 +189,48 @@ return; } +#ifdef CONFIG_DISCONTIGMEM void show_mem(void) { - int i, total = 0, reserved = 0; + int i, reserved = 0; int shared = 0, cached = 0; + pg_data_t *pgdat = pgdat_list; printk("Mem-info:\n"); show_free_areas(); -#ifdef CONFIG_DISCONTIGMEM - { - pg_data_t *pgdat = pgdat_list; - - printk("Free swap: %6dkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); - do { - printk("Node ID: %d\n", pgdat->node_id); - for(i = 0; i < pgdat->node_size; i++) { - if (PageReserved(pgdat->node_mem_map+i)) - reserved++; - else if (PageSwapCache(pgdat->node_mem_map+i)) - cached++; - else if (page_count(pgdat->node_mem_map + i)) - shared += page_count(pgdat->node_mem_map + i) - 1; - } - printk("\t%d pages of RAM\n", pgdat->node_size); - printk("\t%d reserved pages\n", reserved); - printk("\t%d pages shared\n", shared); - printk("\t%d pages swap cached\n", cached); - pgdat = pgdat->node_next; - } while (pgdat); - printk("Total of %ld pages in page table cache\n", pgtable_cache_size); - show_buffers(); - printk("%d free buffer pages\n", nr_free_buffer_pages()); - } + printk("Free swap: %6dkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); + do { + printk("Node ID: %d\n", pgdat->node_id); + for(i = 0; i < pgdat->node_size; i++) { + if (PageReserved(pgdat->node_mem_map+i)) + reserved++; + else if (PageSwapCache(pgdat->node_mem_map+i)) + cached++; + else if (page_count(pgdat->node_mem_map + i)) + shared += page_count(pgdat->node_mem_map + i) - 1; + } + printk("\t%ld pages of RAM\n", pgdat->node_size); + printk("\t%d reserved pages\n", reserved); + printk("\t%d pages shared\n", shared); + printk("\t%d pages swap cached\n", cached); + pgdat = pgdat->node_next; + } while (pgdat); + printk("Total of %ld pages in page table cache\n", pgtable_cache_size); + show_buffers(); + printk("%d free buffer pages\n", nr_free_buffer_pages()); +} #else /* !CONFIG_DISCONTIGMEM */ +void +show_mem(void) +{ + int i, total = 0, reserved = 0; + int shared = 0, cached = 0; + + printk("Mem-info:\n"); + show_free_areas(); + printk("Free swap: %6dkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); i = max_mapnr; while (i-- > 0) { @@ -240,8 +250,8 @@ printk("%d pages swap cached\n", cached); printk("%ld pages in page table cache\n", pgtable_cache_size); show_buffers(); -#endif /* !CONFIG_DISCONTIGMEM */ } +#endif /* !CONFIG_DISCONTIGMEM */ /* * This is like put_dirty_page() but installs a clean page with PAGE_GATE protection @@ -357,6 +367,7 @@ ia64_tlb_init(); } +#ifndef CONFIG_DISCONTIGMEM static int create_mem_map_page_table (u64 start, u64 end, void *arg) { @@ -466,6 +477,7 @@ *count += (end - start) >> PAGE_SHIFT; return 0; } +#endif /* CONFIG_DISCONTIGMEM */ int ia64_page_valid (struct page *page) @@ -498,20 +510,28 @@ last_end = end; return 0; } -#endif +#endif /* CONFIG_DISCONTIGMEM */ /* * Set up the page tables. */ +#ifdef CONFIG_DISCONTIGMEM +void +paging_init (void) +{ + extern void discontig_paging_init(void); + + discontig_paging_init(); + efi_memmap_walk(count_pages, &num_physpages); +} +#else /* !CONFIG_DISCONTIGMEM */ void paging_init (void) { unsigned long max_dma; unsigned long zones_size[MAX_NR_ZONES]; unsigned long zholes_size[MAX_NR_ZONES]; -#ifndef CONFIG_DISCONTIGMEM unsigned long max_gap; -#endif /* initialize mem_map[] */ @@ -539,9 +559,6 @@ } } -#ifdef CONFIG_DISCONTIGMEM - free_area_init_node(0, NULL, NULL, zones_size, 0, zholes_size); -#else max_gap = 0; efi_memmap_walk(find_largest_hole, (u64 *)&max_gap); @@ -562,20 +579,19 @@ free_area_init_node(0, NULL, vmem_map, zones_size, 0, zholes_size); printk("Virtual mem_map starts at 0x%p\n", mem_map); } -#endif } +#endif /* !CONFIG_DISCONTIGMEM */ static int -count_reserved_pages (u64 start, u64 end, void *arg) +count_reserved_pages (u64 start, u64 end) { unsigned long num_reserved = 0; - unsigned long *count = arg; struct page *pg; for (pg = virt_to_page((void *)start); pg < virt_to_page((void *)end); ++pg) if (PageReserved(pg)) ++num_reserved; - *count += num_reserved; + reserved_pages += num_reserved; return 0; } @@ -583,8 +599,11 @@ mem_init (void) { extern char __start_gate_section[]; - long reserved_pages, codesize, datasize, initsize; + long codesize, datasize, initsize; unsigned long num_pgt_pages; + pg_data_t *pgdat; + extern int filter_rsvd_memory (unsigned long start, unsigned long end, void *arg); + #ifdef CONFIG_PCI /* @@ -595,16 +614,19 @@ platform_pci_dma_init(); #endif +#ifndef CONFIG_DISCONTIGMEM if (!mem_map) BUG(); max_mapnr = max_low_pfn; +#endif high_memory = __va(max_low_pfn * PAGE_SIZE); - totalram_pages += free_all_bootmem(); + for_each_pgdat(pgdat) + totalram_pages += free_all_bootmem_node(pgdat); reserved_pages = 0; - efi_memmap_walk(count_reserved_pages, &reserved_pages); + efi_memmap_walk(filter_rsvd_memory, count_reserved_pages); codesize = (unsigned long) &_etext - (unsigned long) &_stext; datasize = (unsigned long) &_edata - (unsigned long) &_etext; diff -Nur linux-2.4.20-base/arch/ia64/mm/numa.c linux-2.4.20-dcm/arch/ia64/mm/numa.c --- linux-2.4.20-base/arch/ia64/mm/numa.c Thu Jan 1 09:00:00 1970 +++ linux-2.4.20-dcm/arch/ia64/mm/numa.c Wed Mar 12 13:34:10 2003 @@ -0,0 +1,46 @@ +/* + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * This file contains NUMA specific variables and functions which can + * be split away from DISCONTIGMEM and are used on NUMA machines with + * contiguous memory. + * + * 2002/08/07 Erich Focht + */ + +#include +#include +#include +#include +#include +#include +#include + +/* + * The following structures are usually initialized by ACPI or + * similar mechanisms and describe the NUMA characteristics of the machine. + */ +int num_memblks = 0; +struct node_memblk_s node_memblk[NR_MEMBLKS]; +struct node_cpuid_s node_cpuid[NR_CPUS]; +/* + * This is a matrix with "distances" between nodes, they should be + * proportional to the memory access latency ratios. + */ +u8 numa_slit[NR_NODES * NR_NODES]; + +/* Identify which cnode a physical address resides on */ +int +paddr_to_nid(unsigned long paddr) +{ + int i; + + for (i = 0; i < num_memblks; i++) + if (paddr >= node_memblk[i].start_paddr && + paddr < node_memblk[i].start_paddr + node_memblk[i].size) + break; + + return (i < num_memblks) ? node_memblk[i].nid : -1; +} diff -Nur linux-2.4.20-base/drivers/acpi/Config.in linux-2.4.20-dcm/drivers/acpi/Config.in --- linux-2.4.20-base/drivers/acpi/Config.in Mon Mar 3 10:24:21 2003 +++ linux-2.4.20-dcm/drivers/acpi/Config.in Mon Mar 3 10:55:12 2003 @@ -36,6 +36,9 @@ tristate ' Fan' CONFIG_ACPI_FAN tristate ' Processor' CONFIG_ACPI_PROCESSOR dep_tristate ' Thermal Zone' CONFIG_ACPI_THERMAL $CONFIG_ACPI_PROCESSOR + if [ "$CONFIG_NUMA" = "y" ]; then + bool ' NUMA support' CONFIG_ACPI_NUMA + fi bool ' Debug Statements' CONFIG_ACPI_DEBUG fi @@ -119,6 +122,9 @@ tristate ' Fan' CONFIG_ACPI_FAN tristate ' Processor' CONFIG_ACPI_PROCESSOR dep_tristate ' Thermal Zone' CONFIG_ACPI_THERMAL $CONFIG_ACPI_PROCESSOR + if [ "$CONFIG_NUMA" = "y" ]; then + bool ' NUMA support' CONFIG_ACPI_NUMA + fi bool ' Debug Statements' CONFIG_ACPI_DEBUG endmenu fi diff -Nur linux-2.4.20-base/drivers/acpi/Makefile linux-2.4.20-dcm/drivers/acpi/Makefile --- linux-2.4.20-base/drivers/acpi/Makefile Mon Mar 3 10:24:21 2003 +++ linux-2.4.20-dcm/drivers/acpi/Makefile Mon Mar 3 10:55:12 2003 @@ -50,6 +50,7 @@ obj-$(CONFIG_ACPI_PROCESSOR) += processor.o obj-$(CONFIG_ACPI_THERMAL) += thermal.o obj-$(CONFIG_ACPI_SYSTEM) += system.o +obj-$(CONFIG_ACPI_NUMA) += numa.o endif include $(TOPDIR)/Rules.make diff -Nur linux-2.4.20-base/drivers/acpi/numa.c linux-2.4.20-dcm/drivers/acpi/numa.c --- linux-2.4.20-base/drivers/acpi/numa.c Thu Jan 1 09:00:00 1970 +++ linux-2.4.20-dcm/drivers/acpi/numa.c Mon Mar 3 10:55:12 2003 @@ -0,0 +1,186 @@ +/* + * acpi_numa.c - ACPI NUMA support + * + * Copyright (C) 2002 Takayoshi Kochi + * + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * + */ + +#include +#include +#include +#include +#include +#include +#include "acpi_bus.h" + +extern int __init acpi_table_parse_madt_family (enum acpi_table_id id, unsigned long madt_size, int entry_id, acpi_madt_entry_handler handler); + +void __init +acpi_table_print_srat_entry ( + acpi_table_entry_header *header) +{ + if (!header) + return; + + switch (header->type) { + + case ACPI_SRAT_PROCESSOR_AFFINITY: + { + struct acpi_table_processor_affinity *p = + (struct acpi_table_processor_affinity*) header; + printk(KERN_INFO "SRAT Processor (id[0x%02x] eid[0x%02x]) in proximity domain %d %s\n", + p->apic_id, p->lsapic_eid, p->proximity_domain, + p->flags.enabled?"enabled":"disabled"); + } + break; + + case ACPI_SRAT_MEMORY_AFFINITY: + { + struct acpi_table_memory_affinity *p = + (struct acpi_table_memory_affinity*) header; + printk(KERN_INFO "SRAT Memory (0x%08x%08x length 0x%08x%08x type 0x%x) in proximity domain %d %s%s\n", + p->base_addr_hi, p->base_addr_lo, p->length_hi, p->length_lo, + p->memory_type, p->proximity_domain, + p->flags.enabled ? "enabled" : "disabled", + p->flags.hot_pluggable ? " hot-pluggable" : ""); + } + break; + + default: + printk(KERN_WARNING "Found unsupported SRAT entry (type = 0x%x)\n", + header->type); + break; + } +} + + +static int __init +acpi_parse_slit (unsigned long phys_addr, unsigned long size) +{ + struct acpi_table_slit *slit; + u32 localities; + + if (!phys_addr || !size) + return -EINVAL; + + slit = (struct acpi_table_slit *) __va(phys_addr); + + /* downcast just for %llu vs %lu for i386/ia64 */ + localities = (u32) slit->localities; + + printk(KERN_INFO "SLIT localities %ux%u\n", localities, localities); + + acpi_numa_slit_init(slit); + + return 0; +} + + +static int __init +acpi_parse_processor_affinity (acpi_table_entry_header *header) +{ + struct acpi_table_processor_affinity *processor_affinity = NULL; + + processor_affinity = (struct acpi_table_processor_affinity*) header; + if (!processor_affinity) + return -EINVAL; + + acpi_table_print_srat_entry(header); + + /* let architecture-dependent part to do it */ + acpi_numa_processor_affinity_init(processor_affinity); + + return 0; +} + + +static int __init +acpi_parse_memory_affinity (acpi_table_entry_header *header) +{ + struct acpi_table_memory_affinity *memory_affinity = NULL; + + memory_affinity = (struct acpi_table_memory_affinity*) header; + if (!memory_affinity) + return -EINVAL; + + acpi_table_print_srat_entry(header); + + /* let architecture-dependent part to do it */ + acpi_numa_memory_affinity_init(memory_affinity); + + return 0; +} + + +static int __init +acpi_parse_srat (unsigned long phys_addr, unsigned long size) +{ + struct acpi_table_srat *srat = NULL; + + if (!phys_addr || !size) + return -EINVAL; + + srat = (struct acpi_table_srat *) __va(phys_addr); + + printk(KERN_INFO "SRAT revision %d\n", srat->table_revision); + + return 0; +} + + +int __init +acpi_table_parse_srat ( + enum acpi_srat_entry_id id, + acpi_madt_entry_handler handler) +{ + return acpi_table_parse_madt_family(ACPI_SRAT, sizeof(struct acpi_table_srat), + id, handler); +} + + +int __init +acpi_numa_init() +{ + int result; + + /* SRAT: Static Resource Affinity Table */ + result = acpi_table_parse(ACPI_SRAT, acpi_parse_srat); + + if (result > 0) { + result = acpi_table_parse_srat(ACPI_SRAT_PROCESSOR_AFFINITY, + acpi_parse_processor_affinity); + result = acpi_table_parse_srat(ACPI_SRAT_MEMORY_AFFINITY, + acpi_parse_memory_affinity); + } else { + /* FIXME */ + printk("Warning: acpi_table_parse(ACPI_SRAT) returned %d!\n",result); + } + + /* SLIT: System Locality Information Table */ + result = acpi_table_parse(ACPI_SLIT, acpi_parse_slit); + if (result < 1) { + /* FIXME */ + printk("Warning: acpi_table_parse(ACPI_SLIT) returned %d!\n",result); + } + + acpi_numa_arch_fixup(); + return 0; +} diff -Nur linux-2.4.20-base/drivers/acpi/tables.c linux-2.4.20-dcm/drivers/acpi/tables.c --- linux-2.4.20-base/drivers/acpi/tables.c Mon Mar 3 10:24:22 2003 +++ linux-2.4.20-dcm/drivers/acpi/tables.c Mon Mar 3 10:55:12 2003 @@ -224,11 +224,13 @@ int __init -acpi_table_parse_madt ( +acpi_table_parse_madt_family ( enum acpi_table_id id, + unsigned long madt_size, + int entry_id, acpi_madt_entry_handler handler) { - struct acpi_table_madt *madt = NULL; + void *madt = NULL; acpi_table_entry_header *entry = NULL; unsigned long count = 0; unsigned long madt_end = 0; @@ -240,19 +242,21 @@ /* Locate the MADT (if exists). There should only be one. */ for (i = 0; i < sdt.count; i++) { - if (sdt.entry[i].id != ACPI_APIC) + if (sdt.entry[i].id != id) continue; - madt = (struct acpi_table_madt *) + madt = (void *) __acpi_map_table(sdt.entry[i].pa, sdt.entry[i].size); if (!madt) { - printk(KERN_WARNING PREFIX "Unable to map MADT\n"); + printk(KERN_WARNING PREFIX "Unable to map %s\n", + acpi_table_signatures[id]); return -ENODEV; } break; } if (!madt) { - printk(KERN_WARNING PREFIX "MADT not present\n"); + printk(KERN_WARNING PREFIX "%s not present\n", + acpi_table_signatures[id]); return -ENODEV; } @@ -261,18 +265,28 @@ /* Parse all entries looking for a match. */ entry = (acpi_table_entry_header *) - ((unsigned long) madt + sizeof(struct acpi_table_madt)); + ((unsigned long) madt + madt_size); while (((unsigned long) entry) < madt_end) { - if (entry->type == id) { + if (entry->type == entry_id) { count++; handler(entry); } entry = (acpi_table_entry_header *) - ((unsigned long) entry += entry->length); + ((unsigned long) entry + entry->length); } return count; +} + + +int __init +acpi_table_parse_madt ( + enum acpi_madt_entry_id id, + acpi_madt_entry_handler handler) +{ + return acpi_table_parse_madt_family(ACPI_APIC, sizeof(struct acpi_table_madt), + id, handler); } diff -Nur linux-2.4.20-base/include/asm-ia64/acpi.h linux-2.4.20-dcm/include/asm-ia64/acpi.h --- linux-2.4.20-base/include/asm-ia64/acpi.h Mon Mar 3 10:24:23 2003 +++ linux-2.4.20-dcm/include/asm-ia64/acpi.h Wed Mar 12 14:17:39 2003 @@ -97,17 +97,18 @@ } while (0) const char *acpi_get_sysname (void); -int acpi_boot_init (char *cdline); int acpi_request_vector (u32 int_type); int acpi_get_prt (struct pci_vector_struct **vectors, int *count); int acpi_get_interrupt_model (int *type); int acpi_irq_to_vector (u32 irq); -#ifdef CONFIG_DISCONTIGMEM -#define NODE_ARRAY_INDEX(x) ((x) / 8) /* 8 bits/char */ -#define NODE_ARRAY_OFFSET(x) ((x) % 8) /* 8 bits/char */ -#define MAX_PXM_DOMAINS (256) -#endif /* CONFIG_DISCONTIGMEM */ +#ifdef CONFIG_ACPI_NUMA +#include +/* Proximity bitmap length; _PXM is at most 255 (8 bit)*/ +#define MAX_PXM_DOMAINS (256) +extern int __initdata pxm_to_nid_map[MAX_PXM_DOMAINS]; +extern int __initdata nid_to_pxm_map[NR_NODES]; +#endif #endif /*__KERNEL__*/ diff -Nur linux-2.4.20-base/include/asm-ia64/mmzone.h linux-2.4.20-dcm/include/asm-ia64/mmzone.h --- linux-2.4.20-base/include/asm-ia64/mmzone.h Thu Jan 1 09:00:00 1970 +++ linux-2.4.20-dcm/include/asm-ia64/mmzone.h Wed Mar 12 13:41:08 2003 @@ -0,0 +1,143 @@ +/* + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * Copyright (c) 2000 Silicon Graphics, Inc. All rights reserved. + * Copyright (c) 2002 NEC Corp. + * Copyright (c) 2002 Erich Focht + * Copyright (c) 2002 Kimio Suganuma + */ +#ifndef _ASM_IA64_MMZONE_H +#define _ASM_IA64_MMZONE_H + +#include +#include + +/* + * Given a kaddr, find the base mem_map address for the start of the mem_map + * entries for the bank containing the kaddr. + */ +#define BANK_MEM_MAP_BASE(kaddr) local_node_data->bank_mem_map_base[BANK_MEM_MAP_INDEX(kaddr)] + +/* + * Given a kaddr, this macro return the relative map number + * within the bank. + */ +#define BANK_MAP_NR(kaddr) (BANK_OFFSET(kaddr) >> PAGE_SHIFT) + +/* + * Given a pte, this macro returns a pointer to the page struct for the pte. + */ +#define pte_page(pte) virt_to_page(PAGE_OFFSET | (pte_val(pte)&_PFN_MASK)) + +/* + * Determine if a kaddr is a valid memory address of memory that + * actually exists. + * + * The check consists of 2 parts: + * - verify that the address is a region 7 address & does not + * contain any bits that preclude it from being a valid platform + * memory address + * - verify that the chunk actually exists. + * + * Note that IO addresses are NOT considered valid addresses. + * + * Note, many platforms can simply check if kaddr exceeds a specific size. + * (However, this wont work on SGI platforms since IO space is embedded + * within the range of valid memory addresses & nodes have holes in the + * address range between banks). + */ +#define kern_addr_valid(kaddr) ({long _kav=(long)(kaddr); \ + VALID_MEM_KADDR(_kav);}) + +/* + * Given a kaddr, return a pointer to the page struct for the page. + * If the kaddr does not represent RAM memory that potentially exists, return + * a pointer the page struct for max_mapnr. IO addresses will + * return the page for max_nr. Addresses in unpopulated RAM banks may + * return undefined results OR may panic the system. + * + */ +#define virt_to_page(kaddr) ({long _kvtp=(long)(kaddr); \ + (VALID_MEM_KADDR(_kvtp)) \ + ? BANK_MEM_MAP_BASE(_kvtp) + BANK_MAP_NR(_kvtp) \ + : NULL;}) + +/* + * Given a page struct entry, return the physical address that the page struct represents. + * Since IA64 has all memory in the DMA zone, the following works: + */ +#define page_to_phys(page) __pa(page_address(page)) + +#define node_mem_map(nid) (NODE_DATA(nid)->node_mem_map) + +#define node_localnr(pfn, nid) ((pfn) - NODE_DATA(nid)->node_start_pfn) + +#define pfn_to_page(pfn) (struct page *)(node_mem_map(pfn_to_nid(pfn)) + node_localnr(pfn, pfn_to_nid(pfn))) + +#define pfn_to_nid(pfn) local_node_data->node_id_map[(pfn << PAGE_SHIFT) >> DIG_BANKSHIFT] + +#define page_to_pfn(page) (long)((page - page_zone(page)->zone_mem_map) + page_zone(page)->zone_start_pfn) + + +/* + * pfn_valid should be made as fast as possible, and the current definition + * is valid for machines that are NUMA, but still contiguous, which is what + * is currently supported. A more generalised, but slower definition would + * be something like this - mbligh: + * ( pfn_to_pgdat(pfn) && (pfn < node_end_pfn(pfn_to_nid(pfn))) ) + */ +#define pfn_valid(pfn) (pfn < max_low_pfn) +extern unsigned long max_low_pfn; + + +#ifdef CONFIG_NUMA + +/* + * Platform definitions for DIG platform with contiguous memory. + */ +#define MAX_PHYSNODE_ID 8 /* Maximum node number +1 */ +#define NR_NODES 8 /* Maximum number of nodes in SSI */ + +#define MAX_PHYS_MEMORY (1UL << 40) /* 1 TB */ + +/* + * Bank definitions. + * Configurable settings for DIG: 512MB/bank: 16GB/node, + * 2048MB/bank: 64GB/node, + * 8192MB/bank: 256GB/node. + */ +#define NR_BANKS_PER_NODE 32 +#if defined(CONFIG_IA64_NODESIZE_16GB) +# define DIG_BANKSHIFT 29 +#elif defined(CONFIG_IA64_NODESIZE_64GB) +# define DIG_BANKSHIFT 31 +#elif defined(CONFIG_IA64_NODESIZE_256GB) +# define DIG_BANKSHIFT 33 +#else +# error Unsupported bank and nodesize! +#endif +#define BANKSIZE (1UL << DIG_BANKSHIFT) +#define BANK_OFFSET(addr) ((unsigned long)(addr) & (BANKSIZE-1)) +#define NR_BANKS (NR_BANKS_PER_NODE * NR_NODES) + +/* + * VALID_MEM_KADDR returns a boolean to indicate if a kaddr is + * potentially a valid cacheable identity mapped RAM memory address. + * Note that the RAM may or may not actually be present!! + */ + #define VALID_MEM_KADDR(kaddr) 1 +/* #define VALID_MEM_KADDR(kaddr) (BANK_MEM_MAP_BASE(kaddr) == NULL ? NULL : 1) */ + +/* + * Given a nodeid & a bank number, find the address of the mem_map + * entry for the first page of the bank. + */ +#define BANK_MEM_MAP_INDEX(kaddr) \ + (((unsigned long)(kaddr) & (MAX_PHYS_MEMORY-1)) >> DIG_BANKSHIFT) + +extern void build_cpu_to_node_map(void); + +#endif /* CONFIG_NUMA */ +#endif /* _ASM_IA64_MMZONE_H */ diff -Nur linux-2.4.20-base/include/asm-ia64/nodedata.h linux-2.4.20-dcm/include/asm-ia64/nodedata.h --- linux-2.4.20-base/include/asm-ia64/nodedata.h Thu Jan 1 09:00:00 1970 +++ linux-2.4.20-dcm/include/asm-ia64/nodedata.h Wed Mar 12 13:41:15 2003 @@ -0,0 +1,75 @@ +/* + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * Copyright (c) 2000 Silicon Graphics, Inc. All rights reserved. + * Copyright (c) 2002 NEC Corp. + * Copyright (c) 2002 Erich Focht + * Copyright (c) 2002 Kimio Suganuma + */ + + +#ifndef _ASM_IA64_NODEDATA_H +#define _ASM_IA64_NODEDATA_H + + +#include + +/* + * Node Data. One of these structures is located on each node of a NUMA system. + */ + +struct pglist_data; +struct ia64_node_data { + short node; + struct pglist_data *pg_data_ptrs[NR_NODES]; + struct page *bank_mem_map_base[NR_BANKS]; + struct ia64_node_data *node_data_ptrs[NR_NODES]; + short node_id_map[NR_BANKS]; +}; + + +/* + * Return a pointer to the node_data structure for the executing cpu. + */ +#define local_node_data (local_cpu_data->node_data) + + +/* + * Return a pointer to the node_data structure for the specified node. + */ +#define node_data(node) (local_node_data->node_data_ptrs[node]) + +/* + * Get a pointer to the node_id/node_data for the current cpu. + * (boot time only) + */ +extern int boot_get_local_nodeid(void); +extern struct ia64_node_data *get_node_data_ptr(void); + +/* + * Given a node id, return a pointer to the pg_data_t for the node. + * The following 2 macros are similar. + * + * NODE_DATA - should be used in all code not related to system + * initialization. It uses pernode data structures to minimize + * offnode memory references. However, these structure are not + * present during boot. This macro can be used once cpu_init + * completes. + * + * BOOT_NODE_DATA + * - should be used during system initialization + * prior to freeing __initdata. It does not depend on the percpu + * area being present. + * + * NOTE: The names of these macros are misleading but are difficult to change + * since they are used in generic linux & on other architecures. + */ +#define NODE_DATA(nid) (local_node_data->pg_data_ptrs[nid]) +#define BOOT_NODE_DATA(nid) boot_get_pg_data_ptr((long)(nid)) + +struct pglist_data; +extern struct pglist_data * __init boot_get_pg_data_ptr(long); + +#endif /* _ASM_IA64_NODEDATA_H */ diff -Nur linux-2.4.20-base/include/asm-ia64/numa.h linux-2.4.20-dcm/include/asm-ia64/numa.h --- linux-2.4.20-base/include/asm-ia64/numa.h Thu Jan 1 09:00:00 1970 +++ linux-2.4.20-dcm/include/asm-ia64/numa.h Wed Mar 12 13:41:15 2003 @@ -0,0 +1,70 @@ +/* + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * This file contains NUMA specific prototypes and definitions. + * + * 2002/08/05 Erich Focht + * + */ +#ifndef _ASM_IA64_NUMA_H +#define _ASM_IA64_NUMA_H + +#ifdef CONFIG_NUMA + +#ifdef CONFIG_DISCONTIGMEM +# include +# define NR_MEMBLKS (NR_BANKS) +#else +# define NR_NODES (8) +# define NR_MEMBLKS (NR_NODES * 8) +#endif + +#include +#include +extern volatile char cpu_to_node_map[NR_CPUS] __cacheline_aligned; +extern volatile unsigned long node_to_cpu_mask[NR_NODES] __cacheline_aligned; + +/* Stuff below this line could be architecture independent */ + +extern int num_memblks; /* total number of memory chunks */ + +/* + * List of node memory chunks. Filled when parsing SRAT table to + * obtain information about memory nodes. +*/ + +struct node_memblk_s { + unsigned long start_paddr; + unsigned long size; + int nid; /* which logical node contains this chunk? */ + int bank; /* which mem bank on this node */ +}; + +struct node_cpuid_s { + u16 phys_id; /* id << 8 | eid */ + int nid; /* logical node containing this CPU */ +}; + +extern struct node_memblk_s node_memblk[NR_MEMBLKS]; +extern struct node_cpuid_s node_cpuid[NR_CPUS]; + +/* + * ACPI 2.0 SLIT (System Locality Information Table) + * http://devresource.hp.com/devresource/Docs/TechPapers/IA64/slit.pdf + * + * This is a matrix with "distances" between nodes, they should be + * proportional to the memory access latency ratios. + */ + +extern u8 numa_slit[NR_NODES * NR_NODES]; +#define node_distance(from,to) (numa_slit[from * numnodes + to]) + +extern int paddr_to_nid(unsigned long paddr); + +#define local_nodeid (cpu_to_node_map[smp_processor_id()]) + +#endif /* CONFIG_NUMA */ + +#endif /* _ASM_IA64_NUMA_H */ diff -Nur linux-2.4.20-base/include/asm-ia64/numnodes.h linux-2.4.20-dcm/include/asm-ia64/numnodes.h --- linux-2.4.20-base/include/asm-ia64/numnodes.h Thu Jan 1 09:00:00 1970 +++ linux-2.4.20-dcm/include/asm-ia64/numnodes.h Wed Mar 12 13:41:15 2003 @@ -0,0 +1,7 @@ +#ifndef _ASM_MAX_NUMNODES_H +#define _ASM_MAX_NUMNODES_H + +#include +#define MAX_NUMNODES NR_NODES + +#endif /* _ASM_MAX_NUMNODES_H */ diff -Nur linux-2.4.20-base/include/asm-ia64/page.h linux-2.4.20-dcm/include/asm-ia64/page.h --- linux-2.4.20-base/include/asm-ia64/page.h Mon Mar 3 10:24:23 2003 +++ linux-2.4.20-dcm/include/asm-ia64/page.h Wed Mar 12 13:27:41 2003 @@ -52,20 +52,16 @@ */ #define MAP_NR_DENSE(addr) (((unsigned long) (addr) - PAGE_OFFSET) >> PAGE_SHIFT) +#ifndef CONFIG_DISCONTIGMEM #ifdef CONFIG_IA64_GENERIC # include # define virt_to_page(kaddr) (mem_map + platform_map_nr(kaddr)) # define page_to_phys(page) ((page - mem_map) << PAGE_SHIFT) -#elif defined (CONFIG_IA64_SGI_SN1) -# ifndef CONFIG_DISCONTIGMEM -# define virt_to_page(kaddr) (mem_map + MAP_NR_DENSE(kaddr)) -# define page_to_phys(page) XXX fix me -# endif #else # define virt_to_page(kaddr) (mem_map + MAP_NR_DENSE(kaddr)) # define page_to_phys(page) ((page - mem_map) << PAGE_SHIFT) #endif - +#endif struct page; extern int ia64_page_valid (struct page *); #define VALID_PAGE(page) (((page - mem_map) < max_mapnr) && ia64_page_valid(page)) diff -Nur linux-2.4.20-base/include/asm-ia64/pgtable.h linux-2.4.20-dcm/include/asm-ia64/pgtable.h --- linux-2.4.20-base/include/asm-ia64/pgtable.h Mon Mar 3 10:24:23 2003 +++ linux-2.4.20-dcm/include/asm-ia64/pgtable.h Wed Mar 12 13:41:38 2003 @@ -206,6 +206,15 @@ * Conversion functions: convert a page and protection to a page entry, * and a page entry and page directory to the page they refer to. */ +#ifdef CONFIG_DISCONTIGMEM +#define mk_pte(page,pgprot) \ +({ \ + pte_t __pte; \ + \ + pte_val(__pte) = (unsigned long)page_address(page) - PAGE_OFFSET + pgprot_val(pgprot); \ + __pte; \ +}) +#else #define mk_pte(page,pgprot) \ ({ \ pte_t __pte; \ @@ -213,6 +222,7 @@ pte_val(__pte) = ((page - mem_map) << PAGE_SHIFT) | pgprot_val(pgprot); \ __pte; \ }) +#endif /* This takes a physical page address that is used by the remapping functions */ #define mk_pte_phys(physpage, pgprot) \ @@ -440,6 +450,7 @@ */ #define pgtable_cache_init() do { } while (0) +#ifndef CONFIG_DISCONTIGMEM /* arch mem_map init routines are needed due to holes in a virtual mem_map */ #define HAVE_ARCH_MEMMAP_INIT @@ -449,7 +460,7 @@ extern unsigned long arch_memmap_init (memmap_init_callback_t *callback, struct page *start, struct page *end, int zone, unsigned long start_paddr, int highmem); - +#endif /* CONFIG_DISCONTIGMEM */ # endif /* !__ASSEMBLY__ */ /* diff -Nur linux-2.4.20-base/include/asm-ia64/processor.h linux-2.4.20-dcm/include/asm-ia64/processor.h --- linux-2.4.20-base/include/asm-ia64/processor.h Mon Mar 3 10:24:23 2003 +++ linux-2.4.20-dcm/include/asm-ia64/processor.h Wed Mar 12 13:41:15 2003 @@ -87,6 +87,9 @@ #include #include #include +#ifdef CONFIG_NUMA +#include +#endif /* like above but expressed as bitfields for more efficient access: */ struct ia64_psr { @@ -187,9 +190,8 @@ } ipi; #endif #ifdef CONFIG_NUMA - void *node_directory; - int numa_node_id; - struct cpuinfo_ia64 *cpu_data[NR_CPUS]; + struct ia64_node_data *node_data; + int nodeid; #endif /* Platform specific word. MUST BE LAST IN STRUCT */ __u64 platform_specific; @@ -201,23 +203,8 @@ */ #define local_cpu_data ((struct cpuinfo_ia64 *) PERCPU_ADDR) -/* - * On NUMA systems, cpu_data for each cpu is allocated during cpu_init() & is allocated on - * the node that contains the cpu. This minimizes off-node memory references. cpu_data - * for each cpu contains an array of pointers to the cpu_data structures of each of the - * other cpus. - * - * On non-NUMA systems, cpu_data is a static array allocated at compile time. References - * to the cpu_data of another cpu is done by direct references to the appropriate entry of - * the array. - */ -#ifdef CONFIG_NUMA -# define cpu_data(cpu) local_cpu_data->cpu_data[cpu] -# define numa_node_id() (local_cpu_data->numa_node_id) -#else - extern struct cpuinfo_ia64 _cpu_data[NR_CPUS]; -# define cpu_data(cpu) (&_cpu_data[cpu]) -#endif +extern struct cpuinfo_ia64 _cpu_data[NR_CPUS]; +#define cpu_data(cpu) (&_cpu_data[cpu]) extern void identify_cpu (struct cpuinfo_ia64 *); extern void print_cpu_info (struct cpuinfo_ia64 *); diff -Nur linux-2.4.20-base/include/asm-ia64/smp.h linux-2.4.20-dcm/include/asm-ia64/smp.h --- linux-2.4.20-base/include/asm-ia64/smp.h Sat Nov 10 07:26:17 2001 +++ linux-2.4.20-dcm/include/asm-ia64/smp.h Wed Mar 12 13:41:36 2003 @@ -122,6 +122,8 @@ extern int smp_call_function_single (int cpuid, void (*func) (void *info), void *info, int retry, int wait); +extern void smp_build_cpu_map(void); + #endif /* CONFIG_SMP */ #endif /* _ASM_IA64_SMP_H */ diff -Nur linux-2.4.20-base/include/asm-ia64/topology.h linux-2.4.20-dcm/include/asm-ia64/topology.h --- linux-2.4.20-base/include/asm-ia64/topology.h Thu Jan 1 09:00:00 1970 +++ linux-2.4.20-dcm/include/asm-ia64/topology.h Wed Mar 12 14:17:42 2003 @@ -0,0 +1,63 @@ +/* + * linux/include/asm-ia64/topology.h + * + * Copyright (C) 2002, Erich Focht, NEC + * + * All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ +#ifndef _ASM_IA64_TOPOLOGY_H +#define _ASM_IA64_TOPOLOGY_H + +#include +#include +#include + +#ifdef CONFIG_NUMA +/* + * Returns the number of the node containing CPU 'cpu' + */ +#define __cpu_to_node(cpu) (int)(cpu_to_node_map[cpu]) + +/* + * Returns a bitmask of CPUs on Node 'node'. + */ +#define __node_to_cpu_mask(node) (node_to_cpu_mask[node]) + +#else +#define __cpu_to_node(cpu) (0) +#define __node_to_cpu_mask(node) (phys_cpu_present_map) +#endif + +/* + * Returns the number of the node containing MemBlk 'memblk' + */ +#ifdef CONFIG_ACPI_NUMA +#define __memblk_to_node(memblk) (node_memblk[memblk].nid) +#else +#define __memblk_to_node(memblk) (memblk) +#endif + +/* + * Returns the number of the node containing Node 'nid'. + * Not implemented here. Multi-level hierarchies detected with + * the help of node_distance(). + */ +#define __parent_node(nid) (nid) + +/* + * Returns the number of the first CPU on Node 'node'. + */ +#define __node_to_first_cpu(node) (__ffs(__node_to_cpu_mask(node))) + +/* + * Returns the number of the first MemBlk on Node 'node' + * Should be fixed when IA64 discontigmem goes in. + */ +#define __node_to_memblk(node) (node) + +#endif /* _ASM_IA64_TOPOLOGY_H */ diff -Nur linux-2.4.20-base/include/linux/acpi.h linux-2.4.20-dcm/include/linux/acpi.h --- linux-2.4.20-base/include/linux/acpi.h Mon Mar 3 10:24:23 2003 +++ linux-2.4.20-dcm/include/linux/acpi.h Wed Mar 12 14:17:40 2003 @@ -344,6 +344,14 @@ void acpi_table_print (struct acpi_table_header *, unsigned long); void acpi_table_print_madt_entry (acpi_table_entry_header *); +#ifdef CONFIG_ACPI_NUMA +int __init acpi_numa_init(void); +void __init acpi_numa_slit_init (struct acpi_table_slit *); +void __init acpi_numa_processor_affinity_init (struct acpi_table_processor_affinity *); +void __init acpi_numa_memory_affinity_init (struct acpi_table_memory_affinity *); +void __init acpi_numa_arch_fixup(void); +#endif + #endif /*CONFIG_ACPI_BOOT*/ diff -Nur linux-2.4.20-base/include/linux/mmzone.h linux-2.4.20-dcm/include/linux/mmzone.h --- linux-2.4.20-base/include/linux/mmzone.h Mon Mar 3 10:24:23 2003 +++ linux-2.4.20-dcm/include/linux/mmzone.h Wed Mar 12 14:17:42 2003 @@ -8,6 +8,12 @@ #include #include #include +#ifdef CONFIG_DISCONTIGMEM +#include +#endif +#ifndef MAX_NUMNODES +#define MAX_NUMNODES 1 +#endif /* * Free memory management - zoned buddy allocator. @@ -212,6 +218,15 @@ #define for_each_zone(zone) \ for(zone = pgdat_list->node_zones; zone; zone = next_zone(zone)) +#ifdef CONFIG_NUMA +#define MAX_NR_MEMBLKS BITS_PER_LONG /* Max number of Memory Blocks */ +#else /* !CONFIG_NUMA */ +#define MAX_NR_MEMBLKS 1 +#endif /* CONFIG_NUMA */ + +#include +/* Returns the number of the current Node. */ +#define numa_node_id() (__cpu_to_node(smp_processor_id())) #ifndef CONFIG_DISCONTIGMEM diff -Nur linux-2.4.20-base/init/main.c linux-2.4.20-dcm/init/main.c --- linux-2.4.20-base/init/main.c Mon Mar 3 10:24:23 2003 +++ linux-2.4.20-dcm/init/main.c Wed Mar 12 13:33:32 2003 @@ -290,6 +290,7 @@ extern void setup_arch(char **); +extern void __init build_all_zonelists(void); extern void cpu_idle(void); unsigned long wait_init_idle; @@ -360,6 +361,7 @@ lock_kernel(); printk(linux_banner); setup_arch(&command_line); + build_all_zonelists(); printk("Kernel command line: %s\n", saved_command_line); parse_options(command_line); trap_init(); diff -Nur linux-2.4.20-base/mm/page_alloc.c linux-2.4.20-dcm/mm/page_alloc.c --- linux-2.4.20-base/mm/page_alloc.c Mon Mar 3 10:24:23 2003 +++ linux-2.4.20-dcm/mm/page_alloc.c Mon Mar 3 10:55:12 2003 @@ -586,13 +586,41 @@ /* * Builds allocation fallback zone lists. */ -static inline void build_zonelists(pg_data_t *pgdat) +static int __init build_zonelists_node(pg_data_t *pgdat, zonelist_t *zonelist, int j, int k) { - int i, j, k; + switch (k) { + zone_t *zone; + default: + BUG(); + case ZONE_HIGHMEM: + zone = pgdat->node_zones + ZONE_HIGHMEM; + if (zone->memsize) { +#ifndef CONFIG_HIGHMEM + BUG(); +#endif + zonelist->zones[j++] = zone; + } + case ZONE_NORMAL: + zone = pgdat->node_zones + ZONE_NORMAL; + if (zone->memsize) + zonelist->zones[j++] = zone; + case ZONE_DMA: + zone = pgdat->node_zones + ZONE_DMA; + if (zone->memsize) + zonelist->zones[j++] = zone; + } + return j; +} + +static void __init build_zonelists(pg_data_t *pgdat) +{ + int i, j, k, node, local_node; + + local_node = pgdat->node_id; + printk("Building zonelist for node : %d\n", local_node); for (i = 0; i <= GFP_ZONEMASK; i++) { zonelist_t *zonelist; - zone_t *zone; zonelist = pgdat->node_zonelists + i; memset(zonelist, 0, sizeof(*zonelist)); @@ -604,31 +632,30 @@ if (i & __GFP_DMA) k = ZONE_DMA; - switch (k) { - default: - BUG(); - /* - * fallthrough: - */ - case ZONE_HIGHMEM: - zone = pgdat->node_zones + ZONE_HIGHMEM; - if (zone->memsize) { -#ifndef CONFIG_HIGHMEM - BUG(); -#endif - zonelist->zones[j++] = zone; - } - case ZONE_NORMAL: - zone = pgdat->node_zones + ZONE_NORMAL; - if (zone->memsize) - zonelist->zones[j++] = zone; - case ZONE_DMA: - zone = pgdat->node_zones + ZONE_DMA; - if (zone->memsize) - zonelist->zones[j++] = zone; - } + j = build_zonelists_node(pgdat, zonelist, j, k); + /* + * Now we build the zonelist so that it contains the zones + * of all the other nodes. + * We don't want to pressure a particular node, so when + * building the zones for node N, we make sure that the + * zones coming right after the local ones are those from + * node N+1 (modulo N) + */ + for (node = local_node + 1; node < numnodes; node++) + j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); + for (node = 0; node < local_node; node++) + j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); + zonelist->zones[j++] = NULL; - } + } +} + +void __init build_all_zonelists(void) +{ + int i; + + for(i = 0 ; i < numnodes ; i++) + build_zonelists(NODE_DATA(i)); } /* @@ -806,6 +833,7 @@ * up by free_all_bootmem() once the early boot process is * done. Non-atomic initialization, single-pass. */ + zone_start_paddr = MEMMAP_INIT(mem_map + offset, mem_map + offset + size, nid * MAX_NR_ZONES + j, zone_start_paddr, @@ -850,7 +878,6 @@ (unsigned long *) alloc_bootmem_node(pgdat, bitmap_size); } } - build_zonelists(pgdat); } void __init free_area_init(unsigned long *zones_size)