Re: Discontig patch for 2.4.21

public inbox for linux-ia64@vger.kernel.org
 help / color / mirror / Atom feed

From: Bjorn Helgaas <bjorn.helgaas@hp.com>
To: linux-ia64@vger.kernel.org
Subject: Re: Discontig patch for 2.4.21
Date: Tue, 12 Aug 2003 23:34:45 +0000	[thread overview]
Message-ID: <marc-linux-ia64-106073158319638@msgid-missing> (raw)

I applied this patch for 2.4.  There were a couple conflicts (for
instance, some of the non-ia64 ACPI stuff is already in 2.4), so I
wouldn't be too surprised if I messed something up, so please
look things over.

Bjorn


On Friday 01 August 2003 11:38 am, Jack Steiner wrote:
> Attached is the patch for discontig memory for 2.4.21. This patch
> has been tested on the ZX1 & NEC platforms & appears to work ok. It 
> also works on SN2 but there are additional patches (unrelated to 
> discontig) that at still needed in 2.4.21.
> 
> 
> Jesse barnes is pushing the patch into 2.6 & is still doing
> minor cleanup. Once he finishes, I'll update this patch with the
> cleanup that he has added to his patch. However, as far as I can tell,
> this patch is ok.
> 
> 
> 
> diff -Naur linux_base/arch/ia64/config.in linux/arch/ia64/config.in
> --- linux_base/arch/ia64/config.in	Sat Jul  5 22:59:38 2003
> +++ linux/arch/ia64/config.in	Mon Jul 28 10:19:02 2003
> @@ -66,6 +66,10 @@
>  fi
>  
>  if [ "$CONFIG_IA64_GENERIC" = "y" -o "$CONFIG_IA64_DIG" = "y" -o "$CONFIG_IA64_HP_ZX1" = "y" ]; then
> +	bool '  Enable NUMA support' CONFIG_NUMA
> +	if [ "$CONFIG_NUMA" = "y" ]; then
> +	  define_bool CONFIG_DISCONTIGMEM y
> +	fi
>  	bool '  Enable IA-64 Machine Check Abort' CONFIG_IA64_MCA
>  	define_bool CONFIG_PM y
>  fi
> diff -Naur linux_base/arch/ia64/kernel/acpi.c linux/arch/ia64/kernel/acpi.c
> --- linux_base/arch/ia64/kernel/acpi.c	Sat Jul  5 22:59:38 2003
> +++ linux/arch/ia64/kernel/acpi.c	Tue Jul 29 10:12:40 2003
> @@ -8,6 +8,9 @@
>   *  Copyright (C) 2000 Intel Corp.
>   *  Copyright (C) 2000,2001 J.I. Lee <jung-ik.lee@intel.com>
>   *  Copyright (C) 2001 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
> + *  Copyright (C) 2001 Jenna Hall <jenna.s.hall@intel.com>
> + *  Copyright (C) 2001 Takayoshi Kochi <t-kouchi@cq.jp.nec.com>
> + *  Copyright (C) 2002 Erich Focht <efocht@ess.nec.de>
>   *
>   * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
>   *
> @@ -38,11 +41,14 @@
>  #include <linux/irq.h>
>  #include <linux/acpi.h>
>  #include <linux/efi.h>
> +#include <linux/mm.h>
> +#include <linux/mmzone.h>
>  #include <asm/io.h>
>  #include <asm/iosapic.h>
>  #include <asm/machvec.h>
>  #include <asm/page.h>
>  #include <asm/system.h>
> +#include <asm/numa.h>
>  
>  
>  #define PREFIX			"ACPI: "
> @@ -179,7 +185,6 @@
>  	acpi_status status;
>  	u8 *data;
>  	u32 length;
> -	int i;
>  
>  	status = acpi_find_vendor_resource(obj, &hp_ccsr_descriptor, &data, &length);
>  
> @@ -437,6 +442,194 @@
>  }
>  
>  
> +#ifdef CONFIG_ACPI_NUMA
> +
> +#define PXM_FLAG_LEN ((MAX_PXM_DOMAINS + 1)/32)
> +
> +static int __initdata srat_num_cpus;			/* number of cpus */
> +static u32 __initdata pxm_flag[PXM_FLAG_LEN];
> +#define pxm_bit_set(bit)	(set_bit(bit,(void *)pxm_flag))
> +#define pxm_bit_test(bit)	(test_bit(bit,(void *)pxm_flag))
> +/* maps to convert between proximity domain and logical node ID */
> +int __initdata pxm_to_nid_map[MAX_PXM_DOMAINS];
> +int __initdata nid_to_pxm_map[NR_NODES];
> +struct acpi_table_slit __initdata *slit_table;
> +
> +/*
> + * ACPI 2.0 SLIT (System Locality Information Table)
> + * http://devresource.hp.com/devresource/Docs/TechPapers/IA64/slit.pdf
> + */
> +void __init
> +acpi_numa_slit_init (struct acpi_table_slit *slit)
> +{
> +	u32 len;
> +
> +	len = sizeof(struct acpi_table_header) + 8
> +		+ slit->localities * slit->localities;
> +	if (slit->header.length != len) {
> +		printk("KERN_INFO ACPI 2.0 SLIT: size mismatch: %d expected, %d actual\n",
> +		      len, slit->header.length);
> +		memset(numa_slit, 10, sizeof(numa_slit));
> +		return;
> +	}
> +	slit_table = slit;
> +}
> +
> +void __init
> +acpi_numa_processor_affinity_init (struct acpi_table_processor_affinity *pa)
> +{
> +	/* record this node in proximity bitmap */
> +	pxm_bit_set(pa->proximity_domain);
> +
> +	node_cpuid[srat_num_cpus].phys_id = (pa->apic_id << 8) | (pa->lsapic_eid);
> +	/* nid should be overridden as logical node id later */
> +	node_cpuid[srat_num_cpus].nid = pa->proximity_domain;
> +	srat_num_cpus++;
> +}
> +
> +void __init
> +acpi_numa_memory_affinity_init (struct acpi_table_memory_affinity *ma)
> +{
> +	unsigned long paddr, size, hole_size, min_hole_size;
> +	u8 pxm;
> +	struct node_memblk_s *p, *q, *pend;
> +
> +	pxm = ma->proximity_domain;
> +
> +	/* fill node memory chunk structure */
> +	paddr = ma->base_addr_hi;
> +	paddr = (paddr << 32) | ma->base_addr_lo;
> +	size = ma->length_hi;
> +	size = (size << 32) | ma->length_lo;
> +
> +	if (num_memblks >= NR_MEMBLKS) {
> +		printk(KERN_ERR "Too many mem chunks in SRAT. Ignoring %ld MBytes at %lx\n",
> +			size/(1024*1024), paddr);
> +		return;
> +	}
> +
> +	/* Ignore disabled entries */
> +	if (!ma->flags.enabled)
> +		return;
> +
> +	/*
> +	 * When the chunk is not the first one in the node, check distance
> +	 * from the other chunks. When the hole is too huge ignore the chunk.
> +	 * This restriction should be removed when multiple chunks per node
> +	 * is supported.
> +	 */
> +	pend = &node_memblk[num_memblks];
> +	min_hole_size = 0;
> +	for (p = &node_memblk[0]; p < pend; p++) {
> +		if (p->nid != pxm)
> +			continue;
> +		if (p->start_paddr < paddr)
> +			hole_size = paddr - (p->start_paddr + p->size);
> +		else
> +			hole_size = p->start_paddr - (paddr + size);
> +
> +		if (!min_hole_size || hole_size < min_hole_size)
> +			min_hole_size = hole_size;
> +	}
> +
> +#if 0	/* test */
> +	if (min_hole_size) {
> +		if (min_hole_size > size) {
> +			printk(KERN_ERR "Too huge memory hole. Ignoring %ld MBytes at %lx\n",
> +				size/(1024*1024), paddr);
> +			return;
> +		}
> +	}
> +#endif
> +
> +	/* record this node in proximity bitmap */
> +	pxm_bit_set(pxm);
> +
> +	/* Insertion sort based on base address */
> +	pend = &node_memblk[num_memblks];
> +	for (p = &node_memblk[0]; p < pend; p++) {
> +		if (paddr < p->start_paddr)
> +			break;
> +	}
> +	if (p < pend) {
> +		for (q = pend; q >= p; q--)
> +			*(q + 1) = *q;
> +	}
> +	p->start_paddr = paddr;
> +	p->size = size;
> +	p->nid = pxm;
> +	num_memblks++;
> +}
> +
> +void __init
> +acpi_numa_arch_fixup(void)
> +{
> +	int i, j, node_from, node_to;
> +
> +	if (srat_num_cpus = 0) {
> +		node_cpuid[0].phys_id = hard_smp_processor_id();
> +		return;
> +	}
> +
> +	/* calculate total number of nodes in system from PXM bitmap */
> +	numnodes = 0;		/* init total nodes in system */
> +
> +	memset(pxm_to_nid_map, -1, sizeof(pxm_to_nid_map));
> +	memset(nid_to_pxm_map, -1, sizeof(nid_to_pxm_map));
> +	for (i = 0; i < MAX_PXM_DOMAINS; i++) {
> +		if (pxm_bit_test(i)) {
> +			pxm_to_nid_map[i] = numnodes;
> +			nid_to_pxm_map[numnodes++] = i;
> +		}
> +	}
> +
> +	/* set logical node id in memory chunk structure */
> +	for (i = 0; i < num_memblks; i++)
> +		node_memblk[i].nid = pxm_to_nid_map[node_memblk[i].nid];
> +
> +	/* assign memory bank numbers for each chunk on each node */
> +	for (i = 0; i < numnodes; i++) {
> +		int bank;
> +
> +		bank = 0;
> +		for (j = 0; j < num_memblks; j++)
> +			if (node_memblk[j].nid = i)
> +				node_memblk[j].bank = bank++;
> +	}
> +
> +	/* set logical node id in cpu structure */
> +	for (i = 0; i < srat_num_cpus; i++)
> +		node_cpuid[i].nid = pxm_to_nid_map[node_cpuid[i].nid];
> +
> +	printk(KERN_INFO "Number of logical nodes in system = %d\n", numnodes);
> +	printk(KERN_INFO "Number of memory chunks in system = %d\n", num_memblks);
> +
> +	if (!slit_table) return;
> +	memset(numa_slit, -1, sizeof(numa_slit));
> +	for (i=0; i<slit_table->localities; i++) {
> +		if (!pxm_bit_test(i))
> +			continue;
> +		node_from = pxm_to_nid_map[i];
> +		for (j=0; j<slit_table->localities; j++) {
> +			if (!pxm_bit_test(j))
> +				continue;
> +			node_to = pxm_to_nid_map[j];
> +			node_distance(node_from, node_to) = 
> +				slit_table->entry[i*slit_table->localities + j];
> +		}
> +	}
> +
> +#ifdef SLIT_DEBUG
> +	printk(KERN_DEBUG "ACPI 2.0 SLIT locality table:\n");
> +	for (i = 0; i < numnodes; i++) {
> +		for (j = 0; j < numnodes; j++)
> +			printk(KERN_DEBUG "%03d ", node_distance(i,j));
> +		printk("\n");
> +	}
> +#endif
> +}
> +#endif /* CONFIG_ACPI_NUMA */
> +
>  static int __init
>  acpi_parse_fadt (unsigned long phys_addr, unsigned long size)
>  {
> @@ -487,12 +680,6 @@
>  int __init
>  acpi_boot_init (char *cmdline)
>  {
> -	int result;
> -
> -	/* Initialize the ACPI boot-time table parser */
> -	result = acpi_table_init(cmdline);
> -	if (result)
> -		return result;
>  
>  	/*
>  	 * MADT
> @@ -556,6 +743,22 @@
>  		available_cpus = 1; /* We've got at least one of these, no? */
>  	}
>  	smp_boot_data.cpu_count = total_cpus;
> +	smp_build_cpu_map();
> +
> +# ifdef CONFIG_NUMA
> +	/* If the platform did not have an SRAT table, initialize the
> +	 * node_cpuid table from the smp_boot_data array. All cpus
> +	 * will be on node 0.
> +	 */
> +	if (srat_num_cpus = 0) {
> +		int cpu, i=1;
> +		for (cpu=0; cpu<smp_boot_data.cpu_count; cpu++)
> +			if (smp_boot_data.cpu_phys_id[cpu] != hard_smp_processor_id())
> +				node_cpuid[i++].phys_id = smp_boot_data.cpu_phys_id[cpu];
> +	}
> +	build_cpu_to_node_map();
> +# endif
> +
>  #endif
>  	/* Make boot-up look pretty */
>  	printk(KERN_INFO "%d CPUs available, %d CPUs total\n", available_cpus, total_cpus);
> diff -Naur linux_base/arch/ia64/kernel/setup.c linux/arch/ia64/kernel/setup.c
> --- linux_base/arch/ia64/kernel/setup.c	Sat Jul  5 22:59:38 2003
> +++ linux/arch/ia64/kernel/setup.c	Tue Jul 29 15:29:42 2003
> @@ -40,6 +40,8 @@
>  #include <asm/system.h>
>  #include <asm/mca.h>
>  #include <asm/smp.h>
> +#include <asm/pgtable.h>
> +#include <asm/pgalloc.h>
>  #include <asm/tlb.h>
>  
>  #ifdef CONFIG_BLK_DEV_RAM
> @@ -56,7 +58,7 @@
>  extern char _end;
>  
>  #ifdef CONFIG_NUMA
> - struct cpuinfo_ia64 *boot_cpu_data;
> + struct cpuinfo_ia64 *_cpu_data[NR_CPUS];
>  #else
>   struct cpuinfo_ia64 _cpu_data[NR_CPUS] __attribute__ ((section ("__special_page_section")));
>   mmu_gather_t mmu_gathers[NR_CPUS];
> @@ -99,6 +101,7 @@
>  static struct rsvd_region rsvd_region[IA64_MAX_RSVD_REGIONS + 1];
>  static int num_rsvd_regions;
>  
> +#ifndef CONFIG_DISCONTIGMEM
>  static unsigned long bootmap_start; /* physical address where the bootmem map is located */
>  
>  static int
> @@ -111,18 +114,74 @@
>  		*max_pfn = pfn;
>  	return 0;
>  }
> +#endif /* !CONFIG_DISCONTIGMEM */
>  
>  #define IGNORE_PFN0	1	/* XXX fix me: ignore pfn 0 until TLB miss handler is updated... */
>  
> +#ifdef CONFIG_DISCONTIGMEM
>  /*
> - * Free available memory based on the primitive map created from
> - * the boot parameters. This routine does not assume the incoming
> - * segments are sorted.
> + * efi_memmap_walk() knows nothing about layout of memory across nodes. Find
> + * out to which node a block of memory belongs.  Ignore memory that we cannot
> + * identify, and split blocks that run across multiple nodes.
> + *
> + * Take this opportunity to round the start address up and the end address
> + * down to page boundaries.
>   */
> +void
> +call_pernode_memory (unsigned long start, unsigned long end, void *arg)
> +{
> +	unsigned long rs, re;
> +	void (*func)(unsigned long, unsigned long, int);
> +	int i;
> +
> +	start = PAGE_ALIGN(start);
> +	end &= PAGE_MASK;
> +	if (start >= end)
> +		return;
> +
> +	func = arg;
> +
> +	if (!num_memblks) {
> +		/* this machine doesn't have SRAT, */
> +		/* so call func with nid=0, bank=0 */
> +		if (start < end)
> +			(*func)(start, end, 0);
> +		return;
> +	}
> +
> +	for (i = 0; i < num_memblks; i++) {
> +		rs = MAX(__pa(start), node_memblk[i].start_paddr);
> +		re = MIN(__pa(end), node_memblk[i].start_paddr+node_memblk[i].size);
> +
> +		if (rs < re)
> +			(*func)((unsigned long)__va(rs), (unsigned long)__va(re), node_memblk[i].nid);
> +		if ((unsigned long)__va(re) = end)
> +			break;
> +	}
> +}
> +
> +#else /* CONFIG_DISCONTIGMEM */
> +
>  static int
>  free_available_memory (unsigned long start, unsigned long end, void *arg)
>  {
> +	free_bootmem(__pa(start), end - start);
> +	return 0;
> +}
> +#endif /* CONFIG_DISCONTIGMEM */
> +
> +/*
> + * Filter incoming memory segments based on the primitive map created from
> + * the boot parameters. Segments contained in the map are removed from the
> + * memory ranges. A caller-specified function is called with the memory
> + * ranges that remain after filtering.
> + * This routine does not assume the incoming segments are sorted.
> + */
> +int
> +filter_rsvd_memory (unsigned long start, unsigned long end, void *arg)
> +{
>  	unsigned long range_start, range_end, prev_start;
> +	void (*func)(unsigned long, unsigned long, int);
>  	int i;
>  
>  #if IGNORE_PFN0
> @@ -136,13 +195,18 @@
>  	 * lowest possible address(walker uses virtual)
>  	 */
>  	prev_start = PAGE_OFFSET;
> +	func = arg;
>  
>  	for (i = 0; i < num_rsvd_regions; ++i) {
>  		range_start = MAX(start, prev_start);
>  		range_end   = MIN(end, rsvd_region[i].start);
>  
>  		if (range_start < range_end)
> -			free_bootmem(__pa(range_start), range_end - range_start);
> +#ifdef CONFIG_DISCONTIGMEM
> +			call_pernode_memory(range_start, range_end, func);
> +#else
> +			(*func)(range_start, range_end, 0);
> +#endif
>  
>  		/* nothing more available in this segment */
>  		if (range_end = end) return 0;
> @@ -154,6 +218,7 @@
>  }
>  
>  
> +#ifndef CONFIG_DISCONTIGMEM
>  /*
>   * Find a place to put the bootmap and return its starting address in bootmap_start.
>   * This address must be page-aligned.
> @@ -192,6 +257,7 @@
>  	}
>  	return 0;
>  }
> +#endif /* CONFIG_DISCONTIGMEM */
>  
>  static void
>  sort_regions (struct rsvd_region *rsvd_region, int max)
> @@ -256,6 +322,14 @@
>  
>  	sort_regions(rsvd_region, num_rsvd_regions);
>  
> +#ifdef CONFIG_DISCONTIGMEM
> +	{
> +		extern void discontig_mem_init(void);
> +		bootmap_size = max_pfn = 0;     /* stop gcc warnings */
> +		discontig_mem_init();
> +	}
> +#else /* !CONFIG_DISCONTIGMEM */
> +
>  	/* first find highest page frame number */
>  	max_pfn = 0;
>  	efi_memmap_walk(find_max_pfn, &max_pfn);
> @@ -272,8 +346,9 @@
>  	bootmap_size = init_bootmem(bootmap_start >> PAGE_SHIFT, max_pfn);
>  
>  	/* Free all available memory, then mark bootmem-map as being in use.  */
> -	efi_memmap_walk(free_available_memory, 0);
> +	efi_memmap_walk(filter_rsvd_memory, free_available_memory);
>  	reserve_bootmem(bootmap_start, bootmap_size);
> +#endif /* !CONFIG_DISCONTIGMEM */
>  
>  #ifdef CONFIG_BLK_DEV_INITRD
>  	if (ia64_boot_param->initrd_start) {
> @@ -300,6 +375,19 @@
>  
>  	efi_init();
>  
> +#ifdef CONFIG_ACPI_BOOT
> +	/* Initialize the ACPI boot-time table parser */
> +	acpi_table_init(*cmdline_p);
> +
> +# ifdef CONFIG_ACPI_NUMA
> +	acpi_numa_init();
> +# endif
> +#else
> +# ifdef CONFIG_SMP
> +	smp_build_cpu_map();	/* happens, e.g., with the Ski simulator */
> +# endif
> +#endif /* CONFIG_APCI_BOOT */
> +
>  	iomem_resource.end = ~0UL;	/* FIXME probably belongs elsewhere */
>  	find_memory();
>  
> @@ -448,6 +536,8 @@
>  		   c->itc_freq / 1000000, c->itc_freq % 1000000,
>  		   lpj*HZ/500000, (lpj*HZ/5000) % 100);
>  	return 0;
> +#undef lpj
> +#undef cpu
>  }
>  
>  static void *
> @@ -548,7 +638,7 @@
>  	unsigned int max_ctx;
>  	struct cpuinfo_ia64 *my_cpu_data;
>  #ifdef CONFIG_NUMA
> -	int cpu, order;
> +	int cpu;
>  
>  	/*
>  	 * If NUMA is configured, the cpu_data array is not preallocated. The boot cpu
> @@ -557,34 +647,14 @@
>  	 * is required because some boot code references all cpu_data structures
>  	 * before the cpus are actually started.
>  	 */
> -	if (!boot_cpu_data) {
> -		my_cpu_data = alloc_bootmem_pages_node(NODE_DATA(numa_node_id()),
> -						       sizeof(struct cpuinfo_ia64));
> -		boot_cpu_data = my_cpu_data;
> -		my_cpu_data->cpu_data[0] = my_cpu_data;
> -		for (cpu = 1; cpu < NR_CPUS; ++cpu)
> -			my_cpu_data->cpu_data[cpu]
> -				= alloc_bootmem_pages_node(NODE_DATA(numa_node_id()),
> -							   sizeof(struct cpuinfo_ia64));
> -		for (cpu = 1; cpu < NR_CPUS; ++cpu)
> -			memcpy(my_cpu_data->cpu_data[cpu]->cpu_data,
> -			       my_cpu_data->cpu_data, sizeof(my_cpu_data->cpu_data));
> -		my_cpu_data->mmu_gathers = alloc_bootmem_pages_node(BOOT_NODE_DATA(boot_get_local_cnodeid()),
> -								    sizeof(mmu_gather_t));
> -	} else {
> -		order = get_order(sizeof(struct cpuinfo_ia64));
> -		my_cpu_data = page_address(alloc_pages_node(numa_node_id(), GFP_KERNEL, order));
> -		memcpy(my_cpu_data, boot_cpu_data->cpu_data[smp_processor_id()],
> -		       sizeof(struct cpuinfo_ia64));
> -		__free_pages(virt_to_page(boot_cpu_data->cpu_data[smp_processor_id()]),
> -			     order);
> -		for (cpu = 0; cpu < NR_CPUS; ++cpu)
> -			boot_cpu_data->cpu_data[cpu]->cpu_data[smp_processor_id()] = my_cpu_data;
> -
> -		my_cpu_data->mmu_gathers = page_address(boot_alloc_pages_node(boot_get_local_cnodeid(),
> -									      GFP_KERNEL,
> -									      get_order(sizeof(mmu_gather_t)));
> -	}
> +	for (cpu=0; cpu < NR_CPUS; cpu++)
> +		if (node_cpuid[cpu].phys_id = hard_smp_processor_id())
> +			break;
> +	my_cpu_data = _cpu_data[cpu];
> +	my_cpu_data->node_data->active_cpu_count++;
> +
> +	for (cpu=0; cpu<NR_CPUS; cpu++)
> +		_cpu_data[cpu]->cpu_data[smp_processor_id()] = my_cpu_data;
>  #else
>  	my_cpu_data = cpu_data(smp_processor_id());
>  	my_cpu_data->mmu_gathers = &mmu_gathers[smp_processor_id()];
> diff -Naur linux_base/arch/ia64/kernel/smpboot.c linux/arch/ia64/kernel/smpboot.c
> --- linux_base/arch/ia64/kernel/smpboot.c	Sat Jul  5 22:59:38 2003
> +++ linux/arch/ia64/kernel/smpboot.c	Mon Jul 28 16:10:20 2003
> @@ -584,3 +584,27 @@
>  		smp_num_cpus = 1;
>  	}
>  }
> +
> +/*
> + * Initialize the logical CPU number to SAPICID mapping
> + */
> +void __init
> +smp_build_cpu_map (void)
> +{
> +	int sapicid, cpu, i;
> +	int boot_cpu_id = hard_smp_processor_id();
> +
> +	for (cpu = 0; cpu < NR_CPUS; cpu++)
> +		ia64_cpu_to_sapicid[cpu] = -1;
> +
> +	ia64_cpu_to_sapicid[0] = boot_cpu_id;
> +
> +	for (cpu = 1, i = 0; i < smp_boot_data.cpu_count; i++) {
> +		sapicid = smp_boot_data.cpu_phys_id[i];
> +		if (sapicid = -1 || sapicid = boot_cpu_id)
> +			continue;
> +		ia64_cpu_to_sapicid[cpu] = sapicid;
> +		cpu++;
> +	}
> +}
> +
> diff -Naur linux_base/arch/ia64/mm/Makefile linux/arch/ia64/mm/Makefile
> --- linux_base/arch/ia64/mm/Makefile	Sat Jul  5 22:59:38 2003
> +++ linux/arch/ia64/mm/Makefile	Mon Jul 28 10:19:02 2003
> @@ -12,6 +12,8 @@
>  export-objs := init.o
>  
>  obj-y	 := init.o fault.o tlb.o extable.o
> +obj-$(CONFIG_NUMA) += numa.o
> +obj-$(CONFIG_DISCONTIGMEM) += discontig.o
>  obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
>  
>  include $(TOPDIR)/Rules.make
> --- linux_base/arch/ia64/mm/discontig.c	Wed Dec 31 18:00:00 1969
> +++ linux/arch/ia64/mm/discontig.c	Tue Jul 29 20:10:49 2003
> @@ -0,0 +1,282 @@
> +/*
> + * Copyright (c) 2000, 2003 Silicon Graphics, Inc.  All rights reserved.
> + * Copyright (c) 2001 Intel Corp.
> + * Copyright (c) 2001 Tony Luck <tony.luck@intel.com>
> + * Copyright (c) 2002 NEC Corp.
> + * Copyright (c) 2002 Kimio Suganuma <k-suganuma@da.jp.nec.com>
> + */
> +
> +/*
> + * Platform initialization for Discontig Memory
> + */
> +
> +#include <linux/kernel.h>
> +#include <linux/mm.h>
> +#include <linux/bootmem.h>
> +#include <linux/mmzone.h>
> +#include <linux/acpi.h>
> +#include <linux/efi.h>
> +#include <asm/pgalloc.h>
> +#include <asm/tlb.h>
> +
> +
> +/*
> + * Round an address upward to the next multiple of GRANULE size.
> + */
> +#define GRANULEROUNDDOWN(n) ((n) & ~(IA64_GRANULE_SIZE-1))
> +#define GRANULEROUNDUP(n) (((n)+IA64_GRANULE_SIZE-1) & ~(IA64_GRANULE_SIZE-1))
> +
> +/*
> + * Used to locate BOOT_DATA prior to initializing the node data area.
> + */
> +#define BOOT_NODE_DATA(node)	pg_data_ptr[node]
> +
> +/*
> + * To prevent cache aliasing effects, align per-node structures so that they 
> + * start at addresses that are strided by node number.
> + */
> +#define NODEDATA_ALIGN(addr, node)	((((addr) + 1024*1024-1) & ~(1024*1024-1)) + (node)*PAGE_SIZE)
> +
> +
> +static struct ia64_node_data	*boot_node_data[NR_NODES] __initdata;
> +static pg_data_t		*pg_data_ptr[NR_NODES] __initdata;
> +static bootmem_data_t		bdata[NR_NODES] __initdata;
> +static unsigned long		boot_pernode[NR_NODES] __initdata;
> +static unsigned long		boot_pernodesize[NR_NODES] __initdata;
> +
> +extern int  filter_rsvd_memory (unsigned long start, unsigned long end, void *arg);
> +extern struct cpuinfo_ia64 *_cpu_data[NR_CPUS];
> +
> +
> +
> +/*
> + * We allocate one of the bootmem_data_t structs for each piece of memory
> + * that we wish to treat as a contiguous block.  Each such block must start
> + * on a GRANULE boundary.  Multiple banks per node is not supported.
> + *   (Note: on SN2, all memory on a node is trated as a single bank.
> + *   Holes within the bank are supported. This works because memory
> + *   from different banks is not interleaved. The bootmap bitmap
> + *   for the node is somewhat large but not too large).
> + */
> +static int __init
> +build_maps(unsigned long start, unsigned long end, int node)
> +{
> +	bootmem_data_t	*bdp;
> +	unsigned long cstart, epfn;
> +
> +	bdp = &bdata[node];
> +	epfn = GRANULEROUNDUP(__pa(end)) >> PAGE_SHIFT;
> +	cstart = GRANULEROUNDDOWN(__pa(start));
> +
> +	if (!bdp->node_low_pfn) {
> +		bdp->node_boot_start = cstart;
> +		bdp->node_low_pfn = epfn;
> +	} else {
> +		bdp->node_boot_start = min(cstart, bdp->node_boot_start);
> +		bdp->node_low_pfn = max(epfn, bdp->node_low_pfn);
> +	}
> +
> +	min_low_pfn = min(min_low_pfn, bdp->node_boot_start>>PAGE_SHIFT);
> +	max_low_pfn = max(max_low_pfn, bdp->node_low_pfn);
> +
> +	return 0;
> +}
> +
> +
> +/*
> + * Count the number of cpus on the node
> + */
> +static __inline__ int
> +count_cpus(int node)
> +{
> +	int cpu, n=0;
> +
> +	for (cpu=0; cpu < NR_CPUS; cpu++)
> +		if (node = node_cpuid[cpu].nid)
> +			n++;
> +	return n;
> +}
> +
> +
> +/*
> + * Find space on each node for the bootmem map & other per-node data structures.
> + *
> + * Called by efi_memmap_walk to find boot memory on each node. Note that
> + * only blocks that are free are passed to this routine (currently filtered by
> + * free_available_memory).
> + */
> +static int __init
> +find_pernode_space(unsigned long start, unsigned long end, int node)
> +{
> +	unsigned long	mapsize, pages, epfn, map=0, cpu, cpus;
> +	unsigned long	pernodesize=0, pernode;
> +	unsigned long	cpu_data, mmu_gathers;
> +	unsigned long	pstart, length;
> +	bootmem_data_t	*bdp;
> +
> +	pstart = __pa(start);
> +	length = end - start;
> +	epfn = (pstart + length) >> PAGE_SHIFT;
> +	bdp = &bdata[node];
> +
> +	if (pstart < bdp->node_boot_start || epfn > bdp->node_low_pfn)
> +		return 0;
> +
> +	if (!boot_pernode[node]) {
> +		cpus = count_cpus(node);
> +		pernodesize += PAGE_ALIGN(sizeof(struct cpuinfo_ia64)) * cpus;
> +		pernodesize += L1_CACHE_ALIGN(sizeof(mmu_gather_t)) * cpus;
> +		pernodesize += L1_CACHE_ALIGN(sizeof(pg_data_t));
> +		pernodesize += L1_CACHE_ALIGN(sizeof(struct ia64_node_data));
> +		pernodesize = PAGE_ALIGN(pernodesize);
> +		pernode = NODEDATA_ALIGN(pstart, node);
> +	
> +		if (pstart + length > (pernode + pernodesize)) {
> +			boot_pernode[node] = pernode;
> +			boot_pernodesize[node] = pernodesize;
> +			memset(__va(pernode), 0, pernodesize);
> +
> +			cpu_data = pernode;
> +			pernode += PAGE_ALIGN(sizeof(struct cpuinfo_ia64)) * cpus;
> +
> +			mmu_gathers = pernode;
> +			pernode += L1_CACHE_ALIGN(sizeof(mmu_gather_t)) * cpus;
> +
> +			pg_data_ptr[node] = __va(pernode);
> +			pernode += L1_CACHE_ALIGN(sizeof(pg_data_t));
> +
> +			boot_node_data[node] = __va(pernode);
> +			pernode += L1_CACHE_ALIGN(sizeof(struct ia64_node_data));
> +
> +			pg_data_ptr[node]->bdata = &bdata[node];
> +			pernode += L1_CACHE_ALIGN(sizeof(pg_data_t));
> +
> +			for (cpu=0; cpu < NR_CPUS; cpu++) {
> +				if (node = node_cpuid[cpu].nid) {
> +					_cpu_data[cpu] = __va(cpu_data);
> +					_cpu_data[cpu]->node_data = boot_node_data[node];
> +					_cpu_data[cpu]->nodeid = node;
> +					_cpu_data[cpu]->mmu_gathers = __va(mmu_gathers);
> +					cpu_data +=  PAGE_ALIGN(sizeof(struct cpuinfo_ia64));
> +					mmu_gathers += L1_CACHE_ALIGN(sizeof(mmu_gather_t));
> +				}
> +			}
> +
> +		}
> +	}
> +
> +	pernode = boot_pernode[node];
> +	pernodesize = boot_pernodesize[node];
> +	if (pernode && !bdp->node_bootmem_map) {
> +		pages = bdp->node_low_pfn - (bdp->node_boot_start>>PAGE_SHIFT);
> +		mapsize = bootmem_bootmap_pages(pages) << PAGE_SHIFT;
> +
> +		if (pernode - pstart > mapsize)
> +			map = pstart;
> +		else if (pstart + length - pernode - pernodesize > mapsize)
> +			map = pernode + pernodesize;
> +
> +		if (map) {
> +			init_bootmem_node(
> +				BOOT_NODE_DATA(node),
> +				map>>PAGE_SHIFT, 
> +				bdp->node_boot_start>>PAGE_SHIFT,
> +				bdp->node_low_pfn);
> +		}
> +
> +	}
> +
> +	return 0;
> +}
> +
> +
> +/*
> + * Free available memory to the bootmem allocator.
> + *
> + * Note that only blocks that are free are passed to this routine (currently 
> + * filtered by free_available_memory).
> + *
> + */
> +static int __init
> +discontig_free_bootmem_node(unsigned long start, unsigned long end, int node)
> +{
> +	free_bootmem_node(BOOT_NODE_DATA(node), __pa(start), end - start);
> +
> +	return 0;
> +}
> +
> +
> +/*
> + * Reserve the space used by the bootmem maps.
> + */
> +static void __init
> +discontig_reserve_bootmem(void)
> +{
> +	int		node;
> +	unsigned long	base, size, pages;
> +	bootmem_data_t	*bdp;
> +
> +	for (node = 0; node < numnodes; node++) {
> +		bdp = BOOT_NODE_DATA(node)->bdata;
> +
> +		pages = bdp->node_low_pfn - (bdp->node_boot_start>>PAGE_SHIFT);
> +		size = bootmem_bootmap_pages(pages) << PAGE_SHIFT;
> +		base = __pa(bdp->node_bootmem_map);
> +		reserve_bootmem_node(BOOT_NODE_DATA(node), base, size);
> +
> +		size = boot_pernodesize[node];
> +		base = __pa(boot_pernode[node]);
> +		reserve_bootmem_node(BOOT_NODE_DATA(node), base, size);
> +	}
> +}
> +
> +/*
> + * Initialize per-node data
> + *
> + * Finish setting up the node data for this node, then copy it to the other nodes.
> + *
> + */
> +static void __init
> +initialize_pernode_data(void)
> +{
> +	int	cpu, node;
> +
> +	memcpy(boot_node_data[0]->pg_data_ptrs, pg_data_ptr, sizeof(pg_data_ptr));
> +	memcpy(boot_node_data[0]->node_data_ptrs, boot_node_data, sizeof(boot_node_data));
> +
> +	for (node=1; node < numnodes; node++) {
> +		memcpy(boot_node_data[node], boot_node_data[0], sizeof(struct ia64_node_data));
> +		boot_node_data[node]->node = node;
> +	}
> +
> +	for (cpu=0; cpu < NR_CPUS; cpu++) {
> +		node = node_cpuid[cpu].nid;
> +		_cpu_data[cpu]->node_data = boot_node_data[node];
> +		_cpu_data[cpu]->nodeid = node;
> +	}
> +}
> +
> +
> +/*
> + * Called early in boot to setup the boot memory allocator, and to
> + * allocate the node-local pg_data & node-directory data structures..
> + */
> +void __init
> +discontig_mem_init(void)
> +{
> +	if (numnodes = 0) {
> +		printk("node info missing!\n");
> +		numnodes = 1;
> +	}
> +
> +	min_low_pfn = -1;
> +	max_low_pfn = 0;
> +
> +        efi_memmap_walk(filter_rsvd_memory, build_maps);
> +        efi_memmap_walk(filter_rsvd_memory, find_pernode_space);
> +        efi_memmap_walk(filter_rsvd_memory, discontig_free_bootmem_node);
> +
> +	discontig_reserve_bootmem();
> +	initialize_pernode_data();
> +}
> +
> diff -Naur linux_base/arch/ia64/mm/init.c linux/arch/ia64/mm/init.c
> --- linux_base/arch/ia64/mm/init.c	Sat Jul  5 22:59:38 2003
> +++ linux/arch/ia64/mm/init.c	Wed Jul 30 11:58:26 2003
> @@ -16,6 +16,7 @@
>  #include <linux/slab.h>
>  #include <linux/swap.h>
>  #include <linux/efi.h>
> +#include <linux/mmzone.h>
>  
>  #include <asm/bitops.h>
>  #include <asm/dma.h>
> @@ -26,16 +27,21 @@
>  #include <asm/sal.h>
>  #include <asm/system.h>
>  #include <asm/uaccess.h>
> +#include <asm/tlb.h>
> +#include <asm/numa.h>
>  
>  /* References to section boundaries: */
>  extern char _stext, _etext, _edata, __init_begin, __init_end;
>  
>  extern void ia64_tlb_init (void);
> +extern int  filter_rsvd_memory (unsigned long, unsigned long, void *);
>  
> +/* Note - may be changed by platform_setup */
>  unsigned long MAX_DMA_ADDRESS = PAGE_OFFSET + 0x100000000UL;
>  #define LARGE_GAP 0x40000000 /* Use virtual mem map if a hole is > than this */
>  
> -static unsigned long totalram_pages;
> +static unsigned long totalram_pages, reserved_pages;
> +struct page *zero_page_memmap_ptr;		/* map entry for zero page */
>  
>  unsigned long vmalloc_end = VMALLOC_END_INIT;
>  
> @@ -107,10 +113,11 @@
>  void
>  free_initmem (void)
>  {
> -	unsigned long addr;
> +	unsigned long addr, eaddr;
>  
>  	addr = (unsigned long) &__init_begin;
> -	for (; addr < (unsigned long) &__init_end; addr += PAGE_SIZE) {
> +	eaddr = (unsigned long) &__init_end;
> +	for (; addr < eaddr; addr += PAGE_SIZE) {
>  		clear_bit(PG_reserved, &virt_to_page((void *)addr)->flags);
>  		set_page_count(virt_to_page((void *)addr), 1);
>  		free_page(addr);
> @@ -186,58 +193,39 @@
>  void
>  show_mem(void)
>  {
> -	int i, total = 0, reserved = 0;
> -	int shared = 0, cached = 0;
> +	int i, reserved;
> +	int shared, cached;
> +	pg_data_t *pgdat;
> +	char *tchar = (numnodes > 1) ? "\t" : "";
>  
>  	printk("Mem-info:\n");
>  	show_free_areas();
>  
> -#ifdef CONFIG_DISCONTIGMEM
> -	{
> -		pg_data_t *pgdat = pgdat_list;
> -
> -		printk("Free swap:       %6dkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
> -		do {
> -			printk("Node ID: %d\n", pgdat->node_id);
> -			for(i = 0; i < pgdat->node_size; i++) {
> -				if (PageReserved(pgdat->node_mem_map+i))
> -					reserved++;
> -				else if (PageSwapCache(pgdat->node_mem_map+i))
> -					cached++;
> -				else if (page_count(pgdat->node_mem_map + i))
> -					shared += page_count(pgdat->node_mem_map + i) - 1;
> -			}
> -			printk("\t%d pages of RAM\n", pgdat->node_size);
> -			printk("\t%d reserved pages\n", reserved);
> -			printk("\t%d pages shared\n", shared);
> -			printk("\t%d pages swap cached\n", cached);
> -			pgdat = pgdat->node_next;
> -		} while (pgdat);
> -		printk("Total of %ld pages in page table cache\n", pgtable_cache_size);
> -		show_buffers();
> -		printk("%d free buffer pages\n", nr_free_buffer_pages());
> -	}
> -#else /* !CONFIG_DISCONTIGMEM */
>  	printk("Free swap:       %6dkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
> -	i = max_mapnr;
> -	while (i-- > 0) {
> -		if (!VALID_PAGE(mem_map + i))
> -			continue;
> -		total++;
> -		if (PageReserved(mem_map+i))
> -			reserved++;
> -		else if (PageSwapCache(mem_map+i))
> -			cached++;
> -		else if (page_count(mem_map + i))
> -			shared += page_count(mem_map + i) - 1;
> +	for_each_pgdat(pgdat) {
> +		reserved=0;
> +		cached=0;
> +		shared=0;
> +		if (numnodes > 1)
> +			printk("Node ID: %d\n", pgdat->node_id);
> +		for(i = 0; i < pgdat->node_size; i++) {
> +			if (!VALID_PAGE(pgdat->node_mem_map+i))
> +				continue;
> +			if (PageReserved(pgdat->node_mem_map+i))
> +				reserved++;
> +			else if (PageSwapCache(pgdat->node_mem_map+i))
> +				cached++;
> +			else if (page_count(pgdat->node_mem_map + i))
> +				shared += page_count(pgdat->node_mem_map + i) - 1;
> +		}
> +		printk("%s%ld pages of RAM\n", tchar, pgdat->node_size);
> +		printk("%s%d reserved pages\n", tchar, reserved);
> +		printk("%s%d pages shared\n", tchar, shared);
> +		printk("%s%d pages swap cached\n", tchar, cached);
>  	}
> -	printk("%d pages of RAM\n", total);
> -	printk("%d reserved pages\n", reserved);
> -	printk("%d pages shared\n", shared);
> -	printk("%d pages swap cached\n", cached);
> -	printk("%ld pages in page table cache\n", pgtable_cache_size);
> +	printk("Total of %ld pages in page table cache\n", pgtable_cache_size);
>  	show_buffers();
> -#endif /* !CONFIG_DISCONTIGMEM */
> +	printk("%d free buffer pages\n", nr_free_buffer_pages());
>  }
>  
>  /*
> @@ -357,8 +345,10 @@
>  static int
>  create_mem_map_page_table (u64 start, u64 end, void *arg)
>  {
> -	unsigned long address, start_page, end_page;
> +	unsigned long address, start_page, end_page, next_blk_page;
> +	unsigned long blk_start;
>  	struct page *map_start, *map_end;
> +	int node=0;
>  	pgd_t *pgd;
>  	pmd_t *pmd;
>  	pte_t *pte;
> @@ -371,18 +361,35 @@
>  	start_page = (unsigned long) map_start & PAGE_MASK;
>  	end_page = PAGE_ALIGN((unsigned long) map_end);
>  
> +	/* force the first iteration to get node id */
> +	blk_start = start;
> +	next_blk_page = 0;
> +
>  	for (address = start_page; address < end_page; address += PAGE_SIZE) {
> +
> +		/* if we went across a node boundary, get new nid */
> +		if (address >= next_blk_page) {
> +			struct page *map_next_blk;
> +
> +			node = paddr_to_nid(__pa(blk_start));
> +
> +			/* get end addr of this memblk as next blk_start */
> +			blk_start = (unsigned long) __va(min(end, memblk_endpaddr(__pa(blk_start))));
> +			map_next_blk = vmem_map + MAP_NR_DENSE(blk_start);
> +			next_blk_page = PAGE_ALIGN((unsigned long) map_next_blk);
> +		}
> +
>  		pgd = pgd_offset_k(address);
>  		if (pgd_none(*pgd))
> -			pgd_populate(&init_mm, pgd, alloc_bootmem_pages(PAGE_SIZE));
> +			pgd_populate(&init_mm, pgd, alloc_bootmem_pages_node(NODE_DATA(node), PAGE_SIZE));
>  		pmd = pmd_offset(pgd, address);
>  
>  		if (pmd_none(*pmd))
> -			pmd_populate(&init_mm, pmd, alloc_bootmem_pages(PAGE_SIZE));
> +			pmd_populate(&init_mm, pmd, alloc_bootmem_pages_node(NODE_DATA(node), PAGE_SIZE));
>  		pte = pte_offset(pmd, address);
>  
>  		if (pte_none(*pte))
> -			set_pte(pte, mk_pte_phys(__pa(alloc_bootmem_pages(PAGE_SIZE)),
> +			set_pte(pte, mk_pte_phys(__pa(alloc_bootmem_pages_node(NODE_DATA(node), PAGE_SIZE)),
>  						 PAGE_KERNEL));
>   	}
>   	return 0;
> @@ -396,6 +403,14 @@
>  	int highmem;
>  };
>  
> +struct memmap_count_callback_data {
> +	int node;
> +	unsigned long num_physpages;
> +	unsigned long num_dma_physpages;
> +	unsigned long min_pfn;
> +	unsigned long max_pfn;
> +} cdata;
> +
>  static int
>  virtual_memmap_init (u64 start, u64 end, void *arg)
>  {
> @@ -451,17 +466,7 @@
>  		efi_memmap_walk(virtual_memmap_init, &args);
>  	}
>  
> -	return page_to_phys(end);
> -}
> -
> -static int
> -count_dma_pages (u64 start, u64 end, void *arg)
> -{
> -	unsigned long *count = arg;
> -
> -	if (end <= MAX_DMA_ADDRESS)
> -		*count += (end - start) >> PAGE_SHIFT;
> -	return 0;
> +	return page_to_phys(end-1) + PAGE_SIZE;;
>  }
>  
>  int
> @@ -472,16 +477,27 @@
>  	return __get_user(byte, (char *) page) = 0;
>  }
>  
> +#define GRANULEROUNDDOWN(n) ((n) & ~(IA64_GRANULE_SIZE-1))
> +#define GRANULEROUNDUP(n) (((n)+IA64_GRANULE_SIZE-1) & ~(IA64_GRANULE_SIZE-1))
> +#define ORDERROUNDDOWN(n) ((n) & ~((PAGE_SIZE<<MAX_ORDER)-1))
>  static int
> -count_pages (u64 start, u64 end, void *arg)
> +count_pages (u64 start, u64 end, int node)
>  {
> -	unsigned long *count = arg;
> -
> -	*count += (end - start) >> PAGE_SHIFT;
> +	start = __pa(start);
> +	end = __pa(end);
> +	if (node = cdata.node) {
> +		cdata.num_physpages += (end - start) >> PAGE_SHIFT;
> +		if (start <= __pa(MAX_DMA_ADDRESS))
> +			cdata.num_dma_physpages += (min(end, __pa(MAX_DMA_ADDRESS)) - start) >> PAGE_SHIFT;
> +		start = GRANULEROUNDDOWN(__pa(start));
> +		start = ORDERROUNDDOWN(start);
> +		end = GRANULEROUNDUP(__pa(end));
> +		cdata.max_pfn = max(cdata.max_pfn, end >> PAGE_SHIFT);
> +		cdata.min_pfn = min(cdata.min_pfn, start >> PAGE_SHIFT);
> +	}
>  	return 0;
>  }
>  
> -#ifndef CONFIG_DISCONTIGMEM
>  static int
>  find_largest_hole(u64 start, u64 end, void *arg)
>  {
> @@ -495,7 +511,6 @@
>  	last_end = end;
>  	return 0;
>  }
> -#endif
>  
>  /*
>   * Set up the page tables.
> @@ -506,73 +521,76 @@
>  	unsigned long max_dma;
>  	unsigned long zones_size[MAX_NR_ZONES];
>  	unsigned long zholes_size[MAX_NR_ZONES];
> -#ifndef CONFIG_DISCONTIGMEM
>  	unsigned long max_gap;
> -#endif
> +	int node;
>  
>  	/* initialize mem_map[] */
>  
> -	memset(zones_size, 0, sizeof(zones_size));
> -	memset(zholes_size, 0, sizeof(zholes_size));
> -
> -	num_physpages = 0;
> -	efi_memmap_walk(count_pages, &num_physpages);
> -
> -	num_dma_physpages = 0;
> -	efi_memmap_walk(count_dma_pages, &num_dma_physpages);
> -
>  	max_dma = virt_to_phys((void *) MAX_DMA_ADDRESS) >> PAGE_SHIFT;
> -
> -	if (max_low_pfn < max_dma) {
> -		zones_size[ZONE_DMA] = max_low_pfn;
> -		zholes_size[ZONE_DMA] = max_low_pfn - num_dma_physpages;
> -	} else {
> -		zones_size[ZONE_DMA] = max_dma;
> -		zholes_size[ZONE_DMA] = max_dma - num_dma_physpages;
> -		if (num_physpages > num_dma_physpages) {
> -			zones_size[ZONE_NORMAL] = max_low_pfn - max_dma;
> -			zholes_size[ZONE_NORMAL] = (max_low_pfn - max_dma)
> -					- (num_physpages - num_dma_physpages);
> -		}
> -	}
> -
> -#ifdef CONFIG_DISCONTIGMEM
> -	free_area_init_node(0, NULL, NULL, zones_size, 0, zholes_size);
> -#else
>  	max_gap = 0;
>  	efi_memmap_walk(find_largest_hole, (u64 *)&max_gap);
>  
> -	if (max_gap < LARGE_GAP) {
> -		vmem_map = (struct page *)0;
> -		free_area_init_node(0, NULL, NULL, zones_size, 0, zholes_size);
> +	for (node=0; node < numnodes; node++) {
> +		memset(zones_size, 0, sizeof(zones_size));
> +		memset(zholes_size, 0, sizeof(zholes_size));
> +		memset(&cdata, 0, sizeof(cdata));
> +
> +		cdata.node = node;
> +		cdata.min_pfn = ~0;
> +
> +		efi_memmap_walk(filter_rsvd_memory, count_pages);
> +		num_dma_physpages += cdata.num_dma_physpages;
> +		num_physpages += cdata.num_physpages;
> +
> +		if (cdata.min_pfn >= max_dma) {
> +			zones_size[ZONE_NORMAL] = cdata.max_pfn - cdata.min_pfn;
> +			zholes_size[ZONE_NORMAL] = cdata.max_pfn - cdata.min_pfn - cdata.num_physpages;
> +		} else if (cdata.max_pfn < max_dma) {
> +			zones_size[ZONE_DMA] = cdata.max_pfn - cdata.min_pfn;
> +			zholes_size[ZONE_DMA] = cdata.max_pfn - cdata.min_pfn - cdata.num_dma_physpages;
> +		} else {
> +			zones_size[ZONE_DMA] = max_dma - cdata.min_pfn;
> +			zholes_size[ZONE_DMA] = zones_size[ZONE_DMA] - cdata.num_dma_physpages;
> +			zones_size[ZONE_NORMAL] = cdata.max_pfn - max_dma;
> +			zholes_size[ZONE_NORMAL] = zones_size[ZONE_NORMAL] - (cdata.num_physpages - cdata.num_dma_physpages);
> +		}
> +	
> +		if (numnodes = 1 && max_gap < LARGE_GAP) {
> +			vmem_map = (struct page *)0;
> +			zones_size[ZONE_DMA] += cdata.min_pfn;
> +			zholes_size[ZONE_DMA] += cdata.min_pfn;
> +			free_area_init_core(0, NODE_DATA(node), &mem_map, zones_size, 0, zholes_size, NULL);
> +		} else {
> +	
> +			/* allocate virtual mem_map */
> +	
> +			if (node = 0) {
> +				unsigned long map_size;
> +				map_size = PAGE_ALIGN(max_low_pfn*sizeof(struct page));
> +				vmalloc_end -= map_size;
> +				mem_map = vmem_map = (struct page *) vmalloc_end;
> +				efi_memmap_walk(create_mem_map_page_table, 0);
> +				printk(KERN_INFO "Virtual mem_map starts at 0x%p\n", mem_map);
> +			}
> +	
> +			free_area_init_node(node, NODE_DATA(node), vmem_map+cdata.min_pfn, zones_size, 
> +				cdata.min_pfn<<PAGE_SHIFT, zholes_size);
> +		}
>  	}
> -	else {
> -		unsigned long map_size;
>  
> -		/* allocate virtual mem_map */
> -
> -		map_size = PAGE_ALIGN(max_low_pfn*sizeof(struct page));
> -		vmalloc_end -= map_size;
> -		vmem_map = (struct page *) vmalloc_end;
> -		efi_memmap_walk(create_mem_map_page_table, 0);
> -
> -		free_area_init_node(0, NULL, vmem_map, zones_size, 0, zholes_size);
> -		printk(KERN_INFO "Virtual mem_map starts at 0x%p\n", mem_map);
> -	}
> -#endif
> +	zero_page_memmap_ptr = virt_to_page(empty_zero_page);
>  }
>  
>  static int
>  count_reserved_pages (u64 start, u64 end, void *arg)
>  {
>  	unsigned long num_reserved = 0;
> -	unsigned long *count = arg;
>  	struct page *pg;
>  
>  	for (pg = virt_to_page((void *)start); pg < virt_to_page((void *)end); ++pg)
>  		if (PageReserved(pg))
>  			++num_reserved;
> -	*count += num_reserved;
> +	reserved_pages += num_reserved;
>  	return 0;
>  }
>  
> @@ -580,8 +598,10 @@
>  mem_init (void)
>  {
>  	extern char __start_gate_section[];
> -	long reserved_pages, codesize, datasize, initsize;
> +	long codesize, datasize, initsize;
>  	unsigned long num_pgt_pages;
> +	pg_data_t *pgdat;
> +
>  
>  #ifdef CONFIG_PCI
>  	/*
> @@ -598,10 +618,11 @@
>  	max_mapnr = max_low_pfn;
>  	high_memory = __va(max_low_pfn * PAGE_SIZE);
>  
> -	totalram_pages += free_all_bootmem();
> +	for_each_pgdat(pgdat)
> +		totalram_pages += free_all_bootmem_node(pgdat);
>  
>  	reserved_pages = 0;
> -	efi_memmap_walk(count_reserved_pages, &reserved_pages);
> +	efi_memmap_walk(filter_rsvd_memory, count_reserved_pages);
>  
>  	codesize =  (unsigned long) &_etext - (unsigned long) &_stext;
>  	datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
> --- linux_base/arch/ia64/mm/numa.c	Wed Dec 31 18:00:00 1969
> +++ linux/arch/ia64/mm/numa.c	Wed Jul 30 09:45:38 2003
> @@ -0,0 +1,104 @@
> +/*
> + * This file is subject to the terms and conditions of the GNU General Public
> + * License.  See the file "COPYING" in the main directory of this archive
> + * for more details.
> + *
> + * This file contains NUMA specific variables and functions which can
> + * be split away from DISCONTIGMEM and are used on NUMA machines with
> + * contiguous memory.
> + * 
> + *                         2002/08/07 Erich Focht <efocht@ess.nec.de>
> + */
> +
> +#include <linux/config.h>
> +#include <linux/kernel.h>
> +#include <linux/mm.h>
> +#include <linux/init.h>
> +#include <linux/bootmem.h>
> +#include <linux/mmzone.h>
> +#include <linux/smp.h>
> +#include <asm/numa.h>
> +
> +/*
> + * The following structures are usually initialized by ACPI or
> + * similar mechanisms and describe the NUMA characteristics of the machine.
> + */
> +int num_memblks = 0;
> +struct node_memblk_s node_memblk[NR_MEMBLKS];
> +struct node_cpuid_s node_cpuid[NR_CPUS];
> +/*
> + * This is a matrix with "distances" between nodes, they should be
> + * proportional to the memory access latency ratios.
> + */
> +u8 numa_slit[NR_NODES * NR_NODES];
> +
> +/* Identify which cnode a physical address resides on */
> +int
> +paddr_to_nid(unsigned long paddr)
> +{
> +	int	i;
> +
> +	for (i = 0; i < num_memblks; i++)
> +		if (paddr >= node_memblk[i].start_paddr &&
> +		    paddr < node_memblk[i].start_paddr + node_memblk[i].size)
> +			break;
> +
> +	return (i < num_memblks) ? node_memblk[i].nid : (num_memblks ? -1 : 0);
> +}
> +
> +/* return end addr of a memblk */
> +unsigned long
> +memblk_endpaddr(unsigned long paddr)
> +{
> +	int	i;
> +
> +	for (i = 0; i < num_memblks; i++)
> +		if (paddr >= node_memblk[i].start_paddr &&
> +		    paddr < node_memblk[i].start_paddr + node_memblk[i].size)
> +			return node_memblk[i].start_paddr + node_memblk[i].size;
> +
> +	return 0;
> +}
> +
> +
> +/* on which node is each logical CPU (one cacheline even for 64 CPUs) */
> +volatile char cpu_to_node_map[NR_CPUS] __cacheline_aligned;
> +
> +/* which logical CPUs are on which nodes */
> +volatile unsigned long node_to_cpu_mask[NR_NODES]  __cacheline_aligned;
> +
> +/*
> + * Build cpu to node mapping and initialize the per node cpu masks.
> + */
> +void __init
> +build_cpu_to_node_map (void)
> +{
> +	int cpu, i, node;
> +
> +	for(cpu = 0; cpu < NR_CPUS; ++cpu) {
> +		/*
> +		 * All Itanium NUMA platforms I know use ACPI, so maybe we
> +		 * can drop this ifdef completely.                    [EF]
> +		 */
> +#ifdef CONFIG_SMP
> +# ifdef CONFIG_ACPI_NUMA
> +		node = -1;
> +		for (i = 0; i < NR_CPUS; ++i) {
> +			extern volatile int ia64_cpu_to_sapicid[];
> +			if (ia64_cpu_to_sapicid[cpu] = node_cpuid[i].phys_id) {
> +				node = node_cpuid[i].nid;
> +				break;
> +			}
> +		}
> +# else
> +#		error Fixme: Dunno how to build CPU-to-node map.
> +# endif
> +		cpu_to_node_map[cpu] = node;
> +		if (node >= 0)
> +			__set_bit(cpu, &node_to_cpu_mask[node]);
> +#else
> +			__set_bit(0, &node_to_cpu_mask[0]);
> +#endif
> +	}
> +}
> +
> diff -Naur linux_base/drivers/acpi/Config.in linux/drivers/acpi/Config.in
> --- linux_base/drivers/acpi/Config.in	Sat Jul  5 22:59:38 2003
> +++ linux/drivers/acpi/Config.in	Wed Jul 30 09:45:38 2003
> @@ -36,6 +36,9 @@
>      tristate     '  Fan'		CONFIG_ACPI_FAN
>      tristate     '  Processor'		CONFIG_ACPI_PROCESSOR
>      dep_tristate '  Thermal Zone' CONFIG_ACPI_THERMAL $CONFIG_ACPI_PROCESSOR
> +    if [ "$CONFIG_NUMA" = "y" ]; then
> +      bool	 '  NUMA support' 	CONFIG_ACPI_NUMA
> +    fi
>      bool         '  Debug Statements' 	CONFIG_ACPI_DEBUG
>    fi
>  
> @@ -99,6 +102,7 @@
>      define_bool CONFIG_ACPI_FAN		n
>      define_bool CONFIG_ACPI_PROCESSOR	n
>      define_bool CONFIG_ACPI_THERMAL	n
> +    define_bool CONFIG_ACPI_NUMA	y
>      endmenu
>    fi
>  
> @@ -119,8 +123,10 @@
>      tristate     '  Fan'		CONFIG_ACPI_FAN
>      tristate     '  Processor'		CONFIG_ACPI_PROCESSOR
>      dep_tristate '  Thermal Zone' CONFIG_ACPI_THERMAL $CONFIG_ACPI_PROCESSOR
> -    bool         '  Debug Statements' 	CONFIG_ACPI_DEBUG
> +    if [ "$CONFIG_NUMA" = "y" ]; then
> +      bool	 '  NUMA support'	CONFIG_ACPI_NUMA
> +    fi
> +    bool         '  Debug Statements'	CONFIG_ACPI_DEBUG
>      endmenu
>    fi
> -
>  fi
> diff -Naur linux_base/drivers/acpi/Makefile linux/drivers/acpi/Makefile
> --- linux_base/drivers/acpi/Makefile	Sat Jul  5 22:59:38 2003
> +++ linux/drivers/acpi/Makefile	Mon Jul 28 10:19:02 2003
> @@ -51,5 +51,6 @@
>    obj-$(CONFIG_ACPI_THERMAL)	+= thermal.o
>    obj-$(CONFIG_ACPI_SYSTEM)	+= system.o
>  endif
> +obj-$(CONFIG_ACPI_NUMA)         += numa.o
>  
>  include $(TOPDIR)/Rules.make
> --- linux_base/drivers/acpi/numa.c	Wed Dec 31 18:00:00 1969
> +++ linux/drivers/acpi/numa.c	Mon Jul 28 16:10:20 2003
> @@ -0,0 +1,190 @@
> +/*
> + *  acpi_numa.c - ACPI NUMA support
> + *
> + *  Copyright (C) 2002 Takayoshi Kochi <t-kouchi@cq.jp.nec.com>
> + *
> + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> + *
> + *  This program is free software; you can redistribute it and/or modify
> + *  it under the terms of the GNU General Public License as published by
> + *  the Free Software Foundation; either version 2 of the License, or
> + *  (at your option) any later version.
> + *
> + *  This program is distributed in the hope that it will be useful,
> + *  but WITHOUT ANY WARRANTY; without even the implied warranty of
> + *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + *  GNU General Public License for more details.
> + *
> + *  You should have received a copy of the GNU General Public License
> + *  along with this program; if not, write to the Free Software
> + *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
> + *
> + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> + *
> + */
> +
> +#include <linux/config.h>
> +#include <linux/init.h>
> +#include <linux/kernel.h>
> +#include <linux/types.h>
> +#include <linux/errno.h>
> +#include <linux/acpi.h>
> +#include "acpi_bus.h"
> +
> +extern int __init acpi_table_parse_madt_family (enum acpi_table_id id, unsigned long madt_size, int entry_id, acpi_madt_entry_handler handler);
> +
> +#define SRAT_DEBUG 0
> +
> +void __init
> +acpi_table_print_srat_entry (
> +	acpi_table_entry_header	*header)
> +{
> +	if (!header)
> +		return;
> +
> +	switch (header->type) {
> +
> +	case ACPI_SRAT_PROCESSOR_AFFINITY:
> +	{
> +		struct acpi_table_processor_affinity *p > +			(struct acpi_table_processor_affinity*) header;
> +		if (SRAT_DEBUG || !p->flags.enabled)
> +			printk(KERN_INFO "SRAT Processor (id[0x%02x] eid[0x%02x]) in proximity domain %d %s\n",
> +		       		p->apic_id, p->lsapic_eid, p->proximity_domain,
> +		       		p->flags.enabled?"enabled":"disabled");
> +	}
> +		break;
> +
> +	case ACPI_SRAT_MEMORY_AFFINITY:
> +	{
> +		struct acpi_table_memory_affinity *p > +			(struct acpi_table_memory_affinity*) header;
> +		if (SRAT_DEBUG || !p->flags.enabled)
> +			printk(KERN_INFO "SRAT Memory (0x%08x%08x length 0x%08x%08x type 0x%x) in proximity domain %d %s%s\n",
> +		       		p->base_addr_hi, p->base_addr_lo, p->length_hi, p->length_lo,
> +		       		p->memory_type, p->proximity_domain,
> +		       		p->flags.enabled ? "enabled" : "disabled",
> +		       		p->flags.hot_pluggable ? " hot-pluggable" : "");
> +	}
> +		break;
> +
> +	default:
> +		printk(KERN_WARNING "Found unsupported SRAT entry (type = 0x%x)\n",
> +			header->type);
> +		break;
> +	}
> +}
> +
> +
> +static int __init
> +acpi_parse_slit (unsigned long phys_addr, unsigned long size)
> +{
> +	struct acpi_table_slit	*slit;
> +	u32			localities;
> +
> +	if (!phys_addr || !size)
> +		return -EINVAL;
> +
> +	slit = (struct acpi_table_slit *) __va(phys_addr);
> +
> +	/* downcast just for %llu vs %lu for i386/ia64  */
> +	localities = (u32) slit->localities;
> +
> +	printk(KERN_INFO "SLIT localities %ux%u\n", localities, localities);
> +
> +	acpi_numa_slit_init(slit);
> +
> +	return 0;
> +}
> +
> +
> +static int __init
> +acpi_parse_processor_affinity (acpi_table_entry_header *header)
> +{
> +	struct acpi_table_processor_affinity *processor_affinity = NULL;
> +
> +	processor_affinity = (struct acpi_table_processor_affinity*) header;
> +	if (!processor_affinity)
> +		return -EINVAL;
> +
> +	acpi_table_print_srat_entry(header);
> +
> +	/* let architecture-dependent part to do it */
> +	acpi_numa_processor_affinity_init(processor_affinity);
> +
> +	return 0;
> +}
> +
> +
> +static int __init
> +acpi_parse_memory_affinity (acpi_table_entry_header *header)
> +{
> +	struct acpi_table_memory_affinity *memory_affinity = NULL;
> +
> +	memory_affinity = (struct acpi_table_memory_affinity*) header;
> +	if (!memory_affinity)
> +		return -EINVAL;
> +
> +	acpi_table_print_srat_entry(header);
> +
> +	/* let architecture-dependent part to do it */
> +	acpi_numa_memory_affinity_init(memory_affinity);
> +
> +	return 0;
> +}
> +
> +
> +static int __init
> +acpi_parse_srat (unsigned long phys_addr, unsigned long size)
> +{
> +	struct acpi_table_srat	*srat = NULL;
> +
> +	if (!phys_addr || !size)
> +		return -EINVAL;
> +
> +	srat = (struct acpi_table_srat *) __va(phys_addr);
> +
> +	printk(KERN_INFO "SRAT revision %d\n", srat->table_revision);
> +
> +	return 0;
> +}
> +
> +
> +int __init
> +acpi_table_parse_srat (
> +	enum acpi_srat_entry_id	id,
> +	acpi_madt_entry_handler	handler)
> +{
> +	return acpi_table_parse_madt_family(ACPI_SRAT, sizeof(struct acpi_table_srat),
> +					    id, handler);
> +}
> +
> +
> +int __init
> +acpi_numa_init()
> +{
> +	int			result;
> +
> +	/* SRAT: Static Resource Affinity Table */
> +	result = acpi_table_parse(ACPI_SRAT, acpi_parse_srat);
> +
> +	if (result > 0) {
> +		result = acpi_table_parse_srat(ACPI_SRAT_PROCESSOR_AFFINITY,
> +					       acpi_parse_processor_affinity);
> +		result = acpi_table_parse_srat(ACPI_SRAT_MEMORY_AFFINITY,
> +					       acpi_parse_memory_affinity);
> +	} else {
> +		/* FIXME */
> +		printk("Warning: acpi_table_parse(ACPI_SRAT) returned %d!\n",result);
> +	}
> +
> +	/* SLIT: System Locality Information Table */
> +	result = acpi_table_parse(ACPI_SLIT, acpi_parse_slit);
> +	if (result < 1) {
> +		/* FIXME */
> +		printk("Warning: acpi_table_parse(ACPI_SLIT) returned %d!\n",result);
> +	}
> +
> +	acpi_numa_arch_fixup();
> +	return 0;
> +}
> diff -Naur linux_base/drivers/acpi/tables.c linux/drivers/acpi/tables.c
> --- linux_base/drivers/acpi/tables.c	Sat Jul  5 22:59:38 2003
> +++ linux/drivers/acpi/tables.c	Mon Jul 28 16:10:20 2003
> @@ -224,11 +224,13 @@
>  
>  
>  int __init
> -acpi_table_parse_madt (
> +acpi_table_parse_madt_family (
>  	enum acpi_table_id	id,
> +	unsigned long		madt_size,
> +	int			entry_id,
>  	acpi_madt_entry_handler	handler)
>  {
> -	struct acpi_table_madt	*madt = NULL;
> +	void			*madt = NULL;
>  	acpi_table_entry_header	*entry = NULL;
>  	unsigned long		count = 0;
>  	unsigned long		madt_end = 0;
> @@ -240,19 +242,21 @@
>  	/* Locate the MADT (if exists). There should only be one. */
>  
>  	for (i = 0; i < sdt.count; i++) {
> -		if (sdt.entry[i].id != ACPI_APIC)
> +		if (sdt.entry[i].id != id)
>  			continue;
> -		madt = (struct acpi_table_madt *)
> +		madt = (void *)
>  			__acpi_map_table(sdt.entry[i].pa, sdt.entry[i].size);
>  		if (!madt) {
> -			printk(KERN_WARNING PREFIX "Unable to map MADT\n");
> +			printk(KERN_WARNING PREFIX "Unable to map %s\n",
> +			       acpi_table_signatures[id]);
>  			return -ENODEV;
>  		}
>  		break;
>  	}
>  
>  	if (!madt) {
> -		printk(KERN_WARNING PREFIX "MADT not present\n");
> +		printk(KERN_WARNING PREFIX "%s not present\n",
> +		       acpi_table_signatures[id]);
>  		return -ENODEV;
>  	}
>  
> @@ -261,21 +265,31 @@
>  	/* Parse all entries looking for a match. */
>  
>  	entry = (acpi_table_entry_header *)
> -		((unsigned long) madt + sizeof(struct acpi_table_madt));
> +		((unsigned long) madt + madt_size);
>  
>  	while (((unsigned long) entry) < madt_end) {
> -		if (entry->type = id) {
> +		if (entry->type = entry_id) {
>  			count++;
>  			handler(entry);
>  		}
>  		entry = (acpi_table_entry_header *)
> -			((unsigned long) entry += entry->length);
> +			((unsigned long) entry + entry->length);
>  	}
>  
>  	return count;
>  }
>  
>  
> +int __init
> +acpi_table_parse_madt (
> +	enum acpi_madt_entry_id	id,
> +	acpi_madt_entry_handler	handler)
> +{
> +	return acpi_table_parse_madt_family(ACPI_APIC, sizeof(struct acpi_table_madt),
> +					    id, handler);
> +}
> +
> +
>  int __init
>  acpi_table_parse (
>  	enum acpi_table_id	id,
> diff -Naur linux_base/include/asm-ia64/acpi.h linux/include/asm-ia64/acpi.h
> --- linux_base/include/asm-ia64/acpi.h	Sat Jul  5 22:46:22 2003
> +++ linux/include/asm-ia64/acpi.h	Wed Jul 30 12:07:29 2003
> @@ -97,17 +97,18 @@
>  	} while (0)
>  
>  const char *acpi_get_sysname (void);
> -int acpi_boot_init (char *cdline);
>  int acpi_request_vector (u32 int_type);
>  int acpi_get_prt (struct pci_vector_struct **vectors, int *count);
>  int acpi_get_interrupt_model (int *type);
>  int acpi_irq_to_vector (u32 irq);
>  
> -#ifdef CONFIG_DISCONTIGMEM
> -#define NODE_ARRAY_INDEX(x)	((x) / 8)	/* 8 bits/char */
> -#define NODE_ARRAY_OFFSET(x)	((x) % 8)	/* 8 bits/char */
> -#define MAX_PXM_DOMAINS		(256)
> -#endif /* CONFIG_DISCONTIGMEM */
> +#ifdef CONFIG_ACPI_NUMA
> +#include <asm/numa.h>
> +/* Proximity bitmap length; _PXM is at most 255 (8 bit)*/
> +#define MAX_PXM_DOMAINS (256)
> +extern int __initdata pxm_to_nid_map[MAX_PXM_DOMAINS];
> +extern int __initdata nid_to_pxm_map[NR_NODES];
> +#endif
>  
>  #endif /*__KERNEL__*/
>  
> --- linux_base/include/asm-ia64/mmzone.h	Wed Dec 31 18:00:00 1969
> +++ linux/include/asm-ia64/mmzone.h	Wed Jul 30 12:04:50 2003
> @@ -0,0 +1,63 @@
> +/*
> + * This file is subject to the terms and conditions of the GNU General Public
> + * License.  See the file "COPYING" in the main directory of this archive
> + * for more details.
> + *
> + * Copyright (c) 2000,2003 Silicon Graphics, Inc.  All rights reserved.
> + * Copyright (c) 2002 NEC Corp.
> + * Copyright (c) 2002 Erich Focht <efocht@ess.nec.de>
> + * Copyright (c) 2002 Kimio Suganuma <k-suganuma@da.jp.nec.com>
> + */
> +#ifndef _ASM_IA64_MMZONE_H
> +#define _ASM_IA64_MMZONE_H
> +
> +#include <linux/config.h>
> +#include <linux/init.h>
> +
> +
> +#ifdef CONFIG_NUMA
> +
> +#ifdef CONFIG_IA64_DIG
> +
> +/*
> + * Platform definitions for DIG platform with contiguous memory.
> + */
> +#define MAX_PHYSNODE_ID	8		/* Maximum node number +1 */
> +#define NR_NODES	8		/* Maximum number of nodes in SSI */
> +#define NR_MEMBLKS	(NR_NODES * 32)
> +
> +
> +
> +
> +#elif CONFIG_IA64_SGI_SN2
> +
> +/*
> + * Platform definitions for DIG platform with contiguous memory.
> + */
> +#define MAX_PHYSNODE_ID	2048		/* Maximum node number +1 */
> +#define NR_NODES	256		/* Maximum number of compute nodes in SSI */
> +#define NR_MEMBLKS	(NR_NODES)
> +
> +#elif CONFIG_IA64_GENERIC
> +
> +
> +/*
> + * Platform definitions for GENERIC platform with contiguous or discontiguous memory.
> + */
> +#define MAX_PHYSNODE_ID 2048		/* Maximum node number +1 */
> +#define NR_NODES        256		/* Maximum number of nodes in SSI */
> +#define NR_MEMBLKS      (NR_NODES)
> +
> +
> +#else
> +#error unknown platform
> +#endif
> +
> +extern void build_cpu_to_node_map(void);
> +
> +#else /* CONFIG_NUMA */
> +
> +#define NR_NODES	1
> +
> +#endif /* CONFIG_NUMA */
> +#endif /* _ASM_IA64_MMZONE_H */
> --- linux_base/include/asm-ia64/nodedata.h	Wed Dec 31 18:00:00 1969
> +++ linux/include/asm-ia64/nodedata.h	Wed Jul 30 12:04:50 2003
> @@ -0,0 +1,66 @@
> +/*
> + * This file is subject to the terms and conditions of the GNU General Public
> + * License.  See the file "COPYING" in the main directory of this archive
> + * for more details.
> + *
> + * Copyright (c) 2000 Silicon Graphics, Inc.  All rights reserved.
> + * Copyright (c) 2002 NEC Corp.
> + * Copyright (c) 2002 Erich Focht <efocht@ess.nec.de>
> + * Copyright (c) 2002 Kimio Suganuma <k-suganuma@da.jp.nec.com>
> + */
> +
> +
> +#ifndef _ASM_IA64_NODEDATA_H
> +#define _ASM_IA64_NODEDATA_H
> +
> +
> +#include <asm/mmzone.h>
> +
> +/*
> + * Node Data. One of these structures is located on each node of a NUMA system.
> + */
> +
> +struct pglist_data;
> +struct ia64_node_data {
> +	short			node;
> +	short			active_cpu_count;
> +	/*
> +	 * The fields are read-only (after boot). They contain pointers to various structures
> +	 * located on other nodes. Ths data is replicated on each node in order to reduce
> +	 * off-node references.
> +	 */
> +        struct pglist_data	*pg_data_ptrs[NR_NODES];
> +	struct ia64_node_data	*node_data_ptrs[NR_NODES];
> +};
> +
> +
> +/*
> + * Return a pointer to the node_data structure for the executing cpu.
> + */
> +#define local_node_data		(local_cpu_data->node_data)
> +
> +
> +/*
> + * Return a pointer to the node_data structure for the specified node.
> + */
> +#define node_data(node)	(local_node_data->node_data_ptrs[node])
> +
> +
> +/*
> + * Given a node id, return a pointer to the pg_data_t for the node.
> + * The following 2 macros are similar. 
> + *
> + * NODE_DATA 	- should be used in all code not related to system
> + *		  initialization. It uses pernode data structures to minimize
> + *		  offnode memory references. However, these structure are not 
> + *		  present during boot. This macro can be used once cpu_init
> + *		  completes.
> + *
> + * NOTE:   The names of these macros are misleading but are difficult to change
> + *	   since they are used in generic linux & on other architecures.
> + */
> +#define NODE_DATA(nid)		(local_node_data->pg_data_ptrs[nid])
> +
> +extern struct pglist_data * __init boot_get_pg_data_ptr(long);
> +
> +#endif /* _ASM_IA64_NODEDATA_H */
> --- linux_base/include/asm-ia64/numa.h	Wed Dec 31 18:00:00 1969
> +++ linux/include/asm-ia64/numa.h	Wed Jul 30 12:07:23 2003
> @@ -0,0 +1,85 @@
> +/*
> + * This file is subject to the terms and conditions of the GNU General Public
> + * License.  See the file "COPYING" in the main directory of this archive
> + * for more details.
> + *
> + * This file contains NUMA specific prototypes and definitions.
> + * 
> + * 2002/08/05 Erich Focht <efocht@ess.nec.de>
> + *
> + */
> +#ifndef _ASM_IA64_NUMA_H
> +#define _ASM_IA64_NUMA_H
> +
> +#ifdef CONFIG_NUMA
> +
> +#ifdef CONFIG_DISCONTIGMEM
> +# include <asm/mmzone.h>
> +#else
> +# define NR_NODES     (8)
> +# define NR_MEMBLKS   (NR_NODES * 8)
> +#endif
> +
> +#include <linux/cache.h>
> +#include <linux/threads.h>
> +#include <linux/smp.h>
> +
> +#define NODEMASK_WORDCOUNT       ((NR_NODES+(BITS_PER_LONG-1))/BITS_PER_LONG)
> +
> +#define NODE_MASK_NONE   { [0 ... ((NR_NODES+BITS_PER_LONG-1)/BITS_PER_LONG)-1] = 0 }
> +
> +typedef unsigned long   nodemask_t[NODEMASK_WORDCOUNT];
> +                                                                                                                             
> +extern volatile char cpu_to_node_map[NR_CPUS] __cacheline_aligned;
> +extern volatile unsigned long node_to_cpu_mask[NR_NODES] __cacheline_aligned;
> +
> +/* Stuff below this line could be architecture independent */
> +
> +extern int num_memblks;		/* total number of memory chunks */
> +
> +/*
> + * List of node memory chunks. Filled when parsing SRAT table to
> + * obtain information about memory nodes.
> +*/
> +
> +struct node_memblk_s {
> +	unsigned long start_paddr;
> +	unsigned long size;
> +	int nid;		/* which logical node contains this chunk? */
> +	int bank;		/* which mem bank on this node */
> +};
> +
> +struct node_cpuid_s {
> +	u16	phys_id;	/* id << 8 | eid */
> +	int	nid;		/* logical node containing this CPU */
> +};
> +
> +extern struct node_memblk_s node_memblk[NR_MEMBLKS];
> +extern struct node_cpuid_s node_cpuid[NR_CPUS];
> +
> +/*
> + * ACPI 2.0 SLIT (System Locality Information Table)
> + * http://devresource.hp.com/devresource/Docs/TechPapers/IA64/slit.pdf
> + *
> + * This is a matrix with "distances" between nodes, they should be
> + * proportional to the memory access latency ratios.
> + */
> +
> +extern u8 numa_slit[NR_NODES * NR_NODES];
> +#define node_distance(from,to) (numa_slit[from * numnodes + to])
> +
> +extern int paddr_to_nid(unsigned long paddr);
> +extern unsigned long memblk_endpaddr(unsigned long paddr);
> +
> +#define local_nodeid (cpu_to_node_map[smp_processor_id()])
> +
> +#else /* !CONFIG_NUMA */
> +
> +#define node_distance(from,to) 10
> +#define paddr_to_nid(x) 0
> +#define memblk_endpaddr(x) ~0UL
> +#define local_nodeid 0
> +
> +#endif /* CONFIG_NUMA */
> +
> +#endif /* _ASM_IA64_NUMA_H */
> --- linux_base/include/asm-ia64/numnodes.h	Wed Dec 31 18:00:00 1969
> +++ linux/include/asm-ia64/numnodes.h	Wed Jul 30 12:04:50 2003
> @@ -0,0 +1,7 @@
> +#ifndef _ASM_MAX_NUMNODES_H
> +#define _ASM_MAX_NUMNODES_H
> +
> +#include <asm/mmzone.h>
> +#define MAX_NUMNODES	NR_NODES
> +
> +#endif /* _ASM_MAX_NUMNODES_H */
> diff -Naur linux_base/include/asm-ia64/page.h linux/include/asm-ia64/page.h
> --- linux_base/include/asm-ia64/page.h	Tue Jul 29 14:43:58 2003
> +++ linux/include/asm-ia64/page.h	Mon Jul 28 11:06:42 2003
> @@ -80,19 +80,8 @@
>   */
>  #define MAP_NR_DENSE(addr)	(((unsigned long) (addr) - PAGE_OFFSET) >> PAGE_SHIFT)
>  
> -#ifdef CONFIG_IA64_GENERIC
> -# include <asm/machvec.h>
> -# define virt_to_page(kaddr)	(mem_map + platform_map_nr(kaddr))
> -# define page_to_phys(page)	((page - mem_map) << PAGE_SHIFT)
> -#elif defined (CONFIG_IA64_SGI_SN1)
> -# ifndef CONFIG_DISCONTIGMEM
> -#  define virt_to_page(kaddr)	(mem_map + MAP_NR_DENSE(kaddr))
> -#  define page_to_phys(page)	XXX fix me
> -# endif
> -#else
> -# define virt_to_page(kaddr)	(mem_map + MAP_NR_DENSE(kaddr))
> -# define page_to_phys(page)	((page - mem_map) << PAGE_SHIFT)
> -#endif
> +#define virt_to_page(kaddr)	(mem_map + MAP_NR_DENSE(kaddr))
> +#define page_to_phys(page)	((page - mem_map) << PAGE_SHIFT)
>  
>  struct page;
>  extern int ia64_page_valid (struct page *);
> diff -Naur linux_base/include/asm-ia64/pgtable.h linux/include/asm-ia64/pgtable.h
> --- linux_base/include/asm-ia64/pgtable.h	Tue Jul 29 14:44:02 2003
> +++ linux/include/asm-ia64/pgtable.h	Wed Jul 30 12:07:32 2003
> @@ -163,7 +163,6 @@
>  	return (addr & (local_cpu_data->unimpl_pa_mask)) = 0;
>  }
>  
> -#ifndef CONFIG_DISCONTIGMEM
>  /*
>   * kern_addr_valid(ADDR) tests if ADDR is pointing to valid kernel
>   * memory.  For the return value to be meaningful, ADDR must be >> @@ -179,7 +178,6 @@
>   */
>  #define kern_addr_valid(addr)	(1)
>  
> -#endif
>  
>  /*
>   * Now come the defines and routines to manage and access the three-level
> @@ -227,10 +225,8 @@
>  #define pte_none(pte) 			(!pte_val(pte))
>  #define pte_present(pte)		(pte_val(pte) & (_PAGE_P | _PAGE_PROTNONE))
>  #define pte_clear(pte)			(pte_val(*(pte)) = 0UL)
> -#ifndef CONFIG_DISCONTIGMEM
>  /* pte_page() returns the "struct page *" corresponding to the PTE: */
>  #define pte_page(pte)			(mem_map + (unsigned long) ((pte_val(pte) & _PFN_MASK) >> PAGE_SHIFT))
> -#endif
>  
>  #define pmd_none(pmd)			(!pmd_val(pmd))
>  #define pmd_bad(pmd)			(!ia64_phys_addr_valid(pmd_val(pmd)))
> @@ -430,7 +426,8 @@
>   * for zero-mapped memory areas etc..
>   */
>  extern unsigned long empty_zero_page[PAGE_SIZE/sizeof(unsigned long)];
> -#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
> +extern struct page *zero_page_memmap_ptr;
> +#define ZERO_PAGE(vaddr) (zero_page_memmap_ptr)
>  
>  /* We provide our own get_unmapped_area to cope with VA holes for userland */
>  #define HAVE_ARCH_UNMAPPED_AREA
> diff -Naur linux_base/include/asm-ia64/processor.h linux/include/asm-ia64/processor.h
> --- linux_base/include/asm-ia64/processor.h	Tue Jul 29 14:43:59 2003
> +++ linux/include/asm-ia64/processor.h	Wed Jul 30 12:04:50 2003
> @@ -87,6 +87,9 @@
>  #include <asm/rse.h>
>  #include <asm/unwind.h>
>  #include <asm/atomic.h>
> +#ifdef CONFIG_NUMA
> +#include <asm/nodedata.h>
> +#endif
>  
>  /* like above but expressed as bitfields for more efficient access: */
>  struct ia64_psr {
> @@ -188,8 +191,8 @@
>  	} ipi;
>  #endif
>  #ifdef CONFIG_NUMA
> -	void *node_directory;
> -	int numa_node_id;
> +	struct ia64_node_data *node_data;
> +	int nodeid;
>  	struct cpuinfo_ia64 *cpu_data[NR_CPUS];
>  #endif
>  	/* Platform specific word.  MUST BE LAST IN STRUCT */
> @@ -214,9 +217,9 @@
>   */
>  #ifdef CONFIG_NUMA
>  # define cpu_data(cpu)		local_cpu_data->cpu_data[cpu]
> -# define numa_node_id()		(local_cpu_data->numa_node_id)
> +# define numa_node_id()		(local_cpu_data->nodeid)
>  #else
> -  extern struct cpuinfo_ia64 _cpu_data[NR_CPUS];
> +  extern struct cpuinfo_ia64	_cpu_data[NR_CPUS];
>  # define cpu_data(cpu)		(&_cpu_data[cpu])
>  #endif
>  
> diff -Naur linux_base/include/asm-ia64/smp.h linux/include/asm-ia64/smp.h
> --- linux_base/include/asm-ia64/smp.h	Tue Jul 29 14:43:59 2003
> +++ linux/include/asm-ia64/smp.h	Wed Jul 30 12:04:50 2003
> @@ -124,6 +124,7 @@
>  extern int smp_call_function_single (int cpuid, void (*func) (void *info), void *info,
>  				     int retry, int wait);
>  
> +extern void smp_build_cpu_map(void);
>  
>  #endif /* CONFIG_SMP */
>  #endif /* _ASM_IA64_SMP_H */
> --- linux_base/include/asm-ia64/topology.h	Wed Dec 31 18:00:00 1969
> +++ linux/include/asm-ia64/topology.h	Wed Jul 30 12:07:30 2003
> @@ -0,0 +1,63 @@
> +/*
> + * linux/include/asm-ia64/topology.h
> + *
> + * Copyright (C) 2002, Erich Focht, NEC
> + *
> + * All rights reserved.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + */
> +#ifndef _ASM_IA64_TOPOLOGY_H
> +#define _ASM_IA64_TOPOLOGY_H
> +
> +#include <asm/acpi.h>
> +#include <asm/numa.h>
> +#include <asm/smp.h>
> +
> +#ifdef CONFIG_NUMA
> +/*
> + * Returns the number of the node containing CPU 'cpu'
> + */
> +#define __cpu_to_node(cpu) (int)(cpu_to_node_map[cpu])
> +
> +/*
> + * Returns a bitmask of CPUs on Node 'node'.
> + */
> +#define __node_to_cpu_mask(node) (node_to_cpu_mask[node])
> +
> +#else
> +#define __cpu_to_node(cpu) (0)
> +#define __node_to_cpumask(node) (&phys_cpu_present_map)
> +#endif
> +
> +/*
> + * Returns the number of the node containing MemBlk 'memblk'
> + */
> +#ifdef CONFIG_ACPI_NUMA
> +#define __memblk_to_node(memblk) (node_memblk[memblk].nid)
> +#else
> +#define __memblk_to_node(memblk) (memblk)
> +#endif
> +
> +/*
> + * Returns the number of the node containing Node 'nid'.
> + * Not implemented here. Multi-level hierarchies detected with
> + * the help of node_distance().
> + */
> +#define __parent_node(nid) (nid)
> +
> +/*
> + * Returns the number of the first CPU on Node 'node'.
> + */
> +#define __node_to_first_cpu(node) (__ffs(__node_to_cpu_mask(node)))
> +
> +/*
> + * Returns the number of the first MemBlk on Node 'node'
> + * Should be fixed when IA64 discontigmem goes in.
> + */
> +#define __node_to_memblk(node) (node)
> +
> +#endif /* _ASM_IA64_TOPOLOGY_H */
> diff -Naur linux_base/include/linux/acpi.h linux/include/linux/acpi.h
> --- linux_base/include/linux/acpi.h	Tue Jul 29 14:43:59 2003
> +++ linux/include/linux/acpi.h	Wed Jul 30 12:07:30 2003
> @@ -344,6 +344,14 @@
>  void acpi_table_print (struct acpi_table_header *, unsigned long);
>  void acpi_table_print_madt_entry (acpi_table_entry_header *);
>  
> +#ifdef CONFIG_ACPI_NUMA
> +int __init acpi_numa_init(void);
> +void __init acpi_numa_slit_init (struct acpi_table_slit *);
> +void __init acpi_numa_processor_affinity_init (struct acpi_table_processor_affinity *);
> +void __init acpi_numa_memory_affinity_init (struct acpi_table_memory_affinity *);
> +void __init acpi_numa_arch_fixup(void);
> +#endif
> +
>  #endif /*CONFIG_ACPI_BOOT*/
>  
>  
> diff -Naur linux_base/include/linux/mmzone.h linux/include/linux/mmzone.h
> --- linux_base/include/linux/mmzone.h	Tue Jul 29 14:43:59 2003
> +++ linux/include/linux/mmzone.h	Wed Jul 30 12:07:31 2003
> @@ -8,6 +8,12 @@
>  #include <linux/spinlock.h>
>  #include <linux/list.h>
>  #include <linux/wait.h>
> +#ifdef CONFIG_DISCONTIGMEM
> +#include <asm/numnodes.h>
> +#endif
> +#ifndef MAX_NUMNODES
> +#define MAX_NUMNODES 1
> +#endif
>  
>  /*
>   * Free memory management - zoned buddy allocator.
> @@ -110,7 +116,7 @@
>   * footprint of this construct is very small.
>   */
>  typedef struct zonelist_struct {
> -	zone_t * zones [MAX_NR_ZONES+1]; // NULL delimited
> +	zone_t * zones [MAX_NUMNODES*MAX_NR_ZONES+1]; // NULL delimited
>  } zonelist_t;
>  
>  #define GFP_ZONEMASK	0x0f
> @@ -144,8 +150,8 @@
>  extern int numnodes;
>  extern pg_data_t *pgdat_list;
>  
> -#define memclass(pgzone, classzone)	(((pgzone)->zone_pgdat = (classzone)->zone_pgdat) \
> -			&& ((pgzone) <= (classzone)))
> +#define memclass(pgzone, classzone)            (((pgzone) - (pgzone)->zone_pgdat->node_zones) <= \
> +((classzone) - (classzone)->zone_pgdat->node_zones))
>  
>  /*
>   * The following two are not meant for general usage. They are here as
> @@ -212,6 +218,18 @@
>  #define for_each_zone(zone) \
>  	for(zone = pgdat_list->node_zones; zone; zone = next_zone(zone))
>  
> +#ifdef CONFIG_NUMA
> +#define MAX_NR_MEMBLKS  BITS_PER_LONG /* Max number of Memory Blocks */
> +#include <asm/topology.h>
> +#else /* !CONFIG_NUMA */
> +#define MAX_NR_MEMBLKS  1
> +#endif /* CONFIG_NUMA */
> +
> +/* Returns the number of the current Node. */
> +
> +#ifndef CONFIG_NUMA
> +#define numa_node_id()          (__cpu_to_node(smp_processor_id()))
> +#endif
>  
>  #ifndef CONFIG_DISCONTIGMEM
>  
> diff -Naur linux_base/init/main.c linux/init/main.c
> --- linux_base/init/main.c	Sat Jul  5 22:59:38 2003
> +++ linux/init/main.c	Mon Jul 28 16:10:20 2003
> @@ -290,6 +290,7 @@
>  
>  
>  extern void setup_arch(char **);
> +extern void __init build_all_zonelists(void);
>  extern void cpu_idle(void);
>  
>  unsigned long wait_init_idle;
> @@ -360,6 +361,7 @@
>  	lock_kernel();
>  	printk(linux_banner);
>  	setup_arch(&command_line);
> +	build_all_zonelists();
>  	printk("Kernel command line: %s\n", saved_command_line);
>  	parse_options(command_line);
>  	trap_init();
> diff -Naur linux_base/mm/bootmem.c linux/mm/bootmem.c
> --- linux_base/mm/bootmem.c	Sat Jul  5 22:59:38 2003
> +++ linux/mm/bootmem.c	Mon Jul 28 16:10:20 2003
> @@ -49,8 +49,24 @@
>  	bootmem_data_t *bdata = pgdat->bdata;
>  	unsigned long mapsize = ((end - start)+7)/8;
>  
> -	pgdat->node_next = pgdat_list;
> -	pgdat_list = pgdat;
> +
> +	/*
> +	 * sort pgdat_list so that the lowest one comes first,
> +	 * which makes alloc_bootmem_low_pages work as desired.
> +	 */
> +	if (!pgdat_list || pgdat_list->node_start_paddr > pgdat->node_start_paddr) {
> +		pgdat->node_next = pgdat_list;
> +		pgdat_list = pgdat;
> +	} else {
> +		pg_data_t *tmp = pgdat_list;
> +		while (tmp->node_next) {
> +			if (tmp->node_next->node_start_paddr > pgdat->node_start_paddr)
> +				break;
> +			tmp = tmp->node_next;
> +		}
> +		pgdat->node_next = tmp->node_next;
> +		tmp->node_next = pgdat;
> +	}
>  
>  	mapsize = (mapsize + (sizeof(long) - 1UL)) & ~(sizeof(long) - 1UL);
>  	bdata->node_bootmem_map = phys_to_virt(mapstart << PAGE_SHIFT);
> @@ -259,16 +275,16 @@
>  	if (!bdata->node_bootmem_map) BUG();
>  
>  	count = 0;
> +	page = virt_to_page(phys_to_virt(bdata->node_boot_start));
>  	idx = bdata->node_low_pfn - (bdata->node_boot_start >> PAGE_SHIFT);
>  	for (i = find_first_zero_bit(bdata->node_bootmem_map, idx);
>  	     i < idx;
>  	     i = find_next_zero_bit(bdata->node_bootmem_map, idx, i + 1))
>  	{
> -		page = pgdat->node_mem_map + i;
>  		count++;
> -		ClearPageReserved(page);
> -		set_page_count(page, 1);
> -		__free_page(page);
> +		ClearPageReserved(page+i);
> +		set_page_count(page+i, 1);
> +		__free_page(page+i);
>  	}
>  	total += count;
>  
> diff -Naur linux_base/mm/page_alloc.c linux/mm/page_alloc.c
> --- linux_base/mm/page_alloc.c	Sat Jul  5 22:59:38 2003
> +++ linux/mm/page_alloc.c	Mon Jul 28 16:10:20 2003
> @@ -586,13 +586,44 @@
>  /*
>   * Builds allocation fallback zone lists.
>   */
> -static inline void build_zonelists(pg_data_t *pgdat)
> +static int __init build_zonelists_node(pg_data_t *pgdat, zonelist_t *zonelist, int j, int k)
>  {
> -	int i, j, k;
> +	zone_t *zone;
> +	switch (k) {
> +	default:
> +		BUG();
> +		/*
> +		 * fallthrough:
> +		 */
> +	case ZONE_HIGHMEM:
> +		zone = pgdat->node_zones + ZONE_HIGHMEM;
> +		if (zone->memsize) {
> +#ifndef CONFIG_HIGHMEM
> +			BUG();
> +#endif
> +			zonelist->zones[j++] = zone;
> +		}
> +	case ZONE_NORMAL:
> +		zone = pgdat->node_zones + ZONE_NORMAL;
> +		if (zone->memsize)
> +			zonelist->zones[j++] = zone;
> +	case ZONE_DMA:
> +		zone = pgdat->node_zones + ZONE_DMA;
> +		if (zone->memsize)
> +			zonelist->zones[j++] = zone;
> +	}
> +
> +	return j;
> +}
> +
> +static void __init build_zonelists(pg_data_t *pgdat)
> +{
> +	int i, j, k, node, local_node;
>  
> +	local_node = pgdat->node_id;
> +	printk("Building zonelist for node : %d\n", local_node);
>  	for (i = 0; i <= GFP_ZONEMASK; i++) {
>  		zonelist_t *zonelist;
> -		zone_t *zone;
>  
>  		zonelist = pgdat->node_zonelists + i;
>  		memset(zonelist, 0, sizeof(*zonelist));
> @@ -604,33 +635,32 @@
>  		if (i & __GFP_DMA)
>  			k = ZONE_DMA;
>  
> -		switch (k) {
> -			default:
> -				BUG();
> -			/*
> -			 * fallthrough:
> -			 */
> -			case ZONE_HIGHMEM:
> -				zone = pgdat->node_zones + ZONE_HIGHMEM;
> -				if (zone->memsize) {
> -#ifndef CONFIG_HIGHMEM
> -					BUG();
> -#endif
> -					zonelist->zones[j++] = zone;
> -				}
> -			case ZONE_NORMAL:
> -				zone = pgdat->node_zones + ZONE_NORMAL;
> -				if (zone->memsize)
> -					zonelist->zones[j++] = zone;
> -			case ZONE_DMA:
> -				zone = pgdat->node_zones + ZONE_DMA;
> -				if (zone->memsize)
> -					zonelist->zones[j++] = zone;
> -		}
> + 		j = build_zonelists_node(pgdat, zonelist, j, k);
> + 		/*
> + 		 * Now we build the zonelist so that it contains the zones
> + 		 * of all the other nodes.
> + 		 * We don't want to pressure a particular node, so when
> + 		 * building the zones for node N, we make sure that the
> + 		 * zones coming right after the local ones are those from
> + 		 * node N+1 (modulo N)
> + 		 */
> + 		for (node = local_node + 1; node < numnodes; node++)
> + 			j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
> + 		for (node = 0; node < local_node; node++)
> + 			j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
> + 
>  		zonelist->zones[j++] = NULL;
>  	} 
>  }
>  
> +void __init build_all_zonelists(void)
> +{
> +	int i;
> +
> +	for(i = 0 ; i < numnodes ; i++)
> +		build_zonelists(NODE_DATA(i));
> +}
> +
>  /*
>   * Helper functions to size the waitqueue hash table.
>   * Essentially these want to choose hash table sizes sufficiently
> @@ -742,7 +772,7 @@
>  			MAP_ALIGN((unsigned long)lmem_map - PAGE_OFFSET));
>  	}
>  	*gmap = pgdat->node_mem_map = lmem_map;
> -	pgdat->node_size = totalpages;
> +	pgdat->node_size = 0;
>  	pgdat->node_start_paddr = zone_start_paddr;
>  	pgdat->node_start_mapnr = (lmem_map - mem_map);
>  	pgdat->nr_zones = 0;
> @@ -766,6 +796,7 @@
>  		zone->zone_pgdat = pgdat;
>  		zone->free_pages = 0;
>  		zone->need_balance = 0;
> +		pgdat->node_size += realsize;
>  		if (!size)
>  			continue;
>  
> @@ -850,7 +881,6 @@
>  			  (unsigned long *) alloc_bootmem_node(pgdat, bitmap_size);
>  		}
>  	}
> -	build_zonelists(pgdat);
>  }
>  
>  void __init free_area_init(unsigned long *zones_size)
> 
> 
> 
> -- 
> Thanks
> 
> Jack Steiner    (651-683-5302)   (vnet 233-5302)      steiner@sgi.com
> 
> 

-- 
Bjorn Helgaas - bjorn.helgaas at hp.com
Linux and Open Source Lab
Hewlett-Packard Company

next             reply	other threads:[~2003-08-12 23:34 UTC|newest]

Thread overview: 2+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2003-08-12 23:34 Bjorn Helgaas [this message]
2003-08-15  1:28 ` Discontig patch for 2.4.21 Jack Steiner

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=marc-linux-ia64-106073158319638@msgid-missing \
    --to=bjorn.helgaas@hp.com \
    --cc=linux-ia64@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox