[PATCH 6/7] Break out memory initialisation code from page_alloc.c to mem

linuxppc-dev.lists.ozlabs.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 6/7] Break out memory initialisation code from page_alloc.c to mem_init.c
  2006-04-12 23:20 [PATCH 0/7] [RFC] Sizing zones and holes in an architecture independent manner V2 Mel Gorman
@ 2006-04-12 23:22 ` Mel Gorman
  0 siblings, 0 replies; 11+ messages in thread
From: Mel Gorman @ 2006-04-12 23:22 UTC (permalink / raw)
  To: davej, tony.luck, linuxppc-dev, linux-kernel, bob.picco, ak,
	linux-mm
  Cc: Mel Gorman


page_alloc.c contains a large amount of memory initialisation code. This patch
breaks out the initialisation code to a separate file to make page_alloc.c
a bit easier to read.


 Makefile     |    2 
 mem_init.c   | 1014 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 page_alloc.c |  993 ----------------------------------------------------
 3 files changed, 1015 insertions(+), 994 deletions(-)

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc1-105-ia64_use_init_nodes/mm/Makefile linux-2.6.17-rc1-106-breakout_mem_init/mm/Makefile
--- linux-2.6.17-rc1-105-ia64_use_init_nodes/mm/Makefile	2006-04-03 04:22:10.000000000 +0100
+++ linux-2.6.17-rc1-106-breakout_mem_init/mm/Makefile	2006-04-12 20:06:12.000000000 +0100
@@ -8,7 +8,7 @@ mmu-$(CONFIG_MMU)	:= fremap.o highmem.o 
 			   vmalloc.o
 
 obj-y			:= bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
-			   page_alloc.o page-writeback.o pdflush.o \
+			   page_alloc.o mem_init.o page-writeback.o pdflush.o \
 			   readahead.o swap.o truncate.o vmscan.o \
 			   prio_tree.o util.o mmzone.o $(mmu-y)
 
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc1-105-ia64_use_init_nodes/mm/mem_init.c linux-2.6.17-rc1-106-breakout_mem_init/mm/mem_init.c
--- linux-2.6.17-rc1-105-ia64_use_init_nodes/mm/mem_init.c	2006-04-12 23:06:24.000000000 +0100
+++ linux-2.6.17-rc1-106-breakout_mem_init/mm/mem_init.c	2006-04-12 21:03:42.000000000 +0100
@@ -0,0 +1,1014 @@
+/*
+ * mm/mem_init.c
+ * Initialises the architecture independant view of memory. pgdats, zones, etc
+ *
+ *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
+ *  Copyright (C) 1995, Stephen Tweedie
+ *  Copyright (C) July 1999, Gerhard Wichert, Siemens AG
+ *  Copyright (C) 1999, Ingo Molnar, Red Hat
+ *  Copyright (C) 1999, 2000, Kanoj Sarcar, SGI
+ *  Copyright (C) Sept 2000, Martin J. Bligh
+ *	(lots of bits borrowed from Ingo Molnar & Andrew Morton)
+ *  Copyright (C) Apr 2006, Mel Gorman, IBM
+ *	(lots of bits taken from architecture-specific code)
+ */
+#include <linux/config.h>
+#include <linux/sort.h>
+#include <linux/pfn.h>
+#include <linux/mm.h>
+#include <linux/bootmem.h>
+#include <linux/module.h>
+#include <linux/cpuset.h>
+#include <linux/cpu.h>
+#include <linux/cpuset.h>
+#include <linux/mempolicy.h>
+#include <linux/swap.h>
+#include <linux/sysctl.h>
+
+static char *zone_names[MAX_NR_ZONES] = { "DMA", "DMA32", "Normal", "HighMem" };
+int percpu_pagelist_fraction;
+
+#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
+  #ifdef CONFIG_MAX_ACTIVE_REGIONS
+    #define MAX_ACTIVE_REGIONS CONFIG_MAX_ACTIVE_REGIONS
+  #else
+    #define MAX_ACTIVE_REGIONS (MAX_NR_ZONES * MAX_NUMNODES + 1)
+  #endif
+
+  struct node_active_region __initdata early_node_map[MAX_ACTIVE_REGIONS];
+  unsigned long __initdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
+  unsigned long __initdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
+#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
+
+/*
+ * Builds allocation fallback zone lists.
+ *
+ * Add all populated zones of a node to the zonelist.
+ */
+static int __init build_zonelists_node(pg_data_t *pgdat,
+			struct zonelist *zonelist, int nr_zones, int zone_type)
+{
+	struct zone *zone;
+
+	BUG_ON(zone_type > ZONE_HIGHMEM);
+
+	do {
+		zone = pgdat->node_zones + zone_type;
+		if (populated_zone(zone)) {
+#ifndef CONFIG_HIGHMEM
+			BUG_ON(zone_type > ZONE_NORMAL);
+#endif
+			zonelist->zones[nr_zones++] = zone;
+			check_highest_zone(zone_type);
+		}
+		zone_type--;
+
+	} while (zone_type >= 0);
+	return nr_zones;
+}
+
+static inline int highest_zone(int zone_bits)
+{
+	int res = ZONE_NORMAL;
+	if (zone_bits & (__force int)__GFP_HIGHMEM)
+		res = ZONE_HIGHMEM;
+	if (zone_bits & (__force int)__GFP_DMA32)
+		res = ZONE_DMA32;
+	if (zone_bits & (__force int)__GFP_DMA)
+		res = ZONE_DMA;
+	return res;
+}
+
+#ifdef CONFIG_NUMA
+#define MAX_NODE_LOAD (num_online_nodes())
+static int __initdata node_load[MAX_NUMNODES];
+/**
+ * find_next_best_node - find the next node that should appear in a given node's fallback list
+ * @node: node whose fallback list we're appending
+ * @used_node_mask: nodemask_t of already used nodes
+ *
+ * We use a number of factors to determine which is the next node that should
+ * appear on a given node's fallback list.  The node should not have appeared
+ * already in @node's fallback list, and it should be the next closest node
+ * according to the distance array (which contains arbitrary distance values
+ * from each node to each node in the system), and should also prefer nodes
+ * with no CPUs, since presumably they'll have very little allocation pressure
+ * on them otherwise.
+ * It returns -1 if no node is found.
+ */
+static int __init find_next_best_node(int node, nodemask_t *used_node_mask)
+{
+	int n, val;
+	int min_val = INT_MAX;
+	int best_node = -1;
+
+	/* Use the local node if we haven't already */
+	if (!node_isset(node, *used_node_mask)) {
+		node_set(node, *used_node_mask);
+		return node;
+	}
+
+	for_each_online_node(n) {
+		cpumask_t tmp;
+
+		/* Don't want a node to appear more than once */
+		if (node_isset(n, *used_node_mask))
+			continue;
+
+		/* Use the distance array to find the distance */
+		val = node_distance(node, n);
+
+		/* Penalize nodes under us ("prefer the next node") */
+		val += (n < node);
+
+		/* Give preference to headless and unused nodes */
+		tmp = node_to_cpumask(n);
+		if (!cpus_empty(tmp))
+			val += PENALTY_FOR_NODE_WITH_CPUS;
+
+		/* Slight preference for less loaded node */
+		val *= (MAX_NODE_LOAD*MAX_NUMNODES);
+		val += node_load[n];
+
+		if (val < min_val) {
+			min_val = val;
+			best_node = n;
+		}
+	}
+
+	if (best_node >= 0)
+		node_set(best_node, *used_node_mask);
+
+	return best_node;
+}
+
+static void __init build_zonelists(pg_data_t *pgdat)
+{
+	int i, j, k, node, local_node;
+	int prev_node, load;
+	struct zonelist *zonelist;
+	nodemask_t used_mask;
+
+	/* initialize zonelists */
+	for (i = 0; i < GFP_ZONETYPES; i++) {
+		zonelist = pgdat->node_zonelists + i;
+		zonelist->zones[0] = NULL;
+	}
+
+	/* NUMA-aware ordering of nodes */
+	local_node = pgdat->node_id;
+	load = num_online_nodes();
+	prev_node = local_node;
+	nodes_clear(used_mask);
+	while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
+		int distance = node_distance(local_node, node);
+
+		/*
+		 * If another node is sufficiently far away then it is better
+		 * to reclaim pages in a zone before going off node.
+		 */
+		if (distance > RECLAIM_DISTANCE)
+			zone_reclaim_mode = 1;
+
+		/*
+		 * We don't want to pressure a particular node.
+		 * So adding penalty to the first node in same
+		 * distance group to make it round-robin.
+		 */
+
+		if (distance != node_distance(local_node, prev_node))
+			node_load[node] += load;
+		prev_node = node;
+		load--;
+		for (i = 0; i < GFP_ZONETYPES; i++) {
+			zonelist = pgdat->node_zonelists + i;
+			for (j = 0; zonelist->zones[j] != NULL; j++);
+
+			k = highest_zone(i);
+
+	 		j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
+			zonelist->zones[j] = NULL;
+		}
+	}
+}
+
+#else	/* CONFIG_NUMA */
+
+static void __init build_zonelists(pg_data_t *pgdat)
+{
+	int i, j, k, node, local_node;
+
+	local_node = pgdat->node_id;
+	for (i = 0; i < GFP_ZONETYPES; i++) {
+		struct zonelist *zonelist;
+
+		zonelist = pgdat->node_zonelists + i;
+
+		j = 0;
+		k = highest_zone(i);
+ 		j = build_zonelists_node(pgdat, zonelist, j, k);
+ 		/*
+ 		 * Now we build the zonelist so that it contains the zones
+ 		 * of all the other nodes.
+ 		 * We don't want to pressure a particular node, so when
+ 		 * building the zones for node N, we make sure that the
+ 		 * zones coming right after the local ones are those from
+ 		 * node N+1 (modulo N)
+ 		 */
+		for (node = local_node + 1; node < MAX_NUMNODES; node++) {
+			if (!node_online(node))
+				continue;
+			j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
+		}
+		for (node = 0; node < local_node; node++) {
+			if (!node_online(node))
+				continue;
+			j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
+		}
+
+		zonelist->zones[j] = NULL;
+	}
+}
+
+#endif	/* CONFIG_NUMA */
+
+void __init build_all_zonelists(void)
+{
+	int i;
+
+	for_each_online_node(i)
+		build_zonelists(NODE_DATA(i));
+	printk("Built %i zonelists\n", num_online_nodes());
+	cpuset_init_current_mems_allowed();
+}
+
+/*
+ * Helper functions to size the waitqueue hash table.
+ * Essentially these want to choose hash table sizes sufficiently
+ * large so that collisions trying to wait on pages are rare.
+ * But in fact, the number of active page waitqueues on typical
+ * systems is ridiculously low, less than 200. So this is even
+ * conservative, even though it seems large.
+ *
+ * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
+ * waitqueues, i.e. the size of the waitq table given the number of pages.
+ */
+#define PAGES_PER_WAITQUEUE	256
+
+static inline unsigned long wait_table_size(unsigned long pages)
+{
+	unsigned long size = 1;
+
+	pages /= PAGES_PER_WAITQUEUE;
+
+	while (size < pages)
+		size <<= 1;
+
+	/*
+	 * Once we have dozens or even hundreds of threads sleeping
+	 * on IO we've got bigger problems than wait queue collision.
+	 * Limit the size of the wait table to a reasonable size.
+	 */
+	size = min(size, 4096UL);
+
+	return max(size, 4UL);
+}
+
+/*
+ * This is an integer logarithm so that shifts can be used later
+ * to extract the more random high bits from the multiplicative
+ * hash function before the remainder is taken.
+ */
+static inline unsigned long wait_table_bits(unsigned long size)
+{
+	return ffz(~size);
+}
+
+#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
+
+#ifndef __HAVE_ARCH_MEMMAP_INIT
+#define memmap_init(size, nid, zone, start_pfn) \
+	memmap_init_zone((size), (nid), (zone), (start_pfn))
+#endif
+
+/*
+ * Initially all pages are reserved - free ones are freed
+ * up by free_all_bootmem() once the early boot process is
+ * done. Non-atomic initialization, single-pass.
+ */
+void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
+		unsigned long start_pfn)
+{
+	struct page *page;
+	unsigned long end_pfn = start_pfn + size;
+	unsigned long pfn;
+
+	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
+		if (!early_pfn_valid(pfn))
+			continue;
+		page = pfn_to_page(pfn);
+		set_page_links(page, zone, nid, pfn);
+		init_page_count(page);
+		reset_page_mapcount(page);
+		SetPageReserved(page);
+		INIT_LIST_HEAD(&page->lru);
+#ifdef WANT_PAGE_VIRTUAL
+		/* The shift won't overflow because ZONE_NORMAL is below 4G. */
+		if (!is_highmem_idx(zone))
+			set_page_address(page, __va(pfn << PAGE_SHIFT));
+#endif
+	}
+}
+
+void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone,
+				unsigned long size)
+{
+	int order;
+	for (order = 0; order < MAX_ORDER ; order++) {
+		INIT_LIST_HEAD(&zone->free_area[order].free_list);
+		zone->free_area[order].nr_free = 0;
+	}
+}
+
+#define ZONETABLE_INDEX(x, zone_nr)	((x << ZONES_SHIFT) | zone_nr)
+void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn,
+		unsigned long size)
+{
+	unsigned long snum = pfn_to_section_nr(pfn);
+	unsigned long end = pfn_to_section_nr(pfn + size);
+
+	if (FLAGS_HAS_NODE)
+		zone_table[ZONETABLE_INDEX(nid, zid)] = zone;
+	else
+		for (; snum <= end; snum++)
+			zone_table[ZONETABLE_INDEX(snum, zid)] = zone;
+}
+
+static __meminit
+void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
+{
+	int i;
+	struct pglist_data *pgdat = zone->zone_pgdat;
+
+	/*
+	 * The per-page waitqueue mechanism uses hashed waitqueues
+	 * per zone.
+	 */
+	zone->wait_table_size = wait_table_size(zone_size_pages);
+	zone->wait_table_bits =	wait_table_bits(zone->wait_table_size);
+	zone->wait_table = (wait_queue_head_t *)
+		alloc_bootmem_node(pgdat, zone->wait_table_size
+					* sizeof(wait_queue_head_t));
+
+	for(i = 0; i < zone->wait_table_size; ++i)
+		init_waitqueue_head(zone->wait_table + i);
+}
+
+/*
+ * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist
+ * to the value high for the pageset p.
+ */
+static void setup_pagelist_highmark(struct per_cpu_pageset *p,
+				unsigned long high)
+{
+	struct per_cpu_pages *pcp;
+
+	pcp = &p->pcp[0]; /* hot list */
+	pcp->high = high;
+	pcp->batch = max(1UL, high/4);
+	if ((high/4) > (PAGE_SHIFT * 8))
+		pcp->batch = PAGE_SHIFT * 8;
+}
+
+/*
+ * percpu_pagelist_fraction - changes the pcp->high for each zone on each
+ * cpu.  It is the fraction of total pages in each zone that a hot per cpu pagelist
+ * can have before it gets flushed back to buddy allocator.
+ */
+int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
+	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+{
+	struct zone *zone;
+	unsigned int cpu;
+	int ret;
+
+	ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
+	if (!write || (ret == -EINVAL))
+		return ret;
+	for_each_zone(zone) {
+		for_each_online_cpu(cpu) {
+			unsigned long  high;
+			high = zone->present_pages / percpu_pagelist_fraction;
+			setup_pagelist_highmark(zone_pcp(zone, cpu), high);
+		}
+	}
+	return 0;
+}
+
+static int __cpuinit zone_batchsize(struct zone *zone)
+{
+	int batch;
+
+	/*
+	 * The per-cpu-pages pools are set to around 1000th of the
+	 * size of the zone.  But no more than 1/2 of a meg.
+	 *
+	 * OK, so we don't know how big the cache is.  So guess.
+	 */
+	batch = zone->present_pages / 1024;
+	if (batch * PAGE_SIZE > 512 * 1024)
+		batch = (512 * 1024) / PAGE_SIZE;
+	batch /= 4;		/* We effectively *= 4 below */
+	if (batch < 1)
+		batch = 1;
+
+	/*
+	 * Clamp the batch to a 2^n - 1 value. Having a power
+	 * of 2 value was found to be more likely to have
+	 * suboptimal cache aliasing properties in some cases.
+	 *
+	 * For example if 2 tasks are alternately allocating
+	 * batches of pages, one task can end up with a lot
+	 * of pages of one half of the possible page colors
+	 * and the other with pages of the other colors.
+	 */
+	batch = (1 << (fls(batch + batch/2)-1)) - 1;
+
+	return batch;
+}
+
+inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
+{
+	struct per_cpu_pages *pcp;
+
+	memset(p, 0, sizeof(*p));
+
+	pcp = &p->pcp[0];		/* hot */
+	pcp->count = 0;
+	pcp->high = 6 * batch;
+	pcp->batch = max(1UL, 1 * batch);
+	INIT_LIST_HEAD(&pcp->list);
+
+	pcp = &p->pcp[1];		/* cold*/
+	pcp->count = 0;
+	pcp->high = 2 * batch;
+	pcp->batch = max(1UL, batch/2);
+	INIT_LIST_HEAD(&pcp->list);
+}
+
+#ifdef CONFIG_NUMA
+/*
+ * Boot pageset table. One per cpu which is going to be used for all
+ * zones and all nodes. The parameters will be set in such a way
+ * that an item put on a list will immediately be handed over to
+ * the buddy list. This is safe since pageset manipulation is done
+ * with interrupts disabled.
+ *
+ * Some NUMA counter updates may also be caught by the boot pagesets.
+ *
+ * The boot_pagesets must be kept even after bootup is complete for
+ * unused processors and/or zones. They do play a role for bootstrapping
+ * hotplugged processors.
+ *
+ * zoneinfo_show() and maybe other functions do
+ * not check if the processor is online before following the pageset pointer.
+ * Other parts of the kernel may not check if the zone is available.
+ */
+static struct per_cpu_pageset boot_pageset[NR_CPUS];
+
+/*
+ * Dynamically allocate memory for the
+ * per cpu pageset array in struct zone.
+ */
+static int __cpuinit process_zones(int cpu)
+{
+	struct zone *zone, *dzone;
+
+	for_each_zone(zone) {
+
+		zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
+					 GFP_KERNEL, cpu_to_node(cpu));
+		if (!zone_pcp(zone, cpu))
+			goto bad;
+
+		setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone));
+
+		if (percpu_pagelist_fraction)
+			setup_pagelist_highmark(zone_pcp(zone, cpu),
+			 	(zone->present_pages / percpu_pagelist_fraction));
+	}
+
+	return 0;
+bad:
+	for_each_zone(dzone) {
+		if (dzone == zone)
+			break;
+		kfree(zone_pcp(dzone, cpu));
+		zone_pcp(dzone, cpu) = NULL;
+	}
+	return -ENOMEM;
+}
+
+static inline void free_zone_pagesets(int cpu)
+{
+	struct zone *zone;
+
+	for_each_zone(zone) {
+		struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
+
+		zone_pcp(zone, cpu) = NULL;
+		kfree(pset);
+	}
+}
+
+static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,
+		unsigned long action,
+		void *hcpu)
+{
+	int cpu = (long)hcpu;
+	int ret = NOTIFY_OK;
+
+	switch (action) {
+		case CPU_UP_PREPARE:
+			if (process_zones(cpu))
+				ret = NOTIFY_BAD;
+			break;
+		case CPU_UP_CANCELED:
+		case CPU_DEAD:
+			free_zone_pagesets(cpu);
+			break;
+		default:
+			break;
+	}
+	return ret;
+}
+
+static struct notifier_block pageset_notifier =
+	{ &pageset_cpuup_callback, NULL, 0 };
+
+void __init setup_per_cpu_pageset(void)
+{
+	int err;
+
+	/* Initialize per_cpu_pageset for cpu 0.
+	 * A cpuup callback will do this for every cpu
+	 * as it comes online
+	 */
+	err = process_zones(smp_processor_id());
+	BUG_ON(err);
+	register_cpu_notifier(&pageset_notifier);
+}
+#endif
+
+static __meminit void zone_pcp_init(struct zone *zone)
+{
+	int cpu;
+	unsigned long batch = zone_batchsize(zone);
+
+	for (cpu = 0; cpu < NR_CPUS; cpu++) {
+#ifdef CONFIG_NUMA
+		/* Early boot. Slab allocator not functional yet */
+		zone_pcp(zone, cpu) = &boot_pageset[cpu];
+		setup_pageset(&boot_pageset[cpu],0);
+#else
+		setup_pageset(zone_pcp(zone,cpu), batch);
+#endif
+	}
+	if (zone->present_pages)
+		printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%lu\n",
+			zone->name, zone->present_pages, batch);
+}
+
+static __meminit void init_currently_empty_zone(struct zone *zone,
+		unsigned long zone_start_pfn, unsigned long size)
+{
+	struct pglist_data *pgdat = zone->zone_pgdat;
+
+	zone_wait_table_init(zone, size);
+	pgdat->nr_zones = zone_idx(zone) + 1;
+
+	zone->zone_start_pfn = zone_start_pfn;
+
+	memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn);
+
+	zone_init_free_lists(pgdat, zone, zone->spanned_pages);
+}
+
+#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
+static int __init first_active_region_index_in_nid(int nid)
+{
+	int i;
+	for (i = 0; early_node_map[i].end_pfn; i++) {
+		if (early_node_map[i].nid == nid)
+			return i;
+	}
+
+	return MAX_ACTIVE_REGIONS;
+}
+
+static int __init next_active_region_index_in_nid(unsigned int index, int nid)
+{
+	for (index = index + 1; early_node_map[index].end_pfn; index++) {
+		if (early_node_map[index].nid == nid)
+			return index;
+	}
+
+	return MAX_ACTIVE_REGIONS;
+}
+
+#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
+int __init early_pfn_to_nid(unsigned long pfn)
+{
+	int i;
+
+	for (i = 0; early_node_map[i].end_pfn; i++) {
+		unsigned long start_pfn = early_node_map[i].start_pfn;
+		unsigned long end_pfn = early_node_map[i].end_pfn;
+
+		if ((start_pfn <= pfn) && (pfn < end_pfn))
+			return early_node_map[i].nid;
+	}
+
+	return -1;
+}
+#endif
+
+#define for_each_active_range_index_in_nid(i, nid) \
+	for (i = first_active_region_index_in_nid(nid); \
+				i != MAX_ACTIVE_REGIONS; \
+				i = next_active_region_index_in_nid(i, nid))
+
+void __init free_bootmem_with_active_regions(int nid,
+						unsigned long max_low_pfn)
+{
+	unsigned int i;
+	for_each_active_range_index_in_nid(i, nid) {
+		unsigned long size_pages = 0;
+		unsigned long end_pfn = early_node_map[i].end_pfn;
+		if (early_node_map[i].start_pfn >= max_low_pfn)
+			continue;
+
+		if (end_pfn > max_low_pfn)
+			end_pfn = max_low_pfn;
+
+		size_pages = end_pfn - early_node_map[i].start_pfn;
+		free_bootmem_node(NODE_DATA(early_node_map[i].nid),
+				PFN_PHYS(early_node_map[i].start_pfn),
+				PFN_PHYS(size_pages));
+	}
+}
+
+void __init memory_present_with_active_regions(int nid)
+{
+	unsigned int i;
+	for_each_active_range_index_in_nid(i, nid)
+		memory_present(early_node_map[i].nid,
+				early_node_map[i].start_pfn,
+				early_node_map[i].end_pfn);
+}
+
+void __init get_pfn_range_for_nid(unsigned int nid,
+			unsigned long *start_pfn, unsigned long *end_pfn)
+{
+	unsigned int i;
+	*start_pfn = -1UL;
+	*end_pfn = 0;
+
+	for_each_active_range_index_in_nid(i, nid) {
+		if (early_node_map[i].start_pfn < *start_pfn)
+			*start_pfn = early_node_map[i].start_pfn;
+
+		if (early_node_map[i].end_pfn > *end_pfn)
+			*end_pfn = early_node_map[i].end_pfn;
+	}
+
+	if (*start_pfn == -1UL) {
+		printk(KERN_WARNING "Node %u active with no memory\n", nid);
+		*start_pfn = 0;
+	}
+}
+
+unsigned long __init zone_present_pages_in_node(int nid,
+					unsigned long zone_type,
+					unsigned long *ignored)
+{
+	unsigned long node_start_pfn, node_end_pfn;
+	unsigned long zone_start_pfn, zone_end_pfn;
+
+	/* Get the start and end of the node and zone */
+	get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
+	zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
+	zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
+
+	/* Check that this node has pages within the zone's required range */
+	if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)
+		return 0;
+
+	/* Move the zone boundaries inside the node if necessary */
+	if (zone_end_pfn > node_end_pfn)
+		zone_end_pfn = node_end_pfn;
+	if (zone_start_pfn < node_start_pfn)
+		zone_start_pfn = node_start_pfn;
+
+	/* Return the spanned pages */
+	return zone_end_pfn - zone_start_pfn;
+}
+
+unsigned long __init zone_absent_pages_in_node(int nid,
+					unsigned long zone_type,
+					unsigned long *ignored)
+{
+	int i = 0;
+	unsigned long prev_end_pfn = 0, hole_pages = 0;
+	unsigned long start_pfn;
+
+	/* Find the end_pfn of the first active range of pfns in the node */
+	i = first_active_region_index_in_nid(nid);
+	prev_end_pfn = early_node_map[i].start_pfn;
+
+	/* Find all holes for the zone within the node */
+	for (; i != MAX_ACTIVE_REGIONS;
+			i = next_active_region_index_in_nid(i, nid)) {
+
+		/* No need to continue if prev_end_pfn is outside the zone */
+		if (prev_end_pfn >= arch_zone_highest_possible_pfn[zone_type])
+			break;
+
+		/* Make sure the end of the zone is not within the hole */
+		start_pfn = early_node_map[i].start_pfn;
+		if (start_pfn > arch_zone_highest_possible_pfn[zone_type])
+			start_pfn = arch_zone_highest_possible_pfn[zone_type];
+		BUG_ON(prev_end_pfn > start_pfn);
+
+		/* Update the hole size cound and move on */
+		hole_pages += start_pfn - prev_end_pfn;
+		prev_end_pfn = early_node_map[i].end_pfn;
+	}
+
+	return hole_pages;
+}
+#else
+static inline unsigned long zone_present_pages_in_node(int nid,
+					unsigned long zone_type,
+					unsigned long *zones_size)
+{
+	return zones_size[zone_type];
+}
+
+static inline unsigned long zone_absent_pages_in_node(int nid,
+						unsigned long zone_type,
+						unsigned long *zholes_size)
+{
+	if (!zholes_size)
+		return 0;
+
+	return zholes_size[zone_type];
+}
+#endif
+
+static void __init calculate_node_totalpages(struct pglist_data *pgdat,
+		unsigned long *zones_size, unsigned long *zholes_size)
+{
+	unsigned long realtotalpages, totalpages = 0;
+	int i;
+
+	for (i = 0; i < MAX_NR_ZONES; i++) {
+		totalpages += zone_present_pages_in_node(pgdat->node_id, i,
+								zones_size);
+	}
+	pgdat->node_spanned_pages = totalpages;
+
+	realtotalpages = totalpages;
+	for (i = 0; i < MAX_NR_ZONES; i++) {
+		realtotalpages -=
+			zone_absent_pages_in_node(pgdat->node_id, i, zholes_size);
+	}
+	pgdat->node_present_pages = realtotalpages;
+	printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
+							realtotalpages);
+}
+
+/*
+ * Set up the zone data structures:
+ *   - mark all pages reserved
+ *   - mark all memory queues empty
+ *   - clear the memory bitmaps
+ */
+static void __init free_area_init_core(struct pglist_data *pgdat,
+		unsigned long *zones_size, unsigned long *zholes_size)
+{
+	unsigned long j;
+	int nid = pgdat->node_id;
+	unsigned long zone_start_pfn = pgdat->node_start_pfn;
+
+	pgdat_resize_init(pgdat);
+	pgdat->nr_zones = 0;
+	init_waitqueue_head(&pgdat->kswapd_wait);
+	pgdat->kswapd_max_order = 0;
+
+	for (j = 0; j < MAX_NR_ZONES; j++) {
+		struct zone *zone = pgdat->node_zones + j;
+		unsigned long size, realsize;
+
+		size = zone_present_pages_in_node(nid, j, zones_size);
+		realsize = size - zone_absent_pages_in_node(nid, j,
+								zholes_size);
+		if (j < ZONE_HIGHMEM)
+			nr_kernel_pages += realsize;
+		nr_all_pages += realsize;
+
+		zone->spanned_pages = size;
+		zone->present_pages = realsize;
+		zone->name = zone_names[j];
+		spin_lock_init(&zone->lock);
+		spin_lock_init(&zone->lru_lock);
+		zone_seqlock_init(zone);
+		zone->zone_pgdat = pgdat;
+		zone->free_pages = 0;
+
+		zone->temp_priority = zone->prev_priority = DEF_PRIORITY;
+
+		zone_pcp_init(zone);
+		INIT_LIST_HEAD(&zone->active_list);
+		INIT_LIST_HEAD(&zone->inactive_list);
+		zone->nr_scan_active = 0;
+		zone->nr_scan_inactive = 0;
+		zone->nr_active = 0;
+		zone->nr_inactive = 0;
+		atomic_set(&zone->reclaim_in_progress, 0);
+		if (!size)
+			continue;
+
+		zonetable_add(zone, nid, j, zone_start_pfn, size);
+		init_currently_empty_zone(zone, zone_start_pfn, size);
+		zone_start_pfn += size;
+	}
+}
+
+static void __init alloc_node_mem_map(struct pglist_data *pgdat)
+{
+	/* Skip empty nodes */
+	if (!pgdat->node_spanned_pages)
+		return;
+
+#ifdef CONFIG_FLAT_NODE_MEM_MAP
+	/* ia64 gets its own node_mem_map, before this, without bootmem */
+	if (!pgdat->node_mem_map) {
+		unsigned long size;
+		struct page *map;
+
+		size = (pgdat->node_spanned_pages + 1) * sizeof(struct page);
+		map = alloc_remap(pgdat->node_id, size);
+		if (!map)
+			map = alloc_bootmem_node(pgdat, size);
+		pgdat->node_mem_map = map;
+	}
+#ifdef CONFIG_FLATMEM
+	/*
+	 * With no DISCONTIG, the global mem_map is just set as node 0's
+	 */
+	if (pgdat == NODE_DATA(0))
+		mem_map = NODE_DATA(0)->node_mem_map;
+#endif
+#endif /* CONFIG_FLAT_NODE_MEM_MAP */
+}
+
+void __init free_area_init_node(int nid, struct pglist_data *pgdat,
+		unsigned long *zones_size, unsigned long node_start_pfn,
+		unsigned long *zholes_size)
+{
+	pgdat->node_id = nid;
+	pgdat->node_start_pfn = node_start_pfn;
+	calculate_node_totalpages(pgdat, zones_size, zholes_size);
+
+	alloc_node_mem_map(pgdat);
+
+	free_area_init_core(pgdat, zones_size, zholes_size);
+}
+
+#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
+void __init add_active_range(unsigned int nid, unsigned long start_pfn,
+						unsigned long end_pfn)
+{
+	unsigned int i;
+
+	/* Merge with existing active regions if possible */
+	for (i = 0; early_node_map[i].end_pfn; i++) {
+		if (early_node_map[i].nid != nid)
+			continue;
+
+		/* Merge forward if suitable */
+		if (start_pfn <= early_node_map[i].end_pfn &&
+				end_pfn > early_node_map[i].end_pfn) {
+			early_node_map[i].end_pfn = end_pfn;
+			return;
+		}
+
+		/* Merge backward if suitable */
+		if (start_pfn < early_node_map[i].end_pfn &&
+				end_pfn >= early_node_map[i].start_pfn) {
+			early_node_map[i].start_pfn = start_pfn;
+			return;
+		}
+	}
+
+	/*
+	 * Leave last entry NULL so we dont iterate off the end (we use
+	 * entry.end_pfn to terminate the walk).
+	 */
+	if (i >= MAX_ACTIVE_REGIONS - 1) {
+		printk(KERN_ERR "WARNING: too many memory regions in "
+				"numa code, truncating\n");
+		return;
+	}
+
+	early_node_map[i].nid = nid;
+	early_node_map[i].start_pfn = start_pfn;
+	early_node_map[i].end_pfn = end_pfn;
+}
+
+/* Compare two active node_active_regions */
+static int __init cmp_node_active_region(const void *a, const void *b)
+{
+	struct node_active_region *arange = (struct node_active_region *)a;
+	struct node_active_region *brange = (struct node_active_region *)b;
+
+	/* Done this way to avoid overflows */
+	if (arange->start_pfn > brange->start_pfn)
+		return 1;
+	if (arange->start_pfn < brange->start_pfn)
+		return -1;
+
+	return 0;
+}
+
+/* sort the node_map by start_pfn */
+static void __init sort_node_map(void)
+{
+	size_t num = 0;
+	while (early_node_map[num].end_pfn)
+		num++;
+
+	sort(early_node_map, num, sizeof(struct node_active_region),
+						cmp_node_active_region, NULL);
+}
+
+unsigned long __init find_min_pfn(void)
+{
+	int i;
+	unsigned long min_pfn = -1UL;
+
+	for (i = 0; early_node_map[i].end_pfn; i++) {
+		if (early_node_map[i].start_pfn < min_pfn)
+			min_pfn = early_node_map[i].start_pfn;
+	}
+
+	return min_pfn;
+}
+
+/* Find the lowest pfn in a node. This depends on a sorted early_node_map */
+unsigned long __init find_start_pfn_for_node(unsigned long nid)
+{
+	int i;
+
+	/* Assuming a sorted map, the first range found has the starting pfn */
+	for_each_active_range_index_in_nid(i, nid) {
+		return early_node_map[i].start_pfn;
+	}
+
+	/* nid does not exist in early_node_map */
+	printk(KERN_WARNING "Could not find start_pfn for node %lu\n", nid);
+	return 0;
+}
+
+void __init free_area_init_nodes(unsigned long arch_max_dma_pfn,
+				unsigned long arch_max_dma32_pfn,
+				unsigned long arch_max_low_pfn,
+				unsigned long arch_max_high_pfn)
+{
+	unsigned long nid;
+
+	/* Record where the zone boundaries are */
+	memset(arch_zone_lowest_possible_pfn, 0,
+				sizeof(arch_zone_lowest_possible_pfn));
+	memset(arch_zone_highest_possible_pfn, 0,
+				sizeof(arch_zone_highest_possible_pfn));
+	arch_zone_lowest_possible_pfn[ZONE_DMA] = find_min_pfn();
+	arch_zone_highest_possible_pfn[ZONE_DMA] = arch_max_dma_pfn;
+	arch_zone_lowest_possible_pfn[ZONE_DMA32] = arch_max_dma_pfn;
+	arch_zone_highest_possible_pfn[ZONE_DMA32] = arch_max_dma32_pfn;
+	arch_zone_lowest_possible_pfn[ZONE_NORMAL] = arch_max_dma32_pfn;
+	arch_zone_highest_possible_pfn[ZONE_NORMAL] = arch_max_low_pfn;
+	arch_zone_lowest_possible_pfn[ZONE_HIGHMEM] = arch_max_low_pfn;
+	arch_zone_highest_possible_pfn[ZONE_HIGHMEM] = arch_max_high_pfn;
+
+	/* Regions in the early_node_map can be in any order */
+	sort_node_map();
+
+	for_each_online_node(nid) {
+		pg_data_t *pgdat = NODE_DATA(nid);
+		free_area_init_node(nid, pgdat, NULL,
+				find_start_pfn_for_node(nid), NULL);
+	}
+}
+#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc1-105-ia64_use_init_nodes/mm/page_alloc.c linux-2.6.17-rc1-106-breakout_mem_init/mm/page_alloc.c
--- linux-2.6.17-rc1-105-ia64_use_init_nodes/mm/page_alloc.c	2006-04-12 21:03:59.000000000 +0100
+++ linux-2.6.17-rc1-106-breakout_mem_init/mm/page_alloc.c	2006-04-12 20:07:52.000000000 +0100
@@ -37,8 +37,6 @@
 #include <linux/nodemask.h>
 #include <linux/vmalloc.h>
 #include <linux/mempolicy.h>
-#include <linux/sort.h>
-#include <linux/pfn.h>
 
 #include <asm/tlbflush.h>
 #include "internal.h"
@@ -54,7 +52,6 @@ EXPORT_SYMBOL(node_possible_map);
 unsigned long totalram_pages __read_mostly;
 unsigned long totalhigh_pages __read_mostly;
 long nr_swap_pages;
-int percpu_pagelist_fraction;
 
 static void __free_pages_ok(struct page *page, unsigned int order);
 
@@ -80,24 +77,11 @@ EXPORT_SYMBOL(totalram_pages);
 struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly;
 EXPORT_SYMBOL(zone_table);
 
-static char *zone_names[MAX_NR_ZONES] = { "DMA", "DMA32", "Normal", "HighMem" };
 int min_free_kbytes = 1024;
 
 unsigned long __initdata nr_kernel_pages;
 unsigned long __initdata nr_all_pages;
 
-#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
-  #ifdef CONFIG_MAX_ACTIVE_REGIONS
-    #define MAX_ACTIVE_REGIONS CONFIG_MAX_ACTIVE_REGIONS
-  #else
-    #define MAX_ACTIVE_REGIONS (MAX_NR_ZONES * MAX_NUMNODES + 1)
-  #endif
-
-  struct node_active_region __initdata early_node_map[MAX_ACTIVE_REGIONS];
-  unsigned long __initdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
-  unsigned long __initdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
-#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
-
 #ifdef CONFIG_DEBUG_VM
 static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
 {
@@ -1511,957 +1495,6 @@ void show_free_areas(void)
 	show_swap_cache_info();
 }
 
-/*
- * Builds allocation fallback zone lists.
- *
- * Add all populated zones of a node to the zonelist.
- */
-static int __init build_zonelists_node(pg_data_t *pgdat,
-			struct zonelist *zonelist, int nr_zones, int zone_type)
-{
-	struct zone *zone;
-
-	BUG_ON(zone_type > ZONE_HIGHMEM);
-
-	do {
-		zone = pgdat->node_zones + zone_type;
-		if (populated_zone(zone)) {
-#ifndef CONFIG_HIGHMEM
-			BUG_ON(zone_type > ZONE_NORMAL);
-#endif
-			zonelist->zones[nr_zones++] = zone;
-			check_highest_zone(zone_type);
-		}
-		zone_type--;
-
-	} while (zone_type >= 0);
-	return nr_zones;
-}
-
-static inline int highest_zone(int zone_bits)
-{
-	int res = ZONE_NORMAL;
-	if (zone_bits & (__force int)__GFP_HIGHMEM)
-		res = ZONE_HIGHMEM;
-	if (zone_bits & (__force int)__GFP_DMA32)
-		res = ZONE_DMA32;
-	if (zone_bits & (__force int)__GFP_DMA)
-		res = ZONE_DMA;
-	return res;
-}
-
-#ifdef CONFIG_NUMA
-#define MAX_NODE_LOAD (num_online_nodes())
-static int __initdata node_load[MAX_NUMNODES];
-/**
- * find_next_best_node - find the next node that should appear in a given node's fallback list
- * @node: node whose fallback list we're appending
- * @used_node_mask: nodemask_t of already used nodes
- *
- * We use a number of factors to determine which is the next node that should
- * appear on a given node's fallback list.  The node should not have appeared
- * already in @node's fallback list, and it should be the next closest node
- * according to the distance array (which contains arbitrary distance values
- * from each node to each node in the system), and should also prefer nodes
- * with no CPUs, since presumably they'll have very little allocation pressure
- * on them otherwise.
- * It returns -1 if no node is found.
- */
-static int __init find_next_best_node(int node, nodemask_t *used_node_mask)
-{
-	int n, val;
-	int min_val = INT_MAX;
-	int best_node = -1;
-
-	/* Use the local node if we haven't already */
-	if (!node_isset(node, *used_node_mask)) {
-		node_set(node, *used_node_mask);
-		return node;
-	}
-
-	for_each_online_node(n) {
-		cpumask_t tmp;
-
-		/* Don't want a node to appear more than once */
-		if (node_isset(n, *used_node_mask))
-			continue;
-
-		/* Use the distance array to find the distance */
-		val = node_distance(node, n);
-
-		/* Penalize nodes under us ("prefer the next node") */
-		val += (n < node);
-
-		/* Give preference to headless and unused nodes */
-		tmp = node_to_cpumask(n);
-		if (!cpus_empty(tmp))
-			val += PENALTY_FOR_NODE_WITH_CPUS;
-
-		/* Slight preference for less loaded node */
-		val *= (MAX_NODE_LOAD*MAX_NUMNODES);
-		val += node_load[n];
-
-		if (val < min_val) {
-			min_val = val;
-			best_node = n;
-		}
-	}
-
-	if (best_node >= 0)
-		node_set(best_node, *used_node_mask);
-
-	return best_node;
-}
-
-static void __init build_zonelists(pg_data_t *pgdat)
-{
-	int i, j, k, node, local_node;
-	int prev_node, load;
-	struct zonelist *zonelist;
-	nodemask_t used_mask;
-
-	/* initialize zonelists */
-	for (i = 0; i < GFP_ZONETYPES; i++) {
-		zonelist = pgdat->node_zonelists + i;
-		zonelist->zones[0] = NULL;
-	}
-
-	/* NUMA-aware ordering of nodes */
-	local_node = pgdat->node_id;
-	load = num_online_nodes();
-	prev_node = local_node;
-	nodes_clear(used_mask);
-	while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
-		int distance = node_distance(local_node, node);
-
-		/*
-		 * If another node is sufficiently far away then it is better
-		 * to reclaim pages in a zone before going off node.
-		 */
-		if (distance > RECLAIM_DISTANCE)
-			zone_reclaim_mode = 1;
-
-		/*
-		 * We don't want to pressure a particular node.
-		 * So adding penalty to the first node in same
-		 * distance group to make it round-robin.
-		 */
-
-		if (distance != node_distance(local_node, prev_node))
-			node_load[node] += load;
-		prev_node = node;
-		load--;
-		for (i = 0; i < GFP_ZONETYPES; i++) {
-			zonelist = pgdat->node_zonelists + i;
-			for (j = 0; zonelist->zones[j] != NULL; j++);
-
-			k = highest_zone(i);
-
-	 		j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
-			zonelist->zones[j] = NULL;
-		}
-	}
-}
-
-#else	/* CONFIG_NUMA */
-
-static void __init build_zonelists(pg_data_t *pgdat)
-{
-	int i, j, k, node, local_node;
-
-	local_node = pgdat->node_id;
-	for (i = 0; i < GFP_ZONETYPES; i++) {
-		struct zonelist *zonelist;
-
-		zonelist = pgdat->node_zonelists + i;
-
-		j = 0;
-		k = highest_zone(i);
- 		j = build_zonelists_node(pgdat, zonelist, j, k);
- 		/*
- 		 * Now we build the zonelist so that it contains the zones
- 		 * of all the other nodes.
- 		 * We don't want to pressure a particular node, so when
- 		 * building the zones for node N, we make sure that the
- 		 * zones coming right after the local ones are those from
- 		 * node N+1 (modulo N)
- 		 */
-		for (node = local_node + 1; node < MAX_NUMNODES; node++) {
-			if (!node_online(node))
-				continue;
-			j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
-		}
-		for (node = 0; node < local_node; node++) {
-			if (!node_online(node))
-				continue;
-			j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
-		}
-
-		zonelist->zones[j] = NULL;
-	}
-}
-
-#endif	/* CONFIG_NUMA */
-
-void __init build_all_zonelists(void)
-{
-	int i;
-
-	for_each_online_node(i)
-		build_zonelists(NODE_DATA(i));
-	printk("Built %i zonelists\n", num_online_nodes());
-	cpuset_init_current_mems_allowed();
-}
-
-/*
- * Helper functions to size the waitqueue hash table.
- * Essentially these want to choose hash table sizes sufficiently
- * large so that collisions trying to wait on pages are rare.
- * But in fact, the number of active page waitqueues on typical
- * systems is ridiculously low, less than 200. So this is even
- * conservative, even though it seems large.
- *
- * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
- * waitqueues, i.e. the size of the waitq table given the number of pages.
- */
-#define PAGES_PER_WAITQUEUE	256
-
-static inline unsigned long wait_table_size(unsigned long pages)
-{
-	unsigned long size = 1;
-
-	pages /= PAGES_PER_WAITQUEUE;
-
-	while (size < pages)
-		size <<= 1;
-
-	/*
-	 * Once we have dozens or even hundreds of threads sleeping
-	 * on IO we've got bigger problems than wait queue collision.
-	 * Limit the size of the wait table to a reasonable size.
-	 */
-	size = min(size, 4096UL);
-
-	return max(size, 4UL);
-}
-
-/*
- * This is an integer logarithm so that shifts can be used later
- * to extract the more random high bits from the multiplicative
- * hash function before the remainder is taken.
- */
-static inline unsigned long wait_table_bits(unsigned long size)
-{
-	return ffz(~size);
-}
-
-#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
-
-/*
- * Initially all pages are reserved - free ones are freed
- * up by free_all_bootmem() once the early boot process is
- * done. Non-atomic initialization, single-pass.
- */
-void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
-		unsigned long start_pfn)
-{
-	struct page *page;
-	unsigned long end_pfn = start_pfn + size;
-	unsigned long pfn;
-
-	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
-		if (!early_pfn_valid(pfn))
-			continue;
-		page = pfn_to_page(pfn);
-		set_page_links(page, zone, nid, pfn);
-		init_page_count(page);
-		reset_page_mapcount(page);
-		SetPageReserved(page);
-		INIT_LIST_HEAD(&page->lru);
-#ifdef WANT_PAGE_VIRTUAL
-		/* The shift won't overflow because ZONE_NORMAL is below 4G. */
-		if (!is_highmem_idx(zone))
-			set_page_address(page, __va(pfn << PAGE_SHIFT));
-#endif
-	}
-}
-
-void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone,
-				unsigned long size)
-{
-	int order;
-	for (order = 0; order < MAX_ORDER ; order++) {
-		INIT_LIST_HEAD(&zone->free_area[order].free_list);
-		zone->free_area[order].nr_free = 0;
-	}
-}
-
-#define ZONETABLE_INDEX(x, zone_nr)	((x << ZONES_SHIFT) | zone_nr)
-void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn,
-		unsigned long size)
-{
-	unsigned long snum = pfn_to_section_nr(pfn);
-	unsigned long end = pfn_to_section_nr(pfn + size);
-
-	if (FLAGS_HAS_NODE)
-		zone_table[ZONETABLE_INDEX(nid, zid)] = zone;
-	else
-		for (; snum <= end; snum++)
-			zone_table[ZONETABLE_INDEX(snum, zid)] = zone;
-}
-
-#ifndef __HAVE_ARCH_MEMMAP_INIT
-#define memmap_init(size, nid, zone, start_pfn) \
-	memmap_init_zone((size), (nid), (zone), (start_pfn))
-#endif
-
-static int __cpuinit zone_batchsize(struct zone *zone)
-{
-	int batch;
-
-	/*
-	 * The per-cpu-pages pools are set to around 1000th of the
-	 * size of the zone.  But no more than 1/2 of a meg.
-	 *
-	 * OK, so we don't know how big the cache is.  So guess.
-	 */
-	batch = zone->present_pages / 1024;
-	if (batch * PAGE_SIZE > 512 * 1024)
-		batch = (512 * 1024) / PAGE_SIZE;
-	batch /= 4;		/* We effectively *= 4 below */
-	if (batch < 1)
-		batch = 1;
-
-	/*
-	 * Clamp the batch to a 2^n - 1 value. Having a power
-	 * of 2 value was found to be more likely to have
-	 * suboptimal cache aliasing properties in some cases.
-	 *
-	 * For example if 2 tasks are alternately allocating
-	 * batches of pages, one task can end up with a lot
-	 * of pages of one half of the possible page colors
-	 * and the other with pages of the other colors.
-	 */
-	batch = (1 << (fls(batch + batch/2)-1)) - 1;
-
-	return batch;
-}
-
-inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
-{
-	struct per_cpu_pages *pcp;
-
-	memset(p, 0, sizeof(*p));
-
-	pcp = &p->pcp[0];		/* hot */
-	pcp->count = 0;
-	pcp->high = 6 * batch;
-	pcp->batch = max(1UL, 1 * batch);
-	INIT_LIST_HEAD(&pcp->list);
-
-	pcp = &p->pcp[1];		/* cold*/
-	pcp->count = 0;
-	pcp->high = 2 * batch;
-	pcp->batch = max(1UL, batch/2);
-	INIT_LIST_HEAD(&pcp->list);
-}
-
-/*
- * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist
- * to the value high for the pageset p.
- */
-
-static void setup_pagelist_highmark(struct per_cpu_pageset *p,
-				unsigned long high)
-{
-	struct per_cpu_pages *pcp;
-
-	pcp = &p->pcp[0]; /* hot list */
-	pcp->high = high;
-	pcp->batch = max(1UL, high/4);
-	if ((high/4) > (PAGE_SHIFT * 8))
-		pcp->batch = PAGE_SHIFT * 8;
-}
-
-
-#ifdef CONFIG_NUMA
-/*
- * Boot pageset table. One per cpu which is going to be used for all
- * zones and all nodes. The parameters will be set in such a way
- * that an item put on a list will immediately be handed over to
- * the buddy list. This is safe since pageset manipulation is done
- * with interrupts disabled.
- *
- * Some NUMA counter updates may also be caught by the boot pagesets.
- *
- * The boot_pagesets must be kept even after bootup is complete for
- * unused processors and/or zones. They do play a role for bootstrapping
- * hotplugged processors.
- *
- * zoneinfo_show() and maybe other functions do
- * not check if the processor is online before following the pageset pointer.
- * Other parts of the kernel may not check if the zone is available.
- */
-static struct per_cpu_pageset boot_pageset[NR_CPUS];
-
-/*
- * Dynamically allocate memory for the
- * per cpu pageset array in struct zone.
- */
-static int __cpuinit process_zones(int cpu)
-{
-	struct zone *zone, *dzone;
-
-	for_each_zone(zone) {
-
-		zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
-					 GFP_KERNEL, cpu_to_node(cpu));
-		if (!zone_pcp(zone, cpu))
-			goto bad;
-
-		setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone));
-
-		if (percpu_pagelist_fraction)
-			setup_pagelist_highmark(zone_pcp(zone, cpu),
-			 	(zone->present_pages / percpu_pagelist_fraction));
-	}
-
-	return 0;
-bad:
-	for_each_zone(dzone) {
-		if (dzone == zone)
-			break;
-		kfree(zone_pcp(dzone, cpu));
-		zone_pcp(dzone, cpu) = NULL;
-	}
-	return -ENOMEM;
-}
-
-static inline void free_zone_pagesets(int cpu)
-{
-	struct zone *zone;
-
-	for_each_zone(zone) {
-		struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
-
-		zone_pcp(zone, cpu) = NULL;
-		kfree(pset);
-	}
-}
-
-static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,
-		unsigned long action,
-		void *hcpu)
-{
-	int cpu = (long)hcpu;
-	int ret = NOTIFY_OK;
-
-	switch (action) {
-		case CPU_UP_PREPARE:
-			if (process_zones(cpu))
-				ret = NOTIFY_BAD;
-			break;
-		case CPU_UP_CANCELED:
-		case CPU_DEAD:
-			free_zone_pagesets(cpu);
-			break;
-		default:
-			break;
-	}
-	return ret;
-}
-
-static struct notifier_block pageset_notifier =
-	{ &pageset_cpuup_callback, NULL, 0 };
-
-void __init setup_per_cpu_pageset(void)
-{
-	int err;
-
-	/* Initialize per_cpu_pageset for cpu 0.
-	 * A cpuup callback will do this for every cpu
-	 * as it comes online
-	 */
-	err = process_zones(smp_processor_id());
-	BUG_ON(err);
-	register_cpu_notifier(&pageset_notifier);
-}
-
-#endif
-
-static __meminit
-void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
-{
-	int i;
-	struct pglist_data *pgdat = zone->zone_pgdat;
-
-	/*
-	 * The per-page waitqueue mechanism uses hashed waitqueues
-	 * per zone.
-	 */
-	zone->wait_table_size = wait_table_size(zone_size_pages);
-	zone->wait_table_bits =	wait_table_bits(zone->wait_table_size);
-	zone->wait_table = (wait_queue_head_t *)
-		alloc_bootmem_node(pgdat, zone->wait_table_size
-					* sizeof(wait_queue_head_t));
-
-	for(i = 0; i < zone->wait_table_size; ++i)
-		init_waitqueue_head(zone->wait_table + i);
-}
-
-static __meminit void zone_pcp_init(struct zone *zone)
-{
-	int cpu;
-	unsigned long batch = zone_batchsize(zone);
-
-	for (cpu = 0; cpu < NR_CPUS; cpu++) {
-#ifdef CONFIG_NUMA
-		/* Early boot. Slab allocator not functional yet */
-		zone_pcp(zone, cpu) = &boot_pageset[cpu];
-		setup_pageset(&boot_pageset[cpu],0);
-#else
-		setup_pageset(zone_pcp(zone,cpu), batch);
-#endif
-	}
-	if (zone->present_pages)
-		printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%lu\n",
-			zone->name, zone->present_pages, batch);
-}
-
-static __meminit void init_currently_empty_zone(struct zone *zone,
-		unsigned long zone_start_pfn, unsigned long size)
-{
-	struct pglist_data *pgdat = zone->zone_pgdat;
-
-	zone_wait_table_init(zone, size);
-	pgdat->nr_zones = zone_idx(zone) + 1;
-
-	zone->zone_start_pfn = zone_start_pfn;
-
-	memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn);
-
-	zone_init_free_lists(pgdat, zone, zone->spanned_pages);
-}
-
-#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
-static int __init first_active_region_index_in_nid(int nid)
-{
-	int i;
-	for (i = 0; early_node_map[i].end_pfn; i++) {
-		if (early_node_map[i].nid == nid)
-			return i;
-	}
-
-	return MAX_ACTIVE_REGIONS;
-}
-
-static int __init next_active_region_index_in_nid(unsigned int index, int nid)
-{
-	for (index = index + 1; early_node_map[index].end_pfn; index++) {
-		if (early_node_map[index].nid == nid)
-			return index;
-	}
-
-	return MAX_ACTIVE_REGIONS;
-}
-
-#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
-int __init early_pfn_to_nid(unsigned long pfn)
-{
-	int i;
-
-	for (i = 0; early_node_map[i].end_pfn; i++) {
-		unsigned long start_pfn = early_node_map[i].start_pfn;
-		unsigned long end_pfn = early_node_map[i].end_pfn;
-
-		if ((start_pfn <= pfn) && (pfn < end_pfn))
-			return early_node_map[i].nid;
-	}
-
-	return -1;
-}
-#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
-
-#define for_each_active_range_index_in_nid(i, nid) \
-	for (i = first_active_region_index_in_nid(nid); \
-				i != MAX_ACTIVE_REGIONS; \
-				i = next_active_region_index_in_nid(i, nid))
-
-void __init free_bootmem_with_active_regions(int nid,
-						unsigned long max_low_pfn)
-{
-	unsigned int i;
-	for_each_active_range_index_in_nid(i, nid) {
-		unsigned long size_pages = 0;
-		unsigned long end_pfn = early_node_map[i].end_pfn;
-		if (early_node_map[i].start_pfn >= max_low_pfn)
-			continue;
-
-		if (end_pfn > max_low_pfn)
-			end_pfn = max_low_pfn;
-
-		size_pages = end_pfn - early_node_map[i].start_pfn;
-		free_bootmem_node(NODE_DATA(early_node_map[i].nid),
-				PFN_PHYS(early_node_map[i].start_pfn),
-				PFN_PHYS(size_pages));
-	}
-}
-
-void __init memory_present_with_active_regions(int nid)
-{
-	unsigned int i;
-	for_each_active_range_index_in_nid(i, nid)
-		memory_present(early_node_map[i].nid,
-				early_node_map[i].start_pfn,
-				early_node_map[i].end_pfn);
-}
-
-void __init get_pfn_range_for_nid(unsigned int nid,
-			unsigned long *start_pfn, unsigned long *end_pfn)
-{
-	unsigned int i;
-	*start_pfn = -1UL;
-	*end_pfn = 0;
-
-	for_each_active_range_index_in_nid(i, nid) {
-		if (early_node_map[i].start_pfn < *start_pfn)
-			*start_pfn = early_node_map[i].start_pfn;
-
-		if (early_node_map[i].end_pfn > *end_pfn)
-			*end_pfn = early_node_map[i].end_pfn;
-	}
-
-	if (*start_pfn == -1UL) {
-		printk(KERN_WARNING "Node %u active with no memory\n", nid);
-		*start_pfn = 0;
-	}
-}
-
-unsigned long __init zone_present_pages_in_node(int nid,
-					unsigned long zone_type,
-					unsigned long *ignored)
-{
-	unsigned long node_start_pfn, node_end_pfn;
-	unsigned long zone_start_pfn, zone_end_pfn;
-
-	/* Get the start and end of the node and zone */
-	get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
-	zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
-	zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
-
-	/* Check that this node has pages within the zone's required range */
-	if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)
-		return 0;
-
-	/* Move the zone boundaries inside the node if necessary */
-	if (zone_end_pfn > node_end_pfn)
-		zone_end_pfn = node_end_pfn;
-	if (zone_start_pfn < node_start_pfn)
-		zone_start_pfn = node_start_pfn;
-
-	/* Return the spanned pages */
-	return zone_end_pfn - zone_start_pfn;
-}
-
-unsigned long __init zone_absent_pages_in_node(int nid,
-					unsigned long zone_type,
-					unsigned long *ignored)
-{
-	int i = 0;
-	unsigned long prev_end_pfn = 0, hole_pages = 0;
-	unsigned long start_pfn;
-
-	/* Find the end_pfn of the first active range of pfns in the node */
-	i = first_active_region_index_in_nid(nid);
-	prev_end_pfn = early_node_map[i].start_pfn;
-
-	/* Find all holes for the zone within the node */
-	for (; i != MAX_ACTIVE_REGIONS;
-			i = next_active_region_index_in_nid(i, nid)) {
-
-		/* No need to continue if prev_end_pfn is outside the zone */
-		if (prev_end_pfn >= arch_zone_highest_possible_pfn[zone_type])
-			break;
-
-		/* Make sure the end of the zone is not within the hole */
-		start_pfn = early_node_map[i].start_pfn;
-		if (start_pfn > arch_zone_highest_possible_pfn[zone_type])
-			start_pfn = arch_zone_highest_possible_pfn[zone_type];
-		BUG_ON(prev_end_pfn > start_pfn);
-
-		/* Update the hole size cound and move on */
-		hole_pages += start_pfn - prev_end_pfn;
-		prev_end_pfn = early_node_map[i].end_pfn;
-	}
-
-	return hole_pages;
-}
-#else
-static inline unsigned long zone_present_pages_in_node(int nid,
-					unsigned long zone_type,
-					unsigned long *zones_size)
-{
-	return zones_size[zone_type];
-}
-
-static inline unsigned long zone_absent_pages_in_node(int nid,
-						unsigned long zone_type,
-						unsigned long *zholes_size)
-{
-	if (!zholes_size)
-		return 0;
-
-	return zholes_size[zone_type];
-}
-#endif
-
-static void __init calculate_node_totalpages(struct pglist_data *pgdat,
-		unsigned long *zones_size, unsigned long *zholes_size)
-{
-	unsigned long realtotalpages, totalpages = 0;
-	int i;
-
-	for (i = 0; i < MAX_NR_ZONES; i++) {
-		totalpages += zone_present_pages_in_node(pgdat->node_id, i,
-								zones_size);
-	}
-	pgdat->node_spanned_pages = totalpages;
-
-	realtotalpages = totalpages;
-	for (i = 0; i < MAX_NR_ZONES; i++) {
-		realtotalpages -=
-			zone_absent_pages_in_node(pgdat->node_id, i, zholes_size);
-	}
-	pgdat->node_present_pages = realtotalpages;
-	printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
-							realtotalpages);
-}
-
-/*
- * Set up the zone data structures:
- *   - mark all pages reserved
- *   - mark all memory queues empty
- *   - clear the memory bitmaps
- */
-static void __init free_area_init_core(struct pglist_data *pgdat,
-		unsigned long *zones_size, unsigned long *zholes_size)
-{
-	unsigned long j;
-	int nid = pgdat->node_id;
-	unsigned long zone_start_pfn = pgdat->node_start_pfn;
-
-	pgdat_resize_init(pgdat);
-	pgdat->nr_zones = 0;
-	init_waitqueue_head(&pgdat->kswapd_wait);
-	pgdat->kswapd_max_order = 0;
-	
-	for (j = 0; j < MAX_NR_ZONES; j++) {
-		struct zone *zone = pgdat->node_zones + j;
-		unsigned long size, realsize;
-
-		size = zone_present_pages_in_node(nid, j, zones_size);
-		realsize = size - zone_absent_pages_in_node(nid, j,
-								zholes_size);
-		if (j < ZONE_HIGHMEM)
-			nr_kernel_pages += realsize;
-		nr_all_pages += realsize;
-
-		zone->spanned_pages = size;
-		zone->present_pages = realsize;
-		zone->name = zone_names[j];
-		spin_lock_init(&zone->lock);
-		spin_lock_init(&zone->lru_lock);
-		zone_seqlock_init(zone);
-		zone->zone_pgdat = pgdat;
-		zone->free_pages = 0;
-
-		zone->temp_priority = zone->prev_priority = DEF_PRIORITY;
-
-		zone_pcp_init(zone);
-		INIT_LIST_HEAD(&zone->active_list);
-		INIT_LIST_HEAD(&zone->inactive_list);
-		zone->nr_scan_active = 0;
-		zone->nr_scan_inactive = 0;
-		zone->nr_active = 0;
-		zone->nr_inactive = 0;
-		atomic_set(&zone->reclaim_in_progress, 0);
-		if (!size)
-			continue;
-
-		zonetable_add(zone, nid, j, zone_start_pfn, size);
-		init_currently_empty_zone(zone, zone_start_pfn, size);
-		zone_start_pfn += size;
-	}
-}
-
-static void __init alloc_node_mem_map(struct pglist_data *pgdat)
-{
-	/* Skip empty nodes */
-	if (!pgdat->node_spanned_pages)
-		return;
-
-#ifdef CONFIG_FLAT_NODE_MEM_MAP
-	/* ia64 gets its own node_mem_map, before this, without bootmem */
-	if (!pgdat->node_mem_map) {
-		unsigned long size;
-		struct page *map;
-
-		size = (pgdat->node_spanned_pages + 1) * sizeof(struct page);
-		map = alloc_remap(pgdat->node_id, size);
-		if (!map)
-			map = alloc_bootmem_node(pgdat, size);
-		pgdat->node_mem_map = map;
-	}
-#ifdef CONFIG_FLATMEM
-	/*
-	 * With no DISCONTIG, the global mem_map is just set as node 0's
-	 */
-	if (pgdat == NODE_DATA(0))
-		mem_map = NODE_DATA(0)->node_mem_map;
-#endif
-#endif /* CONFIG_FLAT_NODE_MEM_MAP */
-}
-
-void __init free_area_init_node(int nid, struct pglist_data *pgdat,
-		unsigned long *zones_size, unsigned long node_start_pfn,
-		unsigned long *zholes_size)
-{
-	pgdat->node_id = nid;
-	pgdat->node_start_pfn = node_start_pfn;
-	calculate_node_totalpages(pgdat, zones_size, zholes_size);
-
-	alloc_node_mem_map(pgdat);
-
-	free_area_init_core(pgdat, zones_size, zholes_size);
-}
-
-#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
-void __init add_active_range(unsigned int nid, unsigned long start_pfn,
-						unsigned long end_pfn)
-{
-	unsigned int i;
-
-	/* Merge with existing active regions if possible */
-	for (i = 0; early_node_map[i].end_pfn; i++) {
-		if (early_node_map[i].nid != nid)
-			continue;
-
-		/* Merge forward if suitable */
-		if (start_pfn <= early_node_map[i].end_pfn &&
-				end_pfn > early_node_map[i].end_pfn) {
-			early_node_map[i].end_pfn = end_pfn;
-			return;
-		}
-
-		/* Merge backward if suitable */
-		if (start_pfn < early_node_map[i].end_pfn &&
-				end_pfn >= early_node_map[i].start_pfn) {
-			early_node_map[i].start_pfn = start_pfn;
-			return;
-		}
-	}
-
-	/*
-	 * Leave last entry NULL so we dont iterate off the end (we use
-	 * entry.end_pfn to terminate the walk).
-	 */
-	if (i >= MAX_ACTIVE_REGIONS - 1) {
-		printk(KERN_ERR "WARNING: too many memory regions in "
-				"numa code, truncating\n");
-		return;
-	}
-
-	early_node_map[i].nid = nid;
-	early_node_map[i].start_pfn = start_pfn;
-	early_node_map[i].end_pfn = end_pfn;
-}
-
-/* Compare two active node_active_regions */
-static int __init cmp_node_active_region(const void *a, const void *b)
-{
-	struct node_active_region *arange = (struct node_active_region *)a;
-	struct node_active_region *brange = (struct node_active_region *)b;
-
-	/* Done this way to avoid overflows */
-	if (arange->start_pfn > brange->start_pfn)
-		return 1;
-	if (arange->start_pfn < brange->start_pfn)
-		return -1;
-
-	return 0;
-}
-
-/* sort the node_map by start_pfn */
-static void __init sort_node_map(void)
-{
-	size_t num = 0;
-	while (early_node_map[num].end_pfn)
-		num++;
-
-	sort(early_node_map, num, sizeof(struct node_active_region),
-						cmp_node_active_region, NULL);
-}
-
-unsigned long __init find_min_pfn(void)
-{
-	int i;
-	unsigned long min_pfn = -1UL;
-
-	for (i = 0; early_node_map[i].end_pfn; i++) {
-		if (early_node_map[i].start_pfn < min_pfn)
-			min_pfn = early_node_map[i].start_pfn;
-	}
-
-	return min_pfn;
-}
-
-/* Find the lowest pfn in a node. This depends on a sorted early_node_map */
-unsigned long __init find_start_pfn_for_node(unsigned long nid)
-{
-	int i;
-
-	/* Assuming a sorted map, the first range found has the starting pfn */
-	for_each_active_range_index_in_nid(i, nid) {
-		return early_node_map[i].start_pfn;
-	}
-
-	/* nid does not exist in early_node_map */
-	printk(KERN_WARNING "Could not find start_pfn for node %lu\n", nid);
-	return 0;
-}
-
-void __init free_area_init_nodes(unsigned long arch_max_dma_pfn,
-				unsigned long arch_max_dma32_pfn,
-				unsigned long arch_max_low_pfn,
-				unsigned long arch_max_high_pfn)
-{
-	unsigned long nid;
-
-	/* Record where the zone boundaries are */
-	memset(arch_zone_lowest_possible_pfn, 0,
-				sizeof(arch_zone_lowest_possible_pfn));
-	memset(arch_zone_highest_possible_pfn, 0,
-				sizeof(arch_zone_highest_possible_pfn));
-	arch_zone_lowest_possible_pfn[ZONE_DMA] = find_min_pfn();
-	arch_zone_highest_possible_pfn[ZONE_DMA] = arch_max_dma_pfn;
-	arch_zone_lowest_possible_pfn[ZONE_DMA32] = arch_max_dma_pfn;
-	arch_zone_highest_possible_pfn[ZONE_DMA32] = arch_max_dma32_pfn;
-	arch_zone_lowest_possible_pfn[ZONE_NORMAL] = arch_max_dma32_pfn;
-	arch_zone_highest_possible_pfn[ZONE_NORMAL] = arch_max_low_pfn;
-	arch_zone_lowest_possible_pfn[ZONE_HIGHMEM] = arch_max_low_pfn;
-	arch_zone_highest_possible_pfn[ZONE_HIGHMEM] = arch_max_high_pfn;
-
-	/* Regions in the early_node_map can be in any order */
-	sort_node_map();
-
-	for_each_online_node(nid) {
-		pg_data_t *pgdat = NODE_DATA(nid);
-		free_area_init_node(nid, pgdat, NULL,
-				find_start_pfn_for_node(nid), NULL);
-	}
-}
-#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
-
 #ifndef CONFIG_NEED_MULTIPLE_NODES
 static bootmem_data_t contig_bootmem_data;
 struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data };
@@ -2944,32 +1977,6 @@ int lowmem_reserve_ratio_sysctl_handler(
 	return 0;
 }
 
-/*
- * percpu_pagelist_fraction - changes the pcp->high for each zone on each
- * cpu.  It is the fraction of total pages in each zone that a hot per cpu pagelist
- * can have before it gets flushed back to buddy allocator.
- */
-
-int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
-	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
-{
-	struct zone *zone;
-	unsigned int cpu;
-	int ret;
-
-	ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
-	if (!write || (ret == -EINVAL))
-		return ret;
-	for_each_zone(zone) {
-		for_each_online_cpu(cpu) {
-			unsigned long  high;
-			high = zone->present_pages / percpu_pagelist_fraction;
-			setup_pagelist_highmark(zone_pcp(zone, cpu), high);
-		}
-	}
-	return 0;
-}
-
 __initdata int hashdist = HASHDIST_DEFAULT;
 
 #ifdef CONFIG_NUMA

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH 6/7] Break out memory initialisation code from page_alloc.c to mem_init.c
  2006-04-18 13:00 [PATCH 0/7] [RFC] Sizing zones and holes in an architecture independent manner V3 Mel Gorman
@ 2006-04-18 13:02 ` Mel Gorman
  0 siblings, 0 replies; 11+ messages in thread
From: Mel Gorman @ 2006-04-18 13:02 UTC (permalink / raw)
  To: davej, tony.luck, linuxppc-dev, linux-kernel, bob.picco, ak,
	linux-mm
  Cc: Mel Gorman


page_alloc.c contains a large amount of memory initialisation code. This patch
breaks out the initialisation code to a separate file to make page_alloc.c
a bit easier to read.


 Makefile     |    2 
 mem_init.c   | 1040 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 page_alloc.c | 1021 -----------------------------------------------------
 3 files changed, 1041 insertions(+), 1022 deletions(-)

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc1-105-ia64_use_init_nodes/mm/Makefile linux-2.6.17-rc1-106-breakout_mem_init/mm/Makefile
--- linux-2.6.17-rc1-105-ia64_use_init_nodes/mm/Makefile	2006-04-03 04:22:10.000000000 +0100
+++ linux-2.6.17-rc1-106-breakout_mem_init/mm/Makefile	2006-04-18 10:22:00.000000000 +0100
@@ -8,7 +8,7 @@ mmu-$(CONFIG_MMU)	:= fremap.o highmem.o 
 			   vmalloc.o
 
 obj-y			:= bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
-			   page_alloc.o page-writeback.o pdflush.o \
+			   page_alloc.o mem_init.o page-writeback.o pdflush.o \
 			   readahead.o swap.o truncate.o vmscan.o \
 			   prio_tree.o util.o mmzone.o $(mmu-y)
 
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc1-105-ia64_use_init_nodes/mm/mem_init.c linux-2.6.17-rc1-106-breakout_mem_init/mm/mem_init.c
--- linux-2.6.17-rc1-105-ia64_use_init_nodes/mm/mem_init.c	2006-04-12 20:09:44.000000000 +0100
+++ linux-2.6.17-rc1-106-breakout_mem_init/mm/mem_init.c	2006-04-18 10:22:00.000000000 +0100
@@ -0,0 +1,1040 @@
+/*
+ * mm/mem_init.c
+ * Initialises the architecture independant view of memory. pgdats, zones, etc
+ *
+ *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
+ *  Copyright (C) 1995, Stephen Tweedie
+ *  Copyright (C) July 1999, Gerhard Wichert, Siemens AG
+ *  Copyright (C) 1999, Ingo Molnar, Red Hat
+ *  Copyright (C) 1999, 2000, Kanoj Sarcar, SGI
+ *  Copyright (C) Sept 2000, Martin J. Bligh
+ *	(lots of bits borrowed from Ingo Molnar & Andrew Morton)
+ *  Copyright (C) Apr 2006, Mel Gorman, IBM
+ *	(lots of bits taken from architecture-specific code)
+ */
+#include <linux/config.h>
+#include <linux/sort.h>
+#include <linux/pfn.h>
+#include <linux/mm.h>
+#include <linux/bootmem.h>
+#include <linux/cpuset.h>
+#include <linux/mempolicy.h>
+#include <linux/sysctl.h>
+#include <linux/swap.h>
+#include <linux/cpu.h>
+
+static char *zone_names[MAX_NR_ZONES] = { "DMA", "DMA32", "Normal", "HighMem" };
+int percpu_pagelist_fraction;
+
+#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
+  #ifdef CONFIG_MAX_ACTIVE_REGIONS
+    #define MAX_ACTIVE_REGIONS CONFIG_MAX_ACTIVE_REGIONS
+  #else
+    #define MAX_ACTIVE_REGIONS (MAX_NR_ZONES * MAX_NUMNODES + 1)
+  #endif
+
+  struct node_active_region __initdata early_node_map[MAX_ACTIVE_REGIONS];
+  unsigned long __initdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
+  unsigned long __initdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
+#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
+
+/*
+ * Builds allocation fallback zone lists.
+ *
+ * Add all populated zones of a node to the zonelist.
+ */
+static int __init build_zonelists_node(pg_data_t *pgdat,
+			struct zonelist *zonelist, int nr_zones, int zone_type)
+{
+	struct zone *zone;
+
+	BUG_ON(zone_type > ZONE_HIGHMEM);
+
+	do {
+		zone = pgdat->node_zones + zone_type;
+		if (populated_zone(zone)) {
+#ifndef CONFIG_HIGHMEM
+			BUG_ON(zone_type > ZONE_NORMAL);
+#endif
+			zonelist->zones[nr_zones++] = zone;
+			check_highest_zone(zone_type);
+		}
+		zone_type--;
+
+	} while (zone_type >= 0);
+	return nr_zones;
+}
+
+static inline int highest_zone(int zone_bits)
+{
+	int res = ZONE_NORMAL;
+	if (zone_bits & (__force int)__GFP_HIGHMEM)
+		res = ZONE_HIGHMEM;
+	if (zone_bits & (__force int)__GFP_DMA32)
+		res = ZONE_DMA32;
+	if (zone_bits & (__force int)__GFP_DMA)
+		res = ZONE_DMA;
+	return res;
+}
+
+#ifdef CONFIG_NUMA
+#define MAX_NODE_LOAD (num_online_nodes())
+static int __initdata node_load[MAX_NUMNODES];
+/**
+ * find_next_best_node - find the next node that should appear in a given node's fallback list
+ * @node: node whose fallback list we're appending
+ * @used_node_mask: nodemask_t of already used nodes
+ *
+ * We use a number of factors to determine which is the next node that should
+ * appear on a given node's fallback list.  The node should not have appeared
+ * already in @node's fallback list, and it should be the next closest node
+ * according to the distance array (which contains arbitrary distance values
+ * from each node to each node in the system), and should also prefer nodes
+ * with no CPUs, since presumably they'll have very little allocation pressure
+ * on them otherwise.
+ * It returns -1 if no node is found.
+ */
+static int __init find_next_best_node(int node, nodemask_t *used_node_mask)
+{
+	int n, val;
+	int min_val = INT_MAX;
+	int best_node = -1;
+
+	/* Use the local node if we haven't already */
+	if (!node_isset(node, *used_node_mask)) {
+		node_set(node, *used_node_mask);
+		return node;
+	}
+
+	for_each_online_node(n) {
+		cpumask_t tmp;
+
+		/* Don't want a node to appear more than once */
+		if (node_isset(n, *used_node_mask))
+			continue;
+
+		/* Use the distance array to find the distance */
+		val = node_distance(node, n);
+
+		/* Penalize nodes under us ("prefer the next node") */
+		val += (n < node);
+
+		/* Give preference to headless and unused nodes */
+		tmp = node_to_cpumask(n);
+		if (!cpus_empty(tmp))
+			val += PENALTY_FOR_NODE_WITH_CPUS;
+
+		/* Slight preference for less loaded node */
+		val *= (MAX_NODE_LOAD*MAX_NUMNODES);
+		val += node_load[n];
+
+		if (val < min_val) {
+			min_val = val;
+			best_node = n;
+		}
+	}
+
+	if (best_node >= 0)
+		node_set(best_node, *used_node_mask);
+
+	return best_node;
+}
+
+static void __init build_zonelists(pg_data_t *pgdat)
+{
+	int i, j, k, node, local_node;
+	int prev_node, load;
+	struct zonelist *zonelist;
+	nodemask_t used_mask;
+
+	/* initialize zonelists */
+	for (i = 0; i < GFP_ZONETYPES; i++) {
+		zonelist = pgdat->node_zonelists + i;
+		zonelist->zones[0] = NULL;
+	}
+
+	/* NUMA-aware ordering of nodes */
+	local_node = pgdat->node_id;
+	load = num_online_nodes();
+	prev_node = local_node;
+	nodes_clear(used_mask);
+	while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
+		int distance = node_distance(local_node, node);
+
+		/*
+		 * If another node is sufficiently far away then it is better
+		 * to reclaim pages in a zone before going off node.
+		 */
+		if (distance > RECLAIM_DISTANCE)
+			zone_reclaim_mode = 1;
+
+		/*
+		 * We don't want to pressure a particular node.
+		 * So adding penalty to the first node in same
+		 * distance group to make it round-robin.
+		 */
+
+		if (distance != node_distance(local_node, prev_node))
+			node_load[node] += load;
+		prev_node = node;
+		load--;
+		for (i = 0; i < GFP_ZONETYPES; i++) {
+			zonelist = pgdat->node_zonelists + i;
+			for (j = 0; zonelist->zones[j] != NULL; j++);
+
+			k = highest_zone(i);
+
+	 		j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
+			zonelist->zones[j] = NULL;
+		}
+	}
+}
+
+#else	/* CONFIG_NUMA */
+
+static void __init build_zonelists(pg_data_t *pgdat)
+{
+	int i, j, k, node, local_node;
+
+	local_node = pgdat->node_id;
+	for (i = 0; i < GFP_ZONETYPES; i++) {
+		struct zonelist *zonelist;
+
+		zonelist = pgdat->node_zonelists + i;
+
+		j = 0;
+		k = highest_zone(i);
+ 		j = build_zonelists_node(pgdat, zonelist, j, k);
+ 		/*
+ 		 * Now we build the zonelist so that it contains the zones
+ 		 * of all the other nodes.
+ 		 * We don't want to pressure a particular node, so when
+ 		 * building the zones for node N, we make sure that the
+ 		 * zones coming right after the local ones are those from
+ 		 * node N+1 (modulo N)
+ 		 */
+		for (node = local_node + 1; node < MAX_NUMNODES; node++) {
+			if (!node_online(node))
+				continue;
+			j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
+		}
+		for (node = 0; node < local_node; node++) {
+			if (!node_online(node))
+				continue;
+			j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
+		}
+
+		zonelist->zones[j] = NULL;
+	}
+}
+
+#endif	/* CONFIG_NUMA */
+
+void __init build_all_zonelists(void)
+{
+	int i;
+
+	for_each_online_node(i)
+		build_zonelists(NODE_DATA(i));
+	printk("Built %i zonelists\n", num_online_nodes());
+	cpuset_init_current_mems_allowed();
+}
+
+/*
+ * Helper functions to size the waitqueue hash table.
+ * Essentially these want to choose hash table sizes sufficiently
+ * large so that collisions trying to wait on pages are rare.
+ * But in fact, the number of active page waitqueues on typical
+ * systems is ridiculously low, less than 200. So this is even
+ * conservative, even though it seems large.
+ *
+ * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
+ * waitqueues, i.e. the size of the waitq table given the number of pages.
+ */
+#define PAGES_PER_WAITQUEUE	256
+
+static inline unsigned long wait_table_size(unsigned long pages)
+{
+	unsigned long size = 1;
+
+	pages /= PAGES_PER_WAITQUEUE;
+
+	while (size < pages)
+		size <<= 1;
+
+	/*
+	 * Once we have dozens or even hundreds of threads sleeping
+	 * on IO we've got bigger problems than wait queue collision.
+	 * Limit the size of the wait table to a reasonable size.
+	 */
+	size = min(size, 4096UL);
+
+	return max(size, 4UL);
+}
+
+/*
+ * This is an integer logarithm so that shifts can be used later
+ * to extract the more random high bits from the multiplicative
+ * hash function before the remainder is taken.
+ */
+static inline unsigned long wait_table_bits(unsigned long size)
+{
+	return ffz(~size);
+}
+
+#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
+
+#ifndef __HAVE_ARCH_MEMMAP_INIT
+#define memmap_init(size, nid, zone, start_pfn) \
+	memmap_init_zone((size), (nid), (zone), (start_pfn))
+#endif
+
+/*
+ * Initially all pages are reserved - free ones are freed
+ * up by free_all_bootmem() once the early boot process is
+ * done. Non-atomic initialization, single-pass.
+ */
+void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
+		unsigned long start_pfn)
+{
+	struct page *page;
+	unsigned long end_pfn = start_pfn + size;
+	unsigned long pfn;
+
+	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
+		if (!early_pfn_valid(pfn))
+			continue;
+		page = pfn_to_page(pfn);
+		set_page_links(page, zone, nid, pfn);
+		init_page_count(page);
+		reset_page_mapcount(page);
+		SetPageReserved(page);
+		INIT_LIST_HEAD(&page->lru);
+#ifdef WANT_PAGE_VIRTUAL
+		/* The shift won't overflow because ZONE_NORMAL is below 4G. */
+		if (!is_highmem_idx(zone))
+			set_page_address(page, __va(pfn << PAGE_SHIFT));
+#endif
+	}
+}
+
+void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone,
+				unsigned long size)
+{
+	int order;
+	for (order = 0; order < MAX_ORDER ; order++) {
+		INIT_LIST_HEAD(&zone->free_area[order].free_list);
+		zone->free_area[order].nr_free = 0;
+	}
+}
+
+#define ZONETABLE_INDEX(x, zone_nr)	((x << ZONES_SHIFT) | zone_nr)
+void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn,
+		unsigned long size)
+{
+	unsigned long snum = pfn_to_section_nr(pfn);
+	unsigned long end = pfn_to_section_nr(pfn + size);
+
+	if (FLAGS_HAS_NODE)
+		zone_table[ZONETABLE_INDEX(nid, zid)] = zone;
+	else
+		for (; snum <= end; snum++)
+			zone_table[ZONETABLE_INDEX(snum, zid)] = zone;
+}
+
+static __meminit
+void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
+{
+	int i;
+	struct pglist_data *pgdat = zone->zone_pgdat;
+
+	/*
+	 * The per-page waitqueue mechanism uses hashed waitqueues
+	 * per zone.
+	 */
+	zone->wait_table_size = wait_table_size(zone_size_pages);
+	zone->wait_table_bits =	wait_table_bits(zone->wait_table_size);
+	zone->wait_table = (wait_queue_head_t *)
+		alloc_bootmem_node(pgdat, zone->wait_table_size
+					* sizeof(wait_queue_head_t));
+
+	for(i = 0; i < zone->wait_table_size; ++i)
+		init_waitqueue_head(zone->wait_table + i);
+}
+
+/*
+ * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist
+ * to the value high for the pageset p.
+ */
+static void setup_pagelist_highmark(struct per_cpu_pageset *p,
+				unsigned long high)
+{
+	struct per_cpu_pages *pcp;
+
+	pcp = &p->pcp[0]; /* hot list */
+	pcp->high = high;
+	pcp->batch = max(1UL, high/4);
+	if ((high/4) > (PAGE_SHIFT * 8))
+		pcp->batch = PAGE_SHIFT * 8;
+}
+
+/*
+ * percpu_pagelist_fraction - changes the pcp->high for each zone on each
+ * cpu.  It is the fraction of total pages in each zone that a hot per cpu pagelist
+ * can have before it gets flushed back to buddy allocator.
+ */
+int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
+	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+{
+	struct zone *zone;
+	unsigned int cpu;
+	int ret;
+
+	ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
+	if (!write || (ret == -EINVAL))
+		return ret;
+	for_each_zone(zone) {
+		for_each_online_cpu(cpu) {
+			unsigned long  high;
+			high = zone->present_pages / percpu_pagelist_fraction;
+			setup_pagelist_highmark(zone_pcp(zone, cpu), high);
+		}
+	}
+	return 0;
+}
+
+static int __cpuinit zone_batchsize(struct zone *zone)
+{
+	int batch;
+
+	/*
+	 * The per-cpu-pages pools are set to around 1000th of the
+	 * size of the zone.  But no more than 1/2 of a meg.
+	 *
+	 * OK, so we don't know how big the cache is.  So guess.
+	 */
+	batch = zone->present_pages / 1024;
+	if (batch * PAGE_SIZE > 512 * 1024)
+		batch = (512 * 1024) / PAGE_SIZE;
+	batch /= 4;		/* We effectively *= 4 below */
+	if (batch < 1)
+		batch = 1;
+
+	/*
+	 * Clamp the batch to a 2^n - 1 value. Having a power
+	 * of 2 value was found to be more likely to have
+	 * suboptimal cache aliasing properties in some cases.
+	 *
+	 * For example if 2 tasks are alternately allocating
+	 * batches of pages, one task can end up with a lot
+	 * of pages of one half of the possible page colors
+	 * and the other with pages of the other colors.
+	 */
+	batch = (1 << (fls(batch + batch/2)-1)) - 1;
+
+	return batch;
+}
+
+inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
+{
+	struct per_cpu_pages *pcp;
+
+	memset(p, 0, sizeof(*p));
+
+	pcp = &p->pcp[0];		/* hot */
+	pcp->count = 0;
+	pcp->high = 6 * batch;
+	pcp->batch = max(1UL, 1 * batch);
+	INIT_LIST_HEAD(&pcp->list);
+
+	pcp = &p->pcp[1];		/* cold*/
+	pcp->count = 0;
+	pcp->high = 2 * batch;
+	pcp->batch = max(1UL, batch/2);
+	INIT_LIST_HEAD(&pcp->list);
+}
+
+#ifdef CONFIG_NUMA
+/*
+ * Boot pageset table. One per cpu which is going to be used for all
+ * zones and all nodes. The parameters will be set in such a way
+ * that an item put on a list will immediately be handed over to
+ * the buddy list. This is safe since pageset manipulation is done
+ * with interrupts disabled.
+ *
+ * Some NUMA counter updates may also be caught by the boot pagesets.
+ *
+ * The boot_pagesets must be kept even after bootup is complete for
+ * unused processors and/or zones. They do play a role for bootstrapping
+ * hotplugged processors.
+ *
+ * zoneinfo_show() and maybe other functions do
+ * not check if the processor is online before following the pageset pointer.
+ * Other parts of the kernel may not check if the zone is available.
+ */
+static struct per_cpu_pageset boot_pageset[NR_CPUS];
+
+/*
+ * Dynamically allocate memory for the
+ * per cpu pageset array in struct zone.
+ */
+static int __cpuinit process_zones(int cpu)
+{
+	struct zone *zone, *dzone;
+
+	for_each_zone(zone) {
+
+		zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
+					 GFP_KERNEL, cpu_to_node(cpu));
+		if (!zone_pcp(zone, cpu))
+			goto bad;
+
+		setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone));
+
+		if (percpu_pagelist_fraction)
+			setup_pagelist_highmark(zone_pcp(zone, cpu),
+			 	(zone->present_pages / percpu_pagelist_fraction));
+	}
+
+	return 0;
+bad:
+	for_each_zone(dzone) {
+		if (dzone == zone)
+			break;
+		kfree(zone_pcp(dzone, cpu));
+		zone_pcp(dzone, cpu) = NULL;
+	}
+	return -ENOMEM;
+}
+
+static inline void free_zone_pagesets(int cpu)
+{
+	struct zone *zone;
+
+	for_each_zone(zone) {
+		struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
+
+		zone_pcp(zone, cpu) = NULL;
+		kfree(pset);
+	}
+}
+
+static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,
+		unsigned long action,
+		void *hcpu)
+{
+	int cpu = (long)hcpu;
+	int ret = NOTIFY_OK;
+
+	switch (action) {
+		case CPU_UP_PREPARE:
+			if (process_zones(cpu))
+				ret = NOTIFY_BAD;
+			break;
+		case CPU_UP_CANCELED:
+		case CPU_DEAD:
+			free_zone_pagesets(cpu);
+			break;
+		default:
+			break;
+	}
+	return ret;
+}
+
+static struct notifier_block pageset_notifier =
+	{ &pageset_cpuup_callback, NULL, 0 };
+
+void __init setup_per_cpu_pageset(void)
+{
+	int err;
+
+	/* Initialize per_cpu_pageset for cpu 0.
+	 * A cpuup callback will do this for every cpu
+	 * as it comes online
+	 */
+	err = process_zones(smp_processor_id());
+	BUG_ON(err);
+	register_cpu_notifier(&pageset_notifier);
+}
+#endif
+
+static __meminit void zone_pcp_init(struct zone *zone)
+{
+	int cpu;
+	unsigned long batch = zone_batchsize(zone);
+
+	for (cpu = 0; cpu < NR_CPUS; cpu++) {
+#ifdef CONFIG_NUMA
+		/* Early boot. Slab allocator not functional yet */
+		zone_pcp(zone, cpu) = &boot_pageset[cpu];
+		setup_pageset(&boot_pageset[cpu],0);
+#else
+		setup_pageset(zone_pcp(zone,cpu), batch);
+#endif
+	}
+	if (zone->present_pages)
+		printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%lu\n",
+			zone->name, zone->present_pages, batch);
+}
+
+static __meminit void init_currently_empty_zone(struct zone *zone,
+		unsigned long zone_start_pfn, unsigned long size)
+{
+	struct pglist_data *pgdat = zone->zone_pgdat;
+
+	zone_wait_table_init(zone, size);
+	pgdat->nr_zones = zone_idx(zone) + 1;
+
+	zone->zone_start_pfn = zone_start_pfn;
+
+	memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn);
+
+	zone_init_free_lists(pgdat, zone, zone->spanned_pages);
+}
+
+#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
+/* Note: nid == MAX_NUMNODES returns first region */
+static int __init first_active_region_index_in_nid(int nid)
+{
+	int i;
+	for (i = 0; early_node_map[i].end_pfn; i++) {
+		if (nid == MAX_NUMNODES || early_node_map[i].nid == nid)
+			return i;
+	}
+
+	return MAX_ACTIVE_REGIONS;
+}
+
+/* Note: nid == MAX_NUMNODES returns next region */
+static int __init next_active_region_index_in_nid(unsigned int index, int nid)
+{
+	for (index = index + 1; early_node_map[index].end_pfn; index++) {
+		if (nid == MAX_NUMNODES || early_node_map[index].nid == nid)
+			return index;
+	}
+
+	return MAX_ACTIVE_REGIONS;
+}
+
+#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
+int __init early_pfn_to_nid(unsigned long pfn)
+{
+	int i;
+
+	for (i = 0; early_node_map[i].end_pfn; i++) {
+		unsigned long start_pfn = early_node_map[i].start_pfn;
+		unsigned long end_pfn = early_node_map[i].end_pfn;
+
+		if ((start_pfn <= pfn) && (pfn < end_pfn))
+			return early_node_map[i].nid;
+	}
+
+	return -1;
+}
+#endif
+
+#define for_each_active_range_index_in_nid(i, nid) \
+	for (i = first_active_region_index_in_nid(nid); \
+				i != MAX_ACTIVE_REGIONS; \
+				i = next_active_region_index_in_nid(i, nid))
+
+void __init free_bootmem_with_active_regions(int nid,
+						unsigned long max_low_pfn)
+{
+	unsigned int i;
+	for_each_active_range_index_in_nid(i, nid) {
+		unsigned long size_pages = 0;
+		unsigned long end_pfn = early_node_map[i].end_pfn;
+		if (early_node_map[i].start_pfn >= max_low_pfn)
+			continue;
+
+		if (end_pfn > max_low_pfn)
+			end_pfn = max_low_pfn;
+
+		size_pages = end_pfn - early_node_map[i].start_pfn;
+		free_bootmem_node(NODE_DATA(early_node_map[i].nid),
+				PFN_PHYS(early_node_map[i].start_pfn),
+				PFN_PHYS(size_pages));
+	}
+}
+
+void __init sparse_memory_present_with_active_regions(int nid)
+{
+	unsigned int i;
+	for_each_active_range_index_in_nid(i, nid)
+		memory_present(early_node_map[i].nid,
+				early_node_map[i].start_pfn,
+				early_node_map[i].end_pfn);
+}
+
+void __init get_pfn_range_for_nid(unsigned int nid,
+			unsigned long *start_pfn, unsigned long *end_pfn)
+{
+	unsigned int i;
+	*start_pfn = -1UL;
+	*end_pfn = 0;
+
+	for_each_active_range_index_in_nid(i, nid) {
+		if (early_node_map[i].start_pfn < *start_pfn)
+			*start_pfn = early_node_map[i].start_pfn;
+
+		if (early_node_map[i].end_pfn > *end_pfn)
+			*end_pfn = early_node_map[i].end_pfn;
+	}
+
+	if (*start_pfn == -1UL) {
+		printk(KERN_WARNING "Node %u active with no memory\n", nid);
+		*start_pfn = 0;
+	}
+}
+
+unsigned long __init zone_present_pages_in_node(int nid,
+					unsigned long zone_type,
+					unsigned long *ignored)
+{
+	unsigned long node_start_pfn, node_end_pfn;
+	unsigned long zone_start_pfn, zone_end_pfn;
+
+	/* Get the start and end of the node and zone */
+	get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
+	zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
+	zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
+
+	/* Check that this node has pages within the zone's required range */
+	if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)
+		return 0;
+
+	/* Move the zone boundaries inside the node if necessary */
+	if (zone_end_pfn > node_end_pfn)
+		zone_end_pfn = node_end_pfn;
+	if (zone_start_pfn < node_start_pfn)
+		zone_start_pfn = node_start_pfn;
+
+	/* Return the spanned pages */
+	return zone_end_pfn - zone_start_pfn;
+}
+
+unsigned long __init __absent_pages_in_range(int nid,
+				unsigned long range_start_pfn,
+				unsigned long range_end_pfn)
+{
+	int i = 0;
+	unsigned long prev_end_pfn = 0, hole_pages = 0;
+	unsigned long start_pfn;
+
+	/* Find the end_pfn of the first active range of pfns in the node */
+	i = first_active_region_index_in_nid(nid);
+	prev_end_pfn = early_node_map[i].start_pfn;
+
+	/* Find all holes for the zone within the node */
+	for (; i != MAX_ACTIVE_REGIONS;
+			i = next_active_region_index_in_nid(i, nid)) {
+
+		/* No need to continue if prev_end_pfn is outside the zone */
+		if (prev_end_pfn >= range_end_pfn)
+			break;
+
+		/* Make sure the end of the zone is not within the hole */
+		start_pfn = early_node_map[i].start_pfn;
+		if (start_pfn > range_end_pfn)
+			start_pfn = range_end_pfn;
+		if (prev_end_pfn < range_start_pfn)
+			prev_end_pfn = range_start_pfn;
+
+		/* Update the hole size cound and move on */
+		if (start_pfn > range_start_pfn) {
+			BUG_ON(prev_end_pfn > start_pfn);
+			hole_pages += start_pfn - prev_end_pfn;
+		}
+		prev_end_pfn = early_node_map[i].end_pfn;
+	}
+
+	return hole_pages;
+}
+
+unsigned long __init absent_pages_in_range(unsigned long start_pfn,
+							unsigned long end_pfn)
+{
+	return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
+}
+
+unsigned long __init zone_absent_pages_in_node(int nid,
+					unsigned long zone_type,
+					unsigned long *ignored)
+{
+	return __absent_pages_in_range(nid,
+				arch_zone_lowest_possible_pfn[zone_type],
+				arch_zone_highest_possible_pfn[zone_type]);
+}
+#else
+static inline unsigned long zone_present_pages_in_node(int nid,
+					unsigned long zone_type,
+					unsigned long *zones_size)
+{
+	return zones_size[zone_type];
+}
+
+static inline unsigned long zone_absent_pages_in_node(int nid,
+						unsigned long zone_type,
+						unsigned long *zholes_size)
+{
+	if (!zholes_size)
+		return 0;
+
+	return zholes_size[zone_type];
+}
+#endif
+
+static void __init calculate_node_totalpages(struct pglist_data *pgdat,
+		unsigned long *zones_size, unsigned long *zholes_size)
+{
+	unsigned long realtotalpages, totalpages = 0;
+	int i;
+
+	for (i = 0; i < MAX_NR_ZONES; i++) {
+		totalpages += zone_present_pages_in_node(pgdat->node_id, i,
+								zones_size);
+	}
+	pgdat->node_spanned_pages = totalpages;
+
+	realtotalpages = totalpages;
+	for (i = 0; i < MAX_NR_ZONES; i++) {
+		realtotalpages -=
+			zone_absent_pages_in_node(pgdat->node_id, i, zholes_size);
+	}
+	pgdat->node_present_pages = realtotalpages;
+	printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
+							realtotalpages);
+}
+
+/*
+ * Set up the zone data structures:
+ *   - mark all pages reserved
+ *   - mark all memory queues empty
+ *   - clear the memory bitmaps
+ */
+static void __init free_area_init_core(struct pglist_data *pgdat,
+		unsigned long *zones_size, unsigned long *zholes_size)
+{
+	unsigned long j;
+	int nid = pgdat->node_id;
+	unsigned long zone_start_pfn = pgdat->node_start_pfn;
+
+	pgdat_resize_init(pgdat);
+	pgdat->nr_zones = 0;
+	init_waitqueue_head(&pgdat->kswapd_wait);
+	pgdat->kswapd_max_order = 0;
+
+	for (j = 0; j < MAX_NR_ZONES; j++) {
+		struct zone *zone = pgdat->node_zones + j;
+		unsigned long size, realsize;
+
+		size = zone_present_pages_in_node(nid, j, zones_size);
+		realsize = size - zone_absent_pages_in_node(nid, j,
+								zholes_size);
+		if (j < ZONE_HIGHMEM)
+			nr_kernel_pages += realsize;
+		nr_all_pages += realsize;
+
+		zone->spanned_pages = size;
+		zone->present_pages = realsize;
+		zone->name = zone_names[j];
+		spin_lock_init(&zone->lock);
+		spin_lock_init(&zone->lru_lock);
+		zone_seqlock_init(zone);
+		zone->zone_pgdat = pgdat;
+		zone->free_pages = 0;
+
+		zone->temp_priority = zone->prev_priority = DEF_PRIORITY;
+
+		zone_pcp_init(zone);
+		INIT_LIST_HEAD(&zone->active_list);
+		INIT_LIST_HEAD(&zone->inactive_list);
+		zone->nr_scan_active = 0;
+		zone->nr_scan_inactive = 0;
+		zone->nr_active = 0;
+		zone->nr_inactive = 0;
+		atomic_set(&zone->reclaim_in_progress, 0);
+		if (!size)
+			continue;
+
+		zonetable_add(zone, nid, j, zone_start_pfn, size);
+		init_currently_empty_zone(zone, zone_start_pfn, size);
+		zone_start_pfn += size;
+	}
+}
+
+static void __init alloc_node_mem_map(struct pglist_data *pgdat)
+{
+	/* Skip empty nodes */
+	if (!pgdat->node_spanned_pages)
+		return;
+
+#ifdef CONFIG_FLAT_NODE_MEM_MAP
+	/* ia64 gets its own node_mem_map, before this, without bootmem */
+	if (!pgdat->node_mem_map) {
+		unsigned long size;
+		struct page *map;
+
+		size = (pgdat->node_spanned_pages + 1) * sizeof(struct page);
+		map = alloc_remap(pgdat->node_id, size);
+		if (!map)
+			map = alloc_bootmem_node(pgdat, size);
+		pgdat->node_mem_map = map;
+	}
+#ifdef CONFIG_FLATMEM
+	/*
+	 * With no DISCONTIG, the global mem_map is just set as node 0's
+	 */
+	if (pgdat == NODE_DATA(0))
+		mem_map = NODE_DATA(0)->node_mem_map;
+#endif
+#endif /* CONFIG_FLAT_NODE_MEM_MAP */
+}
+
+void __init free_area_init_node(int nid, struct pglist_data *pgdat,
+		unsigned long *zones_size, unsigned long node_start_pfn,
+		unsigned long *zholes_size)
+{
+	pgdat->node_id = nid;
+	pgdat->node_start_pfn = node_start_pfn;
+	calculate_node_totalpages(pgdat, zones_size, zholes_size);
+
+	alloc_node_mem_map(pgdat);
+
+	free_area_init_core(pgdat, zones_size, zholes_size);
+}
+
+#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
+void __init add_active_range(unsigned int nid, unsigned long start_pfn,
+						unsigned long end_pfn)
+{
+	unsigned int i;
+
+	/* Merge with existing active regions if possible */
+	for (i = 0; early_node_map[i].end_pfn; i++) {
+		if (early_node_map[i].nid != nid)
+			continue;
+
+		/* Skip if an existing region covers this new one */
+		if (start_pfn >= early_node_map[i].start_pfn &&
+				end_pfn <= early_node_map[i].end_pfn)
+			return;
+
+		/* Merge forward if suitable */
+		if (start_pfn <= early_node_map[i].end_pfn &&
+				end_pfn > early_node_map[i].end_pfn) {
+			early_node_map[i].end_pfn = end_pfn;
+			return;
+		}
+
+		/* Merge backward if suitable */
+		if (start_pfn < early_node_map[i].end_pfn &&
+				end_pfn >= early_node_map[i].start_pfn) {
+			early_node_map[i].start_pfn = start_pfn;
+			return;
+		}
+	}
+
+	/* Leave last entry NULL, we use range.end_pfn to terminate the walk */
+	if (i >= MAX_ACTIVE_REGIONS - 1) {
+		printk(KERN_ERR "Too many memory regions, truncating\n");
+		return;
+	}
+
+	early_node_map[i].nid = nid;
+	early_node_map[i].start_pfn = start_pfn;
+	early_node_map[i].end_pfn = end_pfn;
+}
+
+/* Compare two active node_active_regions */
+static int __init cmp_node_active_region(const void *a, const void *b)
+{
+	struct node_active_region *arange = (struct node_active_region *)a;
+	struct node_active_region *brange = (struct node_active_region *)b;
+
+	/* Done this way to avoid overflows */
+	if (arange->start_pfn > brange->start_pfn)
+		return 1;
+	if (arange->start_pfn < brange->start_pfn)
+		return -1;
+
+	return 0;
+}
+
+/* sort the node_map by start_pfn */
+static void __init sort_node_map(void)
+{
+	size_t num = 0;
+	while (early_node_map[num].end_pfn)
+		num++;
+
+	sort(early_node_map, num, sizeof(struct node_active_region),
+						cmp_node_active_region, NULL);
+}
+
+/* Find the lowest pfn for a node. This depends on a sorted early_node_map */
+unsigned long __init find_min_pfn_for_node(unsigned long nid)
+{
+	int i;
+
+	/* Assuming a sorted map, the first range found has the starting pfn */
+	for_each_active_range_index_in_nid(i, nid)
+		return early_node_map[i].start_pfn;
+
+	/* nid does not exist in early_node_map */
+	printk(KERN_WARNING "Could not find start_pfn for node %lu\n", nid);
+	return 0;
+}
+
+
+unsigned long __init find_min_pfn_with_active_regions(void)
+{
+	return find_min_pfn_for_node(MAX_NUMNODES);
+}
+
+unsigned long __init find_max_pfn_with_active_regions(void)
+{
+	int i;
+	unsigned long max_pfn = -1UL;
+
+	for (i = 0; early_node_map[i].end_pfn; i++)
+		max_pfn = max(max_pfn, early_node_map[i].start_pfn);
+
+	return max_pfn;
+}
+
+void __init free_area_init_nodes(unsigned long arch_max_dma_pfn,
+				unsigned long arch_max_dma32_pfn,
+				unsigned long arch_max_low_pfn,
+				unsigned long arch_max_high_pfn)
+{
+	unsigned long nid;
+	int zone_index;
+
+	/* Record where the zone boundaries are */
+	memset(arch_zone_lowest_possible_pfn, 0,
+				sizeof(arch_zone_lowest_possible_pfn));
+	memset(arch_zone_highest_possible_pfn, 0,
+				sizeof(arch_zone_highest_possible_pfn));
+	arch_zone_lowest_possible_pfn[ZONE_DMA] =
+					find_min_pfn_with_active_regions();
+	arch_zone_highest_possible_pfn[ZONE_DMA] = arch_max_dma_pfn;
+	arch_zone_highest_possible_pfn[ZONE_DMA32] = arch_max_dma32_pfn;
+	arch_zone_highest_possible_pfn[ZONE_NORMAL] = arch_max_low_pfn;
+	arch_zone_highest_possible_pfn[ZONE_HIGHMEM] = arch_max_high_pfn;
+	for (zone_index = 1; zone_index < MAX_NR_ZONES; zone_index++) {
+		arch_zone_lowest_possible_pfn[zone_index] = 
+			arch_zone_highest_possible_pfn[zone_index-1];
+	}
+
+	/* Regions in the early_node_map can be in any order */
+	sort_node_map();
+
+	for_each_online_node(nid) {
+		pg_data_t *pgdat = NODE_DATA(nid);
+		free_area_init_node(nid, pgdat, NULL,
+				find_min_pfn_for_node(nid), NULL);
+	}
+}
+#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc1-105-ia64_use_init_nodes/mm/page_alloc.c linux-2.6.17-rc1-106-breakout_mem_init/mm/page_alloc.c
--- linux-2.6.17-rc1-105-ia64_use_init_nodes/mm/page_alloc.c	2006-04-18 10:17:49.000000000 +0100
+++ linux-2.6.17-rc1-106-breakout_mem_init/mm/page_alloc.c	2006-04-18 10:22:55.000000000 +0100
@@ -37,8 +37,6 @@
 #include <linux/nodemask.h>
 #include <linux/vmalloc.h>
 #include <linux/mempolicy.h>
-#include <linux/sort.h>
-#include <linux/pfn.h>
 
 #include <asm/tlbflush.h>
 #include "internal.h"
@@ -54,7 +52,6 @@ EXPORT_SYMBOL(node_possible_map);
 unsigned long totalram_pages __read_mostly;
 unsigned long totalhigh_pages __read_mostly;
 long nr_swap_pages;
-int percpu_pagelist_fraction;
 
 static void __free_pages_ok(struct page *page, unsigned int order);
 
@@ -80,24 +77,11 @@ EXPORT_SYMBOL(totalram_pages);
 struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly;
 EXPORT_SYMBOL(zone_table);
 
-static char *zone_names[MAX_NR_ZONES] = { "DMA", "DMA32", "Normal", "HighMem" };
 int min_free_kbytes = 1024;
 
 unsigned long __initdata nr_kernel_pages;
 unsigned long __initdata nr_all_pages;
 
-#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
-  #ifdef CONFIG_MAX_ACTIVE_REGIONS
-    #define MAX_ACTIVE_REGIONS CONFIG_MAX_ACTIVE_REGIONS
-  #else
-    #define MAX_ACTIVE_REGIONS (MAX_NR_ZONES * MAX_NUMNODES + 1)
-  #endif
-
-  struct node_active_region __initdata early_node_map[MAX_ACTIVE_REGIONS];
-  unsigned long __initdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
-  unsigned long __initdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
-#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
-
 #ifdef CONFIG_DEBUG_VM
 static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
 {
@@ -1511,985 +1495,6 @@ void show_free_areas(void)
 	show_swap_cache_info();
 }
 
-/*
- * Builds allocation fallback zone lists.
- *
- * Add all populated zones of a node to the zonelist.
- */
-static int __init build_zonelists_node(pg_data_t *pgdat,
-			struct zonelist *zonelist, int nr_zones, int zone_type)
-{
-	struct zone *zone;
-
-	BUG_ON(zone_type > ZONE_HIGHMEM);
-
-	do {
-		zone = pgdat->node_zones + zone_type;
-		if (populated_zone(zone)) {
-#ifndef CONFIG_HIGHMEM
-			BUG_ON(zone_type > ZONE_NORMAL);
-#endif
-			zonelist->zones[nr_zones++] = zone;
-			check_highest_zone(zone_type);
-		}
-		zone_type--;
-
-	} while (zone_type >= 0);
-	return nr_zones;
-}
-
-static inline int highest_zone(int zone_bits)
-{
-	int res = ZONE_NORMAL;
-	if (zone_bits & (__force int)__GFP_HIGHMEM)
-		res = ZONE_HIGHMEM;
-	if (zone_bits & (__force int)__GFP_DMA32)
-		res = ZONE_DMA32;
-	if (zone_bits & (__force int)__GFP_DMA)
-		res = ZONE_DMA;
-	return res;
-}
-
-#ifdef CONFIG_NUMA
-#define MAX_NODE_LOAD (num_online_nodes())
-static int __initdata node_load[MAX_NUMNODES];
-/**
- * find_next_best_node - find the next node that should appear in a given node's fallback list
- * @node: node whose fallback list we're appending
- * @used_node_mask: nodemask_t of already used nodes
- *
- * We use a number of factors to determine which is the next node that should
- * appear on a given node's fallback list.  The node should not have appeared
- * already in @node's fallback list, and it should be the next closest node
- * according to the distance array (which contains arbitrary distance values
- * from each node to each node in the system), and should also prefer nodes
- * with no CPUs, since presumably they'll have very little allocation pressure
- * on them otherwise.
- * It returns -1 if no node is found.
- */
-static int __init find_next_best_node(int node, nodemask_t *used_node_mask)
-{
-	int n, val;
-	int min_val = INT_MAX;
-	int best_node = -1;
-
-	/* Use the local node if we haven't already */
-	if (!node_isset(node, *used_node_mask)) {
-		node_set(node, *used_node_mask);
-		return node;
-	}
-
-	for_each_online_node(n) {
-		cpumask_t tmp;
-
-		/* Don't want a node to appear more than once */
-		if (node_isset(n, *used_node_mask))
-			continue;
-
-		/* Use the distance array to find the distance */
-		val = node_distance(node, n);
-
-		/* Penalize nodes under us ("prefer the next node") */
-		val += (n < node);
-
-		/* Give preference to headless and unused nodes */
-		tmp = node_to_cpumask(n);
-		if (!cpus_empty(tmp))
-			val += PENALTY_FOR_NODE_WITH_CPUS;
-
-		/* Slight preference for less loaded node */
-		val *= (MAX_NODE_LOAD*MAX_NUMNODES);
-		val += node_load[n];
-
-		if (val < min_val) {
-			min_val = val;
-			best_node = n;
-		}
-	}
-
-	if (best_node >= 0)
-		node_set(best_node, *used_node_mask);
-
-	return best_node;
-}
-
-static void __init build_zonelists(pg_data_t *pgdat)
-{
-	int i, j, k, node, local_node;
-	int prev_node, load;
-	struct zonelist *zonelist;
-	nodemask_t used_mask;
-
-	/* initialize zonelists */
-	for (i = 0; i < GFP_ZONETYPES; i++) {
-		zonelist = pgdat->node_zonelists + i;
-		zonelist->zones[0] = NULL;
-	}
-
-	/* NUMA-aware ordering of nodes */
-	local_node = pgdat->node_id;
-	load = num_online_nodes();
-	prev_node = local_node;
-	nodes_clear(used_mask);
-	while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
-		int distance = node_distance(local_node, node);
-
-		/*
-		 * If another node is sufficiently far away then it is better
-		 * to reclaim pages in a zone before going off node.
-		 */
-		if (distance > RECLAIM_DISTANCE)
-			zone_reclaim_mode = 1;
-
-		/*
-		 * We don't want to pressure a particular node.
-		 * So adding penalty to the first node in same
-		 * distance group to make it round-robin.
-		 */
-
-		if (distance != node_distance(local_node, prev_node))
-			node_load[node] += load;
-		prev_node = node;
-		load--;
-		for (i = 0; i < GFP_ZONETYPES; i++) {
-			zonelist = pgdat->node_zonelists + i;
-			for (j = 0; zonelist->zones[j] != NULL; j++);
-
-			k = highest_zone(i);
-
-	 		j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
-			zonelist->zones[j] = NULL;
-		}
-	}
-}
-
-#else	/* CONFIG_NUMA */
-
-static void __init build_zonelists(pg_data_t *pgdat)
-{
-	int i, j, k, node, local_node;
-
-	local_node = pgdat->node_id;
-	for (i = 0; i < GFP_ZONETYPES; i++) {
-		struct zonelist *zonelist;
-
-		zonelist = pgdat->node_zonelists + i;
-
-		j = 0;
-		k = highest_zone(i);
- 		j = build_zonelists_node(pgdat, zonelist, j, k);
- 		/*
- 		 * Now we build the zonelist so that it contains the zones
- 		 * of all the other nodes.
- 		 * We don't want to pressure a particular node, so when
- 		 * building the zones for node N, we make sure that the
- 		 * zones coming right after the local ones are those from
- 		 * node N+1 (modulo N)
- 		 */
-		for (node = local_node + 1; node < MAX_NUMNODES; node++) {
-			if (!node_online(node))
-				continue;
-			j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
-		}
-		for (node = 0; node < local_node; node++) {
-			if (!node_online(node))
-				continue;
-			j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
-		}
-
-		zonelist->zones[j] = NULL;
-	}
-}
-
-#endif	/* CONFIG_NUMA */
-
-void __init build_all_zonelists(void)
-{
-	int i;
-
-	for_each_online_node(i)
-		build_zonelists(NODE_DATA(i));
-	printk("Built %i zonelists\n", num_online_nodes());
-	cpuset_init_current_mems_allowed();
-}
-
-/*
- * Helper functions to size the waitqueue hash table.
- * Essentially these want to choose hash table sizes sufficiently
- * large so that collisions trying to wait on pages are rare.
- * But in fact, the number of active page waitqueues on typical
- * systems is ridiculously low, less than 200. So this is even
- * conservative, even though it seems large.
- *
- * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
- * waitqueues, i.e. the size of the waitq table given the number of pages.
- */
-#define PAGES_PER_WAITQUEUE	256
-
-static inline unsigned long wait_table_size(unsigned long pages)
-{
-	unsigned long size = 1;
-
-	pages /= PAGES_PER_WAITQUEUE;
-
-	while (size < pages)
-		size <<= 1;
-
-	/*
-	 * Once we have dozens or even hundreds of threads sleeping
-	 * on IO we've got bigger problems than wait queue collision.
-	 * Limit the size of the wait table to a reasonable size.
-	 */
-	size = min(size, 4096UL);
-
-	return max(size, 4UL);
-}
-
-/*
- * This is an integer logarithm so that shifts can be used later
- * to extract the more random high bits from the multiplicative
- * hash function before the remainder is taken.
- */
-static inline unsigned long wait_table_bits(unsigned long size)
-{
-	return ffz(~size);
-}
-
-#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
-
-/*
- * Initially all pages are reserved - free ones are freed
- * up by free_all_bootmem() once the early boot process is
- * done. Non-atomic initialization, single-pass.
- */
-void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
-		unsigned long start_pfn)
-{
-	struct page *page;
-	unsigned long end_pfn = start_pfn + size;
-	unsigned long pfn;
-
-	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
-		if (!early_pfn_valid(pfn))
-			continue;
-		page = pfn_to_page(pfn);
-		set_page_links(page, zone, nid, pfn);
-		init_page_count(page);
-		reset_page_mapcount(page);
-		SetPageReserved(page);
-		INIT_LIST_HEAD(&page->lru);
-#ifdef WANT_PAGE_VIRTUAL
-		/* The shift won't overflow because ZONE_NORMAL is below 4G. */
-		if (!is_highmem_idx(zone))
-			set_page_address(page, __va(pfn << PAGE_SHIFT));
-#endif
-	}
-}
-
-void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone,
-				unsigned long size)
-{
-	int order;
-	for (order = 0; order < MAX_ORDER ; order++) {
-		INIT_LIST_HEAD(&zone->free_area[order].free_list);
-		zone->free_area[order].nr_free = 0;
-	}
-}
-
-#define ZONETABLE_INDEX(x, zone_nr)	((x << ZONES_SHIFT) | zone_nr)
-void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn,
-		unsigned long size)
-{
-	unsigned long snum = pfn_to_section_nr(pfn);
-	unsigned long end = pfn_to_section_nr(pfn + size);
-
-	if (FLAGS_HAS_NODE)
-		zone_table[ZONETABLE_INDEX(nid, zid)] = zone;
-	else
-		for (; snum <= end; snum++)
-			zone_table[ZONETABLE_INDEX(snum, zid)] = zone;
-}
-
-#ifndef __HAVE_ARCH_MEMMAP_INIT
-#define memmap_init(size, nid, zone, start_pfn) \
-	memmap_init_zone((size), (nid), (zone), (start_pfn))
-#endif
-
-static int __cpuinit zone_batchsize(struct zone *zone)
-{
-	int batch;
-
-	/*
-	 * The per-cpu-pages pools are set to around 1000th of the
-	 * size of the zone.  But no more than 1/2 of a meg.
-	 *
-	 * OK, so we don't know how big the cache is.  So guess.
-	 */
-	batch = zone->present_pages / 1024;
-	if (batch * PAGE_SIZE > 512 * 1024)
-		batch = (512 * 1024) / PAGE_SIZE;
-	batch /= 4;		/* We effectively *= 4 below */
-	if (batch < 1)
-		batch = 1;
-
-	/*
-	 * Clamp the batch to a 2^n - 1 value. Having a power
-	 * of 2 value was found to be more likely to have
-	 * suboptimal cache aliasing properties in some cases.
-	 *
-	 * For example if 2 tasks are alternately allocating
-	 * batches of pages, one task can end up with a lot
-	 * of pages of one half of the possible page colors
-	 * and the other with pages of the other colors.
-	 */
-	batch = (1 << (fls(batch + batch/2)-1)) - 1;
-
-	return batch;
-}
-
-inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
-{
-	struct per_cpu_pages *pcp;
-
-	memset(p, 0, sizeof(*p));
-
-	pcp = &p->pcp[0];		/* hot */
-	pcp->count = 0;
-	pcp->high = 6 * batch;
-	pcp->batch = max(1UL, 1 * batch);
-	INIT_LIST_HEAD(&pcp->list);
-
-	pcp = &p->pcp[1];		/* cold*/
-	pcp->count = 0;
-	pcp->high = 2 * batch;
-	pcp->batch = max(1UL, batch/2);
-	INIT_LIST_HEAD(&pcp->list);
-}
-
-/*
- * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist
- * to the value high for the pageset p.
- */
-
-static void setup_pagelist_highmark(struct per_cpu_pageset *p,
-				unsigned long high)
-{
-	struct per_cpu_pages *pcp;
-
-	pcp = &p->pcp[0]; /* hot list */
-	pcp->high = high;
-	pcp->batch = max(1UL, high/4);
-	if ((high/4) > (PAGE_SHIFT * 8))
-		pcp->batch = PAGE_SHIFT * 8;
-}
-
-
-#ifdef CONFIG_NUMA
-/*
- * Boot pageset table. One per cpu which is going to be used for all
- * zones and all nodes. The parameters will be set in such a way
- * that an item put on a list will immediately be handed over to
- * the buddy list. This is safe since pageset manipulation is done
- * with interrupts disabled.
- *
- * Some NUMA counter updates may also be caught by the boot pagesets.
- *
- * The boot_pagesets must be kept even after bootup is complete for
- * unused processors and/or zones. They do play a role for bootstrapping
- * hotplugged processors.
- *
- * zoneinfo_show() and maybe other functions do
- * not check if the processor is online before following the pageset pointer.
- * Other parts of the kernel may not check if the zone is available.
- */
-static struct per_cpu_pageset boot_pageset[NR_CPUS];
-
-/*
- * Dynamically allocate memory for the
- * per cpu pageset array in struct zone.
- */
-static int __cpuinit process_zones(int cpu)
-{
-	struct zone *zone, *dzone;
-
-	for_each_zone(zone) {
-
-		zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
-					 GFP_KERNEL, cpu_to_node(cpu));
-		if (!zone_pcp(zone, cpu))
-			goto bad;
-
-		setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone));
-
-		if (percpu_pagelist_fraction)
-			setup_pagelist_highmark(zone_pcp(zone, cpu),
-			 	(zone->present_pages / percpu_pagelist_fraction));
-	}
-
-	return 0;
-bad:
-	for_each_zone(dzone) {
-		if (dzone == zone)
-			break;
-		kfree(zone_pcp(dzone, cpu));
-		zone_pcp(dzone, cpu) = NULL;
-	}
-	return -ENOMEM;
-}
-
-static inline void free_zone_pagesets(int cpu)
-{
-	struct zone *zone;
-
-	for_each_zone(zone) {
-		struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
-
-		zone_pcp(zone, cpu) = NULL;
-		kfree(pset);
-	}
-}
-
-static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,
-		unsigned long action,
-		void *hcpu)
-{
-	int cpu = (long)hcpu;
-	int ret = NOTIFY_OK;
-
-	switch (action) {
-		case CPU_UP_PREPARE:
-			if (process_zones(cpu))
-				ret = NOTIFY_BAD;
-			break;
-		case CPU_UP_CANCELED:
-		case CPU_DEAD:
-			free_zone_pagesets(cpu);
-			break;
-		default:
-			break;
-	}
-	return ret;
-}
-
-static struct notifier_block pageset_notifier =
-	{ &pageset_cpuup_callback, NULL, 0 };
-
-void __init setup_per_cpu_pageset(void)
-{
-	int err;
-
-	/* Initialize per_cpu_pageset for cpu 0.
-	 * A cpuup callback will do this for every cpu
-	 * as it comes online
-	 */
-	err = process_zones(smp_processor_id());
-	BUG_ON(err);
-	register_cpu_notifier(&pageset_notifier);
-}
-
-#endif
-
-static __meminit
-void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
-{
-	int i;
-	struct pglist_data *pgdat = zone->zone_pgdat;
-
-	/*
-	 * The per-page waitqueue mechanism uses hashed waitqueues
-	 * per zone.
-	 */
-	zone->wait_table_size = wait_table_size(zone_size_pages);
-	zone->wait_table_bits =	wait_table_bits(zone->wait_table_size);
-	zone->wait_table = (wait_queue_head_t *)
-		alloc_bootmem_node(pgdat, zone->wait_table_size
-					* sizeof(wait_queue_head_t));
-
-	for(i = 0; i < zone->wait_table_size; ++i)
-		init_waitqueue_head(zone->wait_table + i);
-}
-
-static __meminit void zone_pcp_init(struct zone *zone)
-{
-	int cpu;
-	unsigned long batch = zone_batchsize(zone);
-
-	for (cpu = 0; cpu < NR_CPUS; cpu++) {
-#ifdef CONFIG_NUMA
-		/* Early boot. Slab allocator not functional yet */
-		zone_pcp(zone, cpu) = &boot_pageset[cpu];
-		setup_pageset(&boot_pageset[cpu],0);
-#else
-		setup_pageset(zone_pcp(zone,cpu), batch);
-#endif
-	}
-	if (zone->present_pages)
-		printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%lu\n",
-			zone->name, zone->present_pages, batch);
-}
-
-static __meminit void init_currently_empty_zone(struct zone *zone,
-		unsigned long zone_start_pfn, unsigned long size)
-{
-	struct pglist_data *pgdat = zone->zone_pgdat;
-
-	zone_wait_table_init(zone, size);
-	pgdat->nr_zones = zone_idx(zone) + 1;
-
-	zone->zone_start_pfn = zone_start_pfn;
-
-	memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn);
-
-	zone_init_free_lists(pgdat, zone, zone->spanned_pages);
-}
-
-#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
-/* Note: nid == MAX_NUMNODES returns first region */
-static int __init first_active_region_index_in_nid(int nid)
-{
-	int i;
-	for (i = 0; early_node_map[i].end_pfn; i++) {
-		if (nid == MAX_NUMNODES || early_node_map[i].nid == nid)
-			return i;
-	}
-
-	return MAX_ACTIVE_REGIONS;
-}
-
-/* Note: nid == MAX_NUMNODES returns next region */
-static int __init next_active_region_index_in_nid(unsigned int index, int nid)
-{
-	for (index = index + 1; early_node_map[index].end_pfn; index++) {
-		if (nid == MAX_NUMNODES || early_node_map[index].nid == nid)
-			return index;
-	}
-
-	return MAX_ACTIVE_REGIONS;
-}
-
-#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
-int __init early_pfn_to_nid(unsigned long pfn)
-{
-	int i;
-
-	for (i = 0; early_node_map[i].end_pfn; i++) {
-		unsigned long start_pfn = early_node_map[i].start_pfn;
-		unsigned long end_pfn = early_node_map[i].end_pfn;
-
-		if ((start_pfn <= pfn) && (pfn < end_pfn))
-			return early_node_map[i].nid;
-	}
-
-	return -1;
-}
-#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
-
-#define for_each_active_range_index_in_nid(i, nid) \
-	for (i = first_active_region_index_in_nid(nid); \
-				i != MAX_ACTIVE_REGIONS; \
-				i = next_active_region_index_in_nid(i, nid))
-
-void __init free_bootmem_with_active_regions(int nid,
-						unsigned long max_low_pfn)
-{
-	unsigned int i;
-	for_each_active_range_index_in_nid(i, nid) {
-		unsigned long size_pages = 0;
-		unsigned long end_pfn = early_node_map[i].end_pfn;
-		if (early_node_map[i].start_pfn >= max_low_pfn)
-			continue;
-
-		if (end_pfn > max_low_pfn)
-			end_pfn = max_low_pfn;
-
-		size_pages = end_pfn - early_node_map[i].start_pfn;
-		free_bootmem_node(NODE_DATA(early_node_map[i].nid),
-				PFN_PHYS(early_node_map[i].start_pfn),
-				PFN_PHYS(size_pages));
-	}
-}
-
-void __init sparse_memory_present_with_active_regions(int nid)
-{
-	unsigned int i;
-	for_each_active_range_index_in_nid(i, nid)
-		memory_present(early_node_map[i].nid,
-				early_node_map[i].start_pfn,
-				early_node_map[i].end_pfn);
-}
-
-void __init get_pfn_range_for_nid(unsigned int nid,
-			unsigned long *start_pfn, unsigned long *end_pfn)
-{
-	unsigned int i;
-	*start_pfn = -1UL;
-	*end_pfn = 0;
-
-	for_each_active_range_index_in_nid(i, nid) {
-		if (early_node_map[i].start_pfn < *start_pfn)
-			*start_pfn = early_node_map[i].start_pfn;
-
-		if (early_node_map[i].end_pfn > *end_pfn)
-			*end_pfn = early_node_map[i].end_pfn;
-	}
-
-	if (*start_pfn == -1UL) {
-		printk(KERN_WARNING "Node %u active with no memory\n", nid);
-		*start_pfn = 0;
-	}
-}
-
-unsigned long __init zone_present_pages_in_node(int nid,
-					unsigned long zone_type,
-					unsigned long *ignored)
-{
-	unsigned long node_start_pfn, node_end_pfn;
-	unsigned long zone_start_pfn, zone_end_pfn;
-
-	/* Get the start and end of the node and zone */
-	get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
-	zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
-	zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
-
-	/* Check that this node has pages within the zone's required range */
-	if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)
-		return 0;
-
-	/* Move the zone boundaries inside the node if necessary */
-	if (zone_end_pfn > node_end_pfn)
-		zone_end_pfn = node_end_pfn;
-	if (zone_start_pfn < node_start_pfn)
-		zone_start_pfn = node_start_pfn;
-
-	/* Return the spanned pages */
-	return zone_end_pfn - zone_start_pfn;
-}
-
-unsigned long __init __absent_pages_in_range(int nid,
-				unsigned long range_start_pfn,
-				unsigned long range_end_pfn)
-{
-	int i = 0;
-	unsigned long prev_end_pfn = 0, hole_pages = 0;
-	unsigned long start_pfn;
-
-	/* Find the end_pfn of the first active range of pfns in the node */
-	i = first_active_region_index_in_nid(nid);
-	prev_end_pfn = early_node_map[i].start_pfn;
-
-	/* Find all holes for the zone within the node */
-	for (; i != MAX_ACTIVE_REGIONS;
-			i = next_active_region_index_in_nid(i, nid)) {
-
-		/* No need to continue if prev_end_pfn is outside the zone */
-		if (prev_end_pfn >= range_end_pfn)
-			break;
-
-		/* Make sure the end of the zone is not within the hole */
-		start_pfn = early_node_map[i].start_pfn;
-		if (start_pfn > range_end_pfn)
-			start_pfn = range_end_pfn;
-		if (prev_end_pfn < range_start_pfn)
-			prev_end_pfn = range_start_pfn;
-
-		/* Update the hole size cound and move on */
-		if (start_pfn > range_start_pfn) {
-			BUG_ON(prev_end_pfn > start_pfn);
-			hole_pages += start_pfn - prev_end_pfn;
-		}
-		prev_end_pfn = early_node_map[i].end_pfn;
-	}
-
-	return hole_pages;
-}
-
-unsigned long __init absent_pages_in_range(unsigned long start_pfn,
-							unsigned long end_pfn)
-{
-	return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
-}
-
-unsigned long __init zone_absent_pages_in_node(int nid,
-					unsigned long zone_type,
-					unsigned long *ignored)
-{
-	return __absent_pages_in_range(nid,
-				arch_zone_lowest_possible_pfn[zone_type],
-				arch_zone_highest_possible_pfn[zone_type]);
-}
-#else
-static inline unsigned long zone_present_pages_in_node(int nid,
-					unsigned long zone_type,
-					unsigned long *zones_size)
-{
-	return zones_size[zone_type];
-}
-
-static inline unsigned long zone_absent_pages_in_node(int nid,
-						unsigned long zone_type,
-						unsigned long *zholes_size)
-{
-	if (!zholes_size)
-		return 0;
-
-	return zholes_size[zone_type];
-}
-#endif
-
-static void __init calculate_node_totalpages(struct pglist_data *pgdat,
-		unsigned long *zones_size, unsigned long *zholes_size)
-{
-	unsigned long realtotalpages, totalpages = 0;
-	int i;
-
-	for (i = 0; i < MAX_NR_ZONES; i++) {
-		totalpages += zone_present_pages_in_node(pgdat->node_id, i,
-								zones_size);
-	}
-	pgdat->node_spanned_pages = totalpages;
-
-	realtotalpages = totalpages;
-	for (i = 0; i < MAX_NR_ZONES; i++) {
-		realtotalpages -=
-			zone_absent_pages_in_node(pgdat->node_id, i, zholes_size);
-	}
-	pgdat->node_present_pages = realtotalpages;
-	printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
-							realtotalpages);
-}
-
-/*
- * Set up the zone data structures:
- *   - mark all pages reserved
- *   - mark all memory queues empty
- *   - clear the memory bitmaps
- */
-static void __init free_area_init_core(struct pglist_data *pgdat,
-		unsigned long *zones_size, unsigned long *zholes_size)
-{
-	unsigned long j;
-	int nid = pgdat->node_id;
-	unsigned long zone_start_pfn = pgdat->node_start_pfn;
-
-	pgdat_resize_init(pgdat);
-	pgdat->nr_zones = 0;
-	init_waitqueue_head(&pgdat->kswapd_wait);
-	pgdat->kswapd_max_order = 0;
-	
-	for (j = 0; j < MAX_NR_ZONES; j++) {
-		struct zone *zone = pgdat->node_zones + j;
-		unsigned long size, realsize;
-
-		size = zone_present_pages_in_node(nid, j, zones_size);
-		realsize = size - zone_absent_pages_in_node(nid, j,
-								zholes_size);
-		if (j < ZONE_HIGHMEM)
-			nr_kernel_pages += realsize;
-		nr_all_pages += realsize;
-
-		zone->spanned_pages = size;
-		zone->present_pages = realsize;
-		zone->name = zone_names[j];
-		spin_lock_init(&zone->lock);
-		spin_lock_init(&zone->lru_lock);
-		zone_seqlock_init(zone);
-		zone->zone_pgdat = pgdat;
-		zone->free_pages = 0;
-
-		zone->temp_priority = zone->prev_priority = DEF_PRIORITY;
-
-		zone_pcp_init(zone);
-		INIT_LIST_HEAD(&zone->active_list);
-		INIT_LIST_HEAD(&zone->inactive_list);
-		zone->nr_scan_active = 0;
-		zone->nr_scan_inactive = 0;
-		zone->nr_active = 0;
-		zone->nr_inactive = 0;
-		atomic_set(&zone->reclaim_in_progress, 0);
-		if (!size)
-			continue;
-
-		zonetable_add(zone, nid, j, zone_start_pfn, size);
-		init_currently_empty_zone(zone, zone_start_pfn, size);
-		zone_start_pfn += size;
-	}
-}
-
-static void __init alloc_node_mem_map(struct pglist_data *pgdat)
-{
-	/* Skip empty nodes */
-	if (!pgdat->node_spanned_pages)
-		return;
-
-#ifdef CONFIG_FLAT_NODE_MEM_MAP
-	/* ia64 gets its own node_mem_map, before this, without bootmem */
-	if (!pgdat->node_mem_map) {
-		unsigned long size;
-		struct page *map;
-
-		size = (pgdat->node_spanned_pages + 1) * sizeof(struct page);
-		map = alloc_remap(pgdat->node_id, size);
-		if (!map)
-			map = alloc_bootmem_node(pgdat, size);
-		pgdat->node_mem_map = map;
-	}
-#ifdef CONFIG_FLATMEM
-	/*
-	 * With no DISCONTIG, the global mem_map is just set as node 0's
-	 */
-	if (pgdat == NODE_DATA(0))
-		mem_map = NODE_DATA(0)->node_mem_map;
-#endif
-#endif /* CONFIG_FLAT_NODE_MEM_MAP */
-}
-
-void __init free_area_init_node(int nid, struct pglist_data *pgdat,
-		unsigned long *zones_size, unsigned long node_start_pfn,
-		unsigned long *zholes_size)
-{
-	pgdat->node_id = nid;
-	pgdat->node_start_pfn = node_start_pfn;
-	calculate_node_totalpages(pgdat, zones_size, zholes_size);
-
-	alloc_node_mem_map(pgdat);
-
-	free_area_init_core(pgdat, zones_size, zholes_size);
-}
-
-#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
-void __init add_active_range(unsigned int nid, unsigned long start_pfn,
-						unsigned long end_pfn)
-{
-	unsigned int i;
-
-	/* Merge with existing active regions if possible */
-	for (i = 0; early_node_map[i].end_pfn; i++) {
-		if (early_node_map[i].nid != nid)
-			continue;
-
-		/* Skip if an existing region covers this new one */
-		if (start_pfn >= early_node_map[i].start_pfn &&
-				end_pfn <= early_node_map[i].end_pfn)
-			return;
-
-		/* Merge forward if suitable */
-		if (start_pfn <= early_node_map[i].end_pfn &&
-				end_pfn > early_node_map[i].end_pfn) {
-			early_node_map[i].end_pfn = end_pfn;
-			return;
-		}
-
-		/* Merge backward if suitable */
-		if (start_pfn < early_node_map[i].end_pfn &&
-				end_pfn >= early_node_map[i].start_pfn) {
-			early_node_map[i].start_pfn = start_pfn;
-			return;
-		}
-	}
-
-	/* Leave last entry NULL, we use range.end_pfn to terminate the walk */
-	if (i >= MAX_ACTIVE_REGIONS - 1) {
-		printk(KERN_ERR "Too many memory regions, truncating\n");
-		return;
-	}
-
-	early_node_map[i].nid = nid;
-	early_node_map[i].start_pfn = start_pfn;
-	early_node_map[i].end_pfn = end_pfn;
-}
-
-/* Compare two active node_active_regions */
-static int __init cmp_node_active_region(const void *a, const void *b)
-{
-	struct node_active_region *arange = (struct node_active_region *)a;
-	struct node_active_region *brange = (struct node_active_region *)b;
-
-	/* Done this way to avoid overflows */
-	if (arange->start_pfn > brange->start_pfn)
-		return 1;
-	if (arange->start_pfn < brange->start_pfn)
-		return -1;
-
-	return 0;
-}
-
-/* sort the node_map by start_pfn */
-static void __init sort_node_map(void)
-{
-	size_t num = 0;
-	while (early_node_map[num].end_pfn)
-		num++;
-
-	sort(early_node_map, num, sizeof(struct node_active_region),
-						cmp_node_active_region, NULL);
-}
-
-/* Find the lowest pfn for a node. This depends on a sorted early_node_map */
-unsigned long __init find_min_pfn_for_node(unsigned long nid)
-{
-	int i;
-
-	/* Assuming a sorted map, the first range found has the starting pfn */
-	for_each_active_range_index_in_nid(i, nid)
-		return early_node_map[i].start_pfn;
-
-	/* nid does not exist in early_node_map */
-	printk(KERN_WARNING "Could not find start_pfn for node %lu\n", nid);
-	return 0;
-}
-
-
-unsigned long __init find_min_pfn_with_active_regions(void)
-{
-	return find_min_pfn_for_node(MAX_NUMNODES);
-}
-
-unsigned long __init find_max_pfn_with_active_regions(void)
-{
-	int i;
-	unsigned long max_pfn = -1UL;
-
-	for (i = 0; early_node_map[i].end_pfn; i++)
-		max_pfn = max(max_pfn, early_node_map[i].start_pfn);
-
-	return max_pfn;
-}
-
-void __init free_area_init_nodes(unsigned long arch_max_dma_pfn,
-				unsigned long arch_max_dma32_pfn,
-				unsigned long arch_max_low_pfn,
-				unsigned long arch_max_high_pfn)
-{
-	unsigned long nid;
-	int zone_index;
-
-	/* Record where the zone boundaries are */
-	memset(arch_zone_lowest_possible_pfn, 0,
-				sizeof(arch_zone_lowest_possible_pfn));
-	memset(arch_zone_highest_possible_pfn, 0,
-				sizeof(arch_zone_highest_possible_pfn));
-	arch_zone_lowest_possible_pfn[ZONE_DMA] =
-					find_min_pfn_with_active_regions();
-	arch_zone_highest_possible_pfn[ZONE_DMA] = arch_max_dma_pfn;
-	arch_zone_highest_possible_pfn[ZONE_DMA32] = arch_max_dma32_pfn;
-	arch_zone_highest_possible_pfn[ZONE_NORMAL] = arch_max_low_pfn;
-	arch_zone_highest_possible_pfn[ZONE_HIGHMEM] = arch_max_high_pfn;
-	for (zone_index = 1; zone_index < MAX_NR_ZONES; zone_index++) {
-		arch_zone_lowest_possible_pfn[zone_index] =
-			arch_zone_highest_possible_pfn[zone_index-1];
-	}
-
-	/* Regions in the early_node_map can be in any order */
-	sort_node_map();
-
-	for_each_online_node(nid) {
-		pg_data_t *pgdat = NODE_DATA(nid);
-		free_area_init_node(nid, pgdat, NULL,
-				find_min_pfn_for_node(nid), NULL);
-	}
-}
-#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
-
 #ifndef CONFIG_NEED_MULTIPLE_NODES
 static bootmem_data_t contig_bootmem_data;
 struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data };
@@ -2972,32 +1977,6 @@ int lowmem_reserve_ratio_sysctl_handler(
 	return 0;
 }
 
-/*
- * percpu_pagelist_fraction - changes the pcp->high for each zone on each
- * cpu.  It is the fraction of total pages in each zone that a hot per cpu pagelist
- * can have before it gets flushed back to buddy allocator.
- */
-
-int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
-	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
-{
-	struct zone *zone;
-	unsigned int cpu;
-	int ret;
-
-	ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
-	if (!write || (ret == -EINVAL))
-		return ret;
-	for_each_zone(zone) {
-		for_each_online_cpu(cpu) {
-			unsigned long  high;
-			high = zone->present_pages / percpu_pagelist_fraction;
-			setup_pagelist_highmark(zone_pcp(zone, cpu), high);
-		}
-	}
-	return 0;
-}
-
 __initdata int hashdist = HASHDIST_DEFAULT;
 
 #ifdef CONFIG_NUMA

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH 6/7] Break out memory initialisation code from page_alloc.c to mem_init.c
  2006-04-24 20:20 [PATCH 0/7] [RFC] Sizing zones and holes in an architecture independent manner V4 Mel Gorman
@ 2006-04-24 20:22 ` Mel Gorman
  0 siblings, 0 replies; 11+ messages in thread
From: Mel Gorman @ 2006-04-24 20:22 UTC (permalink / raw)
  To: davej, tony.luck, linuxppc-dev, linux-kernel, bob.picco, ak,
	linux-mm
  Cc: Mel Gorman


page_alloc.c contains a large amount of memory initialisation code. This patch
breaks out the initialisation code to a separate file to make page_alloc.c
a bit easier to read.


 Makefile     |    2 
 mem_init.c   | 1044 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 page_alloc.c | 1025 -----------------------------------------------------
 3 files changed, 1045 insertions(+), 1026 deletions(-)

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc2-105-ia64_use_init_nodes/mm/Makefile linux-2.6.17-rc2-106-breakout_mem_init/mm/Makefile
--- linux-2.6.17-rc2-105-ia64_use_init_nodes/mm/Makefile	2006-04-19 04:00:49.000000000 +0100
+++ linux-2.6.17-rc2-106-breakout_mem_init/mm/Makefile	2006-04-24 15:53:22.000000000 +0100
@@ -8,7 +8,7 @@ mmu-$(CONFIG_MMU)	:= fremap.o highmem.o 
 			   vmalloc.o
 
 obj-y			:= bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
-			   page_alloc.o page-writeback.o pdflush.o \
+			   page_alloc.o mem_init.o page-writeback.o pdflush.o \
 			   readahead.o swap.o truncate.o vmscan.o \
 			   prio_tree.o util.o mmzone.o $(mmu-y)
 
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc2-105-ia64_use_init_nodes/mm/mem_init.c linux-2.6.17-rc2-106-breakout_mem_init/mm/mem_init.c
--- linux-2.6.17-rc2-105-ia64_use_init_nodes/mm/mem_init.c	2006-04-24 09:36:35.000000000 +0100
+++ linux-2.6.17-rc2-106-breakout_mem_init/mm/mem_init.c	2006-04-24 15:53:22.000000000 +0100
@@ -0,0 +1,1044 @@
+/*
+ * mm/mem_init.c
+ * Initialises the architecture independant view of memory. pgdats, zones, etc
+ *
+ *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
+ *  Copyright (C) 1995, Stephen Tweedie
+ *  Copyright (C) July 1999, Gerhard Wichert, Siemens AG
+ *  Copyright (C) 1999, Ingo Molnar, Red Hat
+ *  Copyright (C) 1999, 2000, Kanoj Sarcar, SGI
+ *  Copyright (C) Sept 2000, Martin J. Bligh
+ *	(lots of bits borrowed from Ingo Molnar & Andrew Morton)
+ *  Copyright (C) Apr 2006, Mel Gorman, IBM
+ *	(lots of bits taken from architecture-specific code)
+ */
+#include <linux/config.h>
+#include <linux/sort.h>
+#include <linux/pfn.h>
+#include <linux/mm.h>
+#include <linux/bootmem.h>
+#include <linux/cpuset.h>
+#include <linux/mempolicy.h>
+#include <linux/sysctl.h>
+#include <linux/swap.h>
+#include <linux/cpu.h>
+
+static char *zone_names[MAX_NR_ZONES] = { "DMA", "DMA32", "Normal", "HighMem" };
+int percpu_pagelist_fraction;
+
+#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
+  #ifdef CONFIG_MAX_ACTIVE_REGIONS
+    #define MAX_ACTIVE_REGIONS CONFIG_MAX_ACTIVE_REGIONS
+  #else
+    #define MAX_ACTIVE_REGIONS (MAX_NR_ZONES * MAX_NUMNODES + 1)
+  #endif
+
+  struct node_active_region __initdata early_node_map[MAX_ACTIVE_REGIONS];
+  unsigned long __initdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
+  unsigned long __initdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
+#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
+
+/*
+ * Builds allocation fallback zone lists.
+ *
+ * Add all populated zones of a node to the zonelist.
+ */
+static int __init build_zonelists_node(pg_data_t *pgdat,
+			struct zonelist *zonelist, int nr_zones, int zone_type)
+{
+	struct zone *zone;
+
+	BUG_ON(zone_type > ZONE_HIGHMEM);
+
+	do {
+		zone = pgdat->node_zones + zone_type;
+		if (populated_zone(zone)) {
+#ifndef CONFIG_HIGHMEM
+			BUG_ON(zone_type > ZONE_NORMAL);
+#endif
+			zonelist->zones[nr_zones++] = zone;
+			check_highest_zone(zone_type);
+		}
+		zone_type--;
+
+	} while (zone_type >= 0);
+	return nr_zones;
+}
+
+static inline int highest_zone(int zone_bits)
+{
+	int res = ZONE_NORMAL;
+	if (zone_bits & (__force int)__GFP_HIGHMEM)
+		res = ZONE_HIGHMEM;
+	if (zone_bits & (__force int)__GFP_DMA32)
+		res = ZONE_DMA32;
+	if (zone_bits & (__force int)__GFP_DMA)
+		res = ZONE_DMA;
+	return res;
+}
+
+#ifdef CONFIG_NUMA
+#define MAX_NODE_LOAD (num_online_nodes())
+static int __initdata node_load[MAX_NUMNODES];
+/**
+ * find_next_best_node - find the next node that should appear in a given node's fallback list
+ * @node: node whose fallback list we're appending
+ * @used_node_mask: nodemask_t of already used nodes
+ *
+ * We use a number of factors to determine which is the next node that should
+ * appear on a given node's fallback list.  The node should not have appeared
+ * already in @node's fallback list, and it should be the next closest node
+ * according to the distance array (which contains arbitrary distance values
+ * from each node to each node in the system), and should also prefer nodes
+ * with no CPUs, since presumably they'll have very little allocation pressure
+ * on them otherwise.
+ * It returns -1 if no node is found.
+ */
+static int __init find_next_best_node(int node, nodemask_t *used_node_mask)
+{
+	int n, val;
+	int min_val = INT_MAX;
+	int best_node = -1;
+
+	/* Use the local node if we haven't already */
+	if (!node_isset(node, *used_node_mask)) {
+		node_set(node, *used_node_mask);
+		return node;
+	}
+
+	for_each_online_node(n) {
+		cpumask_t tmp;
+
+		/* Don't want a node to appear more than once */
+		if (node_isset(n, *used_node_mask))
+			continue;
+
+		/* Use the distance array to find the distance */
+		val = node_distance(node, n);
+
+		/* Penalize nodes under us ("prefer the next node") */
+		val += (n < node);
+
+		/* Give preference to headless and unused nodes */
+		tmp = node_to_cpumask(n);
+		if (!cpus_empty(tmp))
+			val += PENALTY_FOR_NODE_WITH_CPUS;
+
+		/* Slight preference for less loaded node */
+		val *= (MAX_NODE_LOAD*MAX_NUMNODES);
+		val += node_load[n];
+
+		if (val < min_val) {
+			min_val = val;
+			best_node = n;
+		}
+	}
+
+	if (best_node >= 0)
+		node_set(best_node, *used_node_mask);
+
+	return best_node;
+}
+
+static void __init build_zonelists(pg_data_t *pgdat)
+{
+	int i, j, k, node, local_node;
+	int prev_node, load;
+	struct zonelist *zonelist;
+	nodemask_t used_mask;
+
+	/* initialize zonelists */
+	for (i = 0; i < GFP_ZONETYPES; i++) {
+		zonelist = pgdat->node_zonelists + i;
+		zonelist->zones[0] = NULL;
+	}
+
+	/* NUMA-aware ordering of nodes */
+	local_node = pgdat->node_id;
+	load = num_online_nodes();
+	prev_node = local_node;
+	nodes_clear(used_mask);
+	while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
+		int distance = node_distance(local_node, node);
+
+		/*
+		 * If another node is sufficiently far away then it is better
+		 * to reclaim pages in a zone before going off node.
+		 */
+		if (distance > RECLAIM_DISTANCE)
+			zone_reclaim_mode = 1;
+
+		/*
+		 * We don't want to pressure a particular node.
+		 * So adding penalty to the first node in same
+		 * distance group to make it round-robin.
+		 */
+
+		if (distance != node_distance(local_node, prev_node))
+			node_load[node] += load;
+		prev_node = node;
+		load--;
+		for (i = 0; i < GFP_ZONETYPES; i++) {
+			zonelist = pgdat->node_zonelists + i;
+			for (j = 0; zonelist->zones[j] != NULL; j++);
+
+			k = highest_zone(i);
+
+	 		j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
+			zonelist->zones[j] = NULL;
+		}
+	}
+}
+
+#else	/* CONFIG_NUMA */
+
+static void __init build_zonelists(pg_data_t *pgdat)
+{
+	int i, j, k, node, local_node;
+
+	local_node = pgdat->node_id;
+	for (i = 0; i < GFP_ZONETYPES; i++) {
+		struct zonelist *zonelist;
+
+		zonelist = pgdat->node_zonelists + i;
+
+		j = 0;
+		k = highest_zone(i);
+ 		j = build_zonelists_node(pgdat, zonelist, j, k);
+ 		/*
+ 		 * Now we build the zonelist so that it contains the zones
+ 		 * of all the other nodes.
+ 		 * We don't want to pressure a particular node, so when
+ 		 * building the zones for node N, we make sure that the
+ 		 * zones coming right after the local ones are those from
+ 		 * node N+1 (modulo N)
+ 		 */
+		for (node = local_node + 1; node < MAX_NUMNODES; node++) {
+			if (!node_online(node))
+				continue;
+			j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
+		}
+		for (node = 0; node < local_node; node++) {
+			if (!node_online(node))
+				continue;
+			j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
+		}
+
+		zonelist->zones[j] = NULL;
+	}
+}
+
+#endif	/* CONFIG_NUMA */
+
+void __init build_all_zonelists(void)
+{
+	int i;
+
+	for_each_online_node(i)
+		build_zonelists(NODE_DATA(i));
+	printk("Built %i zonelists\n", num_online_nodes());
+	cpuset_init_current_mems_allowed();
+}
+
+/*
+ * Helper functions to size the waitqueue hash table.
+ * Essentially these want to choose hash table sizes sufficiently
+ * large so that collisions trying to wait on pages are rare.
+ * But in fact, the number of active page waitqueues on typical
+ * systems is ridiculously low, less than 200. So this is even
+ * conservative, even though it seems large.
+ *
+ * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
+ * waitqueues, i.e. the size of the waitq table given the number of pages.
+ */
+#define PAGES_PER_WAITQUEUE	256
+
+static inline unsigned long wait_table_size(unsigned long pages)
+{
+	unsigned long size = 1;
+
+	pages /= PAGES_PER_WAITQUEUE;
+
+	while (size < pages)
+		size <<= 1;
+
+	/*
+	 * Once we have dozens or even hundreds of threads sleeping
+	 * on IO we've got bigger problems than wait queue collision.
+	 * Limit the size of the wait table to a reasonable size.
+	 */
+	size = min(size, 4096UL);
+
+	return max(size, 4UL);
+}
+
+/*
+ * This is an integer logarithm so that shifts can be used later
+ * to extract the more random high bits from the multiplicative
+ * hash function before the remainder is taken.
+ */
+static inline unsigned long wait_table_bits(unsigned long size)
+{
+	return ffz(~size);
+}
+
+#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
+
+#ifndef __HAVE_ARCH_MEMMAP_INIT
+#define memmap_init(size, nid, zone, start_pfn) \
+	memmap_init_zone((size), (nid), (zone), (start_pfn))
+#endif
+
+/*
+ * Initially all pages are reserved - free ones are freed
+ * up by free_all_bootmem() once the early boot process is
+ * done. Non-atomic initialization, single-pass.
+ */
+void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
+		unsigned long start_pfn)
+{
+	struct page *page;
+	unsigned long end_pfn = start_pfn + size;
+	unsigned long pfn;
+
+	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
+		if (!early_pfn_valid(pfn))
+			continue;
+		page = pfn_to_page(pfn);
+		set_page_links(page, zone, nid, pfn);
+		init_page_count(page);
+		reset_page_mapcount(page);
+		SetPageReserved(page);
+		INIT_LIST_HEAD(&page->lru);
+#ifdef WANT_PAGE_VIRTUAL
+		/* The shift won't overflow because ZONE_NORMAL is below 4G. */
+		if (!is_highmem_idx(zone))
+			set_page_address(page, __va(pfn << PAGE_SHIFT));
+#endif
+	}
+}
+
+void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone,
+				unsigned long size)
+{
+	int order;
+	for (order = 0; order < MAX_ORDER ; order++) {
+		INIT_LIST_HEAD(&zone->free_area[order].free_list);
+		zone->free_area[order].nr_free = 0;
+	}
+}
+
+#define ZONETABLE_INDEX(x, zone_nr)	((x << ZONES_SHIFT) | zone_nr)
+void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn,
+		unsigned long size)
+{
+	unsigned long snum = pfn_to_section_nr(pfn);
+	unsigned long end = pfn_to_section_nr(pfn + size);
+
+	if (FLAGS_HAS_NODE)
+		zone_table[ZONETABLE_INDEX(nid, zid)] = zone;
+	else
+		for (; snum <= end; snum++)
+			zone_table[ZONETABLE_INDEX(snum, zid)] = zone;
+}
+
+static __meminit
+void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
+{
+	int i;
+	struct pglist_data *pgdat = zone->zone_pgdat;
+
+	/*
+	 * The per-page waitqueue mechanism uses hashed waitqueues
+	 * per zone.
+	 */
+	zone->wait_table_size = wait_table_size(zone_size_pages);
+	zone->wait_table_bits =	wait_table_bits(zone->wait_table_size);
+	zone->wait_table = (wait_queue_head_t *)
+		alloc_bootmem_node(pgdat, zone->wait_table_size
+					* sizeof(wait_queue_head_t));
+
+	for(i = 0; i < zone->wait_table_size; ++i)
+		init_waitqueue_head(zone->wait_table + i);
+}
+
+/*
+ * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist
+ * to the value high for the pageset p.
+ */
+static void setup_pagelist_highmark(struct per_cpu_pageset *p,
+				unsigned long high)
+{
+	struct per_cpu_pages *pcp;
+
+	pcp = &p->pcp[0]; /* hot list */
+	pcp->high = high;
+	pcp->batch = max(1UL, high/4);
+	if ((high/4) > (PAGE_SHIFT * 8))
+		pcp->batch = PAGE_SHIFT * 8;
+}
+
+/*
+ * percpu_pagelist_fraction - changes the pcp->high for each zone on each
+ * cpu.  It is the fraction of total pages in each zone that a hot per cpu pagelist
+ * can have before it gets flushed back to buddy allocator.
+ */
+int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
+	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+{
+	struct zone *zone;
+	unsigned int cpu;
+	int ret;
+
+	ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
+	if (!write || (ret == -EINVAL))
+		return ret;
+	for_each_zone(zone) {
+		for_each_online_cpu(cpu) {
+			unsigned long  high;
+			high = zone->present_pages / percpu_pagelist_fraction;
+			setup_pagelist_highmark(zone_pcp(zone, cpu), high);
+		}
+	}
+	return 0;
+}
+
+static int __cpuinit zone_batchsize(struct zone *zone)
+{
+	int batch;
+
+	/*
+	 * The per-cpu-pages pools are set to around 1000th of the
+	 * size of the zone.  But no more than 1/2 of a meg.
+	 *
+	 * OK, so we don't know how big the cache is.  So guess.
+	 */
+	batch = zone->present_pages / 1024;
+	if (batch * PAGE_SIZE > 512 * 1024)
+		batch = (512 * 1024) / PAGE_SIZE;
+	batch /= 4;		/* We effectively *= 4 below */
+	if (batch < 1)
+		batch = 1;
+
+	/*
+	 * Clamp the batch to a 2^n - 1 value. Having a power
+	 * of 2 value was found to be more likely to have
+	 * suboptimal cache aliasing properties in some cases.
+	 *
+	 * For example if 2 tasks are alternately allocating
+	 * batches of pages, one task can end up with a lot
+	 * of pages of one half of the possible page colors
+	 * and the other with pages of the other colors.
+	 */
+	batch = (1 << (fls(batch + batch/2)-1)) - 1;
+
+	return batch;
+}
+
+inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
+{
+	struct per_cpu_pages *pcp;
+
+	memset(p, 0, sizeof(*p));
+
+	pcp = &p->pcp[0];		/* hot */
+	pcp->count = 0;
+	pcp->high = 6 * batch;
+	pcp->batch = max(1UL, 1 * batch);
+	INIT_LIST_HEAD(&pcp->list);
+
+	pcp = &p->pcp[1];		/* cold*/
+	pcp->count = 0;
+	pcp->high = 2 * batch;
+	pcp->batch = max(1UL, batch/2);
+	INIT_LIST_HEAD(&pcp->list);
+}
+
+#ifdef CONFIG_NUMA
+/*
+ * Boot pageset table. One per cpu which is going to be used for all
+ * zones and all nodes. The parameters will be set in such a way
+ * that an item put on a list will immediately be handed over to
+ * the buddy list. This is safe since pageset manipulation is done
+ * with interrupts disabled.
+ *
+ * Some NUMA counter updates may also be caught by the boot pagesets.
+ *
+ * The boot_pagesets must be kept even after bootup is complete for
+ * unused processors and/or zones. They do play a role for bootstrapping
+ * hotplugged processors.
+ *
+ * zoneinfo_show() and maybe other functions do
+ * not check if the processor is online before following the pageset pointer.
+ * Other parts of the kernel may not check if the zone is available.
+ */
+static struct per_cpu_pageset boot_pageset[NR_CPUS];
+
+/*
+ * Dynamically allocate memory for the
+ * per cpu pageset array in struct zone.
+ */
+static int __cpuinit process_zones(int cpu)
+{
+	struct zone *zone, *dzone;
+
+	for_each_zone(zone) {
+
+		zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
+					 GFP_KERNEL, cpu_to_node(cpu));
+		if (!zone_pcp(zone, cpu))
+			goto bad;
+
+		setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone));
+
+		if (percpu_pagelist_fraction)
+			setup_pagelist_highmark(zone_pcp(zone, cpu),
+			 	(zone->present_pages / percpu_pagelist_fraction));
+	}
+
+	return 0;
+bad:
+	for_each_zone(dzone) {
+		if (dzone == zone)
+			break;
+		kfree(zone_pcp(dzone, cpu));
+		zone_pcp(dzone, cpu) = NULL;
+	}
+	return -ENOMEM;
+}
+
+static inline void free_zone_pagesets(int cpu)
+{
+	struct zone *zone;
+
+	for_each_zone(zone) {
+		struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
+
+		zone_pcp(zone, cpu) = NULL;
+		kfree(pset);
+	}
+}
+
+static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,
+		unsigned long action,
+		void *hcpu)
+{
+	int cpu = (long)hcpu;
+	int ret = NOTIFY_OK;
+
+	switch (action) {
+		case CPU_UP_PREPARE:
+			if (process_zones(cpu))
+				ret = NOTIFY_BAD;
+			break;
+		case CPU_UP_CANCELED:
+		case CPU_DEAD:
+			free_zone_pagesets(cpu);
+			break;
+		default:
+			break;
+	}
+	return ret;
+}
+
+static struct notifier_block pageset_notifier =
+	{ &pageset_cpuup_callback, NULL, 0 };
+
+void __init setup_per_cpu_pageset(void)
+{
+	int err;
+
+	/* Initialize per_cpu_pageset for cpu 0.
+	 * A cpuup callback will do this for every cpu
+	 * as it comes online
+	 */
+	err = process_zones(smp_processor_id());
+	BUG_ON(err);
+	register_cpu_notifier(&pageset_notifier);
+}
+#endif
+
+static __meminit void zone_pcp_init(struct zone *zone)
+{
+	int cpu;
+	unsigned long batch = zone_batchsize(zone);
+
+	for (cpu = 0; cpu < NR_CPUS; cpu++) {
+#ifdef CONFIG_NUMA
+		/* Early boot. Slab allocator not functional yet */
+		zone_pcp(zone, cpu) = &boot_pageset[cpu];
+		setup_pageset(&boot_pageset[cpu],0);
+#else
+		setup_pageset(zone_pcp(zone,cpu), batch);
+#endif
+	}
+	if (zone->present_pages)
+		printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%lu\n",
+			zone->name, zone->present_pages, batch);
+}
+
+static __meminit void init_currently_empty_zone(struct zone *zone,
+		unsigned long zone_start_pfn, unsigned long size)
+{
+	struct pglist_data *pgdat = zone->zone_pgdat;
+
+	zone_wait_table_init(zone, size);
+	pgdat->nr_zones = zone_idx(zone) + 1;
+
+	zone->zone_start_pfn = zone_start_pfn;
+
+	memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn);
+
+	zone_init_free_lists(pgdat, zone, zone->spanned_pages);
+}
+
+#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
+/* Note: nid == MAX_NUMNODES returns first region */
+static int __init first_active_region_index_in_nid(int nid)
+{
+	int i;
+	for (i = 0; early_node_map[i].end_pfn; i++) {
+		if (nid == MAX_NUMNODES || early_node_map[i].nid == nid)
+			return i;
+	}
+
+	return MAX_ACTIVE_REGIONS;
+}
+
+/* Note: nid == MAX_NUMNODES returns next region */
+static int __init next_active_region_index_in_nid(unsigned int index, int nid)
+{
+	for (index = index + 1; early_node_map[index].end_pfn; index++) {
+		if (nid == MAX_NUMNODES || early_node_map[index].nid == nid)
+			return index;
+	}
+
+	return MAX_ACTIVE_REGIONS;
+}
+
+#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
+int __init early_pfn_to_nid(unsigned long pfn)
+{
+	int i;
+
+	for (i = 0; early_node_map[i].end_pfn; i++) {
+		unsigned long start_pfn = early_node_map[i].start_pfn;
+		unsigned long end_pfn = early_node_map[i].end_pfn;
+
+		if ((start_pfn <= pfn) && (pfn < end_pfn))
+			return early_node_map[i].nid;
+	}
+
+	return -1;
+}
+#endif
+
+#define for_each_active_range_index_in_nid(i, nid) \
+	for (i = first_active_region_index_in_nid(nid); \
+				i != MAX_ACTIVE_REGIONS; \
+				i = next_active_region_index_in_nid(i, nid))
+
+void __init free_bootmem_with_active_regions(int nid,
+						unsigned long max_low_pfn)
+{
+	unsigned int i;
+	for_each_active_range_index_in_nid(i, nid) {
+		unsigned long size_pages = 0;
+		unsigned long end_pfn = early_node_map[i].end_pfn;
+		if (early_node_map[i].start_pfn >= max_low_pfn)
+			continue;
+
+		if (end_pfn > max_low_pfn)
+			end_pfn = max_low_pfn;
+
+		size_pages = end_pfn - early_node_map[i].start_pfn;
+		free_bootmem_node(NODE_DATA(early_node_map[i].nid),
+				PFN_PHYS(early_node_map[i].start_pfn),
+				PFN_PHYS(size_pages));
+	}
+}
+
+void __init sparse_memory_present_with_active_regions(int nid)
+{
+	unsigned int i;
+	for_each_active_range_index_in_nid(i, nid)
+		memory_present(early_node_map[i].nid,
+				early_node_map[i].start_pfn,
+				early_node_map[i].end_pfn);
+}
+
+void __init get_pfn_range_for_nid(unsigned int nid,
+			unsigned long *start_pfn, unsigned long *end_pfn)
+{
+	unsigned int i;
+	*start_pfn = -1UL;
+	*end_pfn = 0;
+
+	for_each_active_range_index_in_nid(i, nid) {
+		if (early_node_map[i].start_pfn < *start_pfn)
+			*start_pfn = early_node_map[i].start_pfn;
+
+		if (early_node_map[i].end_pfn > *end_pfn)
+			*end_pfn = early_node_map[i].end_pfn;
+	}
+
+	if (*start_pfn == -1UL) {
+		printk(KERN_WARNING "Node %u active with no memory\n", nid);
+		*start_pfn = 0;
+	}
+}
+
+unsigned long __init zone_present_pages_in_node(int nid,
+					unsigned long zone_type,
+					unsigned long *ignored)
+{
+	unsigned long node_start_pfn, node_end_pfn;
+	unsigned long zone_start_pfn, zone_end_pfn;
+
+	/* Get the start and end of the node and zone */
+	get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
+	zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
+	zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
+
+	/* Check that this node has pages within the zone's required range */
+	if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)
+		return 0;
+
+	/* Move the zone boundaries inside the node if necessary */
+	if (zone_end_pfn > node_end_pfn)
+		zone_end_pfn = node_end_pfn;
+	if (zone_start_pfn < node_start_pfn)
+		zone_start_pfn = node_start_pfn;
+
+	/* Return the spanned pages */
+	return zone_end_pfn - zone_start_pfn;
+}
+
+unsigned long __init __absent_pages_in_range(int nid,
+				unsigned long range_start_pfn,
+				unsigned long range_end_pfn)
+{
+	int i = 0;
+	unsigned long prev_end_pfn = 0, hole_pages = 0;
+	unsigned long start_pfn;
+
+	/* Find the end_pfn of the first active range of pfns in the node */
+	i = first_active_region_index_in_nid(nid);
+	prev_end_pfn = early_node_map[i].start_pfn;
+
+	/* Find all holes for the zone within the node */
+	for (; i != MAX_ACTIVE_REGIONS;
+			i = next_active_region_index_in_nid(i, nid)) {
+
+		/* No need to continue if prev_end_pfn is outside the zone */
+		if (prev_end_pfn >= range_end_pfn)
+			break;
+
+		/* Make sure the end of the zone is not within the hole */
+		start_pfn = early_node_map[i].start_pfn;
+		if (start_pfn > range_end_pfn)
+			start_pfn = range_end_pfn;
+		if (prev_end_pfn < range_start_pfn)
+			prev_end_pfn = range_start_pfn;
+
+		/* Update the hole size cound and move on */
+		if (start_pfn > range_start_pfn) {
+			BUG_ON(prev_end_pfn > start_pfn);
+			hole_pages += start_pfn - prev_end_pfn;
+		}
+		prev_end_pfn = early_node_map[i].end_pfn;
+	}
+
+	return hole_pages;
+}
+
+unsigned long __init absent_pages_in_range(unsigned long start_pfn,
+							unsigned long end_pfn)
+{
+	return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
+}
+
+unsigned long __init zone_absent_pages_in_node(int nid,
+					unsigned long zone_type,
+					unsigned long *ignored)
+{
+	return __absent_pages_in_range(nid,
+				arch_zone_lowest_possible_pfn[zone_type],
+				arch_zone_highest_possible_pfn[zone_type]);
+}
+#else
+static inline unsigned long zone_present_pages_in_node(int nid,
+					unsigned long zone_type,
+					unsigned long *zones_size)
+{
+	return zones_size[zone_type];
+}
+
+static inline unsigned long zone_absent_pages_in_node(int nid,
+						unsigned long zone_type,
+						unsigned long *zholes_size)
+{
+	if (!zholes_size)
+		return 0;
+
+	return zholes_size[zone_type];
+}
+#endif
+
+static void __init calculate_node_totalpages(struct pglist_data *pgdat,
+		unsigned long *zones_size, unsigned long *zholes_size)
+{
+	unsigned long realtotalpages, totalpages = 0;
+	int i;
+
+	for (i = 0; i < MAX_NR_ZONES; i++) {
+		totalpages += zone_present_pages_in_node(pgdat->node_id, i,
+								zones_size);
+	}
+	pgdat->node_spanned_pages = totalpages;
+
+	realtotalpages = totalpages;
+	for (i = 0; i < MAX_NR_ZONES; i++) {
+		realtotalpages -=
+			zone_absent_pages_in_node(pgdat->node_id, i, zholes_size);
+	}
+	pgdat->node_present_pages = realtotalpages;
+	printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
+							realtotalpages);
+}
+
+/*
+ * Set up the zone data structures:
+ *   - mark all pages reserved
+ *   - mark all memory queues empty
+ *   - clear the memory bitmaps
+ */
+static void __init free_area_init_core(struct pglist_data *pgdat,
+		unsigned long *zones_size, unsigned long *zholes_size)
+{
+	unsigned long j;
+	int nid = pgdat->node_id;
+	unsigned long zone_start_pfn = pgdat->node_start_pfn;
+
+	pgdat_resize_init(pgdat);
+	pgdat->nr_zones = 0;
+	init_waitqueue_head(&pgdat->kswapd_wait);
+	pgdat->kswapd_max_order = 0;
+
+	for (j = 0; j < MAX_NR_ZONES; j++) {
+		struct zone *zone = pgdat->node_zones + j;
+		unsigned long size, realsize;
+
+		size = zone_present_pages_in_node(nid, j, zones_size);
+		realsize = size - zone_absent_pages_in_node(nid, j,
+								zholes_size);
+		if (j < ZONE_HIGHMEM)
+			nr_kernel_pages += realsize;
+		nr_all_pages += realsize;
+
+		zone->spanned_pages = size;
+		zone->present_pages = realsize;
+		zone->name = zone_names[j];
+		spin_lock_init(&zone->lock);
+		spin_lock_init(&zone->lru_lock);
+		zone_seqlock_init(zone);
+		zone->zone_pgdat = pgdat;
+		zone->free_pages = 0;
+
+		zone->temp_priority = zone->prev_priority = DEF_PRIORITY;
+
+		zone_pcp_init(zone);
+		INIT_LIST_HEAD(&zone->active_list);
+		INIT_LIST_HEAD(&zone->inactive_list);
+		zone->nr_scan_active = 0;
+		zone->nr_scan_inactive = 0;
+		zone->nr_active = 0;
+		zone->nr_inactive = 0;
+		atomic_set(&zone->reclaim_in_progress, 0);
+		if (!size)
+			continue;
+
+		zonetable_add(zone, nid, j, zone_start_pfn, size);
+		init_currently_empty_zone(zone, zone_start_pfn, size);
+		zone_start_pfn += size;
+	}
+}
+
+static void __init alloc_node_mem_map(struct pglist_data *pgdat)
+{
+	/* Skip empty nodes */
+	if (!pgdat->node_spanned_pages)
+		return;
+
+#ifdef CONFIG_FLAT_NODE_MEM_MAP
+	/* ia64 gets its own node_mem_map, before this, without bootmem */
+	if (!pgdat->node_mem_map) {
+		unsigned long size;
+		struct page *map;
+
+		size = (pgdat->node_spanned_pages + 1) * sizeof(struct page);
+		map = alloc_remap(pgdat->node_id, size);
+		if (!map)
+			map = alloc_bootmem_node(pgdat, size);
+		pgdat->node_mem_map = map;
+	}
+#ifdef CONFIG_FLATMEM
+	/*
+	 * With no DISCONTIG, the global mem_map is just set as node 0's
+	 */
+	if (pgdat == NODE_DATA(0))
+		mem_map = NODE_DATA(0)->node_mem_map;
+#endif
+#endif /* CONFIG_FLAT_NODE_MEM_MAP */
+}
+
+void __init free_area_init_node(int nid, struct pglist_data *pgdat,
+		unsigned long *zones_size, unsigned long node_start_pfn,
+		unsigned long *zholes_size)
+{
+	pgdat->node_id = nid;
+	pgdat->node_start_pfn = node_start_pfn;
+	calculate_node_totalpages(pgdat, zones_size, zholes_size);
+
+	alloc_node_mem_map(pgdat);
+
+	free_area_init_core(pgdat, zones_size, zholes_size);
+}
+
+#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
+void __init add_active_range(unsigned int nid, unsigned long start_pfn,
+						unsigned long end_pfn)
+{
+	unsigned int i;
+
+	/* Merge with existing active regions if possible */
+	for (i = 0; early_node_map[i].end_pfn; i++) {
+		if (early_node_map[i].nid != nid)
+			continue;
+
+		/* Skip if an existing region covers this new one */
+		if (start_pfn >= early_node_map[i].start_pfn &&
+				end_pfn <= early_node_map[i].end_pfn)
+			return;
+
+		/* Merge forward if suitable */
+		if (start_pfn <= early_node_map[i].end_pfn &&
+				end_pfn > early_node_map[i].end_pfn) {
+			early_node_map[i].end_pfn = end_pfn;
+			return;
+		}
+
+		/* Merge backward if suitable */
+		if (start_pfn < early_node_map[i].end_pfn &&
+				end_pfn >= early_node_map[i].start_pfn) {
+			early_node_map[i].start_pfn = start_pfn;
+			return;
+		}
+	}
+
+	/* Leave last entry NULL, we use range.end_pfn to terminate the walk */
+	if (i >= MAX_ACTIVE_REGIONS - 1) {
+		printk(KERN_ERR "Too many memory regions, truncating\n");
+		return;
+	}
+
+	early_node_map[i].nid = nid;
+	early_node_map[i].start_pfn = start_pfn;
+	early_node_map[i].end_pfn = end_pfn;
+}
+
+void __init remove_all_active_ranges()
+{
+	memset(early_node_map, 0, sizeof(early_node_map));
+}
+
+/* Compare two active node_active_regions */
+static int __init cmp_node_active_region(const void *a, const void *b)
+{
+	struct node_active_region *arange = (struct node_active_region *)a;
+	struct node_active_region *brange = (struct node_active_region *)b;
+
+	/* Done this way to avoid overflows */
+	if (arange->start_pfn > brange->start_pfn)
+		return 1;
+	if (arange->start_pfn < brange->start_pfn)
+		return -1;
+
+	return 0;
+}
+
+/* sort the node_map by start_pfn */
+static void __init sort_node_map(void)
+{
+	size_t num = 0;
+	while (early_node_map[num].end_pfn)
+		num++;
+
+	sort(early_node_map, num, sizeof(struct node_active_region),
+						cmp_node_active_region, NULL);
+}
+
+/* Find the lowest pfn for a node. This depends on a sorted early_node_map */
+unsigned long __init find_min_pfn_for_node(unsigned long nid)
+{
+	int i;
+
+	/* Assuming a sorted map, the first range found has the starting pfn */
+	for_each_active_range_index_in_nid(i, nid)
+		return early_node_map[i].start_pfn;
+
+	/* nid does not exist in early_node_map */
+	printk(KERN_WARNING "Could not find start_pfn for node %lu\n", nid);
+	return 0;
+}
+
+unsigned long __init find_min_pfn_with_active_regions(void)
+{
+	return find_min_pfn_for_node(MAX_NUMNODES);
+}
+
+unsigned long __init find_max_pfn_with_active_regions(void)
+{
+	int i;
+	unsigned long max_pfn = 0;
+
+	for (i = 0; early_node_map[i].end_pfn; i++)
+		max_pfn = max(max_pfn, early_node_map[i].end_pfn);
+
+	return max_pfn;
+}
+
+void __init free_area_init_nodes(unsigned long arch_max_dma_pfn,
+				unsigned long arch_max_dma32_pfn,
+				unsigned long arch_max_low_pfn,
+				unsigned long arch_max_high_pfn)
+{
+	unsigned long nid;
+	int zone_index;
+
+	/* Record where the zone boundaries are */
+	memset(arch_zone_lowest_possible_pfn, 0,
+				sizeof(arch_zone_lowest_possible_pfn));
+	memset(arch_zone_highest_possible_pfn, 0,
+				sizeof(arch_zone_highest_possible_pfn));
+	arch_zone_lowest_possible_pfn[ZONE_DMA] =
+					find_min_pfn_with_active_regions();
+	arch_zone_highest_possible_pfn[ZONE_DMA] = arch_max_dma_pfn;
+	arch_zone_highest_possible_pfn[ZONE_DMA32] = arch_max_dma32_pfn;
+	arch_zone_highest_possible_pfn[ZONE_NORMAL] = arch_max_low_pfn;
+	arch_zone_highest_possible_pfn[ZONE_HIGHMEM] = arch_max_high_pfn;
+	for (zone_index = 1; zone_index < MAX_NR_ZONES; zone_index++) {
+		arch_zone_lowest_possible_pfn[zone_index] = 
+			arch_zone_highest_possible_pfn[zone_index-1];
+	}
+
+	/* Regions in the early_node_map can be in any order */
+	sort_node_map();
+
+	for_each_online_node(nid) {
+		pg_data_t *pgdat = NODE_DATA(nid);
+		free_area_init_node(nid, pgdat, NULL,
+				find_min_pfn_for_node(nid), NULL);
+	}
+}
+#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc2-105-ia64_use_init_nodes/mm/page_alloc.c linux-2.6.17-rc2-106-breakout_mem_init/mm/page_alloc.c
--- linux-2.6.17-rc2-105-ia64_use_init_nodes/mm/page_alloc.c	2006-04-24 15:49:25.000000000 +0100
+++ linux-2.6.17-rc2-106-breakout_mem_init/mm/page_alloc.c	2006-04-24 15:53:22.000000000 +0100
@@ -37,8 +37,6 @@
 #include <linux/nodemask.h>
 #include <linux/vmalloc.h>
 #include <linux/mempolicy.h>
-#include <linux/sort.h>
-#include <linux/pfn.h>
 
 #include <asm/tlbflush.h>
 #include "internal.h"
@@ -55,7 +53,6 @@ unsigned long totalram_pages __read_most
 unsigned long totalhigh_pages __read_mostly;
 unsigned long totalreserve_pages __read_mostly;
 long nr_swap_pages;
-int percpu_pagelist_fraction;
 
 static void __free_pages_ok(struct page *page, unsigned int order);
 
@@ -81,24 +78,11 @@ EXPORT_SYMBOL(totalram_pages);
 struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly;
 EXPORT_SYMBOL(zone_table);
 
-static char *zone_names[MAX_NR_ZONES] = { "DMA", "DMA32", "Normal", "HighMem" };
 int min_free_kbytes = 1024;
 
 unsigned long __initdata nr_kernel_pages;
 unsigned long __initdata nr_all_pages;
 
-#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
-  #ifdef CONFIG_MAX_ACTIVE_REGIONS
-    #define MAX_ACTIVE_REGIONS CONFIG_MAX_ACTIVE_REGIONS
-  #else
-    #define MAX_ACTIVE_REGIONS (MAX_NR_ZONES * MAX_NUMNODES + 1)
-  #endif
-
-  struct node_active_region __initdata early_node_map[MAX_ACTIVE_REGIONS];
-  unsigned long __initdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
-  unsigned long __initdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
-#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
-
 #ifdef CONFIG_DEBUG_VM
 static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
 {
@@ -1517,989 +1501,6 @@ void show_free_areas(void)
 	show_swap_cache_info();
 }
 
-/*
- * Builds allocation fallback zone lists.
- *
- * Add all populated zones of a node to the zonelist.
- */
-static int __init build_zonelists_node(pg_data_t *pgdat,
-			struct zonelist *zonelist, int nr_zones, int zone_type)
-{
-	struct zone *zone;
-
-	BUG_ON(zone_type > ZONE_HIGHMEM);
-
-	do {
-		zone = pgdat->node_zones + zone_type;
-		if (populated_zone(zone)) {
-#ifndef CONFIG_HIGHMEM
-			BUG_ON(zone_type > ZONE_NORMAL);
-#endif
-			zonelist->zones[nr_zones++] = zone;
-			check_highest_zone(zone_type);
-		}
-		zone_type--;
-
-	} while (zone_type >= 0);
-	return nr_zones;
-}
-
-static inline int highest_zone(int zone_bits)
-{
-	int res = ZONE_NORMAL;
-	if (zone_bits & (__force int)__GFP_HIGHMEM)
-		res = ZONE_HIGHMEM;
-	if (zone_bits & (__force int)__GFP_DMA32)
-		res = ZONE_DMA32;
-	if (zone_bits & (__force int)__GFP_DMA)
-		res = ZONE_DMA;
-	return res;
-}
-
-#ifdef CONFIG_NUMA
-#define MAX_NODE_LOAD (num_online_nodes())
-static int __initdata node_load[MAX_NUMNODES];
-/**
- * find_next_best_node - find the next node that should appear in a given node's fallback list
- * @node: node whose fallback list we're appending
- * @used_node_mask: nodemask_t of already used nodes
- *
- * We use a number of factors to determine which is the next node that should
- * appear on a given node's fallback list.  The node should not have appeared
- * already in @node's fallback list, and it should be the next closest node
- * according to the distance array (which contains arbitrary distance values
- * from each node to each node in the system), and should also prefer nodes
- * with no CPUs, since presumably they'll have very little allocation pressure
- * on them otherwise.
- * It returns -1 if no node is found.
- */
-static int __init find_next_best_node(int node, nodemask_t *used_node_mask)
-{
-	int n, val;
-	int min_val = INT_MAX;
-	int best_node = -1;
-
-	/* Use the local node if we haven't already */
-	if (!node_isset(node, *used_node_mask)) {
-		node_set(node, *used_node_mask);
-		return node;
-	}
-
-	for_each_online_node(n) {
-		cpumask_t tmp;
-
-		/* Don't want a node to appear more than once */
-		if (node_isset(n, *used_node_mask))
-			continue;
-
-		/* Use the distance array to find the distance */
-		val = node_distance(node, n);
-
-		/* Penalize nodes under us ("prefer the next node") */
-		val += (n < node);
-
-		/* Give preference to headless and unused nodes */
-		tmp = node_to_cpumask(n);
-		if (!cpus_empty(tmp))
-			val += PENALTY_FOR_NODE_WITH_CPUS;
-
-		/* Slight preference for less loaded node */
-		val *= (MAX_NODE_LOAD*MAX_NUMNODES);
-		val += node_load[n];
-
-		if (val < min_val) {
-			min_val = val;
-			best_node = n;
-		}
-	}
-
-	if (best_node >= 0)
-		node_set(best_node, *used_node_mask);
-
-	return best_node;
-}
-
-static void __init build_zonelists(pg_data_t *pgdat)
-{
-	int i, j, k, node, local_node;
-	int prev_node, load;
-	struct zonelist *zonelist;
-	nodemask_t used_mask;
-
-	/* initialize zonelists */
-	for (i = 0; i < GFP_ZONETYPES; i++) {
-		zonelist = pgdat->node_zonelists + i;
-		zonelist->zones[0] = NULL;
-	}
-
-	/* NUMA-aware ordering of nodes */
-	local_node = pgdat->node_id;
-	load = num_online_nodes();
-	prev_node = local_node;
-	nodes_clear(used_mask);
-	while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
-		int distance = node_distance(local_node, node);
-
-		/*
-		 * If another node is sufficiently far away then it is better
-		 * to reclaim pages in a zone before going off node.
-		 */
-		if (distance > RECLAIM_DISTANCE)
-			zone_reclaim_mode = 1;
-
-		/*
-		 * We don't want to pressure a particular node.
-		 * So adding penalty to the first node in same
-		 * distance group to make it round-robin.
-		 */
-
-		if (distance != node_distance(local_node, prev_node))
-			node_load[node] += load;
-		prev_node = node;
-		load--;
-		for (i = 0; i < GFP_ZONETYPES; i++) {
-			zonelist = pgdat->node_zonelists + i;
-			for (j = 0; zonelist->zones[j] != NULL; j++);
-
-			k = highest_zone(i);
-
-	 		j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
-			zonelist->zones[j] = NULL;
-		}
-	}
-}
-
-#else	/* CONFIG_NUMA */
-
-static void __init build_zonelists(pg_data_t *pgdat)
-{
-	int i, j, k, node, local_node;
-
-	local_node = pgdat->node_id;
-	for (i = 0; i < GFP_ZONETYPES; i++) {
-		struct zonelist *zonelist;
-
-		zonelist = pgdat->node_zonelists + i;
-
-		j = 0;
-		k = highest_zone(i);
- 		j = build_zonelists_node(pgdat, zonelist, j, k);
- 		/*
- 		 * Now we build the zonelist so that it contains the zones
- 		 * of all the other nodes.
- 		 * We don't want to pressure a particular node, so when
- 		 * building the zones for node N, we make sure that the
- 		 * zones coming right after the local ones are those from
- 		 * node N+1 (modulo N)
- 		 */
-		for (node = local_node + 1; node < MAX_NUMNODES; node++) {
-			if (!node_online(node))
-				continue;
-			j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
-		}
-		for (node = 0; node < local_node; node++) {
-			if (!node_online(node))
-				continue;
-			j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
-		}
-
-		zonelist->zones[j] = NULL;
-	}
-}
-
-#endif	/* CONFIG_NUMA */
-
-void __init build_all_zonelists(void)
-{
-	int i;
-
-	for_each_online_node(i)
-		build_zonelists(NODE_DATA(i));
-	printk("Built %i zonelists\n", num_online_nodes());
-	cpuset_init_current_mems_allowed();
-}
-
-/*
- * Helper functions to size the waitqueue hash table.
- * Essentially these want to choose hash table sizes sufficiently
- * large so that collisions trying to wait on pages are rare.
- * But in fact, the number of active page waitqueues on typical
- * systems is ridiculously low, less than 200. So this is even
- * conservative, even though it seems large.
- *
- * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
- * waitqueues, i.e. the size of the waitq table given the number of pages.
- */
-#define PAGES_PER_WAITQUEUE	256
-
-static inline unsigned long wait_table_size(unsigned long pages)
-{
-	unsigned long size = 1;
-
-	pages /= PAGES_PER_WAITQUEUE;
-
-	while (size < pages)
-		size <<= 1;
-
-	/*
-	 * Once we have dozens or even hundreds of threads sleeping
-	 * on IO we've got bigger problems than wait queue collision.
-	 * Limit the size of the wait table to a reasonable size.
-	 */
-	size = min(size, 4096UL);
-
-	return max(size, 4UL);
-}
-
-/*
- * This is an integer logarithm so that shifts can be used later
- * to extract the more random high bits from the multiplicative
- * hash function before the remainder is taken.
- */
-static inline unsigned long wait_table_bits(unsigned long size)
-{
-	return ffz(~size);
-}
-
-#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
-
-/*
- * Initially all pages are reserved - free ones are freed
- * up by free_all_bootmem() once the early boot process is
- * done. Non-atomic initialization, single-pass.
- */
-void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
-		unsigned long start_pfn)
-{
-	struct page *page;
-	unsigned long end_pfn = start_pfn + size;
-	unsigned long pfn;
-
-	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
-		if (!early_pfn_valid(pfn))
-			continue;
-		page = pfn_to_page(pfn);
-		set_page_links(page, zone, nid, pfn);
-		init_page_count(page);
-		reset_page_mapcount(page);
-		SetPageReserved(page);
-		INIT_LIST_HEAD(&page->lru);
-#ifdef WANT_PAGE_VIRTUAL
-		/* The shift won't overflow because ZONE_NORMAL is below 4G. */
-		if (!is_highmem_idx(zone))
-			set_page_address(page, __va(pfn << PAGE_SHIFT));
-#endif
-	}
-}
-
-void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone,
-				unsigned long size)
-{
-	int order;
-	for (order = 0; order < MAX_ORDER ; order++) {
-		INIT_LIST_HEAD(&zone->free_area[order].free_list);
-		zone->free_area[order].nr_free = 0;
-	}
-}
-
-#define ZONETABLE_INDEX(x, zone_nr)	((x << ZONES_SHIFT) | zone_nr)
-void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn,
-		unsigned long size)
-{
-	unsigned long snum = pfn_to_section_nr(pfn);
-	unsigned long end = pfn_to_section_nr(pfn + size);
-
-	if (FLAGS_HAS_NODE)
-		zone_table[ZONETABLE_INDEX(nid, zid)] = zone;
-	else
-		for (; snum <= end; snum++)
-			zone_table[ZONETABLE_INDEX(snum, zid)] = zone;
-}
-
-#ifndef __HAVE_ARCH_MEMMAP_INIT
-#define memmap_init(size, nid, zone, start_pfn) \
-	memmap_init_zone((size), (nid), (zone), (start_pfn))
-#endif
-
-static int __cpuinit zone_batchsize(struct zone *zone)
-{
-	int batch;
-
-	/*
-	 * The per-cpu-pages pools are set to around 1000th of the
-	 * size of the zone.  But no more than 1/2 of a meg.
-	 *
-	 * OK, so we don't know how big the cache is.  So guess.
-	 */
-	batch = zone->present_pages / 1024;
-	if (batch * PAGE_SIZE > 512 * 1024)
-		batch = (512 * 1024) / PAGE_SIZE;
-	batch /= 4;		/* We effectively *= 4 below */
-	if (batch < 1)
-		batch = 1;
-
-	/*
-	 * Clamp the batch to a 2^n - 1 value. Having a power
-	 * of 2 value was found to be more likely to have
-	 * suboptimal cache aliasing properties in some cases.
-	 *
-	 * For example if 2 tasks are alternately allocating
-	 * batches of pages, one task can end up with a lot
-	 * of pages of one half of the possible page colors
-	 * and the other with pages of the other colors.
-	 */
-	batch = (1 << (fls(batch + batch/2)-1)) - 1;
-
-	return batch;
-}
-
-inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
-{
-	struct per_cpu_pages *pcp;
-
-	memset(p, 0, sizeof(*p));
-
-	pcp = &p->pcp[0];		/* hot */
-	pcp->count = 0;
-	pcp->high = 6 * batch;
-	pcp->batch = max(1UL, 1 * batch);
-	INIT_LIST_HEAD(&pcp->list);
-
-	pcp = &p->pcp[1];		/* cold*/
-	pcp->count = 0;
-	pcp->high = 2 * batch;
-	pcp->batch = max(1UL, batch/2);
-	INIT_LIST_HEAD(&pcp->list);
-}
-
-/*
- * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist
- * to the value high for the pageset p.
- */
-
-static void setup_pagelist_highmark(struct per_cpu_pageset *p,
-				unsigned long high)
-{
-	struct per_cpu_pages *pcp;
-
-	pcp = &p->pcp[0]; /* hot list */
-	pcp->high = high;
-	pcp->batch = max(1UL, high/4);
-	if ((high/4) > (PAGE_SHIFT * 8))
-		pcp->batch = PAGE_SHIFT * 8;
-}
-
-
-#ifdef CONFIG_NUMA
-/*
- * Boot pageset table. One per cpu which is going to be used for all
- * zones and all nodes. The parameters will be set in such a way
- * that an item put on a list will immediately be handed over to
- * the buddy list. This is safe since pageset manipulation is done
- * with interrupts disabled.
- *
- * Some NUMA counter updates may also be caught by the boot pagesets.
- *
- * The boot_pagesets must be kept even after bootup is complete for
- * unused processors and/or zones. They do play a role for bootstrapping
- * hotplugged processors.
- *
- * zoneinfo_show() and maybe other functions do
- * not check if the processor is online before following the pageset pointer.
- * Other parts of the kernel may not check if the zone is available.
- */
-static struct per_cpu_pageset boot_pageset[NR_CPUS];
-
-/*
- * Dynamically allocate memory for the
- * per cpu pageset array in struct zone.
- */
-static int __cpuinit process_zones(int cpu)
-{
-	struct zone *zone, *dzone;
-
-	for_each_zone(zone) {
-
-		zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
-					 GFP_KERNEL, cpu_to_node(cpu));
-		if (!zone_pcp(zone, cpu))
-			goto bad;
-
-		setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone));
-
-		if (percpu_pagelist_fraction)
-			setup_pagelist_highmark(zone_pcp(zone, cpu),
-			 	(zone->present_pages / percpu_pagelist_fraction));
-	}
-
-	return 0;
-bad:
-	for_each_zone(dzone) {
-		if (dzone == zone)
-			break;
-		kfree(zone_pcp(dzone, cpu));
-		zone_pcp(dzone, cpu) = NULL;
-	}
-	return -ENOMEM;
-}
-
-static inline void free_zone_pagesets(int cpu)
-{
-	struct zone *zone;
-
-	for_each_zone(zone) {
-		struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
-
-		zone_pcp(zone, cpu) = NULL;
-		kfree(pset);
-	}
-}
-
-static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,
-		unsigned long action,
-		void *hcpu)
-{
-	int cpu = (long)hcpu;
-	int ret = NOTIFY_OK;
-
-	switch (action) {
-		case CPU_UP_PREPARE:
-			if (process_zones(cpu))
-				ret = NOTIFY_BAD;
-			break;
-		case CPU_UP_CANCELED:
-		case CPU_DEAD:
-			free_zone_pagesets(cpu);
-			break;
-		default:
-			break;
-	}
-	return ret;
-}
-
-static struct notifier_block pageset_notifier =
-	{ &pageset_cpuup_callback, NULL, 0 };
-
-void __init setup_per_cpu_pageset(void)
-{
-	int err;
-
-	/* Initialize per_cpu_pageset for cpu 0.
-	 * A cpuup callback will do this for every cpu
-	 * as it comes online
-	 */
-	err = process_zones(smp_processor_id());
-	BUG_ON(err);
-	register_cpu_notifier(&pageset_notifier);
-}
-
-#endif
-
-static __meminit
-void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
-{
-	int i;
-	struct pglist_data *pgdat = zone->zone_pgdat;
-
-	/*
-	 * The per-page waitqueue mechanism uses hashed waitqueues
-	 * per zone.
-	 */
-	zone->wait_table_size = wait_table_size(zone_size_pages);
-	zone->wait_table_bits =	wait_table_bits(zone->wait_table_size);
-	zone->wait_table = (wait_queue_head_t *)
-		alloc_bootmem_node(pgdat, zone->wait_table_size
-					* sizeof(wait_queue_head_t));
-
-	for(i = 0; i < zone->wait_table_size; ++i)
-		init_waitqueue_head(zone->wait_table + i);
-}
-
-static __meminit void zone_pcp_init(struct zone *zone)
-{
-	int cpu;
-	unsigned long batch = zone_batchsize(zone);
-
-	for (cpu = 0; cpu < NR_CPUS; cpu++) {
-#ifdef CONFIG_NUMA
-		/* Early boot. Slab allocator not functional yet */
-		zone_pcp(zone, cpu) = &boot_pageset[cpu];
-		setup_pageset(&boot_pageset[cpu],0);
-#else
-		setup_pageset(zone_pcp(zone,cpu), batch);
-#endif
-	}
-	if (zone->present_pages)
-		printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%lu\n",
-			zone->name, zone->present_pages, batch);
-}
-
-static __meminit void init_currently_empty_zone(struct zone *zone,
-		unsigned long zone_start_pfn, unsigned long size)
-{
-	struct pglist_data *pgdat = zone->zone_pgdat;
-
-	zone_wait_table_init(zone, size);
-	pgdat->nr_zones = zone_idx(zone) + 1;
-
-	zone->zone_start_pfn = zone_start_pfn;
-
-	memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn);
-
-	zone_init_free_lists(pgdat, zone, zone->spanned_pages);
-}
-
-#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
-/* Note: nid == MAX_NUMNODES returns first region */
-static int __init first_active_region_index_in_nid(int nid)
-{
-	int i;
-	for (i = 0; early_node_map[i].end_pfn; i++) {
-		if (nid == MAX_NUMNODES || early_node_map[i].nid == nid)
-			return i;
-	}
-
-	return MAX_ACTIVE_REGIONS;
-}
-
-/* Note: nid == MAX_NUMNODES returns next region */
-static int __init next_active_region_index_in_nid(unsigned int index, int nid)
-{
-	for (index = index + 1; early_node_map[index].end_pfn; index++) {
-		if (nid == MAX_NUMNODES || early_node_map[index].nid == nid)
-			return index;
-	}
-
-	return MAX_ACTIVE_REGIONS;
-}
-
-#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
-int __init early_pfn_to_nid(unsigned long pfn)
-{
-	int i;
-
-	for (i = 0; early_node_map[i].end_pfn; i++) {
-		unsigned long start_pfn = early_node_map[i].start_pfn;
-		unsigned long end_pfn = early_node_map[i].end_pfn;
-
-		if ((start_pfn <= pfn) && (pfn < end_pfn))
-			return early_node_map[i].nid;
-	}
-
-	return -1;
-}
-#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
-
-#define for_each_active_range_index_in_nid(i, nid) \
-	for (i = first_active_region_index_in_nid(nid); \
-				i != MAX_ACTIVE_REGIONS; \
-				i = next_active_region_index_in_nid(i, nid))
-
-void __init free_bootmem_with_active_regions(int nid,
-						unsigned long max_low_pfn)
-{
-	unsigned int i;
-	for_each_active_range_index_in_nid(i, nid) {
-		unsigned long size_pages = 0;
-		unsigned long end_pfn = early_node_map[i].end_pfn;
-		if (early_node_map[i].start_pfn >= max_low_pfn)
-			continue;
-
-		if (end_pfn > max_low_pfn)
-			end_pfn = max_low_pfn;
-
-		size_pages = end_pfn - early_node_map[i].start_pfn;
-		free_bootmem_node(NODE_DATA(early_node_map[i].nid),
-				PFN_PHYS(early_node_map[i].start_pfn),
-				PFN_PHYS(size_pages));
-	}
-}
-
-void __init sparse_memory_present_with_active_regions(int nid)
-{
-	unsigned int i;
-	for_each_active_range_index_in_nid(i, nid)
-		memory_present(early_node_map[i].nid,
-				early_node_map[i].start_pfn,
-				early_node_map[i].end_pfn);
-}
-
-void __init get_pfn_range_for_nid(unsigned int nid,
-			unsigned long *start_pfn, unsigned long *end_pfn)
-{
-	unsigned int i;
-	*start_pfn = -1UL;
-	*end_pfn = 0;
-
-	for_each_active_range_index_in_nid(i, nid) {
-		if (early_node_map[i].start_pfn < *start_pfn)
-			*start_pfn = early_node_map[i].start_pfn;
-
-		if (early_node_map[i].end_pfn > *end_pfn)
-			*end_pfn = early_node_map[i].end_pfn;
-	}
-
-	if (*start_pfn == -1UL) {
-		printk(KERN_WARNING "Node %u active with no memory\n", nid);
-		*start_pfn = 0;
-	}
-}
-
-unsigned long __init zone_present_pages_in_node(int nid,
-					unsigned long zone_type,
-					unsigned long *ignored)
-{
-	unsigned long node_start_pfn, node_end_pfn;
-	unsigned long zone_start_pfn, zone_end_pfn;
-
-	/* Get the start and end of the node and zone */
-	get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
-	zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
-	zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
-
-	/* Check that this node has pages within the zone's required range */
-	if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)
-		return 0;
-
-	/* Move the zone boundaries inside the node if necessary */
-	if (zone_end_pfn > node_end_pfn)
-		zone_end_pfn = node_end_pfn;
-	if (zone_start_pfn < node_start_pfn)
-		zone_start_pfn = node_start_pfn;
-
-	/* Return the spanned pages */
-	return zone_end_pfn - zone_start_pfn;
-}
-
-unsigned long __init __absent_pages_in_range(int nid,
-				unsigned long range_start_pfn,
-				unsigned long range_end_pfn)
-{
-	int i = 0;
-	unsigned long prev_end_pfn = 0, hole_pages = 0;
-	unsigned long start_pfn;
-
-	/* Find the end_pfn of the first active range of pfns in the node */
-	i = first_active_region_index_in_nid(nid);
-	prev_end_pfn = early_node_map[i].start_pfn;
-
-	/* Find all holes for the zone within the node */
-	for (; i != MAX_ACTIVE_REGIONS;
-			i = next_active_region_index_in_nid(i, nid)) {
-
-		/* No need to continue if prev_end_pfn is outside the zone */
-		if (prev_end_pfn >= range_end_pfn)
-			break;
-
-		/* Make sure the end of the zone is not within the hole */
-		start_pfn = early_node_map[i].start_pfn;
-		if (start_pfn > range_end_pfn)
-			start_pfn = range_end_pfn;
-		if (prev_end_pfn < range_start_pfn)
-			prev_end_pfn = range_start_pfn;
-
-		/* Update the hole size cound and move on */
-		if (start_pfn > range_start_pfn) {
-			BUG_ON(prev_end_pfn > start_pfn);
-			hole_pages += start_pfn - prev_end_pfn;
-		}
-		prev_end_pfn = early_node_map[i].end_pfn;
-	}
-
-	return hole_pages;
-}
-
-unsigned long __init absent_pages_in_range(unsigned long start_pfn,
-							unsigned long end_pfn)
-{
-	return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
-}
-
-unsigned long __init zone_absent_pages_in_node(int nid,
-					unsigned long zone_type,
-					unsigned long *ignored)
-{
-	return __absent_pages_in_range(nid,
-				arch_zone_lowest_possible_pfn[zone_type],
-				arch_zone_highest_possible_pfn[zone_type]);
-}
-#else
-static inline unsigned long zone_present_pages_in_node(int nid,
-					unsigned long zone_type,
-					unsigned long *zones_size)
-{
-	return zones_size[zone_type];
-}
-
-static inline unsigned long zone_absent_pages_in_node(int nid,
-						unsigned long zone_type,
-						unsigned long *zholes_size)
-{
-	if (!zholes_size)
-		return 0;
-
-	return zholes_size[zone_type];
-}
-#endif
-
-static void __init calculate_node_totalpages(struct pglist_data *pgdat,
-		unsigned long *zones_size, unsigned long *zholes_size)
-{
-	unsigned long realtotalpages, totalpages = 0;
-	int i;
-
-	for (i = 0; i < MAX_NR_ZONES; i++) {
-		totalpages += zone_present_pages_in_node(pgdat->node_id, i,
-								zones_size);
-	}
-	pgdat->node_spanned_pages = totalpages;
-
-	realtotalpages = totalpages;
-	for (i = 0; i < MAX_NR_ZONES; i++) {
-		realtotalpages -=
-			zone_absent_pages_in_node(pgdat->node_id, i, zholes_size);
-	}
-	pgdat->node_present_pages = realtotalpages;
-	printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
-							realtotalpages);
-}
-
-/*
- * Set up the zone data structures:
- *   - mark all pages reserved
- *   - mark all memory queues empty
- *   - clear the memory bitmaps
- */
-static void __init free_area_init_core(struct pglist_data *pgdat,
-		unsigned long *zones_size, unsigned long *zholes_size)
-{
-	unsigned long j;
-	int nid = pgdat->node_id;
-	unsigned long zone_start_pfn = pgdat->node_start_pfn;
-
-	pgdat_resize_init(pgdat);
-	pgdat->nr_zones = 0;
-	init_waitqueue_head(&pgdat->kswapd_wait);
-	pgdat->kswapd_max_order = 0;
-	
-	for (j = 0; j < MAX_NR_ZONES; j++) {
-		struct zone *zone = pgdat->node_zones + j;
-		unsigned long size, realsize;
-
-		size = zone_present_pages_in_node(nid, j, zones_size);
-		realsize = size - zone_absent_pages_in_node(nid, j,
-								zholes_size);
-		if (j < ZONE_HIGHMEM)
-			nr_kernel_pages += realsize;
-		nr_all_pages += realsize;
-
-		zone->spanned_pages = size;
-		zone->present_pages = realsize;
-		zone->name = zone_names[j];
-		spin_lock_init(&zone->lock);
-		spin_lock_init(&zone->lru_lock);
-		zone_seqlock_init(zone);
-		zone->zone_pgdat = pgdat;
-		zone->free_pages = 0;
-
-		zone->temp_priority = zone->prev_priority = DEF_PRIORITY;
-
-		zone_pcp_init(zone);
-		INIT_LIST_HEAD(&zone->active_list);
-		INIT_LIST_HEAD(&zone->inactive_list);
-		zone->nr_scan_active = 0;
-		zone->nr_scan_inactive = 0;
-		zone->nr_active = 0;
-		zone->nr_inactive = 0;
-		atomic_set(&zone->reclaim_in_progress, 0);
-		if (!size)
-			continue;
-
-		zonetable_add(zone, nid, j, zone_start_pfn, size);
-		init_currently_empty_zone(zone, zone_start_pfn, size);
-		zone_start_pfn += size;
-	}
-}
-
-static void __init alloc_node_mem_map(struct pglist_data *pgdat)
-{
-	/* Skip empty nodes */
-	if (!pgdat->node_spanned_pages)
-		return;
-
-#ifdef CONFIG_FLAT_NODE_MEM_MAP
-	/* ia64 gets its own node_mem_map, before this, without bootmem */
-	if (!pgdat->node_mem_map) {
-		unsigned long size;
-		struct page *map;
-
-		size = (pgdat->node_spanned_pages + 1) * sizeof(struct page);
-		map = alloc_remap(pgdat->node_id, size);
-		if (!map)
-			map = alloc_bootmem_node(pgdat, size);
-		pgdat->node_mem_map = map;
-	}
-#ifdef CONFIG_FLATMEM
-	/*
-	 * With no DISCONTIG, the global mem_map is just set as node 0's
-	 */
-	if (pgdat == NODE_DATA(0))
-		mem_map = NODE_DATA(0)->node_mem_map;
-#endif
-#endif /* CONFIG_FLAT_NODE_MEM_MAP */
-}
-
-void __init free_area_init_node(int nid, struct pglist_data *pgdat,
-		unsigned long *zones_size, unsigned long node_start_pfn,
-		unsigned long *zholes_size)
-{
-	pgdat->node_id = nid;
-	pgdat->node_start_pfn = node_start_pfn;
-	calculate_node_totalpages(pgdat, zones_size, zholes_size);
-
-	alloc_node_mem_map(pgdat);
-
-	free_area_init_core(pgdat, zones_size, zholes_size);
-}
-
-#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
-void __init add_active_range(unsigned int nid, unsigned long start_pfn,
-						unsigned long end_pfn)
-{
-	unsigned int i;
-
-	/* Merge with existing active regions if possible */
-	for (i = 0; early_node_map[i].end_pfn; i++) {
-		if (early_node_map[i].nid != nid)
-			continue;
-
-		/* Skip if an existing region covers this new one */
-		if (start_pfn >= early_node_map[i].start_pfn &&
-				end_pfn <= early_node_map[i].end_pfn)
-			return;
-
-		/* Merge forward if suitable */
-		if (start_pfn <= early_node_map[i].end_pfn &&
-				end_pfn > early_node_map[i].end_pfn) {
-			early_node_map[i].end_pfn = end_pfn;
-			return;
-		}
-
-		/* Merge backward if suitable */
-		if (start_pfn < early_node_map[i].end_pfn &&
-				end_pfn >= early_node_map[i].start_pfn) {
-			early_node_map[i].start_pfn = start_pfn;
-			return;
-		}
-	}
-
-	/* Leave last entry NULL, we use range.end_pfn to terminate the walk */
-	if (i >= MAX_ACTIVE_REGIONS - 1) {
-		printk(KERN_ERR "Too many memory regions, truncating\n");
-		return;
-	}
-
-	early_node_map[i].nid = nid;
-	early_node_map[i].start_pfn = start_pfn;
-	early_node_map[i].end_pfn = end_pfn;
-}
-
-void __init remove_all_active_ranges()
-{
-	memset(early_node_map, 0, sizeof(early_node_map));
-}
-
-/* Compare two active node_active_regions */
-static int __init cmp_node_active_region(const void *a, const void *b)
-{
-	struct node_active_region *arange = (struct node_active_region *)a;
-	struct node_active_region *brange = (struct node_active_region *)b;
-
-	/* Done this way to avoid overflows */
-	if (arange->start_pfn > brange->start_pfn)
-		return 1;
-	if (arange->start_pfn < brange->start_pfn)
-		return -1;
-
-	return 0;
-}
-
-/* sort the node_map by start_pfn */
-static void __init sort_node_map(void)
-{
-	size_t num = 0;
-	while (early_node_map[num].end_pfn)
-		num++;
-
-	sort(early_node_map, num, sizeof(struct node_active_region),
-						cmp_node_active_region, NULL);
-}
-
-/* Find the lowest pfn for a node. This depends on a sorted early_node_map */
-unsigned long __init find_min_pfn_for_node(unsigned long nid)
-{
-	int i;
-
-	/* Assuming a sorted map, the first range found has the starting pfn */
-	for_each_active_range_index_in_nid(i, nid)
-		return early_node_map[i].start_pfn;
-
-	/* nid does not exist in early_node_map */
-	printk(KERN_WARNING "Could not find start_pfn for node %lu\n", nid);
-	return 0;
-}
-
-unsigned long __init find_min_pfn_with_active_regions(void)
-{
-	return find_min_pfn_for_node(MAX_NUMNODES);
-}
-
-unsigned long __init find_max_pfn_with_active_regions(void)
-{
-	int i;
-	unsigned long max_pfn = 0;
-
-	for (i = 0; early_node_map[i].end_pfn; i++)
-		max_pfn = max(max_pfn, early_node_map[i].end_pfn);
-
-	return max_pfn;
-}
-
-void __init free_area_init_nodes(unsigned long arch_max_dma_pfn,
-				unsigned long arch_max_dma32_pfn,
-				unsigned long arch_max_low_pfn,
-				unsigned long arch_max_high_pfn)
-{
-	unsigned long nid;
-	int zone_index;
-
-	/* Record where the zone boundaries are */
-	memset(arch_zone_lowest_possible_pfn, 0,
-				sizeof(arch_zone_lowest_possible_pfn));
-	memset(arch_zone_highest_possible_pfn, 0,
-				sizeof(arch_zone_highest_possible_pfn));
-	arch_zone_lowest_possible_pfn[ZONE_DMA] =
-					find_min_pfn_with_active_regions();
-	arch_zone_highest_possible_pfn[ZONE_DMA] = arch_max_dma_pfn;
-	arch_zone_highest_possible_pfn[ZONE_DMA32] = arch_max_dma32_pfn;
-	arch_zone_highest_possible_pfn[ZONE_NORMAL] = arch_max_low_pfn;
-	arch_zone_highest_possible_pfn[ZONE_HIGHMEM] = arch_max_high_pfn;
-	for (zone_index = 1; zone_index < MAX_NR_ZONES; zone_index++) {
-		arch_zone_lowest_possible_pfn[zone_index] =
-			arch_zone_highest_possible_pfn[zone_index-1];
-	}
-
-	/* Regions in the early_node_map can be in any order */
-	sort_node_map();
-
-	for_each_online_node(nid) {
-		pg_data_t *pgdat = NODE_DATA(nid);
-		free_area_init_node(nid, pgdat, NULL,
-				find_min_pfn_for_node(nid), NULL);
-	}
-}
-#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
-
 #ifndef CONFIG_NEED_MULTIPLE_NODES
 static bootmem_data_t contig_bootmem_data;
 struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data };
@@ -3020,32 +2021,6 @@ int lowmem_reserve_ratio_sysctl_handler(
 	return 0;
 }
 
-/*
- * percpu_pagelist_fraction - changes the pcp->high for each zone on each
- * cpu.  It is the fraction of total pages in each zone that a hot per cpu pagelist
- * can have before it gets flushed back to buddy allocator.
- */
-
-int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
-	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
-{
-	struct zone *zone;
-	unsigned int cpu;
-	int ret;
-
-	ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
-	if (!write || (ret == -EINVAL))
-		return ret;
-	for_each_zone(zone) {
-		for_each_online_cpu(cpu) {
-			unsigned long  high;
-			high = zone->present_pages / percpu_pagelist_fraction;
-			setup_pagelist_highmark(zone_pcp(zone, cpu), high);
-		}
-	}
-	return 0;
-}
-
 __initdata int hashdist = HASHDIST_DEFAULT;
 
 #ifdef CONFIG_NUMA

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH 0/7] Sizing zones and holes in an architecture independent manner V5
@ 2006-05-01 13:35 Mel Gorman
  2006-05-01 13:35 ` [PATCH 1/7] Introduce mechanism for registering active regions of memory Mel Gorman
                   ` (6 more replies)
  0 siblings, 7 replies; 11+ messages in thread
From: Mel Gorman @ 2006-05-01 13:35 UTC (permalink / raw)
  To: akpm, davej, tony.luck, linuxppc-dev, linux-kernel, bob.picco, ak,
	linux-mm
  Cc: Mel Gorman

This is V5 of the patchset to size zones and memory holes in
an architecture-independent manner. This has been rebased against
2.6.17-rc3-mm1 and as there were no objections against release V4, I would
like to have it considered for merging. If there are merge conflicts with
later trees, let me know what to rebase against.

The reasons why I'd like to this merged include;

 o Less architecture-specific code - particularly for x86 and ppc64
 o More maintainable. Changes to zone layout need only be made in one place
 o Zone-sizing and memory hole calculation is one less job that needs to be
   done for new architecture ports
 o With the architecture-independent representation, zone-based
   anti-fragmentation needs a lot less architecture-specific code making it
   more portable between architectures. This will be important for future
   hugepage-availability work
 o Nigel Cunningham has stated that that software suspend could potentially
   use the architecture-independent representation to discover what pages
   need to be saved during suspend

Changelog since V4
o Rebase to 2.6.17-rc3-mm1
o Calculate holes on x86 with SRAT correctly

Changelog since V3
o Rebase to 2.6.17-rc2
o Allow the active regions to be cleared. Needed by x86_64 when it decides
  the SRAT table is bad half way through the registering of active regions
o Fix for flatmem x86_64 machines booting

Changelog since V2
o Fix a bug where holes in lower zones get double counted
o Catch the case where a new range is registered that is within an range
o Catch the case where a zone boundary is within a hole
o Use the EFI map for registering ranges on x86_64+numa
o On IA64+NUMA, add the active ranges before rounding for granules
o On x86_64, remove e820_hole_size and e820_bootmem_free and use
  arch-independent equivalents
o On x86_64, remove the map walk in e820_end_of_ram()
o Rename memory_present_with_active_regions, name ambiguous
o Add absent_pages_in_range() for arches to call

Changelog since V1
o Correctly convert virtual and physical addresses to PFNs on ia64
o Correctly convert physical addresses to PFN on older ppc 
o When add_active_range() is called with overlapping pfn ranges, merge them
o When a zone boundary occurs within a memory hole, account correctly
o Minor whitespace damage cleanup
o Debugging patch temporarily included

At a basic level, architectures define structures to record where active
ranges of page frames are located. Once located, the code to calculate
zone sizes and holes in each architecture is very similar.  Some of this
zone and hole sizing code is difficult to read for no good reason. This
set of patches eliminates the similar-looking architecture-specific code.

The patches introduce a mechanism where architectures register where the
active ranges of page frames are with add_active_range(). When all areas
have been discovered, free_area_init_nodes() is called to initialise
the pgdat and zones. The zone sizes and holes are then calculated in an
architecture independent manner.

Patch 1 introduces the mechanism for registering and initialising PFN ranges
Patch 2 changes ppc to use the mechanism - 128 arch-specific LOC removed
Patch 3 changes x86 to use the mechanism - 142 arch-specific LOC removed
Patch 4 changes x86_64 to use the mechanism - 94 arch-specific LOC removed
Patch 5 changes ia64 to use the mechanism - 57 arch-specific LOC removed

At this point, there is a reduction of 421 architecture-specific lines of code
and a net reduction of 25 lines. The arch-independent code is a lot easier
to read in comparison to some of the arch-specific stuff, particularly in
arch/i386/ .

For Patch 6, it was also noted that page_alloc.c has a *lot* of
initialisation code which makes the file harder to read than it needs to
be. Patch 6 creates a new file mem_init.c and moves a lot of initialisation
code from page_alloc.c to it. After the patch is applied, there is still a net
loss of 8 lines.

The patches have been successfully boot tested by me and verified that the
zones are the correct size on

o x86, flatmem with 1.5GiB of RAM
o x86, NUMAQ
o x86, NUMA, with SRAT
o x86 with SRAT CONFIG_NUMA=n
o PPC64, NUMA
o PPC64, CONFIG_NUMA=n
o Power, RS6000 (Had difficulty here with missing __udivdi3 symbol in pci_32.o)
o x86_64, NUMA with SRAT
o x86_64, NUMA with broken SRAT that falls back to k8topology discovery
o x86_64, ACPI_NUMA, ACPI_MEMORY_HOTPLUG && !SPARSEMEM to trigger the
  hotadd path without sparsemem fun in srat.c (SRAT broken on test machine and
  I'm pretty sure the machine does not support physical memory hotadd anyway
  so test may not have been effective other than being a compile test.)
o x86_64, CONFIG_NUMA=n
o x86_64, AMD64 desktop machine with flatmem

Tony Luck has successfully tested for ia64 on Itanium with tiger_defconfig,
gensparse_defconfig and defconfig. Bob Picco has also tested and debugged
on IA64. Jack Steiner successfully boot tested on a mammoth SGI IA64-based
machine. These were on patches against 2.6.17-rc1 but there have been no
ia64-changes made between release 3 and 5 of these patches.

There are differences in the zone sizes for x86_64 as the arch-specific code
for x86_64 accounts the kernel image and the starting mem_maps as memory
holes but the architecture-independent code accounts the memory as present.

The net reduction seems small but the big benefit of this set of patches
is the reduction of 421 lines of architecture-specific code, some of
which is very hairy. There should be a greater net reduction when other
architectures use the same mechanisms for zone and hole sizing but I lack
the hardware to test on.

Comments?

Additional credit;
	Dave Hansen for the initial suggestion and comments on early patches
	Andy Whitcroft for reviewing early versions and catching numerous errors
	Tony Luck for testing and debugging on IA64
	Bob Picco for testing and fixing bugs related to pfn registration
	Jack Steiner and Yasunori for testing on IA64

 arch/i386/Kconfig           |    8 
 arch/i386/kernel/setup.c    |   19 
 arch/i386/kernel/srat.c     |  101 ---
 arch/i386/mm/discontig.c    |   65 --
 arch/ia64/Kconfig           |    3 
 arch/ia64/mm/contig.c       |   60 --
 arch/ia64/mm/discontig.c    |   41 -
 arch/ia64/mm/init.c         |   12 
 arch/powerpc/Kconfig        |   13 
 arch/powerpc/mm/mem.c       |   53 --
 arch/powerpc/mm/numa.c      |  157 ------
 arch/ppc/Kconfig            |    3 
 arch/ppc/mm/init.c          |   26 -
 arch/x86_64/Kconfig         |    3 
 arch/x86_64/kernel/e820.c   |  109 +---
 arch/x86_64/kernel/setup.c  |    7 
 arch/x86_64/mm/init.c       |   62 --
 arch/x86_64/mm/k8topology.c |    3 
 arch/x86_64/mm/numa.c       |   18 
 arch/x86_64/mm/srat.c       |   11 
 include/asm-ia64/meminit.h  |    1 
 include/asm-x86_64/e820.h   |    5 
 include/asm-x86_64/proto.h  |    2 
 include/linux/mm.h          |   34 +
 include/linux/mmzone.h      |   10 
 mm/Makefile                 |    2 
 mm/mem_init.c               | 1121 ++++++++++++++++++++++++++++++++++++++++++
 mm/page_alloc.c             |  750 -----------------------------
-- 
-- 
Mel Gorman
Part-time Phd Student                          Linux Technology Center
University of Limerick                         IBM Dublin Software Lab

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH 1/7] Introduce mechanism for registering active regions of memory
  2006-05-01 13:35 [PATCH 0/7] Sizing zones and holes in an architecture independent manner V5 Mel Gorman
@ 2006-05-01 13:35 ` Mel Gorman
  2006-05-01 13:36 ` [PATCH 2/7] Have Power use add_active_range() and free_area_init_nodes() Mel Gorman
                   ` (5 subsequent siblings)
  6 siblings, 0 replies; 11+ messages in thread
From: Mel Gorman @ 2006-05-01 13:35 UTC (permalink / raw)
  To: akpm, davej, tony.luck, linux-mm, linux-kernel, bob.picco, ak,
	linuxppc-dev
  Cc: Mel Gorman


This patch defines the structure to represent an active range of page
frames within a node in an architecture independent manner. Architectures
are expected to register active ranges of PFNs using add_active_range(nid,
start_pfn, end_pfn) and call free_area_init_nodes() passing the PFNs of
the end of each zone.


 include/linux/mm.h     |   34 +++
 include/linux/mmzone.h |   10 -
 mm/page_alloc.c        |  402 +++++++++++++++++++++++++++++++++++++++++---
 3 files changed, 421 insertions(+), 25 deletions(-)

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc3-mm1-clean/include/linux/mm.h linux-2.6.17-rc3-mm1-101-add_free_area_init_nodes/include/linux/mm.h
--- linux-2.6.17-rc3-mm1-clean/include/linux/mm.h	2006-05-01 11:37:01.000000000 +0100
+++ linux-2.6.17-rc3-mm1-101-add_free_area_init_nodes/include/linux/mm.h	2006-05-01 11:39:02.000000000 +0100
@@ -916,6 +916,40 @@ extern void free_area_init(unsigned long
 extern void free_area_init_node(int nid, pg_data_t *pgdat,
 	unsigned long * zones_size, unsigned long zone_start_pfn, 
 	unsigned long *zholes_size);
+#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
+/*
+ * Any architecture that supports CONFIG_ARCH_POPULATES_NODE_MAP can
+ * initialise zone and hole information by
+ *
+ * for_all_memory_regions()
+ * 	add_active_range(nid, start, end)
+ * free_area_init_nodes(max_dma, max_dma32, max_low_pfn, max_pfn);
+ *
+ * Optionally, free_bootmem_with_active_regions() can be used to call
+ * free_bootmem_node() after active regions have been registered with
+ * add_active_range(). Similarly, sparse_memory_present_with_active_regions()
+ * calls memory_present() for active regions when SPARSEMEM is enabled
+ */
+extern void free_area_init_nodes(unsigned long max_dma_pfn,
+					unsigned long max_dma32_pfn,
+					unsigned long max_low_pfn,
+					unsigned long max_high_pfn);
+extern void add_active_range(unsigned int nid, unsigned long start_pfn,
+					unsigned long end_pfn);
+extern void shrink_active_range(unsigned int nid, unsigned long old_end_pfn,
+						unsigned long new_end_pfn);
+extern void remove_all_active_ranges(void);
+extern unsigned long absent_pages_in_range(unsigned long start_pfn,
+						unsigned long end_pfn);
+extern void get_pfn_range_for_nid(unsigned int nid,
+			unsigned long *start_pfn, unsigned long *end_pfn);
+extern unsigned long find_min_pfn_with_active_regions(void);
+extern unsigned long find_max_pfn_with_active_regions(void);
+extern int early_pfn_to_nid(unsigned long pfn);
+extern void free_bootmem_with_active_regions(int nid,
+						unsigned long max_low_pfn);
+extern void sparse_memory_present_with_active_regions(int nid);
+#endif
 extern void memmap_init_zone(unsigned long, int, unsigned long, unsigned long);
 extern void setup_per_zone_pages_min(void);
 extern void mem_init(void);
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc3-mm1-clean/include/linux/mmzone.h linux-2.6.17-rc3-mm1-101-add_free_area_init_nodes/include/linux/mmzone.h
--- linux-2.6.17-rc3-mm1-clean/include/linux/mmzone.h	2006-05-01 11:37:01.000000000 +0100
+++ linux-2.6.17-rc3-mm1-101-add_free_area_init_nodes/include/linux/mmzone.h	2006-05-01 11:39:02.000000000 +0100
@@ -271,6 +271,13 @@ struct zonelist {
 	struct zone *zones[MAX_NUMNODES * MAX_NR_ZONES + 1]; // NULL delimited
 };
 
+#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
+struct node_active_region {
+	unsigned long start_pfn;
+	unsigned long end_pfn;
+	int nid;
+};
+#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
 
 /*
  * The pg_data_t structure is used in machines with CONFIG_DISCONTIGMEM
@@ -468,7 +475,8 @@ extern struct zone *next_zone(struct zon
 
 #endif
 
-#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
+#if !defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) && \
+	!defined(CONFIG_ARCH_POPULATES_NODE_MAP)
 #define early_pfn_to_nid(nid)  (0UL)
 #endif
 
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc3-mm1-clean/mm/page_alloc.c linux-2.6.17-rc3-mm1-101-add_free_area_init_nodes/mm/page_alloc.c
--- linux-2.6.17-rc3-mm1-clean/mm/page_alloc.c	2006-05-01 11:37:01.000000000 +0100
+++ linux-2.6.17-rc3-mm1-101-add_free_area_init_nodes/mm/page_alloc.c	2006-05-01 11:39:02.000000000 +0100
@@ -38,6 +38,8 @@
 #include <linux/vmalloc.h>
 #include <linux/mempolicy.h>
 #include <linux/stop_machine.h>
+#include <linux/sort.h>
+#include <linux/pfn.h>
 
 #include <asm/tlbflush.h>
 #include "internal.h"
@@ -86,6 +88,18 @@ int min_free_kbytes = 1024;
 unsigned long __meminitdata nr_kernel_pages;
 unsigned long __meminitdata nr_all_pages;
 
+#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
+  #ifdef CONFIG_MAX_ACTIVE_REGIONS
+    #define MAX_ACTIVE_REGIONS CONFIG_MAX_ACTIVE_REGIONS
+  #else
+    #define MAX_ACTIVE_REGIONS (MAX_NR_ZONES * MAX_NUMNODES + 1)
+  #endif
+
+  struct node_active_region __initdata early_node_map[MAX_ACTIVE_REGIONS];
+  unsigned long __initdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
+  unsigned long __initdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
+#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
+
 #ifdef CONFIG_DEBUG_VM
 static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
 {
@@ -1864,25 +1878,6 @@ static inline unsigned long wait_table_b
 
 #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
 
-static void __init calculate_zone_totalpages(struct pglist_data *pgdat,
-		unsigned long *zones_size, unsigned long *zholes_size)
-{
-	unsigned long realtotalpages, totalpages = 0;
-	int i;
-
-	for (i = 0; i < MAX_NR_ZONES; i++)
-		totalpages += zones_size[i];
-	pgdat->node_spanned_pages = totalpages;
-
-	realtotalpages = totalpages;
-	if (zholes_size)
-		for (i = 0; i < MAX_NR_ZONES; i++)
-			realtotalpages -= zholes_size[i];
-	pgdat->node_present_pages = realtotalpages;
-	printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages);
-}
-
-
 /*
  * Initially all pages are reserved - free ones are freed
  * up by free_all_bootmem() once the early boot process is
@@ -2200,6 +2195,215 @@ __meminit int init_currently_empty_zone(
 	return 0;
 }
 
+#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
+/* Note: nid == MAX_NUMNODES returns first region */
+static int __init first_active_region_index_in_nid(int nid)
+{
+	int i;
+	for (i = 0; early_node_map[i].end_pfn; i++) {
+		if (nid == MAX_NUMNODES || early_node_map[i].nid == nid)
+			return i;
+	}
+
+	return MAX_ACTIVE_REGIONS;
+}
+
+/* Note: nid == MAX_NUMNODES returns next region */
+static int __init next_active_region_index_in_nid(unsigned int index, int nid)
+{
+	for (index = index + 1; early_node_map[index].end_pfn; index++) {
+		if (nid == MAX_NUMNODES || early_node_map[index].nid == nid)
+			return index;
+	}
+
+	return MAX_ACTIVE_REGIONS;
+}
+
+#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
+int __init early_pfn_to_nid(unsigned long pfn)
+{
+	int i;
+
+	for (i = 0; early_node_map[i].end_pfn; i++) {
+		unsigned long start_pfn = early_node_map[i].start_pfn;
+		unsigned long end_pfn = early_node_map[i].end_pfn;
+
+		if ((start_pfn <= pfn) && (pfn < end_pfn))
+			return early_node_map[i].nid;
+	}
+
+	return -1;
+}
+#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
+
+#define for_each_active_range_index_in_nid(i, nid) \
+	for (i = first_active_region_index_in_nid(nid); \
+				i != MAX_ACTIVE_REGIONS; \
+				i = next_active_region_index_in_nid(i, nid))
+
+void __init free_bootmem_with_active_regions(int nid,
+						unsigned long max_low_pfn)
+{
+	unsigned int i;
+	for_each_active_range_index_in_nid(i, nid) {
+		unsigned long size_pages = 0;
+		unsigned long end_pfn = early_node_map[i].end_pfn;
+		if (early_node_map[i].start_pfn >= max_low_pfn)
+			continue;
+
+		if (end_pfn > max_low_pfn)
+			end_pfn = max_low_pfn;
+
+		size_pages = end_pfn - early_node_map[i].start_pfn;
+		free_bootmem_node(NODE_DATA(early_node_map[i].nid),
+				PFN_PHYS(early_node_map[i].start_pfn),
+				size_pages << PAGE_SHIFT);
+	}
+}
+
+void __init sparse_memory_present_with_active_regions(int nid)
+{
+	unsigned int i;
+	for_each_active_range_index_in_nid(i, nid)
+		memory_present(early_node_map[i].nid,
+				early_node_map[i].start_pfn,
+				early_node_map[i].end_pfn);
+}
+
+void __init get_pfn_range_for_nid(unsigned int nid,
+			unsigned long *start_pfn, unsigned long *end_pfn)
+{
+	unsigned int i;
+	*start_pfn = -1UL;
+	*end_pfn = 0;
+
+	for_each_active_range_index_in_nid(i, nid) {
+		*start_pfn = min(*start_pfn, early_node_map[i].start_pfn);
+		*end_pfn = max(*end_pfn, early_node_map[i].end_pfn);
+	}
+
+	if (*start_pfn == -1UL) {
+		printk(KERN_WARNING "Node %u active with no memory\n", nid);
+		*start_pfn = 0;
+	}
+}
+
+unsigned long __init zone_present_pages_in_node(int nid,
+					unsigned long zone_type,
+					unsigned long *ignored)
+{
+	unsigned long node_start_pfn, node_end_pfn;
+	unsigned long zone_start_pfn, zone_end_pfn;
+
+	/* Get the start and end of the node and zone */
+	get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
+	zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
+	zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
+
+	/* Check that this node has pages within the zone's required range */
+	if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)
+		return 0;
+
+	/* Move the zone boundaries inside the node if necessary */
+	zone_end_pfn = min(zone_end_pfn, node_end_pfn);
+	zone_start_pfn = max(zone_start_pfn, node_start_pfn);
+
+	/* Return the spanned pages */
+	return zone_end_pfn - zone_start_pfn;
+}
+
+unsigned long __init __absent_pages_in_range(int nid,
+				unsigned long range_start_pfn,
+				unsigned long range_end_pfn)
+{
+	int i = 0;
+	unsigned long prev_end_pfn = 0, hole_pages = 0;
+	unsigned long start_pfn;
+
+	/* Find the end_pfn of the first active range of pfns in the node */
+	i = first_active_region_index_in_nid(nid);
+	if (i == MAX_ACTIVE_REGIONS)
+		return 0;
+	prev_end_pfn = early_node_map[i].start_pfn;
+
+	/* Find all holes for the zone within the node */
+	for (; i != MAX_ACTIVE_REGIONS;
+			i = next_active_region_index_in_nid(i, nid)) {
+
+		/* No need to continue if prev_end_pfn is outside the zone */
+		if (prev_end_pfn >= range_end_pfn)
+			break;
+
+		/* Make sure the end of the zone is not within the hole */
+		start_pfn = min(early_node_map[i].start_pfn, range_end_pfn);
+		prev_end_pfn = max(prev_end_pfn, range_start_pfn);
+
+		/* Update the hole size cound and move on */
+		if (start_pfn > range_start_pfn) {
+			BUG_ON(prev_end_pfn > start_pfn);
+			hole_pages += start_pfn - prev_end_pfn;
+		}
+		prev_end_pfn = early_node_map[i].end_pfn;
+	}
+
+	return hole_pages;
+}
+
+unsigned long __init absent_pages_in_range(unsigned long start_pfn,
+							unsigned long end_pfn)
+{
+	return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
+}
+
+unsigned long __init zone_absent_pages_in_node(int nid,
+					unsigned long zone_type,
+					unsigned long *ignored)
+{
+	return __absent_pages_in_range(nid,
+				arch_zone_lowest_possible_pfn[zone_type],
+				arch_zone_highest_possible_pfn[zone_type]);
+}
+#else
+static inline unsigned long zone_present_pages_in_node(int nid,
+					unsigned long zone_type,
+					unsigned long *zones_size)
+{
+	return zones_size[zone_type];
+}
+
+static inline unsigned long zone_absent_pages_in_node(int nid,
+						unsigned long zone_type,
+						unsigned long *zholes_size)
+{
+	if (!zholes_size)
+		return 0;
+
+	return zholes_size[zone_type];
+}
+#endif
+
+static void __init calculate_node_totalpages(struct pglist_data *pgdat,
+		unsigned long *zones_size, unsigned long *zholes_size)
+{
+	unsigned long realtotalpages, totalpages = 0;
+	int i;
+
+	for (i = 0; i < MAX_NR_ZONES; i++) {
+		totalpages += zone_present_pages_in_node(pgdat->node_id, i,
+								zones_size);
+	}
+	pgdat->node_spanned_pages = totalpages;
+
+	realtotalpages = totalpages;
+	for (i = 0; i < MAX_NR_ZONES; i++) {
+		realtotalpages -=
+			zone_absent_pages_in_node(pgdat->node_id, i, zholes_size);
+	}
+	pgdat->node_present_pages = realtotalpages;
+	printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
+							realtotalpages);
+}
+
 /*
  * Set up the zone data structures:
  *   - mark all pages reserved
@@ -2223,10 +2427,9 @@ static void __meminit free_area_init_cor
 		struct zone *zone = pgdat->node_zones + j;
 		unsigned long size, realsize;
 
-		realsize = size = zones_size[j];
-		if (zholes_size)
-			realsize -= zholes_size[j];
-
+		size = zone_present_pages_in_node(nid, j, zones_size);
+		realsize = size - zone_absent_pages_in_node(nid, j,
+								zholes_size);
 		if (j < ZONE_HIGHMEM)
 			nr_kernel_pages += realsize;
 		nr_all_pages += realsize;
@@ -2294,13 +2497,164 @@ void __meminit free_area_init_node(int n
 {
 	pgdat->node_id = nid;
 	pgdat->node_start_pfn = node_start_pfn;
-	calculate_zone_totalpages(pgdat, zones_size, zholes_size);
+	calculate_node_totalpages(pgdat, zones_size, zholes_size);
 
 	alloc_node_mem_map(pgdat);
 
 	free_area_init_core(pgdat, zones_size, zholes_size);
 }
 
+#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
+void __init add_active_range(unsigned int nid, unsigned long start_pfn,
+						unsigned long end_pfn)
+{
+	unsigned int i;
+
+	/* Merge with existing active regions if possible */
+	for (i = 0; early_node_map[i].end_pfn; i++) {
+		if (early_node_map[i].nid != nid)
+			continue;
+
+		/* Skip if an existing region covers this new one */
+		if (start_pfn >= early_node_map[i].start_pfn &&
+				end_pfn <= early_node_map[i].end_pfn)
+			return;
+
+		/* Merge forward if suitable */
+		if (start_pfn <= early_node_map[i].end_pfn &&
+				end_pfn > early_node_map[i].end_pfn) {
+			early_node_map[i].end_pfn = end_pfn;
+			return;
+		}
+
+		/* Merge backward if suitable */
+		if (start_pfn < early_node_map[i].end_pfn &&
+				end_pfn >= early_node_map[i].start_pfn) {
+			early_node_map[i].start_pfn = start_pfn;
+			return;
+		}
+	}
+
+	/* Leave last entry NULL, we use range.end_pfn to terminate the walk */
+	if (i >= MAX_ACTIVE_REGIONS - 1) {
+		printk(KERN_ERR "Too many memory regions, truncating\n");
+		return;
+	}
+
+	early_node_map[i].nid = nid;
+	early_node_map[i].start_pfn = start_pfn;
+	early_node_map[i].end_pfn = end_pfn;
+}
+
+void __init shrink_active_range(unsigned int nid, unsigned long old_end_pfn,
+						unsigned long new_end_pfn)
+{
+	unsigned int i;
+
+	/* Find the old active region end and shrink */
+	for_each_active_range_index_in_nid(i, nid) {
+		if (early_node_map[i].end_pfn == old_end_pfn) {
+			early_node_map[i].end_pfn = new_end_pfn;
+			break;
+		}
+	}
+}
+
+void __init remove_all_active_ranges()
+{
+	memset(early_node_map, 0, sizeof(early_node_map));
+}
+
+/* Compare two active node_active_regions */
+static int __init cmp_node_active_region(const void *a, const void *b)
+{
+	struct node_active_region *arange = (struct node_active_region *)a;
+	struct node_active_region *brange = (struct node_active_region *)b;
+
+	/* Done this way to avoid overflows */
+	if (arange->start_pfn > brange->start_pfn)
+		return 1;
+	if (arange->start_pfn < brange->start_pfn)
+		return -1;
+
+	return 0;
+}
+
+/* sort the node_map by start_pfn */
+static void __init sort_node_map(void)
+{
+	size_t num = 0;
+	while (early_node_map[num].end_pfn)
+		num++;
+
+	sort(early_node_map, num, sizeof(struct node_active_region),
+						cmp_node_active_region, NULL);
+}
+
+/* Find the lowest pfn for a node. This depends on a sorted early_node_map */
+unsigned long __init find_min_pfn_for_node(unsigned long nid)
+{
+	int i;
+
+	/* Assuming a sorted map, the first range found has the starting pfn */
+	for_each_active_range_index_in_nid(i, nid)
+		return early_node_map[i].start_pfn;
+
+	printk(KERN_WARNING "Could not find start_pfn for node %lu\n", nid);
+	return 0;
+}
+
+unsigned long __init find_min_pfn_with_active_regions(void)
+{
+	return find_min_pfn_for_node(MAX_NUMNODES);
+}
+
+unsigned long __init find_max_pfn_with_active_regions(void)
+{
+	int i;
+	unsigned long max_pfn = 0;
+
+	for (i = 0; early_node_map[i].end_pfn; i++)
+		max_pfn = max(max_pfn, early_node_map[i].end_pfn);
+
+	return max_pfn;
+}
+
+void __init free_area_init_nodes(unsigned long arch_max_dma_pfn,
+				unsigned long arch_max_dma32_pfn,
+				unsigned long arch_max_low_pfn,
+				unsigned long arch_max_high_pfn)
+{
+	unsigned long nid;
+	int zone_index;
+
+	/* Record where the zone boundaries are */
+	memset(arch_zone_lowest_possible_pfn, 0,
+				sizeof(arch_zone_lowest_possible_pfn));
+	memset(arch_zone_highest_possible_pfn, 0,
+				sizeof(arch_zone_highest_possible_pfn));
+	arch_zone_lowest_possible_pfn[ZONE_DMA] =
+					find_min_pfn_with_active_regions();
+	arch_zone_highest_possible_pfn[ZONE_DMA] = arch_max_dma_pfn;
+	arch_zone_highest_possible_pfn[ZONE_DMA32] = arch_max_dma32_pfn;
+	arch_zone_highest_possible_pfn[ZONE_NORMAL] = arch_max_low_pfn;
+	arch_zone_highest_possible_pfn[ZONE_HIGHMEM] = arch_max_high_pfn;
+	for (zone_index = 1; zone_index < MAX_NR_ZONES; zone_index++) {
+		arch_zone_lowest_possible_pfn[zone_index] =
+			arch_zone_highest_possible_pfn[zone_index-1];
+	}
+
+	/* Regions in the early_node_map can be in any order */
+	sort_node_map();
+
+	for_each_online_node(nid) {
+		pg_data_t *pgdat = NODE_DATA(nid);
+		free_area_init_node(nid, pgdat, NULL,
+				find_min_pfn_for_node(nid), NULL);
+	}
+}
+#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
+
 #ifndef CONFIG_NEED_MULTIPLE_NODES
 static bootmem_data_t contig_bootmem_data;
 struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data };

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH 2/7] Have Power use add_active_range() and free_area_init_nodes()
  2006-05-01 13:35 [PATCH 0/7] Sizing zones and holes in an architecture independent manner V5 Mel Gorman
  2006-05-01 13:35 ` [PATCH 1/7] Introduce mechanism for registering active regions of memory Mel Gorman
@ 2006-05-01 13:36 ` Mel Gorman
  2006-05-01 13:36 ` [PATCH 3/7] Have x86 use add_active_range() and free_area_init_nodes Mel Gorman
                   ` (4 subsequent siblings)
  6 siblings, 0 replies; 11+ messages in thread
From: Mel Gorman @ 2006-05-01 13:36 UTC (permalink / raw)
  To: akpm, davej, tony.luck, linuxppc-dev, linux-kernel, bob.picco, ak,
	linux-mm
  Cc: Mel Gorman


Size zones and holes in an architecture independent manner for Power.

This has been boot tested on PPC64 with NUMA both enabled and disabled. It
has been compile tested for an older CHRP-based machine.


 powerpc/Kconfig   |   13 ++--
 powerpc/mm/mem.c  |   53 ++++++----------
 powerpc/mm/numa.c |  157 ++++---------------------------------------------
 ppc/Kconfig       |    3 
 ppc/mm/init.c     |   26 ++++----
 5 files changed, 62 insertions(+), 190 deletions(-)

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc3-mm1-101-add_free_area_init_nodes/arch/powerpc/Kconfig linux-2.6.17-rc3-mm1-102-powerpc_use_init_nodes/arch/powerpc/Kconfig
--- linux-2.6.17-rc3-mm1-101-add_free_area_init_nodes/arch/powerpc/Kconfig	2006-05-01 11:36:58.000000000 +0100
+++ linux-2.6.17-rc3-mm1-102-powerpc_use_init_nodes/arch/powerpc/Kconfig	2006-05-01 11:40:07.000000000 +0100
@@ -676,11 +676,16 @@ config ARCH_SPARSEMEM_DEFAULT
 	def_bool y
 	depends on SMP && PPC_PSERIES
 
-source "mm/Kconfig"
-
-config HAVE_ARCH_EARLY_PFN_TO_NID
+config ARCH_POPULATES_NODE_MAP
 	def_bool y
-	depends on NEED_MULTIPLE_NODES
+
+# Value of 256 is MAX_LMB_REGIONS * 2
+config MAX_ACTIVE_REGIONS
+	int
+	default 256
+	depends on ARCH_POPULATES_NODE_MAP
+
+source "mm/Kconfig"
 
 config ARCH_MEMORY_PROBE
 	def_bool y
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc3-mm1-101-add_free_area_init_nodes/arch/powerpc/mm/mem.c linux-2.6.17-rc3-mm1-102-powerpc_use_init_nodes/arch/powerpc/mm/mem.c
--- linux-2.6.17-rc3-mm1-101-add_free_area_init_nodes/arch/powerpc/mm/mem.c	2006-05-01 11:36:58.000000000 +0100
+++ linux-2.6.17-rc3-mm1-102-powerpc_use_init_nodes/arch/powerpc/mm/mem.c	2006-05-01 11:40:07.000000000 +0100
@@ -257,20 +257,22 @@ void __init do_init_bootmem(void)
 
 	boot_mapsize = init_bootmem(start >> PAGE_SHIFT, total_pages);
 
+	/* Add active regions with valid PFNs */
+	for (i = 0; i < lmb.memory.cnt; i++) {
+		unsigned long start_pfn, end_pfn;
+		start_pfn = lmb.memory.region[i].base >> PAGE_SHIFT;
+		end_pfn = start_pfn + lmb_size_pages(&lmb.memory, i);
+		add_active_range(0, start_pfn, end_pfn);
+	}
+
 	/* Add all physical memory to the bootmem map, mark each area
 	 * present.
 	 */
-	for (i = 0; i < lmb.memory.cnt; i++) {
-		unsigned long base = lmb.memory.region[i].base;
-		unsigned long size = lmb_size_bytes(&lmb.memory, i);
 #ifdef CONFIG_HIGHMEM
-		if (base >= total_lowmem)
-			continue;
-		if (base + size > total_lowmem)
-			size = total_lowmem - base;
+	free_bootmem_with_active_regions(0, total_lowmem >> PAGE_SHIFT);
+#else
+	free_bootmem_with_active_regions(0, max_pfn);
 #endif
-		free_bootmem(base, size);
-	}
 
 	/* reserve the sections we're already using */
 	for (i = 0; i < lmb.reserved.cnt; i++)
@@ -278,9 +280,8 @@ void __init do_init_bootmem(void)
 				lmb_size_bytes(&lmb.reserved, i));
 
 	/* XXX need to clip this if using highmem? */
-	for (i = 0; i < lmb.memory.cnt; i++)
-		memory_present(0, lmb_start_pfn(&lmb.memory, i),
-			       lmb_end_pfn(&lmb.memory, i));
+	sparse_memory_present_with_active_regions(0);
+
 	init_bootmem_done = 1;
 }
 
@@ -289,8 +290,6 @@ void __init do_init_bootmem(void)
  */
 void __init paging_init(void)
 {
-	unsigned long zones_size[MAX_NR_ZONES];
-	unsigned long zholes_size[MAX_NR_ZONES];
 	unsigned long total_ram = lmb_phys_mem_size();
 	unsigned long top_of_ram = lmb_end_of_DRAM();
 
@@ -308,26 +307,18 @@ void __init paging_init(void)
 	       top_of_ram, total_ram);
 	printk(KERN_DEBUG "Memory hole size: %ldMB\n",
 	       (top_of_ram - total_ram) >> 20);
-	/*
-	 * All pages are DMA-able so we put them all in the DMA zone.
-	 */
-	memset(zones_size, 0, sizeof(zones_size));
-	memset(zholes_size, 0, sizeof(zholes_size));
-
-	zones_size[ZONE_DMA] = top_of_ram >> PAGE_SHIFT;
-	zholes_size[ZONE_DMA] = (top_of_ram - total_ram) >> PAGE_SHIFT;
-
 #ifdef CONFIG_HIGHMEM
-	zones_size[ZONE_DMA] = total_lowmem >> PAGE_SHIFT;
-	zones_size[ZONE_HIGHMEM] = (total_memory - total_lowmem) >> PAGE_SHIFT;
-	zholes_size[ZONE_HIGHMEM] = (top_of_ram - total_ram) >> PAGE_SHIFT;
+	free_area_init_nodes(total_lowmem >> PAGE_SHIFT,
+				total_lowmem >> PAGE_SHIFT,
+				total_lowmem >> PAGE_SHIFT,
+				top_of_ram >> PAGE_SHIFT);
 #else
-	zones_size[ZONE_DMA] = top_of_ram >> PAGE_SHIFT;
-	zholes_size[ZONE_DMA] = (top_of_ram - total_ram) >> PAGE_SHIFT;
-#endif /* CONFIG_HIGHMEM */
+	free_area_init_nodes(top_of_ram >> PAGE_SHIFT,
+				top_of_ram >> PAGE_SHIFT,
+				top_of_ram >> PAGE_SHIFT,
+				top_of_ram >> PAGE_SHIFT);
+#endif
 
-	free_area_init_node(0, NODE_DATA(0), zones_size,
-			    __pa(PAGE_OFFSET) >> PAGE_SHIFT, zholes_size);
 }
 #endif /* ! CONFIG_NEED_MULTIPLE_NODES */
 
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc3-mm1-101-add_free_area_init_nodes/arch/powerpc/mm/numa.c linux-2.6.17-rc3-mm1-102-powerpc_use_init_nodes/arch/powerpc/mm/numa.c
--- linux-2.6.17-rc3-mm1-101-add_free_area_init_nodes/arch/powerpc/mm/numa.c	2006-05-01 11:36:58.000000000 +0100
+++ linux-2.6.17-rc3-mm1-102-powerpc_use_init_nodes/arch/powerpc/mm/numa.c	2006-05-01 11:40:07.000000000 +0100
@@ -39,96 +39,6 @@ static bootmem_data_t __initdata plat_no
 static int min_common_depth;
 static int n_mem_addr_cells, n_mem_size_cells;
 
-/*
- * We need somewhere to store start/end/node for each region until we have
- * allocated the real node_data structures.
- */
-#define MAX_REGIONS	(MAX_LMB_REGIONS*2)
-static struct {
-	unsigned long start_pfn;
-	unsigned long end_pfn;
-	int nid;
-} init_node_data[MAX_REGIONS] __initdata;
-
-int __init early_pfn_to_nid(unsigned long pfn)
-{
-	unsigned int i;
-
-	for (i = 0; init_node_data[i].end_pfn; i++) {
-		unsigned long start_pfn = init_node_data[i].start_pfn;
-		unsigned long end_pfn = init_node_data[i].end_pfn;
-
-		if ((start_pfn <= pfn) && (pfn < end_pfn))
-			return init_node_data[i].nid;
-	}
-
-	return -1;
-}
-
-void __init add_region(unsigned int nid, unsigned long start_pfn,
-		       unsigned long pages)
-{
-	unsigned int i;
-
-	dbg("add_region nid %d start_pfn 0x%lx pages 0x%lx\n",
-		nid, start_pfn, pages);
-
-	for (i = 0; init_node_data[i].end_pfn; i++) {
-		if (init_node_data[i].nid != nid)
-			continue;
-		if (init_node_data[i].end_pfn == start_pfn) {
-			init_node_data[i].end_pfn += pages;
-			return;
-		}
-		if (init_node_data[i].start_pfn == (start_pfn + pages)) {
-			init_node_data[i].start_pfn -= pages;
-			return;
-		}
-	}
-
-	/*
-	 * Leave last entry NULL so we dont iterate off the end (we use
-	 * entry.end_pfn to terminate the walk).
-	 */
-	if (i >= (MAX_REGIONS - 1)) {
-		printk(KERN_ERR "WARNING: too many memory regions in "
-				"numa code, truncating\n");
-		return;
-	}
-
-	init_node_data[i].start_pfn = start_pfn;
-	init_node_data[i].end_pfn = start_pfn + pages;
-	init_node_data[i].nid = nid;
-}
-
-/* We assume init_node_data has no overlapping regions */
-void __init get_region(unsigned int nid, unsigned long *start_pfn,
-		       unsigned long *end_pfn, unsigned long *pages_present)
-{
-	unsigned int i;
-
-	*start_pfn = -1UL;
-	*end_pfn = *pages_present = 0;
-
-	for (i = 0; init_node_data[i].end_pfn; i++) {
-		if (init_node_data[i].nid != nid)
-			continue;
-
-		*pages_present += init_node_data[i].end_pfn -
-			init_node_data[i].start_pfn;
-
-		if (init_node_data[i].start_pfn < *start_pfn)
-			*start_pfn = init_node_data[i].start_pfn;
-
-		if (init_node_data[i].end_pfn > *end_pfn)
-			*end_pfn = init_node_data[i].end_pfn;
-	}
-
-	/* We didnt find a matching region, return start/end as 0 */
-	if (*start_pfn == -1UL)
-		*start_pfn = 0;
-}
-
 static void __cpuinit map_cpu_to_node(int cpu, int node)
 {
 	numa_cpu_lookup_table[cpu] = node;
@@ -471,8 +381,8 @@ new_range:
 				continue;
 		}
 
-		add_region(nid, start >> PAGE_SHIFT,
-			   size >> PAGE_SHIFT);
+		add_active_range(nid, start >> PAGE_SHIFT,
+				(start >> PAGE_SHIFT) + (size >> PAGE_SHIFT));
 
 		if (--ranges)
 			goto new_range;
@@ -485,6 +395,7 @@ static void __init setup_nonnuma(void)
 {
 	unsigned long top_of_ram = lmb_end_of_DRAM();
 	unsigned long total_ram = lmb_phys_mem_size();
+	unsigned long start_pfn, end_pfn;
 	unsigned int i;
 
 	printk(KERN_DEBUG "Top of RAM: 0x%lx, Total RAM: 0x%lx\n",
@@ -492,9 +403,11 @@ static void __init setup_nonnuma(void)
 	printk(KERN_DEBUG "Memory hole size: %ldMB\n",
 	       (top_of_ram - total_ram) >> 20);
 
-	for (i = 0; i < lmb.memory.cnt; ++i)
-		add_region(0, lmb.memory.region[i].base >> PAGE_SHIFT,
-			   lmb_size_pages(&lmb.memory, i));
+	for (i = 0; i < lmb.memory.cnt; ++i) {
+		start_pfn = lmb.memory.region[i].base >> PAGE_SHIFT;
+		end_pfn = start_pfn + lmb_size_pages(&lmb.memory, i);
+		add_active_range(0, start_pfn, end_pfn);
+	}
 	node_set_online(0);
 }
 
@@ -632,11 +545,11 @@ void __init do_init_bootmem(void)
 			  (void *)(unsigned long)boot_cpuid);
 
 	for_each_online_node(nid) {
-		unsigned long start_pfn, end_pfn, pages_present;
+		unsigned long start_pfn, end_pfn;
 		unsigned long bootmem_paddr;
 		unsigned long bootmap_pages;
 
-		get_region(nid, &start_pfn, &end_pfn, &pages_present);
+		get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
 
 		/* Allocate the node structure node local if possible */
 		NODE_DATA(nid) = careful_allocation(nid,
@@ -669,19 +582,7 @@ void __init do_init_bootmem(void)
 		init_bootmem_node(NODE_DATA(nid), bootmem_paddr >> PAGE_SHIFT,
 				  start_pfn, end_pfn);
 
-		/* Add free regions on this node */
-		for (i = 0; init_node_data[i].end_pfn; i++) {
-			unsigned long start, end;
-
-			if (init_node_data[i].nid != nid)
-				continue;
-
-			start = init_node_data[i].start_pfn << PAGE_SHIFT;
-			end = init_node_data[i].end_pfn << PAGE_SHIFT;
-
-			dbg("free_bootmem %lx %lx\n", start, end - start);
-  			free_bootmem_node(NODE_DATA(nid), start, end - start);
-		}
+		free_bootmem_with_active_regions(nid, end_pfn);
 
 		/* Mark reserved regions on this node */
 		for (i = 0; i < lmb.reserved.cnt; i++) {
@@ -712,44 +613,14 @@ void __init do_init_bootmem(void)
 			}
 		}
 
-		/* Add regions into sparsemem */
-		for (i = 0; init_node_data[i].end_pfn; i++) {
-			unsigned long start, end;
-
-			if (init_node_data[i].nid != nid)
-				continue;
-
-			start = init_node_data[i].start_pfn;
-			end = init_node_data[i].end_pfn;
-
-			memory_present(nid, start, end);
-		}
+		sparse_memory_present_with_active_regions(nid);
 	}
 }
 
 void __init paging_init(void)
 {
-	unsigned long zones_size[MAX_NR_ZONES];
-	unsigned long zholes_size[MAX_NR_ZONES];
-	int nid;
-
-	memset(zones_size, 0, sizeof(zones_size));
-	memset(zholes_size, 0, sizeof(zholes_size));
-
-	for_each_online_node(nid) {
-		unsigned long start_pfn, end_pfn, pages_present;
-
-		get_region(nid, &start_pfn, &end_pfn, &pages_present);
-
-		zones_size[ZONE_DMA] = end_pfn - start_pfn;
-		zholes_size[ZONE_DMA] = zones_size[ZONE_DMA] - pages_present;
-
-		dbg("free_area_init node %d %lx %lx (hole: %lx)\n", nid,
-		    zones_size[ZONE_DMA], start_pfn, zholes_size[ZONE_DMA]);
-
-		free_area_init_node(nid, NODE_DATA(nid), zones_size, start_pfn,
-				    zholes_size);
-	}
+	unsigned long end_pfn = lmb_end_of_DRAM() >> PAGE_SHIFT;
+	free_area_init_nodes(end_pfn, end_pfn, end_pfn, end_pfn);
 }
 
 static int __init early_numa(char *p)
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc3-mm1-101-add_free_area_init_nodes/arch/ppc/Kconfig linux-2.6.17-rc3-mm1-102-powerpc_use_init_nodes/arch/ppc/Kconfig
--- linux-2.6.17-rc3-mm1-101-add_free_area_init_nodes/arch/ppc/Kconfig	2006-04-27 03:19:25.000000000 +0100
+++ linux-2.6.17-rc3-mm1-102-powerpc_use_init_nodes/arch/ppc/Kconfig	2006-05-01 11:40:07.000000000 +0100
@@ -949,6 +949,9 @@ config NR_CPUS
 config HIGHMEM
 	bool "High memory support"
 
+config ARCH_POPULATES_NODE_MAP
+	def_bool y
+
 source kernel/Kconfig.hz
 source kernel/Kconfig.preempt
 source "mm/Kconfig"
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc3-mm1-101-add_free_area_init_nodes/arch/ppc/mm/init.c linux-2.6.17-rc3-mm1-102-powerpc_use_init_nodes/arch/ppc/mm/init.c
--- linux-2.6.17-rc3-mm1-101-add_free_area_init_nodes/arch/ppc/mm/init.c	2006-04-27 03:19:25.000000000 +0100
+++ linux-2.6.17-rc3-mm1-102-powerpc_use_init_nodes/arch/ppc/mm/init.c	2006-05-01 11:40:07.000000000 +0100
@@ -359,8 +359,7 @@ void __init do_init_bootmem(void)
  */
 void __init paging_init(void)
 {
-	unsigned long zones_size[MAX_NR_ZONES], i;
-
+	unsigned long start_pfn, end_pfn;
 #ifdef CONFIG_HIGHMEM
 	map_page(PKMAP_BASE, 0, 0);	/* XXX gross */
 	pkmap_page_table = pte_offset_kernel(pmd_offset(pgd_offset_k
@@ -370,19 +369,22 @@ void __init paging_init(void)
 			(KMAP_FIX_BEGIN), KMAP_FIX_BEGIN), KMAP_FIX_BEGIN);
 	kmap_prot = PAGE_KERNEL;
 #endif /* CONFIG_HIGHMEM */
-
-	/*
-	 * All pages are DMA-able so we put them all in the DMA zone.
-	 */
-	zones_size[ZONE_DMA] = total_lowmem >> PAGE_SHIFT;
-	for (i = 1; i < MAX_NR_ZONES; i++)
-		zones_size[i] = 0;
+	/* All pages are DMA-able so we put them all in the DMA zone. */
+	start_pfn = __pa(PAGE_OFFSET) >> PAGE_SHIFT;
+	end_pfn = start_pfn + (total_memory >> PAGE_SHIFT);
+	add_active_range(0, start_pfn, end_pfn);
 
 #ifdef CONFIG_HIGHMEM
-	zones_size[ZONE_HIGHMEM] = (total_memory - total_lowmem) >> PAGE_SHIFT;
+	free_area_init_nodes(total_lowmem >> PAGE_SHIFT,
+				total_lowmem >> PAGE_SHIFT,
+				total_lowmem >> PAGE_SHIFT,
+				total_memory >> PAGE_SHIFT);
+#else
+	free_area_init_nodes(total_memory >> PAGE_SHIFT,
+				total_memory >> PAGE_SHIFT,
+				total_memory >> PAGE_SHIFT,
+				total_memory >> PAGE_SHIFT);
 #endif /* CONFIG_HIGHMEM */
-
-	free_area_init(zones_size);
 }
 
 void __init mem_init(void)

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH 3/7] Have x86 use add_active_range() and free_area_init_nodes
  2006-05-01 13:35 [PATCH 0/7] Sizing zones and holes in an architecture independent manner V5 Mel Gorman
  2006-05-01 13:35 ` [PATCH 1/7] Introduce mechanism for registering active regions of memory Mel Gorman
  2006-05-01 13:36 ` [PATCH 2/7] Have Power use add_active_range() and free_area_init_nodes() Mel Gorman
@ 2006-05-01 13:36 ` Mel Gorman
  2006-05-01 13:36 ` [PATCH 4/7] Have x86_64 " Mel Gorman
                   ` (3 subsequent siblings)
  6 siblings, 0 replies; 11+ messages in thread
From: Mel Gorman @ 2006-05-01 13:36 UTC (permalink / raw)
  To: akpm, davej, tony.luck, linux-mm, linux-kernel, bob.picco, ak,
	linuxppc-dev
  Cc: Mel Gorman


Size zones and holes in an architecture independent manner for x86.

This has been boot tested on;

x86 with 4 CPUs, flatmem
x86 on NUMAQ

It needs to be boot tested on an x86 machine that uses SRAT.


 Kconfig        |    8 +---
 kernel/setup.c |   19 +++------
 kernel/srat.c  |  100 +---------------------------------------------------
 mm/discontig.c |   65 +++++++--------------------------
 4 files changed, 25 insertions(+), 167 deletions(-)

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc3-mm1-102-powerpc_use_init_nodes/arch/i386/Kconfig linux-2.6.17-rc3-mm1-103-x86_use_init_nodes/arch/i386/Kconfig
--- linux-2.6.17-rc3-mm1-102-powerpc_use_init_nodes/arch/i386/Kconfig	2006-05-01 11:36:54.000000000 +0100
+++ linux-2.6.17-rc3-mm1-103-x86_use_init_nodes/arch/i386/Kconfig	2006-05-01 11:41:15.000000000 +0100
@@ -577,12 +577,10 @@ config ARCH_SELECT_MEMORY_MODEL
 	def_bool y
 	depends on ARCH_SPARSEMEM_ENABLE
 
-source "mm/Kconfig"
+config ARCH_POPULATES_NODE_MAP
+	def_bool y
 
-config HAVE_ARCH_EARLY_PFN_TO_NID
-	bool
-	default y
-	depends on NUMA
+source "mm/Kconfig"
 
 config HIGHPTE
 	bool "Allocate 3rd-level pagetables from highmem"
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc3-mm1-102-powerpc_use_init_nodes/arch/i386/kernel/setup.c linux-2.6.17-rc3-mm1-103-x86_use_init_nodes/arch/i386/kernel/setup.c
--- linux-2.6.17-rc3-mm1-102-powerpc_use_init_nodes/arch/i386/kernel/setup.c	2006-05-01 11:36:54.000000000 +0100
+++ linux-2.6.17-rc3-mm1-103-x86_use_init_nodes/arch/i386/kernel/setup.c	2006-05-01 11:41:15.000000000 +0100
@@ -1207,22 +1207,15 @@ static unsigned long __init setup_memory
 
 void __init zone_sizes_init(void)
 {
-	unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0};
-	unsigned int max_dma, low;
+	unsigned int max_dma;
+#ifndef CONFIG_HIGHMEM
+	unsigned long highend_pfn = max_low_pfn;
+#endif
 
 	max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
-	low = max_low_pfn;
 
-	if (low < max_dma)
-		zones_size[ZONE_DMA] = low;
-	else {
-		zones_size[ZONE_DMA] = max_dma;
-		zones_size[ZONE_NORMAL] = low - max_dma;
-#ifdef CONFIG_HIGHMEM
-		zones_size[ZONE_HIGHMEM] = highend_pfn - low;
-#endif
-	}
-	free_area_init(zones_size);
+	add_active_range(0, 0, highend_pfn);
+	free_area_init_nodes(max_dma, max_dma, max_low_pfn, highend_pfn);
 }
 #else
 extern unsigned long __init setup_memory(void);
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc3-mm1-102-powerpc_use_init_nodes/arch/i386/kernel/srat.c linux-2.6.17-rc3-mm1-103-x86_use_init_nodes/arch/i386/kernel/srat.c
--- linux-2.6.17-rc3-mm1-102-powerpc_use_init_nodes/arch/i386/kernel/srat.c	2006-05-01 11:36:54.000000000 +0100
+++ linux-2.6.17-rc3-mm1-103-x86_use_init_nodes/arch/i386/kernel/srat.c	2006-05-01 11:41:15.000000000 +0100
@@ -55,8 +55,6 @@ struct node_memory_chunk_s {
 static struct node_memory_chunk_s node_memory_chunk[MAXCHUNKS];
 
 static int num_memory_chunks;		/* total number of memory chunks */
-static int zholes_size_init;
-static unsigned long zholes_size[MAX_NUMNODES * MAX_NR_ZONES];
 
 extern void * boot_ioremap(unsigned long, unsigned long);
 
@@ -136,50 +134,6 @@ static void __init parse_memory_affinity
 		 "enabled and removable" : "enabled" ) );
 }
 
-#if MAX_NR_ZONES != 4
-#error "MAX_NR_ZONES != 4, chunk_to_zone requires review"
-#endif
-/* Take a chunk of pages from page frame cstart to cend and count the number
- * of pages in each zone, returned via zones[].
- */
-static __init void chunk_to_zones(unsigned long cstart, unsigned long cend, 
-		unsigned long *zones)
-{
-	unsigned long max_dma;
-	extern unsigned long max_low_pfn;
-
-	int z;
-	unsigned long rend;
-
-	/* FIXME: MAX_DMA_ADDRESS and max_low_pfn are trying to provide
-	 * similarly scoped information and should be handled in a consistant
-	 * manner.
-	 */
-	max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
-
-	/* Split the hole into the zones in which it falls.  Repeatedly
-	 * take the segment in which the remaining hole starts, round it
-	 * to the end of that zone.
-	 */
-	memset(zones, 0, MAX_NR_ZONES * sizeof(long));
-	while (cstart < cend) {
-		if (cstart < max_dma) {
-			z = ZONE_DMA;
-			rend = (cend < max_dma)? cend : max_dma;
-
-		} else if (cstart < max_low_pfn) {
-			z = ZONE_NORMAL;
-			rend = (cend < max_low_pfn)? cend : max_low_pfn;
-
-		} else {
-			z = ZONE_HIGHMEM;
-			rend = cend;
-		}
-		zones[z] += rend - cstart;
-		cstart = rend;
-	}
-}
-
 /*
  * The SRAT table always lists ascending addresses, so can always
  * assume that the first "start" address that you see is the real
@@ -224,7 +178,6 @@ static int __init acpi20_parse_srat(stru
 
 	memset(pxm_bitmap, 0, sizeof(pxm_bitmap));	/* init proximity domain bitmap */
 	memset(node_memory_chunk, 0, sizeof(node_memory_chunk));
-	memset(zholes_size, 0, sizeof(zholes_size));
 
 	num_memory_chunks = 0;
 	while (p < end) {
@@ -288,6 +241,7 @@ static int __init acpi20_parse_srat(stru
 		printk("chunk %d nid %d start_pfn %08lx end_pfn %08lx\n",
 		       j, chunk->nid, chunk->start_pfn, chunk->end_pfn);
 		node_read_chunk(chunk->nid, chunk);
+		add_active_range(chunk->nid, chunk->start_pfn, chunk->end_pfn);
 	}
  
 	for_each_online_node(nid) {
@@ -396,57 +350,7 @@ int __init get_memcfg_from_srat(void)
 		return acpi20_parse_srat((struct acpi_table_srat *)header);
 	}
 out_err:
+	remove_all_active_ranges();
 	printk("failed to get NUMA memory information from SRAT table\n");
 	return 0;
 }
-
-/* For each node run the memory list to determine whether there are
- * any memory holes.  For each hole determine which ZONE they fall
- * into.
- *
- * NOTE#1: this requires knowledge of the zone boundries and so
- * _cannot_ be performed before those are calculated in setup_memory.
- * 
- * NOTE#2: we rely on the fact that the memory chunks are ordered by
- * start pfn number during setup.
- */
-static void __init get_zholes_init(void)
-{
-	int nid;
-	int c;
-	int first;
-	unsigned long end = 0;
-
-	for_each_online_node(nid) {
-		first = 1;
-		for (c = 0; c < num_memory_chunks; c++){
-			if (node_memory_chunk[c].nid == nid) {
-				if (first) {
-					end = node_memory_chunk[c].end_pfn;
-					first = 0;
-
-				} else {
-					/* Record any gap between this chunk
-					 * and the previous chunk on this node
-					 * against the zones it spans.
-					 */
-					chunk_to_zones(end,
-						node_memory_chunk[c].start_pfn,
-						&zholes_size[nid * MAX_NR_ZONES]);
-				}
-			}
-		}
-	}
-}
-
-unsigned long * __init get_zholes_size(int nid)
-{
-	if (!zholes_size_init) {
-		zholes_size_init++;
-		get_zholes_init();
-	}
-	if (nid >= MAX_NUMNODES || !node_online(nid))
-		printk("%s: nid = %d is invalid/offline. num_online_nodes = %d",
-		       __FUNCTION__, nid, num_online_nodes());
-	return &zholes_size[nid * MAX_NR_ZONES];
-}
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc3-mm1-102-powerpc_use_init_nodes/arch/i386/mm/discontig.c linux-2.6.17-rc3-mm1-103-x86_use_init_nodes/arch/i386/mm/discontig.c
--- linux-2.6.17-rc3-mm1-102-powerpc_use_init_nodes/arch/i386/mm/discontig.c	2006-04-27 03:19:25.000000000 +0100
+++ linux-2.6.17-rc3-mm1-103-x86_use_init_nodes/arch/i386/mm/discontig.c	2006-05-01 11:41:15.000000000 +0100
@@ -157,21 +157,6 @@ static void __init find_max_pfn_node(int
 		BUG();
 }
 
-/* Find the owning node for a pfn. */
-int early_pfn_to_nid(unsigned long pfn)
-{
-	int nid;
-
-	for_each_node(nid) {
-		if (node_end_pfn[nid] == 0)
-			break;
-		if (node_start_pfn[nid] <= pfn && node_end_pfn[nid] >= pfn)
-			return nid;
-	}
-
-	return 0;
-}
-
 /* 
  * Allocate memory for the pg_data_t for this node via a crude pre-bootmem
  * method.  For node zero take this from the bottom of memory, for
@@ -227,6 +212,8 @@ static unsigned long calculate_numa_rema
 	unsigned long pfn;
 
 	for_each_online_node(nid) {
+		unsigned old_end_pfn = node_end_pfn[nid];
+
 		/*
 		 * The acpi/srat node info can show hot-add memroy zones
 		 * where memory could be added but not currently present.
@@ -276,6 +263,7 @@ static unsigned long calculate_numa_rema
 
 		node_end_pfn[nid] -= size;
 		node_remap_start_pfn[nid] = node_end_pfn[nid];
+		shrink_active_range(nid, old_end_pfn, node_end_pfn[nid]);
 	}
 	printk("Reserving total of %ld pages for numa KVA remap\n",
 			reserve_pages);
@@ -352,45 +340,20 @@ unsigned long __init setup_memory(void)
 void __init zone_sizes_init(void)
 {
 	int nid;
+	unsigned long max_dma_pfn;
 
-
-	for_each_online_node(nid) {
-		unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0};
-		unsigned long *zholes_size;
-		unsigned int max_dma;
-
-		unsigned long low = max_low_pfn;
-		unsigned long start = node_start_pfn[nid];
-		unsigned long high = node_end_pfn[nid];
-
-		max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
-
-		if (node_has_online_mem(nid)){
-			if (start > low) {
-#ifdef CONFIG_HIGHMEM
-				BUG_ON(start > high);
-				zones_size[ZONE_HIGHMEM] = high - start;
-#endif
-			} else {
-				if (low < max_dma)
-					zones_size[ZONE_DMA] = low;
-				else {
-					BUG_ON(max_dma > low);
-					BUG_ON(low > high);
-					zones_size[ZONE_DMA] = max_dma;
-					zones_size[ZONE_NORMAL] = low - max_dma;
-#ifdef CONFIG_HIGHMEM
-					zones_size[ZONE_HIGHMEM] = high - low;
-#endif
-				}
-			}
+	/* If SRAT has not registered memory, register it now */
+	if (find_max_pfn_with_active_regions() == 0) {
+		for_each_online_node(nid) {
+			if (node_has_online_mem(nid))
+				add_active_range(nid, node_start_pfn[nid],
+							node_end_pfn[nid]);
 		}
-
-		zholes_size = get_zholes_size(nid);
-
-		free_area_init_node(nid, NODE_DATA(nid), zones_size, start,
-				zholes_size);
 	}
+
+	max_dma_pfn = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
+	free_area_init_nodes(max_dma_pfn, max_dma_pfn,
+						max_low_pfn, highend_pfn);
 	return;
 }
 

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH 4/7] Have x86_64 use add_active_range() and free_area_init_nodes
  2006-05-01 13:35 [PATCH 0/7] Sizing zones and holes in an architecture independent manner V5 Mel Gorman
                   ` (2 preceding siblings ...)
  2006-05-01 13:36 ` [PATCH 3/7] Have x86 use add_active_range() and free_area_init_nodes Mel Gorman
@ 2006-05-01 13:36 ` Mel Gorman
  2006-05-01 13:37 ` [PATCH 5/7] Have ia64 " Mel Gorman
                   ` (2 subsequent siblings)
  6 siblings, 0 replies; 11+ messages in thread
From: Mel Gorman @ 2006-05-01 13:36 UTC (permalink / raw)
  To: akpm, davej, tony.luck, linuxppc-dev, linux-kernel, bob.picco, ak,
	linux-mm
  Cc: Mel Gorman


Size zones and holes in an architecture independent manner for x86_64.

This has only been boot tested on an x86_64 with NUMA and SRAT.


 arch/x86_64/Kconfig         |    3 +
 arch/x86_64/kernel/e820.c   |  109 ++++++++++-----------------------------
 arch/x86_64/kernel/setup.c  |    7 ++
 arch/x86_64/mm/init.c       |   62 +---------------------
 arch/x86_64/mm/k8topology.c |    3 +
 arch/x86_64/mm/numa.c       |   18 +++---
 arch/x86_64/mm/srat.c       |   11 ++-
 include/asm-x86_64/e820.h   |    5 -
 include/asm-x86_64/proto.h  |    2 
 9 files changed, 63 insertions(+), 157 deletions(-)

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc3-mm1-103-x86_use_init_nodes/arch/x86_64/Kconfig linux-2.6.17-rc3-mm1-104-x86_64_use_init_nodes/arch/x86_64/Kconfig
--- linux-2.6.17-rc3-mm1-103-x86_use_init_nodes/arch/x86_64/Kconfig	2006-05-01 11:36:58.000000000 +0100
+++ linux-2.6.17-rc3-mm1-104-x86_64_use_init_nodes/arch/x86_64/Kconfig	2006-05-01 11:42:26.000000000 +0100
@@ -73,6 +73,9 @@ config ARCH_MAY_HAVE_PC_FDC
 	bool
 	default y
 
+config ARCH_POPULATES_NODE_MAP
+	def_bool y
+
 config DMI
 	bool
 	default y
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc3-mm1-103-x86_use_init_nodes/arch/x86_64/kernel/e820.c linux-2.6.17-rc3-mm1-104-x86_64_use_init_nodes/arch/x86_64/kernel/e820.c
--- linux-2.6.17-rc3-mm1-103-x86_use_init_nodes/arch/x86_64/kernel/e820.c	2006-04-27 03:19:25.000000000 +0100
+++ linux-2.6.17-rc3-mm1-104-x86_64_use_init_nodes/arch/x86_64/kernel/e820.c	2006-05-01 11:42:26.000000000 +0100
@@ -18,6 +18,7 @@
 #include <linux/string.h>
 #include <linux/kexec.h>
 #include <linux/module.h>
+#include <linux/mm.h>
 
 #include <asm/page.h>
 #include <asm/e820.h>
@@ -155,58 +156,14 @@ unsigned long __init find_e820_area(unsi
 	return -1UL;		
 } 
 
-/* 
- * Free bootmem based on the e820 table for a node.
- */
-void __init e820_bootmem_free(pg_data_t *pgdat, unsigned long start,unsigned long end)
-{
-	int i;
-	for (i = 0; i < e820.nr_map; i++) {
-		struct e820entry *ei = &e820.map[i]; 
-		unsigned long last, addr;
-
-		if (ei->type != E820_RAM || 
-		    ei->addr+ei->size <= start || 
-		    ei->addr >= end)
-			continue;
-
-		addr = round_up(ei->addr, PAGE_SIZE);
-		if (addr < start) 
-			addr = start;
-
-		last = round_down(ei->addr + ei->size, PAGE_SIZE); 
-		if (last >= end)
-			last = end; 
-
-		if (last > addr && last-addr >= PAGE_SIZE)
-			free_bootmem_node(pgdat, addr, last-addr);
-	}
-}
-
 /*
  * Find the highest page frame number we have available
  */
 unsigned long __init e820_end_of_ram(void)
 {
-	int i;
 	unsigned long end_pfn = 0;
 	
-	for (i = 0; i < e820.nr_map; i++) {
-		struct e820entry *ei = &e820.map[i]; 
-		unsigned long start, end;
-
-		start = round_up(ei->addr, PAGE_SIZE); 
-		end = round_down(ei->addr + ei->size, PAGE_SIZE); 
-		if (start >= end)
-			continue;
-		if (ei->type == E820_RAM) { 
-		if (end > end_pfn<<PAGE_SHIFT)
-			end_pfn = end>>PAGE_SHIFT;
-		} else { 
-			if (end > end_pfn_map<<PAGE_SHIFT) 
-				end_pfn_map = end>>PAGE_SHIFT;
-		} 
-	}
+	end_pfn = find_max_pfn_with_active_regions();
 
 	if (end_pfn > end_pfn_map) 
 		end_pfn_map = end_pfn;
@@ -220,40 +177,6 @@ unsigned long __init e820_end_of_ram(voi
 	return end_pfn;	
 }
 
-/* 
- * Compute how much memory is missing in a range.
- * Unlike the other functions in this file the arguments are in page numbers.
- */
-unsigned long __init
-e820_hole_size(unsigned long start_pfn, unsigned long end_pfn)
-{
-	unsigned long ram = 0;
-	unsigned long start = start_pfn << PAGE_SHIFT;
-	unsigned long end = end_pfn << PAGE_SHIFT;
-	int i;
-	for (i = 0; i < e820.nr_map; i++) {
-		struct e820entry *ei = &e820.map[i];
-		unsigned long last, addr;
-
-		if (ei->type != E820_RAM ||
-		    ei->addr+ei->size <= start ||
-		    ei->addr >= end)
-			continue;
-
-		addr = round_up(ei->addr, PAGE_SIZE);
-		if (addr < start)
-			addr = start;
-
-		last = round_down(ei->addr + ei->size, PAGE_SIZE);
-		if (last >= end)
-			last = end;
-
-		if (last > addr)
-			ram += last - addr;
-	}
-	return ((end - start) - ram) >> PAGE_SHIFT;
-}
-
 /*
  * Mark e820 reserved areas as busy for the resource manager.
  */
@@ -288,6 +211,34 @@ void __init e820_reserve_resources(void)
 	}
 }
 
+/* Walk the e820 map and register active regions within a node */
+void __init
+e820_register_active_regions(int nid, unsigned long start_pfn,
+							unsigned long end_pfn)
+{
+	int i;
+	unsigned long ei_startpfn, ei_endpfn;
+	for (i = 0; i < e820.nr_map; i++) {
+		struct e820entry *ei = &e820.map[i];
+		ei_startpfn = round_up(ei->addr, PAGE_SIZE) >> PAGE_SHIFT;
+		ei_endpfn = round_down(ei->addr + ei->size, PAGE_SIZE)
+								>> PAGE_SHIFT;
+		/* Skip if map is outside the node */
+		if (ei->type != E820_RAM ||
+				ei_endpfn <= start_pfn ||
+				ei_startpfn >= end_pfn)
+			continue;
+
+		/* Check for overlaps */
+		if (ei_startpfn < start_pfn)
+			ei_startpfn = start_pfn;
+		if (ei_endpfn > end_pfn)
+			ei_endpfn = end_pfn;
+
+		add_active_range(nid, ei_startpfn, ei_endpfn);
+	}
+}
+
 /* 
  * Add a memory region to the kernel e820 map.
  */ 
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc3-mm1-103-x86_use_init_nodes/arch/x86_64/kernel/setup.c linux-2.6.17-rc3-mm1-104-x86_64_use_init_nodes/arch/x86_64/kernel/setup.c
--- linux-2.6.17-rc3-mm1-103-x86_use_init_nodes/arch/x86_64/kernel/setup.c	2006-05-01 11:36:58.000000000 +0100
+++ linux-2.6.17-rc3-mm1-104-x86_64_use_init_nodes/arch/x86_64/kernel/setup.c	2006-05-01 11:42:26.000000000 +0100
@@ -475,7 +475,8 @@ contig_initmem_init(unsigned long start_
 	if (bootmap == -1L)
 		panic("Cannot find bootmem map of size %ld\n",bootmap_size);
 	bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn);
-	e820_bootmem_free(NODE_DATA(0), 0, end_pfn << PAGE_SHIFT);
+	e820_register_active_regions(0, start_pfn, end_pfn);
+	free_bootmem_with_active_regions(0, end_pfn);
 	reserve_bootmem(bootmap, bootmap_size);
 } 
 #endif
@@ -645,6 +646,7 @@ void __init setup_arch(char **cmdline_p)
 
 	early_identify_cpu(&boot_cpu_data);
 
+	e820_register_active_regions(0, 0, -1UL);
 	/*
 	 * partially used pages are not usable - thus
 	 * we are rounding upwards:
@@ -668,6 +670,9 @@ void __init setup_arch(char **cmdline_p)
 	acpi_boot_table_init();
 #endif
 
+	/* Remove active ranges so rediscovery with NUMA-awareness happens */
+	remove_all_active_ranges();
+
 #ifdef CONFIG_ACPI_NUMA
 	/*
 	 * Parse SRAT to discover nodes.
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc3-mm1-103-x86_use_init_nodes/arch/x86_64/mm/init.c linux-2.6.17-rc3-mm1-104-x86_64_use_init_nodes/arch/x86_64/mm/init.c
--- linux-2.6.17-rc3-mm1-103-x86_use_init_nodes/arch/x86_64/mm/init.c	2006-05-01 11:36:58.000000000 +0100
+++ linux-2.6.17-rc3-mm1-104-x86_64_use_init_nodes/arch/x86_64/mm/init.c	2006-05-01 11:42:26.000000000 +0100
@@ -406,69 +406,12 @@ void __cpuinit zap_low_mappings(int cpu)
 	__flush_tlb_all();
 }
 
-/* Compute zone sizes for the DMA and DMA32 zones in a node. */
-__init void
-size_zones(unsigned long *z, unsigned long *h,
-	   unsigned long start_pfn, unsigned long end_pfn)
-{
- 	int i;
- 	unsigned long w;
-
- 	for (i = 0; i < MAX_NR_ZONES; i++)
- 		z[i] = 0;
-
- 	if (start_pfn < MAX_DMA_PFN)
- 		z[ZONE_DMA] = MAX_DMA_PFN - start_pfn;
- 	if (start_pfn < MAX_DMA32_PFN) {
- 		unsigned long dma32_pfn = MAX_DMA32_PFN;
- 		if (dma32_pfn > end_pfn)
- 			dma32_pfn = end_pfn;
- 		z[ZONE_DMA32] = dma32_pfn - start_pfn;
- 	}
- 	z[ZONE_NORMAL] = end_pfn - start_pfn;
-
- 	/* Remove lower zones from higher ones. */
- 	w = 0;
- 	for (i = 0; i < MAX_NR_ZONES; i++) {
- 		if (z[i])
- 			z[i] -= w;
- 	        w += z[i];
-	}
-
-	/* Compute holes */
-	w = start_pfn;
-	for (i = 0; i < MAX_NR_ZONES; i++) {
-		unsigned long s = w;
-		w += z[i];
-		h[i] = e820_hole_size(s, w);
-	}
-
-	/* Add the space pace needed for mem_map to the holes too. */
-	for (i = 0; i < MAX_NR_ZONES; i++)
-		h[i] += (z[i] * sizeof(struct page)) / PAGE_SIZE;
-
-	/* The 16MB DMA zone has the kernel and other misc mappings.
- 	   Account them too */
-	if (h[ZONE_DMA]) {
-		h[ZONE_DMA] += dma_reserve;
-		if (h[ZONE_DMA] >= z[ZONE_DMA]) {
-			printk(KERN_WARNING
-				"Kernel too large and filling up ZONE_DMA?\n");
-			h[ZONE_DMA] = z[ZONE_DMA];
-		}
-	}
-}
-
 #ifndef CONFIG_NUMA
 void __init paging_init(void)
 {
-	unsigned long zones[MAX_NR_ZONES], holes[MAX_NR_ZONES];
-
 	memory_present(0, 0, end_pfn);
 	sparse_init();
-	size_zones(zones, holes, 0, end_pfn);
-	free_area_init_node(0, NODE_DATA(0), zones,
-			    __pa(PAGE_OFFSET) >> PAGE_SHIFT, holes);
+	free_area_init_nodes(MAX_DMA_PFN, MAX_DMA32_PFN, end_pfn, end_pfn);
 }
 #endif
 
@@ -620,7 +563,8 @@ void __init mem_init(void)
 #else
 	totalram_pages = free_all_bootmem();
 #endif
-	reservedpages = end_pfn - totalram_pages - e820_hole_size(0, end_pfn);
+	reservedpages = end_pfn - totalram_pages -
+					absent_pages_in_range(0, end_pfn);
 
 	after_bootmem = 1;
 
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc3-mm1-103-x86_use_init_nodes/arch/x86_64/mm/k8topology.c linux-2.6.17-rc3-mm1-104-x86_64_use_init_nodes/arch/x86_64/mm/k8topology.c
--- linux-2.6.17-rc3-mm1-103-x86_use_init_nodes/arch/x86_64/mm/k8topology.c	2006-04-27 03:19:25.000000000 +0100
+++ linux-2.6.17-rc3-mm1-104-x86_64_use_init_nodes/arch/x86_64/mm/k8topology.c	2006-05-01 11:42:26.000000000 +0100
@@ -146,6 +146,9 @@ int __init k8_scan_nodes(unsigned long s
 		
 		nodes[nodeid].start = base; 
 		nodes[nodeid].end = limit;
+		e820_register_active_regions(nodeid,
+				nodes[nodeid].start >> PAGE_SHIFT,
+				nodes[nodeid].end >> PAGE_SHIFT);
 
 		prevbase = base;
 
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc3-mm1-103-x86_use_init_nodes/arch/x86_64/mm/numa.c linux-2.6.17-rc3-mm1-104-x86_64_use_init_nodes/arch/x86_64/mm/numa.c
--- linux-2.6.17-rc3-mm1-103-x86_use_init_nodes/arch/x86_64/mm/numa.c	2006-04-27 03:19:25.000000000 +0100
+++ linux-2.6.17-rc3-mm1-104-x86_64_use_init_nodes/arch/x86_64/mm/numa.c	2006-05-01 11:42:26.000000000 +0100
@@ -161,7 +161,7 @@ void __init setup_node_bootmem(int nodei
 					 bootmap_start >> PAGE_SHIFT, 
 					 start_pfn, end_pfn); 
 
-	e820_bootmem_free(NODE_DATA(nodeid), start, end);
+	free_bootmem_with_active_regions(nodeid, end);
 
 	reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size); 
 	reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT);
@@ -175,13 +175,11 @@ void __init setup_node_bootmem(int nodei
 void __init setup_node_zones(int nodeid)
 { 
 	unsigned long start_pfn, end_pfn, memmapsize, limit;
-	unsigned long zones[MAX_NR_ZONES];
-	unsigned long holes[MAX_NR_ZONES];
 
  	start_pfn = node_start_pfn(nodeid);
  	end_pfn = node_end_pfn(nodeid);
 
-	Dprintk(KERN_INFO "Setting up node %d %lx-%lx\n",
+	Dprintk(KERN_INFO "Setting up memmap for node %d %lx-%lx\n",
 		nodeid, start_pfn, end_pfn);
 
 	/* Try to allocate mem_map at end to not fill up precious <4GB
@@ -195,10 +193,6 @@ void __init setup_node_zones(int nodeid)
 				round_down(limit - memmapsize, PAGE_SIZE), 
 				limit);
 #endif
-
-	size_zones(zones, holes, start_pfn, end_pfn);
-	free_area_init_node(nodeid, NODE_DATA(nodeid), zones,
-			    start_pfn, holes);
 } 
 
 void __init numa_init_array(void)
@@ -259,8 +253,11 @@ static int numa_emulation(unsigned long 
  		printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n");
  		return -1;
  	}
- 	for_each_online_node(i)
+ 	for_each_online_node(i) {
+		e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
+						nodes[i].end >> PAGE_SHIFT);
  		setup_node_bootmem(i, nodes[i].start, nodes[i].end);
+	}
  	numa_init_array();
  	return 0;
 }
@@ -299,6 +296,7 @@ void __init numa_initmem_init(unsigned l
 	for (i = 0; i < NR_CPUS; i++)
 		numa_set_node(i, 0);
 	node_to_cpumask[0] = cpumask_of_cpu(0);
+	e820_register_active_regions(0, start_pfn, end_pfn);
 	setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
 }
 
@@ -346,6 +344,8 @@ void __init paging_init(void)
 	for_each_online_node(i) {
 		setup_node_zones(i); 
 	}
+
+	free_area_init_nodes(MAX_DMA_PFN, MAX_DMA32_PFN, end_pfn, end_pfn);
 } 
 
 /* [numa=off] */
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc3-mm1-103-x86_use_init_nodes/arch/x86_64/mm/srat.c linux-2.6.17-rc3-mm1-104-x86_64_use_init_nodes/arch/x86_64/mm/srat.c
--- linux-2.6.17-rc3-mm1-103-x86_use_init_nodes/arch/x86_64/mm/srat.c	2006-05-01 11:36:58.000000000 +0100
+++ linux-2.6.17-rc3-mm1-104-x86_64_use_init_nodes/arch/x86_64/mm/srat.c	2006-05-01 11:42:26.000000000 +0100
@@ -87,6 +87,7 @@ static __init void bad_srat(void)
 		apicid_to_node[i] = NUMA_NO_NODE;
 	for (i = 0; i < MAX_NUMNODES; i++)
 		nodes_add[i].start = nodes[i].end = 0;
+	remove_all_active_ranges();
 }
 
 static __init inline int srat_disabled(void)
@@ -168,7 +169,7 @@ static int hotadd_enough_memory(struct b
 
 	if (mem < 0)
 		return 0;
-	allowed = (end_pfn - e820_hole_size(0, end_pfn)) * PAGE_SIZE;
+	allowed = (end_pfn - absent_pages_in_range(0, end_pfn)) * PAGE_SIZE;
 	allowed = (allowed / 100) * hotadd_percent;
 	if (allocated + mem > allowed) {
 		/* Give them at least part of their hotadd memory upto hotadd_percent
@@ -216,7 +217,7 @@ static int reserve_hotadd(int node, unsi
 	}
 
 	/* This check might be a bit too strict, but I'm keeping it for now. */
-	if (e820_hole_size(s_pfn, e_pfn) != e_pfn - s_pfn) {
+	if (absent_pages_in_range(s_pfn, e_pfn) != e_pfn - s_pfn) {
 		printk(KERN_ERR "SRAT: Hotplug area has existing memory\n");
 		return -1;
 	}
@@ -310,6 +311,8 @@ acpi_numa_memory_affinity_init(struct ac
 
 	printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm,
 	       nd->start, nd->end);
+	e820_register_active_regions(node, nd->start >> PAGE_SHIFT,
+						nd->end >> PAGE_SHIFT);
 
 #ifdef RESERVE_HOTADD
  	if (ma->flags.hot_pluggable && reserve_hotadd(node, start, end) < 0) {
@@ -334,13 +337,13 @@ static int nodes_cover_memory(void)
 		unsigned long s = nodes[i].start >> PAGE_SHIFT;
 		unsigned long e = nodes[i].end >> PAGE_SHIFT;
 		pxmram += e - s;
-		pxmram -= e820_hole_size(s, e);
+		pxmram -= absent_pages_in_range(s, e);
 		pxmram -= nodes_add[i].end - nodes_add[i].start;
 		if ((long)pxmram < 0)
 			pxmram = 0;
 	}
 
-	e820ram = end_pfn - e820_hole_size(0, end_pfn);
+	e820ram = end_pfn - absent_pages_in_range(0, end_pfn);
 	/* We seem to lose 3 pages somewhere. Allow a bit of slack. */
 	if ((long)(e820ram - pxmram) >= 1*1024*1024) {
 		printk(KERN_ERR
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc3-mm1-103-x86_use_init_nodes/include/asm-x86_64/e820.h linux-2.6.17-rc3-mm1-104-x86_64_use_init_nodes/include/asm-x86_64/e820.h
--- linux-2.6.17-rc3-mm1-103-x86_use_init_nodes/include/asm-x86_64/e820.h	2006-04-27 03:19:25.000000000 +0100
+++ linux-2.6.17-rc3-mm1-104-x86_64_use_init_nodes/include/asm-x86_64/e820.h	2006-05-01 11:42:26.000000000 +0100
@@ -50,10 +50,9 @@ extern void e820_print_map(char *who);
 extern int e820_any_mapped(unsigned long start, unsigned long end, unsigned type);
 extern int e820_all_mapped(unsigned long start, unsigned long end, unsigned type);
 
-extern void e820_bootmem_free(pg_data_t *pgdat, unsigned long start,unsigned long end);
 extern void e820_setup_gap(void);
-extern unsigned long e820_hole_size(unsigned long start_pfn,
-				    unsigned long end_pfn);
+extern void e820_register_active_regions(int nid,
+				unsigned long start_pfn, unsigned long end_pfn);
 
 extern void __init parse_memopt(char *p, char **end);
 extern void __init parse_memmapopt(char *p, char **end);
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc3-mm1-103-x86_use_init_nodes/include/asm-x86_64/proto.h linux-2.6.17-rc3-mm1-104-x86_64_use_init_nodes/include/asm-x86_64/proto.h
--- linux-2.6.17-rc3-mm1-103-x86_use_init_nodes/include/asm-x86_64/proto.h	2006-05-01 11:37:01.000000000 +0100
+++ linux-2.6.17-rc3-mm1-104-x86_64_use_init_nodes/include/asm-x86_64/proto.h	2006-05-01 11:42:26.000000000 +0100
@@ -24,8 +24,6 @@ extern void mtrr_bp_init(void);
 #define mtrr_bp_init() do {} while (0)
 #endif
 extern void init_memory_mapping(unsigned long start, unsigned long end);
-extern void size_zones(unsigned long *z, unsigned long *h,
-			unsigned long start_pfn, unsigned long end_pfn);
 
 extern void system_call(void); 
 extern int kernel_syscall(void);

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH 5/7] Have ia64 use add_active_range() and free_area_init_nodes
  2006-05-01 13:35 [PATCH 0/7] Sizing zones and holes in an architecture independent manner V5 Mel Gorman
                   ` (3 preceding siblings ...)
  2006-05-01 13:36 ` [PATCH 4/7] Have x86_64 " Mel Gorman
@ 2006-05-01 13:37 ` Mel Gorman
  2006-05-01 13:37 ` [PATCH 6/7] Break out memory initialisation code from page_alloc.c to mem_init.c Mel Gorman
  2006-05-01 13:37 ` [PATCH 7/7] Print out debugging information during initialisation Mel Gorman
  6 siblings, 0 replies; 11+ messages in thread
From: Mel Gorman @ 2006-05-01 13:37 UTC (permalink / raw)
  To: akpm, davej, tony.luck, linux-mm, linux-kernel, bob.picco, ak,
	linuxppc-dev
  Cc: Mel Gorman


Size zones and holes in an architecture independent manner for ia64.

This has only been compile-tested due to lack of a suitable test machine.


 arch/ia64/Kconfig          |    3 ++
 arch/ia64/mm/contig.c      |   60 +++++-----------------------------------
 arch/ia64/mm/discontig.c   |   41 ++++-----------------------
 arch/ia64/mm/init.c        |   12 ++++++++
 include/asm-ia64/meminit.h |    1 
 5 files changed, 30 insertions(+), 87 deletions(-)

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc3-mm1-104-x86_64_use_init_nodes/arch/ia64/Kconfig linux-2.6.17-rc3-mm1-105-ia64_use_init_nodes/arch/ia64/Kconfig
--- linux-2.6.17-rc3-mm1-104-x86_64_use_init_nodes/arch/ia64/Kconfig	2006-05-01 11:36:54.000000000 +0100
+++ linux-2.6.17-rc3-mm1-105-ia64_use_init_nodes/arch/ia64/Kconfig	2006-05-01 11:43:32.000000000 +0100
@@ -353,6 +353,9 @@ config NODES_SHIFT
 	  MAX_NUMNODES will be 2^(This value).
 	  If in doubt, use the default.
 
+config ARCH_POPULATES_NODE_MAP
+	def_bool y
+
 # VIRTUAL_MEM_MAP and FLAT_NODE_MEM_MAP are functionally equivalent.
 # VIRTUAL_MEM_MAP has been retained for historical reasons.
 config VIRTUAL_MEM_MAP
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc3-mm1-104-x86_64_use_init_nodes/arch/ia64/mm/contig.c linux-2.6.17-rc3-mm1-105-ia64_use_init_nodes/arch/ia64/mm/contig.c
--- linux-2.6.17-rc3-mm1-104-x86_64_use_init_nodes/arch/ia64/mm/contig.c	2006-04-27 03:19:25.000000000 +0100
+++ linux-2.6.17-rc3-mm1-105-ia64_use_init_nodes/arch/ia64/mm/contig.c	2006-05-01 11:43:32.000000000 +0100
@@ -26,10 +26,6 @@
 #include <asm/sections.h>
 #include <asm/mca.h>
 
-#ifdef CONFIG_VIRTUAL_MEM_MAP
-static unsigned long num_dma_physpages;
-#endif
-
 /**
  * show_mem - display a memory statistics summary
  *
@@ -212,18 +208,6 @@ count_pages (u64 start, u64 end, void *a
 	return 0;
 }
 
-#ifdef CONFIG_VIRTUAL_MEM_MAP
-static int
-count_dma_pages (u64 start, u64 end, void *arg)
-{
-	unsigned long *count = arg;
-
-	if (start < MAX_DMA_ADDRESS)
-		*count += (min(end, MAX_DMA_ADDRESS) - start) >> PAGE_SHIFT;
-	return 0;
-}
-#endif
-
 /*
  * Set up the page tables.
  */
@@ -232,47 +216,24 @@ void __init
 paging_init (void)
 {
 	unsigned long max_dma;
-	unsigned long zones_size[MAX_NR_ZONES];
 #ifdef CONFIG_VIRTUAL_MEM_MAP
-	unsigned long zholes_size[MAX_NR_ZONES];
+	unsigned long nid = 0;
 	unsigned long max_gap;
 #endif
 
-	/* initialize mem_map[] */
-
-	memset(zones_size, 0, sizeof(zones_size));
-
 	num_physpages = 0;
 	efi_memmap_walk(count_pages, &num_physpages);
 
 	max_dma = virt_to_phys((void *) MAX_DMA_ADDRESS) >> PAGE_SHIFT;
 
 #ifdef CONFIG_VIRTUAL_MEM_MAP
-	memset(zholes_size, 0, sizeof(zholes_size));
-
-	num_dma_physpages = 0;
-	efi_memmap_walk(count_dma_pages, &num_dma_physpages);
-
-	if (max_low_pfn < max_dma) {
-		zones_size[ZONE_DMA] = max_low_pfn;
-		zholes_size[ZONE_DMA] = max_low_pfn - num_dma_physpages;
-	} else {
-		zones_size[ZONE_DMA] = max_dma;
-		zholes_size[ZONE_DMA] = max_dma - num_dma_physpages;
-		if (num_physpages > num_dma_physpages) {
-			zones_size[ZONE_NORMAL] = max_low_pfn - max_dma;
-			zholes_size[ZONE_NORMAL] =
-				((max_low_pfn - max_dma) -
-				 (num_physpages - num_dma_physpages));
-		}
-	}
-
 	max_gap = 0;
+	efi_memmap_walk(register_active_ranges, &nid);
 	efi_memmap_walk(find_largest_hole, (u64 *)&max_gap);
 	if (max_gap < LARGE_GAP) {
 		vmem_map = (struct page *) 0;
-		free_area_init_node(0, NODE_DATA(0), zones_size, 0,
-				    zholes_size);
+		free_area_init_nodes(max_dma, max_dma,
+				max_low_pfn, max_low_pfn);
 	} else {
 		unsigned long map_size;
 
@@ -284,19 +245,14 @@ paging_init (void)
 		efi_memmap_walk(create_mem_map_page_table, NULL);
 
 		NODE_DATA(0)->node_mem_map = vmem_map;
-		free_area_init_node(0, NODE_DATA(0), zones_size,
-				    0, zholes_size);
+		free_area_init_nodes(max_dma, max_dma,
+				max_low_pfn, max_low_pfn);
 
 		printk("Virtual mem_map starts at 0x%p\n", mem_map);
 	}
 #else /* !CONFIG_VIRTUAL_MEM_MAP */
-	if (max_low_pfn < max_dma)
-		zones_size[ZONE_DMA] = max_low_pfn;
-	else {
-		zones_size[ZONE_DMA] = max_dma;
-		zones_size[ZONE_NORMAL] = max_low_pfn - max_dma;
-	}
-	free_area_init(zones_size);
+	add_active_range(0, 0, max_low_pfn);
+	free_area_init_nodes(max_dma, max_dma, max_low_pfn, max_low_pfn);
 #endif /* !CONFIG_VIRTUAL_MEM_MAP */
 	zero_page_memmap_ptr = virt_to_page(ia64_imva(empty_zero_page));
 }
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc3-mm1-104-x86_64_use_init_nodes/arch/ia64/mm/discontig.c linux-2.6.17-rc3-mm1-105-ia64_use_init_nodes/arch/ia64/mm/discontig.c
--- linux-2.6.17-rc3-mm1-104-x86_64_use_init_nodes/arch/ia64/mm/discontig.c	2006-04-27 03:19:25.000000000 +0100
+++ linux-2.6.17-rc3-mm1-105-ia64_use_init_nodes/arch/ia64/mm/discontig.c	2006-05-01 11:43:32.000000000 +0100
@@ -700,6 +700,7 @@ static __init int count_node_pages(unsig
 {
 	unsigned long end = start + len;
 
+	add_active_range(node, start >> PAGE_SHIFT, end >> PAGE_SHIFT);
 	mem_data[node].num_physpages += len >> PAGE_SHIFT;
 	if (start <= __pa(MAX_DMA_ADDRESS))
 		mem_data[node].num_dma_physpages +=
@@ -724,9 +725,8 @@ static __init int count_node_pages(unsig
 void __init paging_init(void)
 {
 	unsigned long max_dma;
-	unsigned long zones_size[MAX_NR_ZONES];
-	unsigned long zholes_size[MAX_NR_ZONES];
 	unsigned long pfn_offset = 0;
+	unsigned long max_pfn = 0;
 	int node;
 
 	max_dma = virt_to_phys((void *) MAX_DMA_ADDRESS) >> PAGE_SHIFT;
@@ -743,46 +743,17 @@ void __init paging_init(void)
 #endif
 
 	for_each_online_node(node) {
-		memset(zones_size, 0, sizeof(zones_size));
-		memset(zholes_size, 0, sizeof(zholes_size));
-
 		num_physpages += mem_data[node].num_physpages;
-
-		if (mem_data[node].min_pfn >= max_dma) {
-			/* All of this node's memory is above ZONE_DMA */
-			zones_size[ZONE_NORMAL] = mem_data[node].max_pfn -
-				mem_data[node].min_pfn;
-			zholes_size[ZONE_NORMAL] = mem_data[node].max_pfn -
-				mem_data[node].min_pfn -
-				mem_data[node].num_physpages;
-		} else if (mem_data[node].max_pfn < max_dma) {
-			/* All of this node's memory is in ZONE_DMA */
-			zones_size[ZONE_DMA] = mem_data[node].max_pfn -
-				mem_data[node].min_pfn;
-			zholes_size[ZONE_DMA] = mem_data[node].max_pfn -
-				mem_data[node].min_pfn -
-				mem_data[node].num_dma_physpages;
-		} else {
-			/* This node has memory in both zones */
-			zones_size[ZONE_DMA] = max_dma -
-				mem_data[node].min_pfn;
-			zholes_size[ZONE_DMA] = zones_size[ZONE_DMA] -
-				mem_data[node].num_dma_physpages;
-			zones_size[ZONE_NORMAL] = mem_data[node].max_pfn -
-				max_dma;
-			zholes_size[ZONE_NORMAL] = zones_size[ZONE_NORMAL] -
-				(mem_data[node].num_physpages -
-				 mem_data[node].num_dma_physpages);
-		}
-
 		pfn_offset = mem_data[node].min_pfn;
 
 #ifdef CONFIG_VIRTUAL_MEM_MAP
 		NODE_DATA(node)->node_mem_map = vmem_map + pfn_offset;
 #endif
-		free_area_init_node(node, NODE_DATA(node), zones_size,
-				    pfn_offset, zholes_size);
+		if (mem_data[node].max_pfn > max_pfn)
+			max_pfn = mem_data[node].max_pfn;
 	}
 
+	free_area_init_nodes(max_dma, max_dma, max_pfn, max_pfn);
+
 	zero_page_memmap_ptr = virt_to_page(ia64_imva(empty_zero_page));
 }
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc3-mm1-104-x86_64_use_init_nodes/arch/ia64/mm/init.c linux-2.6.17-rc3-mm1-105-ia64_use_init_nodes/arch/ia64/mm/init.c
--- linux-2.6.17-rc3-mm1-104-x86_64_use_init_nodes/arch/ia64/mm/init.c	2006-05-01 11:36:54.000000000 +0100
+++ linux-2.6.17-rc3-mm1-105-ia64_use_init_nodes/arch/ia64/mm/init.c	2006-05-01 11:43:32.000000000 +0100
@@ -539,6 +539,18 @@ find_largest_hole (u64 start, u64 end, v
 	last_end = end;
 	return 0;
 }
+
+int __init
+register_active_ranges(u64 start, u64 end, void *nid)
+{
+	BUG_ON(nid == NULL);
+	BUG_ON(*(unsigned long *)nid >= MAX_NUMNODES);
+
+	add_active_range(*(unsigned long *)nid,
+				__pa(start) >> PAGE_SHIFT,
+				__pa(end) >> PAGE_SHIFT);
+	return 0;
+}
 #endif /* CONFIG_VIRTUAL_MEM_MAP */
 
 static int __init
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc3-mm1-104-x86_64_use_init_nodes/include/asm-ia64/meminit.h linux-2.6.17-rc3-mm1-105-ia64_use_init_nodes/include/asm-ia64/meminit.h
--- linux-2.6.17-rc3-mm1-104-x86_64_use_init_nodes/include/asm-ia64/meminit.h	2006-04-27 03:19:25.000000000 +0100
+++ linux-2.6.17-rc3-mm1-105-ia64_use_init_nodes/include/asm-ia64/meminit.h	2006-05-01 11:43:32.000000000 +0100
@@ -56,6 +56,7 @@ extern void efi_memmap_init(unsigned lon
   extern unsigned long vmalloc_end;
   extern struct page *vmem_map;
   extern int find_largest_hole (u64 start, u64 end, void *arg);
+  extern int register_active_ranges (u64 start, u64 end, void *arg);
   extern int create_mem_map_page_table (u64 start, u64 end, void *arg);
 #endif
 

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH 6/7] Break out memory initialisation code from page_alloc.c to mem_init.c
  2006-05-01 13:35 [PATCH 0/7] Sizing zones and holes in an architecture independent manner V5 Mel Gorman
                   ` (4 preceding siblings ...)
  2006-05-01 13:37 ` [PATCH 5/7] Have ia64 " Mel Gorman
@ 2006-05-01 13:37 ` Mel Gorman
  2006-05-01 13:37 ` [PATCH 7/7] Print out debugging information during initialisation Mel Gorman
  6 siblings, 0 replies; 11+ messages in thread
From: Mel Gorman @ 2006-05-01 13:37 UTC (permalink / raw)
  To: akpm, davej, tony.luck, linuxppc-dev, linux-kernel, bob.picco, ak,
	linux-mm
  Cc: Mel Gorman


page_alloc.c contains a large amount of memory initialisation code. This patch
breaks out the initialisation code to a separate file to make page_alloc.c
a bit easier to read.


 Makefile     |    2 
 mem_init.c   | 1121 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 page_alloc.c | 1104 -----------------------------------------------------
 3 files changed, 1122 insertions(+), 1105 deletions(-)

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc3-mm1-105-ia64_use_init_nodes/mm/Makefile linux-2.6.17-rc3-mm1-106-breakout_mem_init/mm/Makefile
--- linux-2.6.17-rc3-mm1-105-ia64_use_init_nodes/mm/Makefile	2006-05-01 11:37:01.000000000 +0100
+++ linux-2.6.17-rc3-mm1-106-breakout_mem_init/mm/Makefile	2006-05-01 11:44:35.000000000 +0100
@@ -8,7 +8,7 @@ mmu-$(CONFIG_MMU)	:= fremap.o highmem.o 
 			   vmalloc.o
 
 obj-y			:= bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
-			   page_alloc.o page-writeback.o pdflush.o \
+			   page_alloc.o mem_init.o page-writeback.o pdflush.o \
 			   readahead.o swap.o truncate.o vmscan.o \
 			   prio_tree.o util.o mmzone.o $(mmu-y)
 
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc3-mm1-105-ia64_use_init_nodes/mm/mem_init.c linux-2.6.17-rc3-mm1-106-breakout_mem_init/mm/mem_init.c
--- linux-2.6.17-rc3-mm1-105-ia64_use_init_nodes/mm/mem_init.c	2006-05-01 11:51:50.000000000 +0100
+++ linux-2.6.17-rc3-mm1-106-breakout_mem_init/mm/mem_init.c	2006-05-01 11:51:50.000000000 +0100
@@ -0,0 +1,1121 @@
+/*
+ * mm/mem_init.c
+ * Initialises the architecture independant view of memory. pgdats, zones, etc
+ *
+ *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
+ *  Copyright (C) 1995, Stephen Tweedie
+ *  Copyright (C) July 1999, Gerhard Wichert, Siemens AG
+ *  Copyright (C) 1999, Ingo Molnar, Red Hat
+ *  Copyright (C) 1999, 2000, Kanoj Sarcar, SGI
+ *  Copyright (C) Sept 2000, Martin J. Bligh
+ *	(lots of bits borrowed from Ingo Molnar & Andrew Morton)
+ *  Copyright (C) Apr 2006, Mel Gorman, IBM
+ *	(lots of bits taken from architecture-specific code)
+ */
+#include <linux/config.h>
+#include <linux/sort.h>
+#include <linux/pfn.h>
+#include <linux/mm.h>
+#include <linux/bootmem.h>
+#include <linux/cpuset.h>
+#include <linux/mempolicy.h>
+#include <linux/sysctl.h>
+#include <linux/swap.h>
+#include <linux/cpu.h>
+#include <linux/stop_machine.h>
+
+static char *zone_names[MAX_NR_ZONES] = { "DMA", "DMA32", "Normal", "HighMem" };
+int percpu_pagelist_fraction;
+
+#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
+  #ifdef CONFIG_MAX_ACTIVE_REGIONS
+    #define MAX_ACTIVE_REGIONS CONFIG_MAX_ACTIVE_REGIONS
+  #else
+    #define MAX_ACTIVE_REGIONS (MAX_NR_ZONES * MAX_NUMNODES + 1)
+  #endif
+
+  struct node_active_region __initdata early_node_map[MAX_ACTIVE_REGIONS];
+  unsigned long __initdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
+  unsigned long __initdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
+#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
+
+/*
+ * Builds allocation fallback zone lists.
+ *
+ * Add all populated zones of a node to the zonelist.
+ */
+static int __meminit build_zonelists_node(pg_data_t *pgdat,
+			struct zonelist *zonelist, int nr_zones, int zone_type)
+{
+	struct zone *zone;
+
+	BUG_ON(zone_type > ZONE_HIGHMEM);
+
+	do {
+		zone = pgdat->node_zones + zone_type;
+		if (populated_zone(zone)) {
+#ifndef CONFIG_HIGHMEM
+			BUG_ON(zone_type > ZONE_NORMAL);
+#endif
+			zonelist->zones[nr_zones++] = zone;
+			check_highest_zone(zone_type);
+		}
+		zone_type--;
+
+	} while (zone_type >= 0);
+	return nr_zones;
+}
+
+static inline int highest_zone(int zone_bits)
+{
+	int res = ZONE_NORMAL;
+	if (zone_bits & (__force int)__GFP_HIGHMEM)
+		res = ZONE_HIGHMEM;
+	if (zone_bits & (__force int)__GFP_DMA32)
+		res = ZONE_DMA32;
+	if (zone_bits & (__force int)__GFP_DMA)
+		res = ZONE_DMA;
+	return res;
+}
+
+#ifdef CONFIG_NUMA
+#define MAX_NODE_LOAD (num_online_nodes())
+static int __meminitdata node_load[MAX_NUMNODES];
+/**
+ * find_next_best_node - find the next node that should appear in a given node's fallback list
+ * @node: node whose fallback list we're appending
+ * @used_node_mask: nodemask_t of already used nodes
+ *
+ * We use a number of factors to determine which is the next node that should
+ * appear on a given node's fallback list.  The node should not have appeared
+ * already in @node's fallback list, and it should be the next closest node
+ * according to the distance array (which contains arbitrary distance values
+ * from each node to each node in the system), and should also prefer nodes
+ * with no CPUs, since presumably they'll have very little allocation pressure
+ * on them otherwise.
+ * It returns -1 if no node is found.
+ */
+static int __meminit find_next_best_node(int node, nodemask_t *used_node_mask)
+{
+	int n, val;
+	int min_val = INT_MAX;
+	int best_node = -1;
+
+	/* Use the local node if we haven't already */
+	if (!node_isset(node, *used_node_mask)) {
+		node_set(node, *used_node_mask);
+		return node;
+	}
+
+	for_each_online_node(n) {
+		cpumask_t tmp;
+
+		/* Don't want a node to appear more than once */
+		if (node_isset(n, *used_node_mask))
+			continue;
+
+		/* Use the distance array to find the distance */
+		val = node_distance(node, n);
+
+		/* Penalize nodes under us ("prefer the next node") */
+		val += (n < node);
+
+		/* Give preference to headless and unused nodes */
+		tmp = node_to_cpumask(n);
+		if (!cpus_empty(tmp))
+			val += PENALTY_FOR_NODE_WITH_CPUS;
+
+		/* Slight preference for less loaded node */
+		val *= (MAX_NODE_LOAD*MAX_NUMNODES);
+		val += node_load[n];
+
+		if (val < min_val) {
+			min_val = val;
+			best_node = n;
+		}
+	}
+
+	if (best_node >= 0)
+		node_set(best_node, *used_node_mask);
+
+	return best_node;
+}
+
+static void __meminit build_zonelists(pg_data_t *pgdat)
+{
+	int i, j, k, node, local_node;
+	int prev_node, load;
+	struct zonelist *zonelist;
+	nodemask_t used_mask;
+
+	/* initialize zonelists */
+	for (i = 0; i < GFP_ZONETYPES; i++) {
+		zonelist = pgdat->node_zonelists + i;
+		zonelist->zones[0] = NULL;
+	}
+
+	/* NUMA-aware ordering of nodes */
+	local_node = pgdat->node_id;
+	load = num_online_nodes();
+	prev_node = local_node;
+	nodes_clear(used_mask);
+	while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
+		int distance = node_distance(local_node, node);
+
+		/*
+		 * If another node is sufficiently far away then it is better
+		 * to reclaim pages in a zone before going off node.
+		 */
+		if (distance > RECLAIM_DISTANCE)
+			zone_reclaim_mode = 1;
+
+		/*
+		 * We don't want to pressure a particular node.
+		 * So adding penalty to the first node in same
+		 * distance group to make it round-robin.
+		 */
+
+		if (distance != node_distance(local_node, prev_node))
+			node_load[node] += load;
+		prev_node = node;
+		load--;
+		for (i = 0; i < GFP_ZONETYPES; i++) {
+			zonelist = pgdat->node_zonelists + i;
+			for (j = 0; zonelist->zones[j] != NULL; j++);
+
+			k = highest_zone(i);
+
+	 		j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
+			zonelist->zones[j] = NULL;
+		}
+	}
+}
+
+#else	/* CONFIG_NUMA */
+
+static void __meminit build_zonelists(pg_data_t *pgdat)
+{
+	int i, j, k, node, local_node;
+
+	local_node = pgdat->node_id;
+	for (i = 0; i < GFP_ZONETYPES; i++) {
+		struct zonelist *zonelist;
+
+		zonelist = pgdat->node_zonelists + i;
+
+		j = 0;
+		k = highest_zone(i);
+ 		j = build_zonelists_node(pgdat, zonelist, j, k);
+ 		/*
+ 		 * Now we build the zonelist so that it contains the zones
+ 		 * of all the other nodes.
+ 		 * We don't want to pressure a particular node, so when
+ 		 * building the zones for node N, we make sure that the
+ 		 * zones coming right after the local ones are those from
+ 		 * node N+1 (modulo N)
+ 		 */
+		for (node = local_node + 1; node < MAX_NUMNODES; node++) {
+			if (!node_online(node))
+				continue;
+			j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
+		}
+		for (node = 0; node < local_node; node++) {
+			if (!node_online(node))
+				continue;
+			j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
+		}
+
+		zonelist->zones[j] = NULL;
+	}
+}
+
+#endif	/* CONFIG_NUMA */
+
+/* return values int ....just for stop_machine_run() */
+static int __meminit __build_all_zonelists(void *dummy)
+{
+	int nid;
+	for_each_online_node(nid)
+		build_zonelists(NODE_DATA(nid));
+	return 0;
+}
+
+void __meminit build_all_zonelists(void)
+{
+	if (system_state == SYSTEM_BOOTING) {
+		__build_all_zonelists(0);
+		cpuset_init_current_mems_allowed();
+	} else {
+		/* we have to stop all cpus to guaranntee there is no user
+		   of zonelist */
+		stop_machine_run(__build_all_zonelists, NULL, NR_CPUS);
+		/* cpuset refresh routine should be here */
+	}
+
+	printk("Built %i zonelists\n", num_online_nodes());
+
+}
+
+/*
+ * Helper functions to size the waitqueue hash table.
+ * Essentially these want to choose hash table sizes sufficiently
+ * large so that collisions trying to wait on pages are rare.
+ * But in fact, the number of active page waitqueues on typical
+ * systems is ridiculously low, less than 200. So this is even
+ * conservative, even though it seems large.
+ *
+ * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
+ * waitqueues, i.e. the size of the waitq table given the number of pages.
+ */
+#define PAGES_PER_WAITQUEUE	256
+
+#ifndef CONFIG_MEMORY_HOTPLUG
+static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
+{
+	unsigned long size = 1;
+
+	pages /= PAGES_PER_WAITQUEUE;
+
+	while (size < pages)
+		size <<= 1;
+
+	/*
+	 * Once we have dozens or even hundreds of threads sleeping
+	 * on IO we've got bigger problems than wait queue collision.
+	 * Limit the size of the wait table to a reasonable size.
+	 */
+	size = min(size, 4096UL);
+
+	return max(size, 4UL);
+}
+#else
+/*
+ * A zone's size might be changed by hot-add, so it is not possible to determine
+ * a suitable size for its wait_table.  So we use the maximum size now.
+ *
+ * The max wait table size = 4096 x sizeof(wait_queue_head_t).   ie:
+ *
+ *    i386 (preemption config)    : 4096 x 16 = 64Kbyte.
+ *    ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.
+ *    ia64, x86-64 (preemption)   : 4096 x 24 = 96Kbyte.
+ *
+ * The maximum entries are prepared when a zone's memory is (512K + 256) pages
+ * or more by the traditional way. (See above).  It equals:
+ *
+ *    i386, x86-64, powerpc(4K page size) : =  ( 2G + 1M)byte.
+ *    ia64(16K page size)                 : =  ( 8G + 4M)byte.
+ *    powerpc (64K page size)             : =  (32G +16M)byte.
+ */
+static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
+{
+	return 4096UL;
+}
+#endif
+
+/*
+ * This is an integer logarithm so that shifts can be used later
+ * to extract the more random high bits from the multiplicative
+ * hash function before the remainder is taken.
+ */
+static inline unsigned long wait_table_bits(unsigned long size)
+{
+	return ffz(~size);
+}
+
+#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
+
+#ifndef __HAVE_ARCH_MEMMAP_INIT
+#define memmap_init(size, nid, zone, start_pfn) \
+	memmap_init_zone((size), (nid), (zone), (start_pfn))
+#endif
+
+/*
+ * Initially all pages are reserved - free ones are freed
+ * up by free_all_bootmem() once the early boot process is
+ * done. Non-atomic initialization, single-pass.
+ */
+void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
+		unsigned long start_pfn)
+{
+	struct page *page;
+	unsigned long end_pfn = start_pfn + size;
+	unsigned long pfn;
+
+	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
+		if (!early_pfn_valid(pfn))
+			continue;
+		page = pfn_to_page(pfn);
+		set_page_links(page, zone, nid, pfn);
+		init_page_count(page);
+		reset_page_mapcount(page);
+		SetPageReserved(page);
+		INIT_LIST_HEAD(&page->lru);
+#ifdef WANT_PAGE_VIRTUAL
+		/* The shift won't overflow because ZONE_NORMAL is below 4G. */
+		if (!is_highmem_idx(zone))
+			set_page_address(page, __va(pfn << PAGE_SHIFT));
+#endif
+	}
+}
+
+void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone,
+				unsigned long size)
+{
+	int order;
+	for (order = 0; order < MAX_ORDER ; order++) {
+		INIT_LIST_HEAD(&zone->free_area[order].free_list);
+		zone->free_area[order].nr_free = 0;
+	}
+}
+
+#define ZONETABLE_INDEX(x, zone_nr)	((x << ZONES_SHIFT) | zone_nr)
+void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn,
+		unsigned long size)
+{
+	unsigned long snum = pfn_to_section_nr(pfn);
+	unsigned long end = pfn_to_section_nr(pfn + size);
+
+	if (FLAGS_HAS_NODE)
+		zone_table[ZONETABLE_INDEX(nid, zid)] = zone;
+	else
+		for (; snum <= end; snum++)
+			zone_table[ZONETABLE_INDEX(snum, zid)] = zone;
+}
+
+static __meminit
+int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
+{
+	int i;
+	struct pglist_data *pgdat = zone->zone_pgdat;
+	size_t alloc_size;
+
+	/*
+	 * The per-page waitqueue mechanism uses hashed waitqueues
+	 * per zone.
+	 */
+	zone->wait_table_hash_nr_entries =
+		 wait_table_hash_nr_entries(zone_size_pages);
+	zone->wait_table_bits =
+		wait_table_bits(zone->wait_table_hash_nr_entries);
+	alloc_size = zone->wait_table_hash_nr_entries
+					* sizeof(wait_queue_head_t);
+
+ 	if (system_state == SYSTEM_BOOTING) {
+		zone->wait_table = (wait_queue_head_t *)
+			alloc_bootmem_node(pgdat, alloc_size);
+	} else {
+		/*
+		 * This case means that a zone whose size was 0 gets new memory
+		 * via memory hot-add.
+		 * But it may be the case that a new node was hot-added.  In
+		 * this case vmalloc() will not be able to use this new node's
+		 * memory - this wait_table must be initialized to use this new
+		 * node itself as well.
+		 * To use this new node's memory, further consideration will be
+		 * necessary.
+		 */
+		zone->wait_table = (wait_queue_head_t *)vmalloc(alloc_size);
+	}
+	if (!zone->wait_table)
+		return -ENOMEM;
+
+	for(i = 0; i < zone->wait_table_hash_nr_entries; ++i)
+		init_waitqueue_head(zone->wait_table + i);
+
+	return 0;
+}
+
+/*
+ * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist
+ * to the value high for the pageset p.
+ */
+static void setup_pagelist_highmark(struct per_cpu_pageset *p,
+				unsigned long high)
+{
+	struct per_cpu_pages *pcp;
+
+	pcp = &p->pcp[0]; /* hot list */
+	pcp->high = high;
+	pcp->batch = max(1UL, high/4);
+	if ((high/4) > (PAGE_SHIFT * 8))
+		pcp->batch = PAGE_SHIFT * 8;
+}
+
+/*
+ * percpu_pagelist_fraction - changes the pcp->high for each zone on each
+ * cpu.  It is the fraction of total pages in each zone that a hot per cpu pagelist
+ * can have before it gets flushed back to buddy allocator.
+ */
+int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
+	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+{
+	struct zone *zone;
+	unsigned int cpu;
+	int ret;
+
+	ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
+	if (!write || (ret == -EINVAL))
+		return ret;
+	for_each_zone(zone) {
+		for_each_online_cpu(cpu) {
+			unsigned long  high;
+			high = zone->present_pages / percpu_pagelist_fraction;
+			setup_pagelist_highmark(zone_pcp(zone, cpu), high);
+		}
+	}
+	return 0;
+}
+
+static int __cpuinit zone_batchsize(struct zone *zone)
+{
+	int batch;
+
+	/*
+	 * The per-cpu-pages pools are set to around 1000th of the
+	 * size of the zone.  But no more than 1/2 of a meg.
+	 *
+	 * OK, so we don't know how big the cache is.  So guess.
+	 */
+	batch = zone->present_pages / 1024;
+	if (batch * PAGE_SIZE > 512 * 1024)
+		batch = (512 * 1024) / PAGE_SIZE;
+	batch /= 4;		/* We effectively *= 4 below */
+	if (batch < 1)
+		batch = 1;
+
+	/*
+	 * Clamp the batch to a 2^n - 1 value. Having a power
+	 * of 2 value was found to be more likely to have
+	 * suboptimal cache aliasing properties in some cases.
+	 *
+	 * For example if 2 tasks are alternately allocating
+	 * batches of pages, one task can end up with a lot
+	 * of pages of one half of the possible page colors
+	 * and the other with pages of the other colors.
+	 */
+	batch = (1 << (fls(batch + batch/2)-1)) - 1;
+
+	return batch;
+}
+
+inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
+{
+	struct per_cpu_pages *pcp;
+
+	memset(p, 0, sizeof(*p));
+
+	pcp = &p->pcp[0];		/* hot */
+	pcp->count = 0;
+	pcp->high = 6 * batch;
+	pcp->batch = max(1UL, 1 * batch);
+	INIT_LIST_HEAD(&pcp->list);
+
+	pcp = &p->pcp[1];		/* cold*/
+	pcp->count = 0;
+	pcp->high = 2 * batch;
+	pcp->batch = max(1UL, batch/2);
+	INIT_LIST_HEAD(&pcp->list);
+}
+
+#ifdef CONFIG_NUMA
+/*
+ * Boot pageset table. One per cpu which is going to be used for all
+ * zones and all nodes. The parameters will be set in such a way
+ * that an item put on a list will immediately be handed over to
+ * the buddy list. This is safe since pageset manipulation is done
+ * with interrupts disabled.
+ *
+ * Some NUMA counter updates may also be caught by the boot pagesets.
+ *
+ * The boot_pagesets must be kept even after bootup is complete for
+ * unused processors and/or zones. They do play a role for bootstrapping
+ * hotplugged processors.
+ *
+ * zoneinfo_show() and maybe other functions do
+ * not check if the processor is online before following the pageset pointer.
+ * Other parts of the kernel may not check if the zone is available.
+ */
+static struct per_cpu_pageset boot_pageset[NR_CPUS];
+
+/*
+ * Dynamically allocate memory for the
+ * per cpu pageset array in struct zone.
+ */
+static int __cpuinit process_zones(int cpu)
+{
+	struct zone *zone, *dzone;
+
+	for_each_zone(zone) {
+
+		zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
+					 GFP_KERNEL, cpu_to_node(cpu));
+		if (!zone_pcp(zone, cpu))
+			goto bad;
+
+		setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone));
+
+		if (percpu_pagelist_fraction)
+			setup_pagelist_highmark(zone_pcp(zone, cpu),
+			 	(zone->present_pages / percpu_pagelist_fraction));
+	}
+
+	return 0;
+bad:
+	for_each_zone(dzone) {
+		if (dzone == zone)
+			break;
+		kfree(zone_pcp(dzone, cpu));
+		zone_pcp(dzone, cpu) = NULL;
+	}
+	return -ENOMEM;
+}
+
+static inline void free_zone_pagesets(int cpu)
+{
+	struct zone *zone;
+
+	for_each_zone(zone) {
+		struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
+
+		zone_pcp(zone, cpu) = NULL;
+		kfree(pset);
+	}
+}
+
+static int pageset_cpuup_callback(struct notifier_block *nfb,
+		unsigned long action,
+		void *hcpu)
+{
+	int cpu = (long)hcpu;
+	int ret = NOTIFY_OK;
+
+	switch (action) {
+		case CPU_UP_PREPARE:
+			if (process_zones(cpu))
+				ret = NOTIFY_BAD;
+			break;
+		case CPU_UP_CANCELED:
+		case CPU_DEAD:
+			free_zone_pagesets(cpu);
+			break;
+		default:
+			break;
+	}
+	return ret;
+}
+
+static struct notifier_block pageset_notifier =
+	{ &pageset_cpuup_callback, NULL, 0 };
+
+void __init setup_per_cpu_pageset(void)
+{
+	int err;
+
+	/* Initialize per_cpu_pageset for cpu 0.
+	 * A cpuup callback will do this for every cpu
+	 * as it comes online
+	 */
+	err = process_zones(smp_processor_id());
+	BUG_ON(err);
+	register_cpu_notifier(&pageset_notifier);
+}
+#endif
+
+static __meminit void zone_pcp_init(struct zone *zone)
+{
+	int cpu;
+	unsigned long batch = zone_batchsize(zone);
+
+	for (cpu = 0; cpu < NR_CPUS; cpu++) {
+#ifdef CONFIG_NUMA
+		/* Early boot. Slab allocator not functional yet */
+		zone_pcp(zone, cpu) = &boot_pageset[cpu];
+		setup_pageset(&boot_pageset[cpu],0);
+#else
+		setup_pageset(zone_pcp(zone,cpu), batch);
+#endif
+	}
+	if (zone->present_pages)
+		printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%lu\n",
+			zone->name, zone->present_pages, batch);
+}
+
+__meminit int init_currently_empty_zone(struct zone *zone,
+					unsigned long zone_start_pfn,
+					unsigned long size)
+{
+	struct pglist_data *pgdat = zone->zone_pgdat;
+	int ret;
+	ret = zone_wait_table_init(zone, size);
+	if (ret)
+		return ret;
+	pgdat->nr_zones = zone_idx(zone) + 1;
+
+	zone->zone_start_pfn = zone_start_pfn;
+
+	memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn);
+
+	zone_init_free_lists(pgdat, zone, zone->spanned_pages);
+
+	return 0;
+}
+
+#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
+/* Note: nid == MAX_NUMNODES returns first region */
+static int __init first_active_region_index_in_nid(int nid)
+{
+	int i;
+	for (i = 0; early_node_map[i].end_pfn; i++) {
+		if (nid == MAX_NUMNODES || early_node_map[i].nid == nid)
+			return i;
+	}
+
+	return MAX_ACTIVE_REGIONS;
+}
+
+/* Note: nid == MAX_NUMNODES returns next region */
+static int __init next_active_region_index_in_nid(unsigned int index, int nid)
+{
+	for (index = index + 1; early_node_map[index].end_pfn; index++) {
+		if (nid == MAX_NUMNODES || early_node_map[index].nid == nid)
+			return index;
+	}
+
+	return MAX_ACTIVE_REGIONS;
+}
+
+#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
+int __init early_pfn_to_nid(unsigned long pfn)
+{
+	int i;
+
+	for (i = 0; early_node_map[i].end_pfn; i++) {
+		unsigned long start_pfn = early_node_map[i].start_pfn;
+		unsigned long end_pfn = early_node_map[i].end_pfn;
+
+		if ((start_pfn <= pfn) && (pfn < end_pfn))
+			return early_node_map[i].nid;
+	}
+
+	return -1;
+}
+#endif
+
+#define for_each_active_range_index_in_nid(i, nid) \
+	for (i = first_active_region_index_in_nid(nid); \
+				i != MAX_ACTIVE_REGIONS; \
+				i = next_active_region_index_in_nid(i, nid))
+
+void __init free_bootmem_with_active_regions(int nid,
+						unsigned long max_low_pfn)
+{
+	unsigned int i;
+	for_each_active_range_index_in_nid(i, nid) {
+		unsigned long size_pages = 0;
+		unsigned long end_pfn = early_node_map[i].end_pfn;
+		if (early_node_map[i].start_pfn >= max_low_pfn)
+			continue;
+
+		if (end_pfn > max_low_pfn)
+			end_pfn = max_low_pfn;
+
+		size_pages = end_pfn - early_node_map[i].start_pfn;
+		free_bootmem_node(NODE_DATA(early_node_map[i].nid),
+				PFN_PHYS(early_node_map[i].start_pfn),
+				size_pages << PAGE_SHIFT);
+	}
+}
+
+void __init sparse_memory_present_with_active_regions(int nid)
+{
+	unsigned int i;
+	for_each_active_range_index_in_nid(i, nid)
+		memory_present(early_node_map[i].nid,
+				early_node_map[i].start_pfn,
+				early_node_map[i].end_pfn);
+}
+
+void __init get_pfn_range_for_nid(unsigned int nid,
+			unsigned long *start_pfn, unsigned long *end_pfn)
+{
+	unsigned int i;
+	*start_pfn = -1UL;
+	*end_pfn = 0;
+
+	for_each_active_range_index_in_nid(i, nid) {
+		*start_pfn = min(*start_pfn, early_node_map[i].start_pfn);
+		*end_pfn = max(*end_pfn, early_node_map[i].end_pfn);
+	}
+
+	if (*start_pfn == -1UL) {
+		printk(KERN_WARNING "Node %u active with no memory\n", nid);
+		*start_pfn = 0;
+	}
+}
+
+unsigned long __init zone_present_pages_in_node(int nid,
+					unsigned long zone_type,
+					unsigned long *ignored)
+{
+	unsigned long node_start_pfn, node_end_pfn;
+	unsigned long zone_start_pfn, zone_end_pfn;
+
+	/* Get the start and end of the node and zone */
+	get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
+	zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
+	zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
+
+	/* Check that this node has pages within the zone's required range */
+	if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)
+		return 0;
+
+	/* Move the zone boundaries inside the node if necessary */
+	zone_end_pfn = min(zone_end_pfn, node_end_pfn);
+	zone_start_pfn = max(zone_start_pfn, node_start_pfn);
+
+	/* Return the spanned pages */
+	return zone_end_pfn - zone_start_pfn;
+}
+
+unsigned long __init __absent_pages_in_range(int nid,
+				unsigned long range_start_pfn,
+				unsigned long range_end_pfn)
+{
+	int i = 0;
+	unsigned long prev_end_pfn = 0, hole_pages = 0;
+	unsigned long start_pfn;
+
+	/* Find the end_pfn of the first active range of pfns in the node */
+	i = first_active_region_index_in_nid(nid);
+	if (i == MAX_ACTIVE_REGIONS)
+		return 0;
+	prev_end_pfn = early_node_map[i].start_pfn;
+
+	/* Find all holes for the zone within the node */
+	for (; i != MAX_ACTIVE_REGIONS;
+			i = next_active_region_index_in_nid(i, nid)) {
+
+		/* No need to continue if prev_end_pfn is outside the zone */
+		if (prev_end_pfn >= range_end_pfn)
+			break;
+
+		/* Make sure the end of the zone is not within the hole */
+		start_pfn = min(early_node_map[i].start_pfn, range_end_pfn);
+		prev_end_pfn = max(prev_end_pfn, range_start_pfn);
+
+		/* Update the hole size cound and move on */
+		if (start_pfn > range_start_pfn) {
+			BUG_ON(prev_end_pfn > start_pfn);
+			hole_pages += start_pfn - prev_end_pfn;
+		}
+		prev_end_pfn = early_node_map[i].end_pfn;
+	}
+
+	return hole_pages;
+}
+
+unsigned long __init absent_pages_in_range(unsigned long start_pfn,
+							unsigned long end_pfn)
+{
+	return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
+}
+
+unsigned long __init zone_absent_pages_in_node(int nid,
+					unsigned long zone_type,
+					unsigned long *ignored)
+{
+	return __absent_pages_in_range(nid,
+				arch_zone_lowest_possible_pfn[zone_type],
+				arch_zone_highest_possible_pfn[zone_type]);
+}
+#else
+static inline unsigned long zone_present_pages_in_node(int nid,
+					unsigned long zone_type,
+					unsigned long *zones_size)
+{
+	return zones_size[zone_type];
+}
+
+static inline unsigned long zone_absent_pages_in_node(int nid,
+						unsigned long zone_type,
+						unsigned long *zholes_size)
+{
+	if (!zholes_size)
+		return 0;
+
+	return zholes_size[zone_type];
+}
+#endif
+
+static void __init calculate_node_totalpages(struct pglist_data *pgdat,
+		unsigned long *zones_size, unsigned long *zholes_size)
+{
+	unsigned long realtotalpages, totalpages = 0;
+	int i;
+
+	for (i = 0; i < MAX_NR_ZONES; i++) {
+		totalpages += zone_present_pages_in_node(pgdat->node_id, i,
+								zones_size);
+	}
+	pgdat->node_spanned_pages = totalpages;
+
+	realtotalpages = totalpages;
+	for (i = 0; i < MAX_NR_ZONES; i++) {
+		realtotalpages -=
+			zone_absent_pages_in_node(pgdat->node_id, i, zholes_size);
+	}
+	pgdat->node_present_pages = realtotalpages;
+	printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
+							realtotalpages);
+}
+
+/*
+ * Set up the zone data structures:
+ *   - mark all pages reserved
+ *   - mark all memory queues empty
+ *   - clear the memory bitmaps
+ */
+static void __meminit free_area_init_core(struct pglist_data *pgdat,
+		unsigned long *zones_size, unsigned long *zholes_size)
+{
+	unsigned long j;
+	int nid = pgdat->node_id;
+	unsigned long zone_start_pfn = pgdat->node_start_pfn;
+	int ret;
+
+	pgdat_resize_init(pgdat);
+	pgdat->nr_zones = 0;
+	init_waitqueue_head(&pgdat->kswapd_wait);
+	pgdat->kswapd_max_order = 0;
+
+	for (j = 0; j < MAX_NR_ZONES; j++) {
+		struct zone *zone = pgdat->node_zones + j;
+		unsigned long size, realsize;
+
+		size = zone_present_pages_in_node(nid, j, zones_size);
+		realsize = size - zone_absent_pages_in_node(nid, j,
+								zholes_size);
+		if (j < ZONE_HIGHMEM)
+			nr_kernel_pages += realsize;
+		nr_all_pages += realsize;
+
+		zone->spanned_pages = size;
+		zone->present_pages = realsize;
+		zone->name = zone_names[j];
+		spin_lock_init(&zone->lock);
+		spin_lock_init(&zone->lru_lock);
+		zone_seqlock_init(zone);
+		zone->zone_pgdat = pgdat;
+		zone->free_pages = 0;
+
+		zone->temp_priority = zone->prev_priority = DEF_PRIORITY;
+
+		zone_pcp_init(zone);
+		INIT_LIST_HEAD(&zone->active_list);
+		INIT_LIST_HEAD(&zone->inactive_list);
+		zone->nr_scan_active = 0;
+		zone->nr_scan_inactive = 0;
+		zone->nr_active = 0;
+		zone->nr_inactive = 0;
+		atomic_set(&zone->reclaim_in_progress, 0);
+		if (!size)
+			continue;
+
+		zonetable_add(zone, nid, j, zone_start_pfn, size);
+		ret = init_currently_empty_zone(zone, zone_start_pfn, size);
+		BUG_ON(ret);
+		zone_start_pfn += size;
+	}
+}
+
+static void __init alloc_node_mem_map(struct pglist_data *pgdat)
+{
+	/* Skip empty nodes */
+	if (!pgdat->node_spanned_pages)
+		return;
+
+#ifdef CONFIG_FLAT_NODE_MEM_MAP
+	/* ia64 gets its own node_mem_map, before this, without bootmem */
+	if (!pgdat->node_mem_map) {
+		unsigned long size;
+		struct page *map;
+
+		size = (pgdat->node_spanned_pages + 1) * sizeof(struct page);
+		map = alloc_remap(pgdat->node_id, size);
+		if (!map)
+			map = alloc_bootmem_node(pgdat, size);
+		pgdat->node_mem_map = map;
+	}
+#ifdef CONFIG_FLATMEM
+	/*
+	 * With no DISCONTIG, the global mem_map is just set as node 0's
+	 */
+	if (pgdat == NODE_DATA(0))
+		mem_map = NODE_DATA(0)->node_mem_map;
+#endif
+#endif /* CONFIG_FLAT_NODE_MEM_MAP */
+}
+
+void __meminit free_area_init_node(int nid, struct pglist_data *pgdat,
+		unsigned long *zones_size, unsigned long node_start_pfn,
+		unsigned long *zholes_size)
+{
+	pgdat->node_id = nid;
+	pgdat->node_start_pfn = node_start_pfn;
+	calculate_node_totalpages(pgdat, zones_size, zholes_size);
+
+	alloc_node_mem_map(pgdat);
+
+	free_area_init_core(pgdat, zones_size, zholes_size);
+}
+
+#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
+void __init add_active_range(unsigned int nid, unsigned long start_pfn,
+						unsigned long end_pfn)
+{
+	unsigned int i;
+
+	/* Merge with existing active regions if possible */
+	for (i = 0; early_node_map[i].end_pfn; i++) {
+		if (early_node_map[i].nid != nid)
+			continue;
+
+		/* Skip if an existing region covers this new one */
+		if (start_pfn >= early_node_map[i].start_pfn &&
+				end_pfn <= early_node_map[i].end_pfn)
+			return;
+
+		/* Merge forward if suitable */
+		if (start_pfn <= early_node_map[i].end_pfn &&
+				end_pfn > early_node_map[i].end_pfn) {
+			early_node_map[i].end_pfn = end_pfn;
+			return;
+		}
+
+		/* Merge backward if suitable */
+		if (start_pfn < early_node_map[i].end_pfn &&
+				end_pfn >= early_node_map[i].start_pfn) {
+			early_node_map[i].start_pfn = start_pfn;
+			return;
+		}
+	}
+
+	/* Leave last entry NULL, we use range.end_pfn to terminate the walk */
+	if (i >= MAX_ACTIVE_REGIONS - 1) {
+		printk(KERN_ERR "Too many memory regions, truncating\n");
+		return;
+	}
+
+	early_node_map[i].nid = nid;
+	early_node_map[i].start_pfn = start_pfn;
+	early_node_map[i].end_pfn = end_pfn;
+}
+
+void __init shrink_active_range(unsigned int nid, unsigned long old_end_pfn,
+						unsigned long new_end_pfn)
+{
+	unsigned int i;
+
+	/* Find the old active region end and shrink */
+	for_each_active_range_index_in_nid(i, nid) {
+		if (early_node_map[i].end_pfn == old_end_pfn) {
+			early_node_map[i].end_pfn = new_end_pfn;
+			break;
+		}
+	}
+}
+
+void __init remove_all_active_ranges()
+{
+	memset(early_node_map, 0, sizeof(early_node_map));
+}
+
+/* Compare two active node_active_regions */
+static int __init cmp_node_active_region(const void *a, const void *b)
+{
+	struct node_active_region *arange = (struct node_active_region *)a;
+	struct node_active_region *brange = (struct node_active_region *)b;
+
+	/* Done this way to avoid overflows */
+	if (arange->start_pfn > brange->start_pfn)
+		return 1;
+	if (arange->start_pfn < brange->start_pfn)
+		return -1;
+
+	return 0;
+}
+
+/* sort the node_map by start_pfn */
+static void __init sort_node_map(void)
+{
+	size_t num = 0;
+	while (early_node_map[num].end_pfn)
+		num++;
+
+	sort(early_node_map, num, sizeof(struct node_active_region),
+						cmp_node_active_region, NULL);
+}
+
+/* Find the lowest pfn for a node. This depends on a sorted early_node_map */
+unsigned long __init find_min_pfn_for_node(unsigned long nid)
+{
+	int i;
+
+	/* Assuming a sorted map, the first range found has the starting pfn */
+	for_each_active_range_index_in_nid(i, nid)
+		return early_node_map[i].start_pfn;
+
+	printk(KERN_WARNING "Could not find start_pfn for node %lu\n", nid);
+	return 0;
+}
+
+unsigned long __init find_min_pfn_with_active_regions(void)
+{
+	return find_min_pfn_for_node(MAX_NUMNODES);
+}
+
+unsigned long __init find_max_pfn_with_active_regions(void)
+{
+	int i;
+	unsigned long max_pfn = 0;
+
+	for (i = 0; early_node_map[i].end_pfn; i++)
+		max_pfn = max(max_pfn, early_node_map[i].end_pfn);
+
+	return max_pfn;
+}
+
+void __init free_area_init_nodes(unsigned long arch_max_dma_pfn,
+				unsigned long arch_max_dma32_pfn,
+				unsigned long arch_max_low_pfn,
+				unsigned long arch_max_high_pfn)
+{
+	unsigned long nid;
+	int zone_index;
+
+	/* Record where the zone boundaries are */
+	memset(arch_zone_lowest_possible_pfn, 0,
+				sizeof(arch_zone_lowest_possible_pfn));
+	memset(arch_zone_highest_possible_pfn, 0,
+				sizeof(arch_zone_highest_possible_pfn));
+	arch_zone_lowest_possible_pfn[ZONE_DMA] =
+					find_min_pfn_with_active_regions();
+	arch_zone_highest_possible_pfn[ZONE_DMA] = arch_max_dma_pfn;
+	arch_zone_highest_possible_pfn[ZONE_DMA32] = arch_max_dma32_pfn;
+	arch_zone_highest_possible_pfn[ZONE_NORMAL] = arch_max_low_pfn;
+	arch_zone_highest_possible_pfn[ZONE_HIGHMEM] = arch_max_high_pfn;
+	for (zone_index = 1; zone_index < MAX_NR_ZONES; zone_index++) {
+		arch_zone_lowest_possible_pfn[zone_index] =
+			arch_zone_highest_possible_pfn[zone_index-1];
+	}
+
+	/* Regions in the early_node_map can be in any order */
+	sort_node_map();
+
+	for_each_online_node(nid) {
+		pg_data_t *pgdat = NODE_DATA(nid);
+		free_area_init_node(nid, pgdat, NULL,
+				find_min_pfn_for_node(nid), NULL);
+	}
+}
+#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc3-mm1-105-ia64_use_init_nodes/mm/page_alloc.c linux-2.6.17-rc3-mm1-106-breakout_mem_init/mm/page_alloc.c
--- linux-2.6.17-rc3-mm1-105-ia64_use_init_nodes/mm/page_alloc.c	2006-05-01 11:39:02.000000000 +0100
+++ linux-2.6.17-rc3-mm1-106-breakout_mem_init/mm/page_alloc.c	2006-05-01 11:44:35.000000000 +0100
@@ -38,8 +38,6 @@
 #include <linux/vmalloc.h>
 #include <linux/mempolicy.h>
 #include <linux/stop_machine.h>
-#include <linux/sort.h>
-#include <linux/pfn.h>
 
 #include <asm/tlbflush.h>
 #include "internal.h"
@@ -56,7 +54,6 @@ unsigned long totalram_pages __read_most
 unsigned long totalhigh_pages __read_mostly;
 unsigned long totalreserve_pages __read_mostly;
 long nr_swap_pages;
-int percpu_pagelist_fraction;
 
 static void __free_pages_ok(struct page *page, unsigned int order);
 
@@ -82,24 +79,11 @@ EXPORT_SYMBOL(totalram_pages);
 struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly;
 EXPORT_SYMBOL(zone_table);
 
-static char *zone_names[MAX_NR_ZONES] = { "DMA", "DMA32", "Normal", "HighMem" };
 int min_free_kbytes = 1024;
 
 unsigned long __meminitdata nr_kernel_pages;
 unsigned long __meminitdata nr_all_pages;
 
-#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
-  #ifdef CONFIG_MAX_ACTIVE_REGIONS
-    #define MAX_ACTIVE_REGIONS CONFIG_MAX_ACTIVE_REGIONS
-  #else
-    #define MAX_ACTIVE_REGIONS (MAX_NR_ZONES * MAX_NUMNODES + 1)
-  #endif
-
-  struct node_active_region __initdata early_node_map[MAX_ACTIVE_REGIONS];
-  unsigned long __initdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
-  unsigned long __initdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
-#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
-
 #ifdef CONFIG_DEBUG_VM
 static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
 {
@@ -1593,1068 +1577,6 @@ void show_free_areas(void)
 	show_swap_cache_info();
 }
 
-/*
- * Builds allocation fallback zone lists.
- *
- * Add all populated zones of a node to the zonelist.
- */
-static int __meminit build_zonelists_node(pg_data_t *pgdat,
-			struct zonelist *zonelist, int nr_zones, int zone_type)
-{
-	struct zone *zone;
-
-	BUG_ON(zone_type > ZONE_HIGHMEM);
-
-	do {
-		zone = pgdat->node_zones + zone_type;
-		if (populated_zone(zone)) {
-#ifndef CONFIG_HIGHMEM
-			BUG_ON(zone_type > ZONE_NORMAL);
-#endif
-			zonelist->zones[nr_zones++] = zone;
-			check_highest_zone(zone_type);
-		}
-		zone_type--;
-
-	} while (zone_type >= 0);
-	return nr_zones;
-}
-
-static inline int highest_zone(int zone_bits)
-{
-	int res = ZONE_NORMAL;
-	if (zone_bits & (__force int)__GFP_HIGHMEM)
-		res = ZONE_HIGHMEM;
-	if (zone_bits & (__force int)__GFP_DMA32)
-		res = ZONE_DMA32;
-	if (zone_bits & (__force int)__GFP_DMA)
-		res = ZONE_DMA;
-	return res;
-}
-
-#ifdef CONFIG_NUMA
-#define MAX_NODE_LOAD (num_online_nodes())
-static int __meminitdata node_load[MAX_NUMNODES];
-/**
- * find_next_best_node - find the next node that should appear in a given node's fallback list
- * @node: node whose fallback list we're appending
- * @used_node_mask: nodemask_t of already used nodes
- *
- * We use a number of factors to determine which is the next node that should
- * appear on a given node's fallback list.  The node should not have appeared
- * already in @node's fallback list, and it should be the next closest node
- * according to the distance array (which contains arbitrary distance values
- * from each node to each node in the system), and should also prefer nodes
- * with no CPUs, since presumably they'll have very little allocation pressure
- * on them otherwise.
- * It returns -1 if no node is found.
- */
-static int __meminit find_next_best_node(int node, nodemask_t *used_node_mask)
-{
-	int n, val;
-	int min_val = INT_MAX;
-	int best_node = -1;
-
-	/* Use the local node if we haven't already */
-	if (!node_isset(node, *used_node_mask)) {
-		node_set(node, *used_node_mask);
-		return node;
-	}
-
-	for_each_online_node(n) {
-		cpumask_t tmp;
-
-		/* Don't want a node to appear more than once */
-		if (node_isset(n, *used_node_mask))
-			continue;
-
-		/* Use the distance array to find the distance */
-		val = node_distance(node, n);
-
-		/* Penalize nodes under us ("prefer the next node") */
-		val += (n < node);
-
-		/* Give preference to headless and unused nodes */
-		tmp = node_to_cpumask(n);
-		if (!cpus_empty(tmp))
-			val += PENALTY_FOR_NODE_WITH_CPUS;
-
-		/* Slight preference for less loaded node */
-		val *= (MAX_NODE_LOAD*MAX_NUMNODES);
-		val += node_load[n];
-
-		if (val < min_val) {
-			min_val = val;
-			best_node = n;
-		}
-	}
-
-	if (best_node >= 0)
-		node_set(best_node, *used_node_mask);
-
-	return best_node;
-}
-
-static void __meminit build_zonelists(pg_data_t *pgdat)
-{
-	int i, j, k, node, local_node;
-	int prev_node, load;
-	struct zonelist *zonelist;
-	nodemask_t used_mask;
-
-	/* initialize zonelists */
-	for (i = 0; i < GFP_ZONETYPES; i++) {
-		zonelist = pgdat->node_zonelists + i;
-		zonelist->zones[0] = NULL;
-	}
-
-	/* NUMA-aware ordering of nodes */
-	local_node = pgdat->node_id;
-	load = num_online_nodes();
-	prev_node = local_node;
-	nodes_clear(used_mask);
-	while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
-		int distance = node_distance(local_node, node);
-
-		/*
-		 * If another node is sufficiently far away then it is better
-		 * to reclaim pages in a zone before going off node.
-		 */
-		if (distance > RECLAIM_DISTANCE)
-			zone_reclaim_mode = 1;
-
-		/*
-		 * We don't want to pressure a particular node.
-		 * So adding penalty to the first node in same
-		 * distance group to make it round-robin.
-		 */
-
-		if (distance != node_distance(local_node, prev_node))
-			node_load[node] += load;
-		prev_node = node;
-		load--;
-		for (i = 0; i < GFP_ZONETYPES; i++) {
-			zonelist = pgdat->node_zonelists + i;
-			for (j = 0; zonelist->zones[j] != NULL; j++);
-
-			k = highest_zone(i);
-
-	 		j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
-			zonelist->zones[j] = NULL;
-		}
-	}
-}
-
-#else	/* CONFIG_NUMA */
-
-static void __meminit build_zonelists(pg_data_t *pgdat)
-{
-	int i, j, k, node, local_node;
-
-	local_node = pgdat->node_id;
-	for (i = 0; i < GFP_ZONETYPES; i++) {
-		struct zonelist *zonelist;
-
-		zonelist = pgdat->node_zonelists + i;
-
-		j = 0;
-		k = highest_zone(i);
- 		j = build_zonelists_node(pgdat, zonelist, j, k);
- 		/*
- 		 * Now we build the zonelist so that it contains the zones
- 		 * of all the other nodes.
- 		 * We don't want to pressure a particular node, so when
- 		 * building the zones for node N, we make sure that the
- 		 * zones coming right after the local ones are those from
- 		 * node N+1 (modulo N)
- 		 */
-		for (node = local_node + 1; node < MAX_NUMNODES; node++) {
-			if (!node_online(node))
-				continue;
-			j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
-		}
-		for (node = 0; node < local_node; node++) {
-			if (!node_online(node))
-				continue;
-			j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
-		}
-
-		zonelist->zones[j] = NULL;
-	}
-}
-
-#endif	/* CONFIG_NUMA */
-
-/* return values int ....just for stop_machine_run() */
-static int __meminit __build_all_zonelists(void *dummy)
-{
-	int nid;
-	for_each_online_node(nid)
-		build_zonelists(NODE_DATA(nid));
-	return 0;
-}
-
-void __meminit build_all_zonelists(void)
-{
-	if (system_state == SYSTEM_BOOTING) {
-		__build_all_zonelists(0);
-		cpuset_init_current_mems_allowed();
-	} else {
-		/* we have to stop all cpus to guaranntee there is no user
-		   of zonelist */
-		stop_machine_run(__build_all_zonelists, NULL, NR_CPUS);
-		/* cpuset refresh routine should be here */
-	}
-
-	printk("Built %i zonelists\n", num_online_nodes());
-
-}
-
-/*
- * Helper functions to size the waitqueue hash table.
- * Essentially these want to choose hash table sizes sufficiently
- * large so that collisions trying to wait on pages are rare.
- * But in fact, the number of active page waitqueues on typical
- * systems is ridiculously low, less than 200. So this is even
- * conservative, even though it seems large.
- *
- * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
- * waitqueues, i.e. the size of the waitq table given the number of pages.
- */
-#define PAGES_PER_WAITQUEUE	256
-
-#ifndef CONFIG_MEMORY_HOTPLUG
-static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
-{
-	unsigned long size = 1;
-
-	pages /= PAGES_PER_WAITQUEUE;
-
-	while (size < pages)
-		size <<= 1;
-
-	/*
-	 * Once we have dozens or even hundreds of threads sleeping
-	 * on IO we've got bigger problems than wait queue collision.
-	 * Limit the size of the wait table to a reasonable size.
-	 */
-	size = min(size, 4096UL);
-
-	return max(size, 4UL);
-}
-#else
-/*
- * A zone's size might be changed by hot-add, so it is not possible to determine
- * a suitable size for its wait_table.  So we use the maximum size now.
- *
- * The max wait table size = 4096 x sizeof(wait_queue_head_t).   ie:
- *
- *    i386 (preemption config)    : 4096 x 16 = 64Kbyte.
- *    ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.
- *    ia64, x86-64 (preemption)   : 4096 x 24 = 96Kbyte.
- *
- * The maximum entries are prepared when a zone's memory is (512K + 256) pages
- * or more by the traditional way. (See above).  It equals:
- *
- *    i386, x86-64, powerpc(4K page size) : =  ( 2G + 1M)byte.
- *    ia64(16K page size)                 : =  ( 8G + 4M)byte.
- *    powerpc (64K page size)             : =  (32G +16M)byte.
- */
-static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
-{
-	return 4096UL;
-}
-#endif
-
-/*
- * This is an integer logarithm so that shifts can be used later
- * to extract the more random high bits from the multiplicative
- * hash function before the remainder is taken.
- */
-static inline unsigned long wait_table_bits(unsigned long size)
-{
-	return ffz(~size);
-}
-
-#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
-
-/*
- * Initially all pages are reserved - free ones are freed
- * up by free_all_bootmem() once the early boot process is
- * done. Non-atomic initialization, single-pass.
- */
-void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
-		unsigned long start_pfn)
-{
-	struct page *page;
-	unsigned long end_pfn = start_pfn + size;
-	unsigned long pfn;
-
-	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
-		if (!early_pfn_valid(pfn))
-			continue;
-		page = pfn_to_page(pfn);
-		set_page_links(page, zone, nid, pfn);
-		init_page_count(page);
-		reset_page_mapcount(page);
-		SetPageReserved(page);
-		INIT_LIST_HEAD(&page->lru);
-#ifdef WANT_PAGE_VIRTUAL
-		/* The shift won't overflow because ZONE_NORMAL is below 4G. */
-		if (!is_highmem_idx(zone))
-			set_page_address(page, __va(pfn << PAGE_SHIFT));
-#endif
-#ifdef CONFIG_PAGE_OWNER
-		page->order = -1;
-#endif
-	}
-}
-
-void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone,
-				unsigned long size)
-{
-	int order;
-	for (order = 0; order < MAX_ORDER ; order++) {
-		INIT_LIST_HEAD(&zone->free_area[order].free_list);
-		zone->free_area[order].nr_free = 0;
-	}
-}
-
-#define ZONETABLE_INDEX(x, zone_nr)	((x << ZONES_SHIFT) | zone_nr)
-void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn,
-		unsigned long size)
-{
-	unsigned long snum = pfn_to_section_nr(pfn);
-	unsigned long end = pfn_to_section_nr(pfn + size);
-
-	if (FLAGS_HAS_NODE)
-		zone_table[ZONETABLE_INDEX(nid, zid)] = zone;
-	else
-		for (; snum <= end; snum++)
-			zone_table[ZONETABLE_INDEX(snum, zid)] = zone;
-}
-
-#ifndef __HAVE_ARCH_MEMMAP_INIT
-#define memmap_init(size, nid, zone, start_pfn) \
-	memmap_init_zone((size), (nid), (zone), (start_pfn))
-#endif
-
-static int __cpuinit zone_batchsize(struct zone *zone)
-{
-	int batch;
-
-	/*
-	 * The per-cpu-pages pools are set to around 1000th of the
-	 * size of the zone.  But no more than 1/2 of a meg.
-	 *
-	 * OK, so we don't know how big the cache is.  So guess.
-	 */
-	batch = zone->present_pages / 1024;
-	if (batch * PAGE_SIZE > 512 * 1024)
-		batch = (512 * 1024) / PAGE_SIZE;
-	batch /= 4;		/* We effectively *= 4 below */
-	if (batch < 1)
-		batch = 1;
-
-	/*
-	 * Clamp the batch to a 2^n - 1 value. Having a power
-	 * of 2 value was found to be more likely to have
-	 * suboptimal cache aliasing properties in some cases.
-	 *
-	 * For example if 2 tasks are alternately allocating
-	 * batches of pages, one task can end up with a lot
-	 * of pages of one half of the possible page colors
-	 * and the other with pages of the other colors.
-	 */
-	batch = (1 << (fls(batch + batch/2)-1)) - 1;
-
-	return batch;
-}
-
-inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
-{
-	struct per_cpu_pages *pcp;
-
-	memset(p, 0, sizeof(*p));
-
-	pcp = &p->pcp[0];		/* hot */
-	pcp->count = 0;
-	pcp->high = 6 * batch;
-	pcp->batch = max(1UL, 1 * batch);
-	INIT_LIST_HEAD(&pcp->list);
-
-	pcp = &p->pcp[1];		/* cold*/
-	pcp->count = 0;
-	pcp->high = 2 * batch;
-	pcp->batch = max(1UL, batch/2);
-	INIT_LIST_HEAD(&pcp->list);
-}
-
-/*
- * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist
- * to the value high for the pageset p.
- */
-
-static void setup_pagelist_highmark(struct per_cpu_pageset *p,
-				unsigned long high)
-{
-	struct per_cpu_pages *pcp;
-
-	pcp = &p->pcp[0]; /* hot list */
-	pcp->high = high;
-	pcp->batch = max(1UL, high/4);
-	if ((high/4) > (PAGE_SHIFT * 8))
-		pcp->batch = PAGE_SHIFT * 8;
-}
-
-
-#ifdef CONFIG_NUMA
-/*
- * Boot pageset table. One per cpu which is going to be used for all
- * zones and all nodes. The parameters will be set in such a way
- * that an item put on a list will immediately be handed over to
- * the buddy list. This is safe since pageset manipulation is done
- * with interrupts disabled.
- *
- * Some NUMA counter updates may also be caught by the boot pagesets.
- *
- * The boot_pagesets must be kept even after bootup is complete for
- * unused processors and/or zones. They do play a role for bootstrapping
- * hotplugged processors.
- *
- * zoneinfo_show() and maybe other functions do
- * not check if the processor is online before following the pageset pointer.
- * Other parts of the kernel may not check if the zone is available.
- */
-static struct per_cpu_pageset boot_pageset[NR_CPUS];
-
-/*
- * Dynamically allocate memory for the
- * per cpu pageset array in struct zone.
- */
-static int __cpuinit process_zones(int cpu)
-{
-	struct zone *zone, *dzone;
-
-	for_each_zone(zone) {
-
-		zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
-					 GFP_KERNEL, cpu_to_node(cpu));
-		if (!zone_pcp(zone, cpu))
-			goto bad;
-
-		setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone));
-
-		if (percpu_pagelist_fraction)
-			setup_pagelist_highmark(zone_pcp(zone, cpu),
-			 	(zone->present_pages / percpu_pagelist_fraction));
-	}
-
-	return 0;
-bad:
-	for_each_zone(dzone) {
-		if (dzone == zone)
-			break;
-		kfree(zone_pcp(dzone, cpu));
-		zone_pcp(dzone, cpu) = NULL;
-	}
-	return -ENOMEM;
-}
-
-static inline void free_zone_pagesets(int cpu)
-{
-	struct zone *zone;
-
-	for_each_zone(zone) {
-		struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
-
-		zone_pcp(zone, cpu) = NULL;
-		kfree(pset);
-	}
-}
-
-static int pageset_cpuup_callback(struct notifier_block *nfb,
-		unsigned long action,
-		void *hcpu)
-{
-	int cpu = (long)hcpu;
-	int ret = NOTIFY_OK;
-
-	switch (action) {
-		case CPU_UP_PREPARE:
-			if (process_zones(cpu))
-				ret = NOTIFY_BAD;
-			break;
-		case CPU_UP_CANCELED:
-		case CPU_DEAD:
-			free_zone_pagesets(cpu);
-			break;
-		default:
-			break;
-	}
-	return ret;
-}
-
-static struct notifier_block pageset_notifier =
-	{ &pageset_cpuup_callback, NULL, 0 };
-
-void __init setup_per_cpu_pageset(void)
-{
-	int err;
-
-	/* Initialize per_cpu_pageset for cpu 0.
-	 * A cpuup callback will do this for every cpu
-	 * as it comes online
-	 */
-	err = process_zones(smp_processor_id());
-	BUG_ON(err);
-	register_cpu_notifier(&pageset_notifier);
-}
-
-#endif
-
-static __meminit
-int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
-{
-	int i;
-	struct pglist_data *pgdat = zone->zone_pgdat;
-	size_t alloc_size;
-
-	/*
-	 * The per-page waitqueue mechanism uses hashed waitqueues
-	 * per zone.
-	 */
-	zone->wait_table_hash_nr_entries =
-		 wait_table_hash_nr_entries(zone_size_pages);
-	zone->wait_table_bits =
-		wait_table_bits(zone->wait_table_hash_nr_entries);
-	alloc_size = zone->wait_table_hash_nr_entries
-					* sizeof(wait_queue_head_t);
-
- 	if (system_state == SYSTEM_BOOTING) {
-		zone->wait_table = (wait_queue_head_t *)
-			alloc_bootmem_node(pgdat, alloc_size);
-	} else {
-		/*
-		 * This case means that a zone whose size was 0 gets new memory
-		 * via memory hot-add.
-		 * But it may be the case that a new node was hot-added.  In
-		 * this case vmalloc() will not be able to use this new node's
-		 * memory - this wait_table must be initialized to use this new
-		 * node itself as well.
-		 * To use this new node's memory, further consideration will be
-		 * necessary.
-		 */
-		zone->wait_table = (wait_queue_head_t *)vmalloc(alloc_size);
-	}
-	if (!zone->wait_table)
-		return -ENOMEM;
-
-	for(i = 0; i < zone->wait_table_hash_nr_entries; ++i)
-		init_waitqueue_head(zone->wait_table + i);
-
-	return 0;
-}
-
-static __meminit void zone_pcp_init(struct zone *zone)
-{
-	int cpu;
-	unsigned long batch = zone_batchsize(zone);
-
-	for (cpu = 0; cpu < NR_CPUS; cpu++) {
-#ifdef CONFIG_NUMA
-		/* Early boot. Slab allocator not functional yet */
-		zone_pcp(zone, cpu) = &boot_pageset[cpu];
-		setup_pageset(&boot_pageset[cpu],0);
-#else
-		setup_pageset(zone_pcp(zone,cpu), batch);
-#endif
-	}
-	if (zone->present_pages)
-		printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%lu\n",
-			zone->name, zone->present_pages, batch);
-}
-
-__meminit int init_currently_empty_zone(struct zone *zone,
-					unsigned long zone_start_pfn,
-					unsigned long size)
-{
-	struct pglist_data *pgdat = zone->zone_pgdat;
-	int ret;
-	ret = zone_wait_table_init(zone, size);
-	if (ret)
-		return ret;
-	pgdat->nr_zones = zone_idx(zone) + 1;
-
-	zone->zone_start_pfn = zone_start_pfn;
-
-	memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn);
-
-	zone_init_free_lists(pgdat, zone, zone->spanned_pages);
-
-	return 0;
-}
-
-#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
-/* Note: nid == MAX_NUMNODES returns first region */
-static int __init first_active_region_index_in_nid(int nid)
-{
-	int i;
-	for (i = 0; early_node_map[i].end_pfn; i++) {
-		if (nid == MAX_NUMNODES || early_node_map[i].nid == nid)
-			return i;
-	}
-
-	return MAX_ACTIVE_REGIONS;
-}
-
-/* Note: nid == MAX_NUMNODES returns next region */
-static int __init next_active_region_index_in_nid(unsigned int index, int nid)
-{
-	for (index = index + 1; early_node_map[index].end_pfn; index++) {
-		if (nid == MAX_NUMNODES || early_node_map[index].nid == nid)
-			return index;
-	}
-
-	return MAX_ACTIVE_REGIONS;
-}
-
-#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
-int __init early_pfn_to_nid(unsigned long pfn)
-{
-	int i;
-
-	for (i = 0; early_node_map[i].end_pfn; i++) {
-		unsigned long start_pfn = early_node_map[i].start_pfn;
-		unsigned long end_pfn = early_node_map[i].end_pfn;
-
-		if ((start_pfn <= pfn) && (pfn < end_pfn))
-			return early_node_map[i].nid;
-	}
-
-	return -1;
-}
-#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
-
-#define for_each_active_range_index_in_nid(i, nid) \
-	for (i = first_active_region_index_in_nid(nid); \
-				i != MAX_ACTIVE_REGIONS; \
-				i = next_active_region_index_in_nid(i, nid))
-
-void __init free_bootmem_with_active_regions(int nid,
-						unsigned long max_low_pfn)
-{
-	unsigned int i;
-	for_each_active_range_index_in_nid(i, nid) {
-		unsigned long size_pages = 0;
-		unsigned long end_pfn = early_node_map[i].end_pfn;
-		if (early_node_map[i].start_pfn >= max_low_pfn)
-			continue;
-
-		if (end_pfn > max_low_pfn)
-			end_pfn = max_low_pfn;
-
-		size_pages = end_pfn - early_node_map[i].start_pfn;
-		free_bootmem_node(NODE_DATA(early_node_map[i].nid),
-				PFN_PHYS(early_node_map[i].start_pfn),
-				size_pages << PAGE_SHIFT);
-	}
-}
-
-void __init sparse_memory_present_with_active_regions(int nid)
-{
-	unsigned int i;
-	for_each_active_range_index_in_nid(i, nid)
-		memory_present(early_node_map[i].nid,
-				early_node_map[i].start_pfn,
-				early_node_map[i].end_pfn);
-}
-
-void __init get_pfn_range_for_nid(unsigned int nid,
-			unsigned long *start_pfn, unsigned long *end_pfn)
-{
-	unsigned int i;
-	*start_pfn = -1UL;
-	*end_pfn = 0;
-
-	for_each_active_range_index_in_nid(i, nid) {
-		*start_pfn = min(*start_pfn, early_node_map[i].start_pfn);
-		*end_pfn = max(*end_pfn, early_node_map[i].end_pfn);
-	}
-
-	if (*start_pfn == -1UL) {
-		printk(KERN_WARNING "Node %u active with no memory\n", nid);
-		*start_pfn = 0;
-	}
-}
-
-unsigned long __init zone_present_pages_in_node(int nid,
-					unsigned long zone_type,
-					unsigned long *ignored)
-{
-	unsigned long node_start_pfn, node_end_pfn;
-	unsigned long zone_start_pfn, zone_end_pfn;
-
-	/* Get the start and end of the node and zone */
-	get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
-	zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
-	zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
-
-	/* Check that this node has pages within the zone's required range */
-	if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)
-		return 0;
-
-	/* Move the zone boundaries inside the node if necessary */
-	zone_end_pfn = min(zone_end_pfn, node_end_pfn);
-	zone_start_pfn = max(zone_start_pfn, node_start_pfn);
-
-	/* Return the spanned pages */
-	return zone_end_pfn - zone_start_pfn;
-}
-
-unsigned long __init __absent_pages_in_range(int nid,
-				unsigned long range_start_pfn,
-				unsigned long range_end_pfn)
-{
-	int i = 0;
-	unsigned long prev_end_pfn = 0, hole_pages = 0;
-	unsigned long start_pfn;
-
-	/* Find the end_pfn of the first active range of pfns in the node */
-	i = first_active_region_index_in_nid(nid);
-	if (i == MAX_ACTIVE_REGIONS)
-		return 0;
-	prev_end_pfn = early_node_map[i].start_pfn;
-
-	/* Find all holes for the zone within the node */
-	for (; i != MAX_ACTIVE_REGIONS;
-			i = next_active_region_index_in_nid(i, nid)) {
-
-		/* No need to continue if prev_end_pfn is outside the zone */
-		if (prev_end_pfn >= range_end_pfn)
-			break;
-
-		/* Make sure the end of the zone is not within the hole */
-		start_pfn = min(early_node_map[i].start_pfn, range_end_pfn);
-		prev_end_pfn = max(prev_end_pfn, range_start_pfn);
-
-		/* Update the hole size cound and move on */
-		if (start_pfn > range_start_pfn) {
-			BUG_ON(prev_end_pfn > start_pfn);
-			hole_pages += start_pfn - prev_end_pfn;
-		}
-		prev_end_pfn = early_node_map[i].end_pfn;
-	}
-
-	return hole_pages;
-}
-
-unsigned long __init absent_pages_in_range(unsigned long start_pfn,
-							unsigned long end_pfn)
-{
-	return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
-}
-
-unsigned long __init zone_absent_pages_in_node(int nid,
-					unsigned long zone_type,
-					unsigned long *ignored)
-{
-	return __absent_pages_in_range(nid,
-				arch_zone_lowest_possible_pfn[zone_type],
-				arch_zone_highest_possible_pfn[zone_type]);
-}
-#else
-static inline unsigned long zone_present_pages_in_node(int nid,
-					unsigned long zone_type,
-					unsigned long *zones_size)
-{
-	return zones_size[zone_type];
-}
-
-static inline unsigned long zone_absent_pages_in_node(int nid,
-						unsigned long zone_type,
-						unsigned long *zholes_size)
-{
-	if (!zholes_size)
-		return 0;
-
-	return zholes_size[zone_type];
-}
-#endif
-
-static void __init calculate_node_totalpages(struct pglist_data *pgdat,
-		unsigned long *zones_size, unsigned long *zholes_size)
-{
-	unsigned long realtotalpages, totalpages = 0;
-	int i;
-
-	for (i = 0; i < MAX_NR_ZONES; i++) {
-		totalpages += zone_present_pages_in_node(pgdat->node_id, i,
-								zones_size);
-	}
-	pgdat->node_spanned_pages = totalpages;
-
-	realtotalpages = totalpages;
-	for (i = 0; i < MAX_NR_ZONES; i++) {
-		realtotalpages -=
-			zone_absent_pages_in_node(pgdat->node_id, i, zholes_size);
-	}
-	pgdat->node_present_pages = realtotalpages;
-	printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
-							realtotalpages);
-}
-
-/*
- * Set up the zone data structures:
- *   - mark all pages reserved
- *   - mark all memory queues empty
- *   - clear the memory bitmaps
- */
-static void __meminit free_area_init_core(struct pglist_data *pgdat,
-		unsigned long *zones_size, unsigned long *zholes_size)
-{
-	unsigned long j;
-	int nid = pgdat->node_id;
-	unsigned long zone_start_pfn = pgdat->node_start_pfn;
-	int ret;
-
-	pgdat_resize_init(pgdat);
-	pgdat->nr_zones = 0;
-	init_waitqueue_head(&pgdat->kswapd_wait);
-	pgdat->kswapd_max_order = 0;
-	
-	for (j = 0; j < MAX_NR_ZONES; j++) {
-		struct zone *zone = pgdat->node_zones + j;
-		unsigned long size, realsize;
-
-		size = zone_present_pages_in_node(nid, j, zones_size);
-		realsize = size - zone_absent_pages_in_node(nid, j,
-								zholes_size);
-		if (j < ZONE_HIGHMEM)
-			nr_kernel_pages += realsize;
-		nr_all_pages += realsize;
-
-		zone->spanned_pages = size;
-		zone->present_pages = realsize;
-		zone->name = zone_names[j];
-		spin_lock_init(&zone->lock);
-		spin_lock_init(&zone->lru_lock);
-		zone_seqlock_init(zone);
-		zone->zone_pgdat = pgdat;
-		zone->free_pages = 0;
-
-		zone->temp_priority = zone->prev_priority = DEF_PRIORITY;
-
-		zone_pcp_init(zone);
-		INIT_LIST_HEAD(&zone->active_list);
-		INIT_LIST_HEAD(&zone->inactive_list);
-		zone->nr_scan_active = 0;
-		zone->nr_scan_inactive = 0;
-		zone->nr_active = 0;
-		zone->nr_inactive = 0;
-		atomic_set(&zone->reclaim_in_progress, 0);
-		if (!size)
-			continue;
-
-		zonetable_add(zone, nid, j, zone_start_pfn, size);
-		ret = init_currently_empty_zone(zone, zone_start_pfn, size);
-		BUG_ON(ret);
-		zone_start_pfn += size;
-	}
-}
-
-static void __init alloc_node_mem_map(struct pglist_data *pgdat)
-{
-	/* Skip empty nodes */
-	if (!pgdat->node_spanned_pages)
-		return;
-
-#ifdef CONFIG_FLAT_NODE_MEM_MAP
-	/* ia64 gets its own node_mem_map, before this, without bootmem */
-	if (!pgdat->node_mem_map) {
-		unsigned long size;
-		struct page *map;
-
-		size = (pgdat->node_spanned_pages + 1) * sizeof(struct page);
-		map = alloc_remap(pgdat->node_id, size);
-		if (!map)
-			map = alloc_bootmem_node(pgdat, size);
-		pgdat->node_mem_map = map;
-	}
-#ifdef CONFIG_FLATMEM
-	/*
-	 * With no DISCONTIG, the global mem_map is just set as node 0's
-	 */
-	if (pgdat == NODE_DATA(0))
-		mem_map = NODE_DATA(0)->node_mem_map;
-#endif
-#endif /* CONFIG_FLAT_NODE_MEM_MAP */
-}
-
-void __meminit free_area_init_node(int nid, struct pglist_data *pgdat,
-		unsigned long *zones_size, unsigned long node_start_pfn,
-		unsigned long *zholes_size)
-{
-	pgdat->node_id = nid;
-	pgdat->node_start_pfn = node_start_pfn;
-	calculate_node_totalpages(pgdat, zones_size, zholes_size);
-
-	alloc_node_mem_map(pgdat);
-
-	free_area_init_core(pgdat, zones_size, zholes_size);
-}
-
-#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
-void __init add_active_range(unsigned int nid, unsigned long start_pfn,
-						unsigned long end_pfn)
-{
-	unsigned int i;
-
-	/* Merge with existing active regions if possible */
-	for (i = 0; early_node_map[i].end_pfn; i++) {
-		if (early_node_map[i].nid != nid)
-			continue;
-
-		/* Skip if an existing region covers this new one */
-		if (start_pfn >= early_node_map[i].start_pfn &&
-				end_pfn <= early_node_map[i].end_pfn)
-			return;
-
-		/* Merge forward if suitable */
-		if (start_pfn <= early_node_map[i].end_pfn &&
-				end_pfn > early_node_map[i].end_pfn) {
-			early_node_map[i].end_pfn = end_pfn;
-			return;
-		}
-
-		/* Merge backward if suitable */
-		if (start_pfn < early_node_map[i].end_pfn &&
-				end_pfn >= early_node_map[i].start_pfn) {
-			early_node_map[i].start_pfn = start_pfn;
-			return;
-		}
-	}
-
-	/* Leave last entry NULL, we use range.end_pfn to terminate the walk */
-	if (i >= MAX_ACTIVE_REGIONS - 1) {
-		printk(KERN_ERR "Too many memory regions, truncating\n");
-		return;
-	}
-
-	early_node_map[i].nid = nid;
-	early_node_map[i].start_pfn = start_pfn;
-	early_node_map[i].end_pfn = end_pfn;
-}
-
-void __init shrink_active_range(unsigned int nid, unsigned long old_end_pfn,
-						unsigned long new_end_pfn)
-{
-	unsigned int i;
-
-	/* Find the old active region end and shrink */
-	for_each_active_range_index_in_nid(i, nid) {
-		if (early_node_map[i].end_pfn == old_end_pfn) {
-			early_node_map[i].end_pfn = new_end_pfn;
-			break;
-		}
-	}
-}
-
-void __init remove_all_active_ranges()
-{
-	memset(early_node_map, 0, sizeof(early_node_map));
-}
-
-/* Compare two active node_active_regions */
-static int __init cmp_node_active_region(const void *a, const void *b)
-{
-	struct node_active_region *arange = (struct node_active_region *)a;
-	struct node_active_region *brange = (struct node_active_region *)b;
-
-	/* Done this way to avoid overflows */
-	if (arange->start_pfn > brange->start_pfn)
-		return 1;
-	if (arange->start_pfn < brange->start_pfn)
-		return -1;
-
-	return 0;
-}
-
-/* sort the node_map by start_pfn */
-static void __init sort_node_map(void)
-{
-	size_t num = 0;
-	while (early_node_map[num].end_pfn)
-		num++;
-
-	sort(early_node_map, num, sizeof(struct node_active_region),
-						cmp_node_active_region, NULL);
-}
-
-/* Find the lowest pfn for a node. This depends on a sorted early_node_map */
-unsigned long __init find_min_pfn_for_node(unsigned long nid)
-{
-	int i;
-
-	/* Assuming a sorted map, the first range found has the starting pfn */
-	for_each_active_range_index_in_nid(i, nid)
-		return early_node_map[i].start_pfn;
-
-	printk(KERN_WARNING "Could not find start_pfn for node %lu\n", nid);
-	return 0;
-}
-
-unsigned long __init find_min_pfn_with_active_regions(void)
-{
-	return find_min_pfn_for_node(MAX_NUMNODES);
-}
-
-unsigned long __init find_max_pfn_with_active_regions(void)
-{
-	int i;
-	unsigned long max_pfn = 0;
-
-	for (i = 0; early_node_map[i].end_pfn; i++)
-		max_pfn = max(max_pfn, early_node_map[i].end_pfn);
-
-	return max_pfn;
-}
-
-void __init free_area_init_nodes(unsigned long arch_max_dma_pfn,
-				unsigned long arch_max_dma32_pfn,
-				unsigned long arch_max_low_pfn,
-				unsigned long arch_max_high_pfn)
-{
-	unsigned long nid;
-	int zone_index;
-
-	/* Record where the zone boundaries are */
-	memset(arch_zone_lowest_possible_pfn, 0,
-				sizeof(arch_zone_lowest_possible_pfn));
-	memset(arch_zone_highest_possible_pfn, 0,
-				sizeof(arch_zone_highest_possible_pfn));
-	arch_zone_lowest_possible_pfn[ZONE_DMA] =
-					find_min_pfn_with_active_regions();
-	arch_zone_highest_possible_pfn[ZONE_DMA] = arch_max_dma_pfn;
-	arch_zone_highest_possible_pfn[ZONE_DMA32] = arch_max_dma32_pfn;
-	arch_zone_highest_possible_pfn[ZONE_NORMAL] = arch_max_low_pfn;
-	arch_zone_highest_possible_pfn[ZONE_HIGHMEM] = arch_max_high_pfn;
-	for (zone_index = 1; zone_index < MAX_NR_ZONES; zone_index++) {
-		arch_zone_lowest_possible_pfn[zone_index] =
-			arch_zone_highest_possible_pfn[zone_index-1];
-	}
-
-	/* Regions in the early_node_map can be in any order */
-	sort_node_map();
-
-	for_each_online_node(nid) {
-		pg_data_t *pgdat = NODE_DATA(nid);
-		free_area_init_node(nid, pgdat, NULL,
-				find_min_pfn_for_node(nid), NULL);
-	}
-}
-#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
-
 #ifndef CONFIG_NEED_MULTIPLE_NODES
 static bootmem_data_t contig_bootmem_data;
 struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data };
@@ -3175,32 +2097,6 @@ int lowmem_reserve_ratio_sysctl_handler(
 	return 0;
 }
 
-/*
- * percpu_pagelist_fraction - changes the pcp->high for each zone on each
- * cpu.  It is the fraction of total pages in each zone that a hot per cpu pagelist
- * can have before it gets flushed back to buddy allocator.
- */
-
-int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
-	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
-{
-	struct zone *zone;
-	unsigned int cpu;
-	int ret;
-
-	ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
-	if (!write || (ret == -EINVAL))
-		return ret;
-	for_each_zone(zone) {
-		for_each_online_cpu(cpu) {
-			unsigned long  high;
-			high = zone->present_pages / percpu_pagelist_fraction;
-			setup_pagelist_highmark(zone_pcp(zone, cpu), high);
-		}
-	}
-	return 0;
-}
-
 __initdata int hashdist = HASHDIST_DEFAULT;
 
 #ifdef CONFIG_NUMA

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH 7/7] Print out debugging information during initialisation
  2006-05-01 13:35 [PATCH 0/7] Sizing zones and holes in an architecture independent manner V5 Mel Gorman
                   ` (5 preceding siblings ...)
  2006-05-01 13:37 ` [PATCH 6/7] Break out memory initialisation code from page_alloc.c to mem_init.c Mel Gorman
@ 2006-05-01 13:37 ` Mel Gorman
  6 siblings, 0 replies; 11+ messages in thread
From: Mel Gorman @ 2006-05-01 13:37 UTC (permalink / raw)
  To: akpm, davej, tony.luck, linux-mm, linux-kernel, bob.picco, ak,
	linuxppc-dev
  Cc: Mel Gorman


The zone and hole sizing code is new and unexpected problems showed up
during early releases on machines that were not covered by the pre-release
tests. This patch prints out useful information when those unexpected
situations occur.

It is not expected that this patch become a permanent part of the set.


 mem_init.c |   62 ++++++++++++++++++++++++++++++++++++++++++++++++++++----
 1 files changed, 58 insertions(+), 4 deletions(-)

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc3-mm1-106-breakout_mem_init/mm/mem_init.c linux-2.6.17-rc3-mm1-107-debug/mm/mem_init.c
--- linux-2.6.17-rc3-mm1-106-breakout_mem_init/mm/mem_init.c	2006-05-01 11:51:50.000000000 +0100
+++ linux-2.6.17-rc3-mm1-107-debug/mm/mem_init.c	2006-05-01 11:51:50.000000000 +0100
@@ -341,6 +341,9 @@ void __meminit memmap_init_zone(unsigned
 	unsigned long end_pfn = start_pfn + size;
 	unsigned long pfn;
 
+	printk("memmap_init_zone(size %lu, nid %d, zone %lu, start_pfn %lu)\n",
+						size, nid, zone, start_pfn);
+
 	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
 		if (!early_pfn_valid(pfn))
 			continue;
@@ -661,6 +664,7 @@ __meminit int init_currently_empty_zone(
 }
 
 #ifdef CONFIG_ARCH_POPULATES_NODE_MAP
+
 /* Note: nid == MAX_NUMNODES returns first region */
 static int __init first_active_region_index_in_nid(int nid)
 {
@@ -713,13 +717,24 @@ void __init free_bootmem_with_active_reg
 	for_each_active_range_index_in_nid(i, nid) {
 		unsigned long size_pages = 0;
 		unsigned long end_pfn = early_node_map[i].end_pfn;
-		if (early_node_map[i].start_pfn >= max_low_pfn)
+		if (early_node_map[i].start_pfn >= max_low_pfn) {
+			printk("start_pfn %lu >= %lu\n", early_node_map[i].start_pfn,
+								max_low_pfn);
 			continue;
+		}
 
-		if (end_pfn > max_low_pfn)
+		if (end_pfn > max_low_pfn) {
+			printk("end_pfn %lu going back to %lu\n", early_node_map[i].end_pfn,
+									max_low_pfn);
 			end_pfn = max_low_pfn;
+		}
 
 		size_pages = end_pfn - early_node_map[i].start_pfn;
+		printk("free_bootmem_node(%d, %lu, %lu) :::: pfn ranges (%d, %lu, %lu)\n",
+				early_node_map[i].nid,
+				PFN_PHYS(early_node_map[i].start_pfn),
+				PFN_PHYS(size_pages),
+				early_node_map[i].nid, early_node_map[i].start_pfn, end_pfn);
 		free_bootmem_node(NODE_DATA(early_node_map[i].nid),
 				PFN_PHYS(early_node_map[i].start_pfn),
 				size_pages << PAGE_SHIFT);
@@ -729,10 +744,15 @@ void __init free_bootmem_with_active_reg
 void __init sparse_memory_present_with_active_regions(int nid)
 {
 	unsigned int i;
-	for_each_active_range_index_in_nid(i, nid)
+	for_each_active_range_index_in_nid(i, nid) {
+		printk("memory_present(%d, %lu, %lu)\n",
+			early_node_map[i].nid,
+			early_node_map[i].start_pfn,
+			early_node_map[i].end_pfn);
 		memory_present(early_node_map[i].nid,
 				early_node_map[i].start_pfn,
 				early_node_map[i].end_pfn);
+	}
 }
 
 void __init get_pfn_range_for_nid(unsigned int nid,
@@ -785,6 +805,8 @@ unsigned long __init __absent_pages_in_r
 	unsigned long prev_end_pfn = 0, hole_pages = 0;
 	unsigned long start_pfn;
 
+	printk("__absent_pages_in_range(%d, %lu, %lu) = ", nid,
+					range_start_pfn, range_end_pfn);
 	/* Find the end_pfn of the first active range of pfns in the node */
 	i = first_active_region_index_in_nid(nid);
 	if (i == MAX_ACTIVE_REGIONS)
@@ -811,6 +833,8 @@ unsigned long __init __absent_pages_in_r
 		prev_end_pfn = early_node_map[i].end_pfn;
 	}
 
+	printk("%lu\n", hole_pages);
+
 	return hole_pages;
 }
 
@@ -975,6 +999,9 @@ void __init add_active_range(unsigned in
 {
 	unsigned int i;
 
+	printk("add_active_range(%d, %lu, %lu): ",
+			nid, start_pfn, end_pfn);
+
 	/* Merge with existing active regions if possible */
 	for (i = 0; early_node_map[i].end_pfn; i++) {
 		if (early_node_map[i].nid != nid)
@@ -982,12 +1009,15 @@ void __init add_active_range(unsigned in
 
 		/* Skip if an existing region covers this new one */
 		if (start_pfn >= early_node_map[i].start_pfn &&
-				end_pfn <= early_node_map[i].end_pfn)
+				end_pfn <= early_node_map[i].end_pfn) {
+			printk("Existing\n");
 			return;
+		}
 
 		/* Merge forward if suitable */
 		if (start_pfn <= early_node_map[i].end_pfn &&
 				end_pfn > early_node_map[i].end_pfn) {
+			printk("Merging forward\n");
 			early_node_map[i].end_pfn = end_pfn;
 			return;
 		}
@@ -995,6 +1025,7 @@ void __init add_active_range(unsigned in
 		/* Merge backward if suitable */
 		if (start_pfn < early_node_map[i].end_pfn &&
 				end_pfn >= early_node_map[i].start_pfn) {
+			printk("Merging backwards\n");
 			early_node_map[i].start_pfn = start_pfn;
 			return;
 		}
@@ -1006,6 +1037,7 @@ void __init add_active_range(unsigned in
 		return;
 	}
 
+	printk("New\n");
 	early_node_map[i].nid = nid;
 	early_node_map[i].start_pfn = start_pfn;
 	early_node_map[i].end_pfn = end_pfn;
@@ -1017,16 +1049,22 @@ void __init shrink_active_range(unsigned
 	unsigned int i;
 
 	/* Find the old active region end and shrink */
+	printk("Shrinking %u from %lu to %lu: ",
+					nid, old_end_pfn, new_end_pfn);
 	for_each_active_range_index_in_nid(i, nid) {
 		if (early_node_map[i].end_pfn == old_end_pfn) {
+			printk("Done\n");
 			early_node_map[i].end_pfn = new_end_pfn;
 			break;
 		}
 	}
+
+	printk("Not found\n");
 }
 
 void __init remove_all_active_ranges()
 {
+	printk("remove_all_active_ranges()\n");
 	memset(early_node_map, 0, sizeof(early_node_map));
 }
 
@@ -1054,6 +1092,14 @@ static void __init sort_node_map(void)
 
 	sort(early_node_map, num, sizeof(struct node_active_region),
 						cmp_node_active_region, NULL);
+
+	printk("Dumping sorted node map\n");
+	for (num = 0; early_node_map[num].end_pfn; num++) {
+		printk("entry %lu: %d  %lu -> %lu\n", num,
+				early_node_map[num].nid,
+				early_node_map[num].start_pfn,
+				early_node_map[num].end_pfn);
+	}
 }
 
 /* Find the lowest pfn for a node. This depends on a sorted early_node_map */
@@ -1069,6 +1115,7 @@ unsigned long __init find_min_pfn_for_no
 	return 0;
 }
 
+
 unsigned long __init find_min_pfn_with_active_regions(void)
 {
 	return find_min_pfn_for_node(MAX_NUMNODES);
@@ -1082,6 +1129,7 @@ unsigned long __init find_max_pfn_with_a
 	for (i = 0; early_node_map[i].end_pfn; i++)
 		max_pfn = max(max_pfn, early_node_map[i].end_pfn);
 
+	printk("find_max_pfn_with_active_regions() == %lu\n", max_pfn);
 	return max_pfn;
 }
 
@@ -1093,6 +1141,10 @@ void __init free_area_init_nodes(unsigne
 	unsigned long nid;
 	int zone_index;
 
+	printk("free_area_init_nodes(%lu, %lu, %lu, %lu)\n",
+			arch_max_dma_pfn, arch_max_dma32_pfn,
+			arch_max_low_pfn, arch_max_high_pfn);
+
 	/* Record where the zone boundaries are */
 	memset(arch_zone_lowest_possible_pfn, 0,
 				sizeof(arch_zone_lowest_possible_pfn));
@@ -1109,6 +1161,8 @@ void __init free_area_init_nodes(unsigne
 			arch_zone_highest_possible_pfn[zone_index-1];
 	}
 
+	printk("free_area_init_nodes(): find_min_pfn = %lu\n", find_min_pfn_with_active_regions());
+
 	/* Regions in the early_node_map can be in any order */
 	sort_node_map();
 

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2006-05-01 13:37 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2006-05-01 13:35 [PATCH 0/7] Sizing zones and holes in an architecture independent manner V5 Mel Gorman
2006-05-01 13:35 ` [PATCH 1/7] Introduce mechanism for registering active regions of memory Mel Gorman
2006-05-01 13:36 ` [PATCH 2/7] Have Power use add_active_range() and free_area_init_nodes() Mel Gorman
2006-05-01 13:36 ` [PATCH 3/7] Have x86 use add_active_range() and free_area_init_nodes Mel Gorman
2006-05-01 13:36 ` [PATCH 4/7] Have x86_64 " Mel Gorman
2006-05-01 13:37 ` [PATCH 5/7] Have ia64 " Mel Gorman
2006-05-01 13:37 ` [PATCH 6/7] Break out memory initialisation code from page_alloc.c to mem_init.c Mel Gorman
2006-05-01 13:37 ` [PATCH 7/7] Print out debugging information during initialisation Mel Gorman
  -- strict thread matches above, loose matches on Subject: below --
2006-04-24 20:20 [PATCH 0/7] [RFC] Sizing zones and holes in an architecture independent manner V4 Mel Gorman
2006-04-24 20:22 ` [PATCH 6/7] Break out memory initialisation code from page_alloc.c to mem_init.c Mel Gorman
2006-04-18 13:00 [PATCH 0/7] [RFC] Sizing zones and holes in an architecture independent manner V3 Mel Gorman
2006-04-18 13:02 ` [PATCH 6/7] Break out memory initialisation code from page_alloc.c to mem_init.c Mel Gorman
2006-04-12 23:20 [PATCH 0/7] [RFC] Sizing zones and holes in an architecture independent manner V2 Mel Gorman
2006-04-12 23:22 ` [PATCH 6/7] Break out memory initialisation code from page_alloc.c to mem_init.c Mel Gorman

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).