All of lore.kernel.org
 help / color / mirror / Atom feed
From: Mark Gross <mgross@linux.intel.com>
To: David Rientjes <rientjes@google.com>
Cc: linux-mm@kvack.org, mark.gross@intel.com,
	linux-pm@lists.osdl.org,
	Andrew Morton <akpm@linux-foundation.org>,
	Linus Torvalds <torvalds@linux-foundation.org>,
	neelam.chandwani@intel.com
Subject: Re: [RFC] [PATCH] Power Managed memory base enabling
Date: Fri, 9 Mar 2007 12:53:44 -0800	[thread overview]
Message-ID: <20070309205344.GA16777@linux.intel.com> (raw)
In-Reply-To: <Pine.LNX.4.64.0703061838390.13314@chino.kir.corp.google.com>

On Tue, Mar 06, 2007 at 06:40:36PM -0800, David Rientjes wrote:
> On Tue, 6 Mar 2007, Mark Gross wrote:
> 
> > Let me give your idea a spin and get back to you. 
> > 
> 
> Something like the following might be a little better.

Thanks!  I've got things cleaned up and working with as many of your
ideas as I could get working.  I liked many of the changes you offered
in the patch you sent to me off list.  

One thing I found was your patch didn't use the SLIT data in computing
the nearest non PM node, and I had to be careful about the difference
between the PM memory PXM bitmap and node id's.  After I accounted for
the not_to_pxm mapping things started working for me.

BTW re basing to 2.6.21rc3mm2, resulted in one 4k allocation in my
PM-zones.  I'll be looking for where that allocation is coming from
after I get this post finished.

--mgross

Singed-off-by: Mark Gross <mark.gross@intel.com>

diff -urN -X linux-2.6.21rc3mm2/Documentation/dontdiff linux-2.6.21rc3mm2/arch/x86_64/mm/numa.c linux-2.6.21rc3mm2-monroe/arch/x86_64/mm/numa.c
--- linux-2.6.21rc3mm2/arch/x86_64/mm/numa.c	2007-03-08 11:14:19.000000000 -0800
+++ linux-2.6.21rc3mm2-monroe/arch/x86_64/mm/numa.c	2007-03-09 10:23:25.000000000 -0800
@@ -155,19 +155,47 @@
 }
 #endif
 
+/* we need a place to save the next start address to use for each node because
+ * we need to allocate the pgdata and bootmem for power managed memory in
+ * non-power managed nodes.  We do this by saving off where we can start
+ * allocating in the nodes and updating them as the boot up proceeds.
+ */
+static unsigned long bootmem_start[MAX_NUMNODES];
+
+
 static void * __init
 early_node_mem(int nodeid, unsigned long start, unsigned long end,
 	      unsigned long size)
 {
-	unsigned long mem = find_e820_area(start, end, size);
+	unsigned long mem;
 	void *ptr;
-	if (mem != -1L)
+	int nid;
+	
+	if (bootmem_start[nodeid] < start) {
+		bootmem_start[nodeid] = start;
+	}
+
+	mem = -1L;
+	nid = nearest_non_pm_node(nodeid);
+	if (nid != nodeid) {
+		if (!node_online(nid))
+			return NULL;
+
+		end = (NODE_DATA(nid)->node_start_pfn +
+			NODE_DATA(nid)->node_spanned_pages)
+				<< PAGE_SHIFT;
+	}
+	mem = find_e820_area(bootmem_start[nid], end, size);
+	if (mem!= -1L) {
+		/* now increment bootmem_start for next call */
+		bootmem_start[nid] = round_up(mem + size, PAGE_SIZE);
 		return __va(mem);
+	}
 	ptr = __alloc_bootmem_nopanic(size,
 				SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS));
 	if (ptr == 0) {
 		printk(KERN_ERR "Cannot find %lu bytes in node %d\n",
-			size, nodeid);
+			size, nid);
 		return NULL;
 	}
 	return ptr;
@@ -179,6 +207,7 @@
 	unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start; 
 	unsigned long nodedata_phys;
 	void *bootmap;
+	int non_pm_node = nearest_non_pm_node(nodeid);
 	const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
 
 	start = round_up(start, ZONE_ALIGN); 
@@ -218,8 +247,8 @@
 
 	free_bootmem_with_active_regions(nodeid, end);
 
-	reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size); 
-	reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT);
+	reserve_bootmem_node(NODE_DATA(non_pm_node), nodedata_phys, pgdat_size);
+	reserve_bootmem_node(NODE_DATA(non_pm_node), bootmap_start, bootmap_pages<<PAGE_SHIFT);
 #ifdef CONFIG_ACPI_NUMA
 	srat_reserve_add_area(nodeid);
 #endif
@@ -230,8 +259,9 @@
 void __init setup_node_zones(int nodeid)
 { 
 	unsigned long start_pfn, end_pfn, memmapsize, limit;
+	int non_pm_node = nearest_non_pm_node(nodeid);
 
- 	start_pfn = node_start_pfn(nodeid);
+	start_pfn = node_start_pfn(nodeid);
  	end_pfn = node_end_pfn(nodeid);
 
 	Dprintk(KERN_INFO "Setting up memmap for node %d %lx-%lx\n",
@@ -242,11 +272,11 @@
 	memmapsize = sizeof(struct page) * (end_pfn-start_pfn);
 	limit = end_pfn << PAGE_SHIFT;
 #ifdef CONFIG_FLAT_NODE_MEM_MAP
-	NODE_DATA(nodeid)->node_mem_map = 
-		__alloc_bootmem_core(NODE_DATA(nodeid)->bdata, 
-				memmapsize, SMP_CACHE_BYTES, 
-				round_down(limit - memmapsize, PAGE_SIZE), 
-				limit);
+	NODE_DATA(nodeid)->node_mem_map =
+		__alloc_bootmem_core(NODE_DATA(non_pm_node)->bdata,
+			memmapsize, SMP_CACHE_BYTES,
+			round_down(limit - memmapsize, PAGE_SIZE),
+			limit);
 	printk(KERN_DEBUG "Node %d memmap at 0x%p size %lu first pfn 0x%p\n",
 			nodeid, NODE_DATA(nodeid)->node_mem_map,
 			memmapsize, NODE_DATA(nodeid)->node_mem_map);
@@ -265,7 +295,8 @@
 	for (i = 0; i < NR_CPUS; i++) {
 		if (cpu_to_node[i] != NUMA_NO_NODE)
 			continue;
- 		numa_set_node(i, rr);
+		numa_set_node(i,nearest_non_pm_node(rr));
+		//numa_set_node(i, rr);
 		rr = next_node(rr, node_online_map);
 		if (rr == MAX_NUMNODES)
 			rr = first_node(node_online_map);
diff -urN -X linux-2.6.21rc3mm2/Documentation/dontdiff linux-2.6.21rc3mm2/arch/x86_64/mm/srat.c linux-2.6.21rc3mm2-monroe/arch/x86_64/mm/srat.c
--- linux-2.6.21rc3mm2/arch/x86_64/mm/srat.c	2007-03-08 11:14:19.000000000 -0800
+++ linux-2.6.21rc3mm2-monroe/arch/x86_64/mm/srat.c	2007-03-09 11:00:51.000000000 -0800
@@ -27,6 +27,7 @@
 
 static nodemask_t nodes_parsed __initdata;
 static struct bootnode nodes_add[MAX_NUMNODES];
+static nodemask_t pm_nodes __read_mostly;
 static int found_add_area __initdata;
 int hotadd_percent __initdata = 0;
 
@@ -34,6 +35,9 @@
    from BIOS bugs. */
 #define NODE_MIN_SIZE (4*1024*1024)
 
+/* ACPI bit to represent power management node */
+#define POWER_MANAGEMENT_ACPI_BIT	(1 << 31)
+
 static __init int setup_node(int pxm)
 {
 	return acpi_map_pxm_to_node(pxm);
@@ -298,7 +302,10 @@
 		return;
 	start = ma->base_address;
 	end = start + ma->length;
-	pxm = ma->proximity_domain;
+	pxm = ma->proximity_domain & ~POWER_MANAGEMENT_ACPI_BIT;
+	if (ma->proximity_domain & POWER_MANAGEMENT_ACPI_BIT)
+		node_set(pxm, pm_nodes);
+
 	node = setup_node(pxm);
 	if (node < 0) {
 		printk(KERN_ERR "SRAT: Too many proximity domains.\n");
@@ -486,3 +493,35 @@
 }
 EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
 
+int __power_managed_node(int nid)
+{
+	return node_isset(node_to_pxm(nid), pm_nodes);
+}
+
+int __power_managed_memory_present(void)
+{
+	return !nodes_empty(pm_nodes);
+}
+
+int __nearest_non_pm_node(int nid)
+{
+	int i, dist, closest, temp;
+	
+	if (!__power_managed_node(nid))
+		return nid;
+	dist = closest= 255;
+	for_each_node(i) {
+		if (__power_managed_node(i))
+			continue;
+
+		if (i != nid) {
+			temp = __node_distance(nid, i );
+			if (temp < dist) {
+				closest = i;
+				dist = temp;
+			}
+		}
+	}
+	BUG_ON(closest == 255);
+	return closest;
+}
diff -urN -X linux-2.6.21rc3mm2/Documentation/dontdiff linux-2.6.21rc3mm2/include/asm-x86_64/topology.h linux-2.6.21rc3mm2-monroe/include/asm-x86_64/topology.h
--- linux-2.6.21rc3mm2/include/asm-x86_64/topology.h	2007-03-08 11:14:20.000000000 -0800
+++ linux-2.6.21rc3mm2-monroe/include/asm-x86_64/topology.h	2007-03-09 10:23:25.000000000 -0800
@@ -18,6 +18,13 @@
 /* #else fallback version */
 #endif
 
+extern int __power_managed_node(int);
+extern int __power_managed_memory_present(void);
+extern int __nearest_non_pm_node(int);
+#define power_managed_node(nid)		__power_managed_node(nid)
+#define power_managed_memory_present()	__power_managed_memory_present()
+#define nearest_non_pm_node(nid)	__nearest_non_pm_node(nid)
+
 #define cpu_to_node(cpu)		(cpu_to_node[cpu])
 #define parent_node(node)		(node)
 #define node_to_first_cpu(node) 	(first_cpu(node_to_cpumask[node]))
diff -urN -X linux-2.6.21rc3mm2/Documentation/dontdiff linux-2.6.21rc3mm2/include/linux/topology.h linux-2.6.21rc3mm2-monroe/include/linux/topology.h
--- linux-2.6.21rc3mm2/include/linux/topology.h	2007-03-08 11:14:08.000000000 -0800
+++ linux-2.6.21rc3mm2-monroe/include/linux/topology.h	2007-03-09 10:23:25.000000000 -0800
@@ -67,6 +67,24 @@
 #ifndef PENALTY_FOR_NODE_WITH_CPUS
 #define PENALTY_FOR_NODE_WITH_CPUS	(1)
 #endif
+#ifndef power_managed_node
+static inline int power_managed_node(int nid)
+{
+	return 0;
+}
+#endif
+#ifndef power_managed_memory_present
+static inline int power_managed_memory_present(void)
+{
+	return 0;
+}
+#endif
+#ifndef nearest_non_pm_node
+static inline int nearest_non_pm_node(int nid)
+{
+	return nid;
+}
+#endif
 
 /*
  * Below are the 3 major initializers used in building sched_domains:
diff -urN -X linux-2.6.21rc3mm2/Documentation/dontdiff linux-2.6.21rc3mm2/mm/bootmem.c linux-2.6.21rc3mm2-monroe/mm/bootmem.c
--- linux-2.6.21rc3mm2/mm/bootmem.c	2007-02-04 10:44:54.000000000 -0800
+++ linux-2.6.21rc3mm2-monroe/mm/bootmem.c	2007-03-09 10:23:25.000000000 -0800
@@ -417,11 +417,14 @@
 void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
 				      unsigned long goal)
 {
-	bootmem_data_t *bdata;
 	void *ptr;
+	int i;
 
-	list_for_each_entry(bdata, &bdata_list, list) {
-		ptr = __alloc_bootmem_core(bdata, size, align, goal, 0);
+	for_each_online_node(i) {
+		if (power_managed_node(i))
+			continue;
+		ptr = __alloc_bootmem_core(NODE_DATA(i)->bdata, size,
+					align, goal, 0);
 		if (ptr)
 			return ptr;
 	}
@@ -463,12 +466,14 @@
 void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
 				  unsigned long goal)
 {
-	bootmem_data_t *bdata;
 	void *ptr;
+	int i;
 
-	list_for_each_entry(bdata, &bdata_list, list) {
-		ptr = __alloc_bootmem_core(bdata, size, align, goal,
-						ARCH_LOW_ADDRESS_LIMIT);
+	for_each_online_node(i) {
+		if (power_managed_node(i))
+			continue;
+		ptr = __alloc_bootmem_core(NODE_DATA(i)->bdata, size, align,
+					goal, ARCH_LOW_ADDRESS_LIMIT);
 		if (ptr)
 			return ptr;
 	}
diff -urN -X linux-2.6.21rc3mm2/Documentation/dontdiff linux-2.6.21rc3mm2/mm/mempolicy.c linux-2.6.21rc3mm2-monroe/mm/mempolicy.c
--- linux-2.6.21rc3mm2/mm/mempolicy.c	2007-03-08 11:14:20.000000000 -0800
+++ linux-2.6.21rc3mm2-monroe/mm/mempolicy.c	2007-03-09 10:23:25.000000000 -0800
@@ -1609,8 +1609,13 @@
 	/* Set interleaving policy for system init. This way not all
 	   the data structures allocated at system boot end up in node zero. */
 
-	if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map))
-		printk("numa_policy_init: interleaving failed\n");
+	if (power_managed_memory_present()) {
+		if (do_set_mempolicy(MPOL_DEFAULT, &node_online_map))
+			printk("numa_policy_init: default failed\n");
+	} else {
+		if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map))
+			printk("numa_policy_init: interleaving failed\n");
+	}
 }
 
 /* Reset policy of current process to default */
diff -urN -X linux-2.6.21rc3mm2/Documentation/dontdiff linux-2.6.21rc3mm2/mm/page_alloc.c linux-2.6.21rc3mm2-monroe/mm/page_alloc.c
--- linux-2.6.21rc3mm2/mm/page_alloc.c	2007-03-08 11:14:20.000000000 -0800
+++ linux-2.6.21rc3mm2-monroe/mm/page_alloc.c	2007-03-09 10:23:25.000000000 -0800
@@ -2600,8 +2600,10 @@
 					* sizeof(wait_queue_head_t);
 
  	if (system_state == SYSTEM_BOOTING) {
+		int nid = nearest_non_pm_node(pgdat->node_id);
+		
 		zone->wait_table = (wait_queue_head_t *)
-			alloc_bootmem_node(pgdat, alloc_size);
+			alloc_bootmem_node(NODE_DATA(nid), alloc_size);
 	} else {
 		/*
 		 * This case means that a zone whose size was 0 gets new memory
@@ -3215,8 +3217,11 @@
 		end = ALIGN(end, MAX_ORDER_NR_PAGES);
 		size =  (end - start) * sizeof(struct page);
 		map = alloc_remap(pgdat->node_id, size);
-		if (!map)
-			map = alloc_bootmem_node(pgdat, size);
+		if (!map) {
+			int nid = nearest_non_pm_node(pgdat->node_id);
+
+			map = alloc_bootmem_node(NODE_DATA(nid), size);
+		}
 		pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
 		printk(KERN_DEBUG
 			"Node %d memmap at 0x%p size %lu first pfn 0x%p\n",
diff -urN -X linux-2.6.21rc3mm2/Documentation/dontdiff linux-2.6.21rc3mm2/mm/slab.c linux-2.6.21rc3mm2-monroe/mm/slab.c
--- linux-2.6.21rc3mm2/mm/slab.c	2007-03-08 11:14:20.000000000 -0800
+++ linux-2.6.21rc3mm2-monroe/mm/slab.c	2007-03-09 10:23:25.000000000 -0800
@@ -3399,6 +3399,7 @@
 	if (unlikely(nodeid == -1))
 		nodeid = numa_node_id();
 
+	nodeid = nearest_non_pm_node(nodeid);
 	if (unlikely(!cachep->nodelists[nodeid])) {
 		/* Node not bootstrapped yet */
 		ptr = fallback_alloc(cachep, flags);
@@ -3672,6 +3673,7 @@
 #ifdef CONFIG_NUMA
 void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 {
+	nodeid = nearest_non_pm_node(nodeid);
 	return __cache_alloc_node(cachep, flags, nodeid,
 			__builtin_return_address(0));
 }
diff -urN -X linux-2.6.21rc3mm2/Documentation/dontdiff linux-2.6.21rc3mm2/mm/sparse.c linux-2.6.21rc3mm2-monroe/mm/sparse.c
--- linux-2.6.21rc3mm2/mm/sparse.c	2007-02-04 10:44:54.000000000 -0800
+++ linux-2.6.21rc3mm2-monroe/mm/sparse.c	2007-03-09 10:23:25.000000000 -0800
@@ -49,7 +49,8 @@
 	struct mem_section *section = NULL;
 	unsigned long array_size = SECTIONS_PER_ROOT *
 				   sizeof(struct mem_section);
-
+	
+	nid = nearest_non_pm_node(nid);
 	if (slab_is_available())
 		section = kmalloc_node(array_size, GFP_KERNEL, nid);
 	else
@@ -215,6 +216,7 @@
 	struct mem_section *ms = __nr_to_section(pnum);
 	int nid = sparse_early_nid(ms);
 
+	nid = nearest_non_pm_node(nid);
 	map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION);
 	if (map)
 		return map;

WARNING: multiple messages have this Message-ID (diff)
From: Mark Gross <mgross@linux.intel.com>
To: David Rientjes <rientjes@google.com>
Cc: linux-mm@kvack.org, linux-pm@lists.osdl.org,
	Linus Torvalds <torvalds@linux-foundation.org>,
	Andrew Morton <akpm@linux-foundation.org>,
	mark.gross@intel.com, neelam.chandwani@intel.com
Subject: Re: [RFC] [PATCH] Power Managed memory base enabling
Date: Fri, 9 Mar 2007 12:53:44 -0800	[thread overview]
Message-ID: <20070309205344.GA16777@linux.intel.com> (raw)
In-Reply-To: <Pine.LNX.4.64.0703061838390.13314@chino.kir.corp.google.com>

On Tue, Mar 06, 2007 at 06:40:36PM -0800, David Rientjes wrote:
> On Tue, 6 Mar 2007, Mark Gross wrote:
> 
> > Let me give your idea a spin and get back to you. 
> > 
> 
> Something like the following might be a little better.

Thanks!  I've got things cleaned up and working with as many of your
ideas as I could get working.  I liked many of the changes you offered
in the patch you sent to me off list.  

One thing I found was your patch didn't use the SLIT data in computing
the nearest non PM node, and I had to be careful about the difference
between the PM memory PXM bitmap and node id's.  After I accounted for
the not_to_pxm mapping things started working for me.

BTW re basing to 2.6.21rc3mm2, resulted in one 4k allocation in my
PM-zones.  I'll be looking for where that allocation is coming from
after I get this post finished.

--mgross

Singed-off-by: Mark Gross <mark.gross@intel.com>

diff -urN -X linux-2.6.21rc3mm2/Documentation/dontdiff linux-2.6.21rc3mm2/arch/x86_64/mm/numa.c linux-2.6.21rc3mm2-monroe/arch/x86_64/mm/numa.c
--- linux-2.6.21rc3mm2/arch/x86_64/mm/numa.c	2007-03-08 11:14:19.000000000 -0800
+++ linux-2.6.21rc3mm2-monroe/arch/x86_64/mm/numa.c	2007-03-09 10:23:25.000000000 -0800
@@ -155,19 +155,47 @@
 }
 #endif
 
+/* we need a place to save the next start address to use for each node because
+ * we need to allocate the pgdata and bootmem for power managed memory in
+ * non-power managed nodes.  We do this by saving off where we can start
+ * allocating in the nodes and updating them as the boot up proceeds.
+ */
+static unsigned long bootmem_start[MAX_NUMNODES];
+
+
 static void * __init
 early_node_mem(int nodeid, unsigned long start, unsigned long end,
 	      unsigned long size)
 {
-	unsigned long mem = find_e820_area(start, end, size);
+	unsigned long mem;
 	void *ptr;
-	if (mem != -1L)
+	int nid;
+	
+	if (bootmem_start[nodeid] < start) {
+		bootmem_start[nodeid] = start;
+	}
+
+	mem = -1L;
+	nid = nearest_non_pm_node(nodeid);
+	if (nid != nodeid) {
+		if (!node_online(nid))
+			return NULL;
+
+		end = (NODE_DATA(nid)->node_start_pfn +
+			NODE_DATA(nid)->node_spanned_pages)
+				<< PAGE_SHIFT;
+	}
+	mem = find_e820_area(bootmem_start[nid], end, size);
+	if (mem!= -1L) {
+		/* now increment bootmem_start for next call */
+		bootmem_start[nid] = round_up(mem + size, PAGE_SIZE);
 		return __va(mem);
+	}
 	ptr = __alloc_bootmem_nopanic(size,
 				SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS));
 	if (ptr == 0) {
 		printk(KERN_ERR "Cannot find %lu bytes in node %d\n",
-			size, nodeid);
+			size, nid);
 		return NULL;
 	}
 	return ptr;
@@ -179,6 +207,7 @@
 	unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start; 
 	unsigned long nodedata_phys;
 	void *bootmap;
+	int non_pm_node = nearest_non_pm_node(nodeid);
 	const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
 
 	start = round_up(start, ZONE_ALIGN); 
@@ -218,8 +247,8 @@
 
 	free_bootmem_with_active_regions(nodeid, end);
 
-	reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size); 
-	reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT);
+	reserve_bootmem_node(NODE_DATA(non_pm_node), nodedata_phys, pgdat_size);
+	reserve_bootmem_node(NODE_DATA(non_pm_node), bootmap_start, bootmap_pages<<PAGE_SHIFT);
 #ifdef CONFIG_ACPI_NUMA
 	srat_reserve_add_area(nodeid);
 #endif
@@ -230,8 +259,9 @@
 void __init setup_node_zones(int nodeid)
 { 
 	unsigned long start_pfn, end_pfn, memmapsize, limit;
+	int non_pm_node = nearest_non_pm_node(nodeid);
 
- 	start_pfn = node_start_pfn(nodeid);
+	start_pfn = node_start_pfn(nodeid);
  	end_pfn = node_end_pfn(nodeid);
 
 	Dprintk(KERN_INFO "Setting up memmap for node %d %lx-%lx\n",
@@ -242,11 +272,11 @@
 	memmapsize = sizeof(struct page) * (end_pfn-start_pfn);
 	limit = end_pfn << PAGE_SHIFT;
 #ifdef CONFIG_FLAT_NODE_MEM_MAP
-	NODE_DATA(nodeid)->node_mem_map = 
-		__alloc_bootmem_core(NODE_DATA(nodeid)->bdata, 
-				memmapsize, SMP_CACHE_BYTES, 
-				round_down(limit - memmapsize, PAGE_SIZE), 
-				limit);
+	NODE_DATA(nodeid)->node_mem_map =
+		__alloc_bootmem_core(NODE_DATA(non_pm_node)->bdata,
+			memmapsize, SMP_CACHE_BYTES,
+			round_down(limit - memmapsize, PAGE_SIZE),
+			limit);
 	printk(KERN_DEBUG "Node %d memmap at 0x%p size %lu first pfn 0x%p\n",
 			nodeid, NODE_DATA(nodeid)->node_mem_map,
 			memmapsize, NODE_DATA(nodeid)->node_mem_map);
@@ -265,7 +295,8 @@
 	for (i = 0; i < NR_CPUS; i++) {
 		if (cpu_to_node[i] != NUMA_NO_NODE)
 			continue;
- 		numa_set_node(i, rr);
+		numa_set_node(i,nearest_non_pm_node(rr));
+		//numa_set_node(i, rr);
 		rr = next_node(rr, node_online_map);
 		if (rr == MAX_NUMNODES)
 			rr = first_node(node_online_map);
diff -urN -X linux-2.6.21rc3mm2/Documentation/dontdiff linux-2.6.21rc3mm2/arch/x86_64/mm/srat.c linux-2.6.21rc3mm2-monroe/arch/x86_64/mm/srat.c
--- linux-2.6.21rc3mm2/arch/x86_64/mm/srat.c	2007-03-08 11:14:19.000000000 -0800
+++ linux-2.6.21rc3mm2-monroe/arch/x86_64/mm/srat.c	2007-03-09 11:00:51.000000000 -0800
@@ -27,6 +27,7 @@
 
 static nodemask_t nodes_parsed __initdata;
 static struct bootnode nodes_add[MAX_NUMNODES];
+static nodemask_t pm_nodes __read_mostly;
 static int found_add_area __initdata;
 int hotadd_percent __initdata = 0;
 
@@ -34,6 +35,9 @@
    from BIOS bugs. */
 #define NODE_MIN_SIZE (4*1024*1024)
 
+/* ACPI bit to represent power management node */
+#define POWER_MANAGEMENT_ACPI_BIT	(1 << 31)
+
 static __init int setup_node(int pxm)
 {
 	return acpi_map_pxm_to_node(pxm);
@@ -298,7 +302,10 @@
 		return;
 	start = ma->base_address;
 	end = start + ma->length;
-	pxm = ma->proximity_domain;
+	pxm = ma->proximity_domain & ~POWER_MANAGEMENT_ACPI_BIT;
+	if (ma->proximity_domain & POWER_MANAGEMENT_ACPI_BIT)
+		node_set(pxm, pm_nodes);
+
 	node = setup_node(pxm);
 	if (node < 0) {
 		printk(KERN_ERR "SRAT: Too many proximity domains.\n");
@@ -486,3 +493,35 @@
 }
 EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
 
+int __power_managed_node(int nid)
+{
+	return node_isset(node_to_pxm(nid), pm_nodes);
+}
+
+int __power_managed_memory_present(void)
+{
+	return !nodes_empty(pm_nodes);
+}
+
+int __nearest_non_pm_node(int nid)
+{
+	int i, dist, closest, temp;
+	
+	if (!__power_managed_node(nid))
+		return nid;
+	dist = closest= 255;
+	for_each_node(i) {
+		if (__power_managed_node(i))
+			continue;
+
+		if (i != nid) {
+			temp = __node_distance(nid, i );
+			if (temp < dist) {
+				closest = i;
+				dist = temp;
+			}
+		}
+	}
+	BUG_ON(closest == 255);
+	return closest;
+}
diff -urN -X linux-2.6.21rc3mm2/Documentation/dontdiff linux-2.6.21rc3mm2/include/asm-x86_64/topology.h linux-2.6.21rc3mm2-monroe/include/asm-x86_64/topology.h
--- linux-2.6.21rc3mm2/include/asm-x86_64/topology.h	2007-03-08 11:14:20.000000000 -0800
+++ linux-2.6.21rc3mm2-monroe/include/asm-x86_64/topology.h	2007-03-09 10:23:25.000000000 -0800
@@ -18,6 +18,13 @@
 /* #else fallback version */
 #endif
 
+extern int __power_managed_node(int);
+extern int __power_managed_memory_present(void);
+extern int __nearest_non_pm_node(int);
+#define power_managed_node(nid)		__power_managed_node(nid)
+#define power_managed_memory_present()	__power_managed_memory_present()
+#define nearest_non_pm_node(nid)	__nearest_non_pm_node(nid)
+
 #define cpu_to_node(cpu)		(cpu_to_node[cpu])
 #define parent_node(node)		(node)
 #define node_to_first_cpu(node) 	(first_cpu(node_to_cpumask[node]))
diff -urN -X linux-2.6.21rc3mm2/Documentation/dontdiff linux-2.6.21rc3mm2/include/linux/topology.h linux-2.6.21rc3mm2-monroe/include/linux/topology.h
--- linux-2.6.21rc3mm2/include/linux/topology.h	2007-03-08 11:14:08.000000000 -0800
+++ linux-2.6.21rc3mm2-monroe/include/linux/topology.h	2007-03-09 10:23:25.000000000 -0800
@@ -67,6 +67,24 @@
 #ifndef PENALTY_FOR_NODE_WITH_CPUS
 #define PENALTY_FOR_NODE_WITH_CPUS	(1)
 #endif
+#ifndef power_managed_node
+static inline int power_managed_node(int nid)
+{
+	return 0;
+}
+#endif
+#ifndef power_managed_memory_present
+static inline int power_managed_memory_present(void)
+{
+	return 0;
+}
+#endif
+#ifndef nearest_non_pm_node
+static inline int nearest_non_pm_node(int nid)
+{
+	return nid;
+}
+#endif
 
 /*
  * Below are the 3 major initializers used in building sched_domains:
diff -urN -X linux-2.6.21rc3mm2/Documentation/dontdiff linux-2.6.21rc3mm2/mm/bootmem.c linux-2.6.21rc3mm2-monroe/mm/bootmem.c
--- linux-2.6.21rc3mm2/mm/bootmem.c	2007-02-04 10:44:54.000000000 -0800
+++ linux-2.6.21rc3mm2-monroe/mm/bootmem.c	2007-03-09 10:23:25.000000000 -0800
@@ -417,11 +417,14 @@
 void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
 				      unsigned long goal)
 {
-	bootmem_data_t *bdata;
 	void *ptr;
+	int i;
 
-	list_for_each_entry(bdata, &bdata_list, list) {
-		ptr = __alloc_bootmem_core(bdata, size, align, goal, 0);
+	for_each_online_node(i) {
+		if (power_managed_node(i))
+			continue;
+		ptr = __alloc_bootmem_core(NODE_DATA(i)->bdata, size,
+					align, goal, 0);
 		if (ptr)
 			return ptr;
 	}
@@ -463,12 +466,14 @@
 void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
 				  unsigned long goal)
 {
-	bootmem_data_t *bdata;
 	void *ptr;
+	int i;
 
-	list_for_each_entry(bdata, &bdata_list, list) {
-		ptr = __alloc_bootmem_core(bdata, size, align, goal,
-						ARCH_LOW_ADDRESS_LIMIT);
+	for_each_online_node(i) {
+		if (power_managed_node(i))
+			continue;
+		ptr = __alloc_bootmem_core(NODE_DATA(i)->bdata, size, align,
+					goal, ARCH_LOW_ADDRESS_LIMIT);
 		if (ptr)
 			return ptr;
 	}
diff -urN -X linux-2.6.21rc3mm2/Documentation/dontdiff linux-2.6.21rc3mm2/mm/mempolicy.c linux-2.6.21rc3mm2-monroe/mm/mempolicy.c
--- linux-2.6.21rc3mm2/mm/mempolicy.c	2007-03-08 11:14:20.000000000 -0800
+++ linux-2.6.21rc3mm2-monroe/mm/mempolicy.c	2007-03-09 10:23:25.000000000 -0800
@@ -1609,8 +1609,13 @@
 	/* Set interleaving policy for system init. This way not all
 	   the data structures allocated at system boot end up in node zero. */
 
-	if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map))
-		printk("numa_policy_init: interleaving failed\n");
+	if (power_managed_memory_present()) {
+		if (do_set_mempolicy(MPOL_DEFAULT, &node_online_map))
+			printk("numa_policy_init: default failed\n");
+	} else {
+		if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map))
+			printk("numa_policy_init: interleaving failed\n");
+	}
 }
 
 /* Reset policy of current process to default */
diff -urN -X linux-2.6.21rc3mm2/Documentation/dontdiff linux-2.6.21rc3mm2/mm/page_alloc.c linux-2.6.21rc3mm2-monroe/mm/page_alloc.c
--- linux-2.6.21rc3mm2/mm/page_alloc.c	2007-03-08 11:14:20.000000000 -0800
+++ linux-2.6.21rc3mm2-monroe/mm/page_alloc.c	2007-03-09 10:23:25.000000000 -0800
@@ -2600,8 +2600,10 @@
 					* sizeof(wait_queue_head_t);
 
  	if (system_state == SYSTEM_BOOTING) {
+		int nid = nearest_non_pm_node(pgdat->node_id);
+		
 		zone->wait_table = (wait_queue_head_t *)
-			alloc_bootmem_node(pgdat, alloc_size);
+			alloc_bootmem_node(NODE_DATA(nid), alloc_size);
 	} else {
 		/*
 		 * This case means that a zone whose size was 0 gets new memory
@@ -3215,8 +3217,11 @@
 		end = ALIGN(end, MAX_ORDER_NR_PAGES);
 		size =  (end - start) * sizeof(struct page);
 		map = alloc_remap(pgdat->node_id, size);
-		if (!map)
-			map = alloc_bootmem_node(pgdat, size);
+		if (!map) {
+			int nid = nearest_non_pm_node(pgdat->node_id);
+
+			map = alloc_bootmem_node(NODE_DATA(nid), size);
+		}
 		pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
 		printk(KERN_DEBUG
 			"Node %d memmap at 0x%p size %lu first pfn 0x%p\n",
diff -urN -X linux-2.6.21rc3mm2/Documentation/dontdiff linux-2.6.21rc3mm2/mm/slab.c linux-2.6.21rc3mm2-monroe/mm/slab.c
--- linux-2.6.21rc3mm2/mm/slab.c	2007-03-08 11:14:20.000000000 -0800
+++ linux-2.6.21rc3mm2-monroe/mm/slab.c	2007-03-09 10:23:25.000000000 -0800
@@ -3399,6 +3399,7 @@
 	if (unlikely(nodeid == -1))
 		nodeid = numa_node_id();
 
+	nodeid = nearest_non_pm_node(nodeid);
 	if (unlikely(!cachep->nodelists[nodeid])) {
 		/* Node not bootstrapped yet */
 		ptr = fallback_alloc(cachep, flags);
@@ -3672,6 +3673,7 @@
 #ifdef CONFIG_NUMA
 void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 {
+	nodeid = nearest_non_pm_node(nodeid);
 	return __cache_alloc_node(cachep, flags, nodeid,
 			__builtin_return_address(0));
 }
diff -urN -X linux-2.6.21rc3mm2/Documentation/dontdiff linux-2.6.21rc3mm2/mm/sparse.c linux-2.6.21rc3mm2-monroe/mm/sparse.c
--- linux-2.6.21rc3mm2/mm/sparse.c	2007-02-04 10:44:54.000000000 -0800
+++ linux-2.6.21rc3mm2-monroe/mm/sparse.c	2007-03-09 10:23:25.000000000 -0800
@@ -49,7 +49,8 @@
 	struct mem_section *section = NULL;
 	unsigned long array_size = SECTIONS_PER_ROOT *
 				   sizeof(struct mem_section);
-
+	
+	nid = nearest_non_pm_node(nid);
 	if (slab_is_available())
 		section = kmalloc_node(array_size, GFP_KERNEL, nid);
 	else
@@ -215,6 +216,7 @@
 	struct mem_section *ms = __nr_to_section(pnum);
 	int nid = sparse_early_nid(ms);
 
+	nid = nearest_non_pm_node(nid);
 	map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION);
 	if (map)
 		return map;

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

  reply	other threads:[~2007-03-09 20:53 UTC|newest]

Thread overview: 26+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2007-03-05 18:18 [RFC] [PATCH] Power Managed memory base enabling Mark Gross
2007-03-05 18:18 ` Mark Gross
2007-03-06  1:26 ` KAMEZAWA Hiroyuki
2007-03-06  1:26   ` KAMEZAWA Hiroyuki
2007-03-06 15:54   ` Mark Gross
2007-03-06 15:54     ` Mark Gross
2007-03-06 15:09 ` David Rientjes
2007-03-06 15:09   ` David Rientjes
2007-03-06 16:47   ` Mark Gross
2007-03-06 16:47     ` Mark Gross
2007-03-06 17:12     ` David Rientjes
2007-03-06 17:12       ` David Rientjes
2007-03-06 17:20       ` Mark Gross
2007-03-06 17:20         ` Mark Gross
2007-03-06 17:33         ` David Rientjes
2007-03-06 17:33           ` David Rientjes
2007-03-07  2:40     ` David Rientjes
2007-03-07  2:40       ` David Rientjes
2007-03-09 20:53       ` Mark Gross [this message]
2007-03-09 20:53         ` Mark Gross
2007-03-09 21:27         ` David Rientjes
2007-03-09 21:27           ` David Rientjes
2007-03-09 21:26           ` Mark Gross
2007-03-09 21:26             ` Mark Gross
2007-03-26 12:48 ` Pavel Machek
2007-03-26 12:48   ` [linux-pm] " Pavel Machek

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20070309205344.GA16777@linux.intel.com \
    --to=mgross@linux.intel.com \
    --cc=akpm@linux-foundation.org \
    --cc=linux-mm@kvack.org \
    --cc=linux-pm@lists.osdl.org \
    --cc=mark.gross@intel.com \
    --cc=neelam.chandwani@intel.com \
    --cc=rientjes@google.com \
    --cc=torvalds@linux-foundation.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.