public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH] __alloc_pages - NUMA and lower zone protection
@ 2004-02-13 18:32 Martin Hicks
  2004-02-14  0:01 ` Nick Piggin
  2004-02-17 22:58 ` Andrew Morton
  0 siblings, 2 replies; 5+ messages in thread
From: Martin Hicks @ 2004-02-13 18:32 UTC (permalink / raw)
  To: linux-kernel

[-- Attachment #1: Type: text/plain, Size: 1010 bytes --]


Hi,

There is a problem with the current __alloc pages on a machine with many
nodes.  As we go down the zones[] list, we may move onto other nodes.
Each time we go to the next zone we protect these zones by doing
"min += local_low".

This is quite appropriate on a machine with one node, but wrong on
machines with other nodes.  To illustrate, here is an example.  On a
256 node Altix machine, a request on node 0 for 2MB requires just over
600MB of free memory on the 256th node in order to fullfil the "min"
requirements if all other nodes are low on memory.  This could leave
73GB of memory unallocated across all nodes.

This patch keeps the same semantics for lower_zone_protection, but only
provides protection for higher priority zones in the same node.

The patch seems to do the right thing on my non-NUMA zx1 ia64 machine
(which has ZONE_DMA and ZONE_NORMAL) as well as the multi-node Altix.

thanks
mh

-- 
Martin Hicks                Wild Open Source Inc.
mort@wildopensource.com     613-266-2296

[-- Attachment #2: page_alloc-6.patch --]
[-- Type: text/plain, Size: 3521 bytes --]

===== include/linux/mmzone.h 1.51 vs edited =====
--- 1.51/include/linux/mmzone.h	Wed Feb  4 00:35:17 2004
+++ edited/include/linux/mmzone.h	Wed Feb 11 15:17:40 2004
@@ -70,6 +70,7 @@
 	spinlock_t		lock;
 	unsigned long		free_pages;
 	unsigned long		pages_min, pages_low, pages_high;
+	unsigned long		zone_type;
 
 	ZONE_PADDING(_pad1_)
 
===== mm/page_alloc.c 1.184 vs edited =====
--- 1.184/mm/page_alloc.c	Wed Feb  4 00:35:18 2004
+++ edited/mm/page_alloc.c	Wed Feb 11 15:48:43 2004
@@ -41,6 +41,7 @@
 int nr_swap_pages;
 int numnodes = 1;
 int sysctl_lower_zone_protection = 0;
+static int max_zone;	/* Highest zone number that contains pages */
 
 EXPORT_SYMBOL(totalram_pages);
 EXPORT_SYMBOL(nr_swap_pages);
@@ -557,27 +558,24 @@
 		return NULL;
 
 	/* Go through the zonelist once, looking for a zone with enough free */
-	min = 1UL << order;
 	for (i = 0; zones[i] != NULL; i++) {
 		struct zone *z = zones[i];
-		unsigned long local_low;
+		unsigned long local_low = z->pages_low;
 
 		/*
 		 * This is the fabled 'incremental min'. We let real-time tasks
 		 * dip their real-time paws a little deeper into reserves.
 		 */
-		local_low = z->pages_low;
 		if (rt_task(p))
 			local_low >>= 1;
-		min += local_low;
-
+		min = (1UL << order) + local_low;
+		min += local_low * sysctl_lower_zone_protection * (max_zone - z->zone_type);
 		if (z->free_pages >= min ||
 				(!wait && z->free_pages >= z->pages_high)) {
 			page = buffered_rmqueue(z, order, cold);
 			if (page)
-		       		goto got_pg;
+				goto got_pg;
 		}
-		min += z->pages_low * sysctl_lower_zone_protection;
 	}
 
 	/* we're somewhat low on memory, failed to find what we needed */
@@ -585,24 +583,23 @@
 		wakeup_kswapd(zones[i]);
 
 	/* Go through the zonelist again, taking __GFP_HIGH into account */
-	min = 1UL << order;
 	for (i = 0; zones[i] != NULL; i++) {
-		unsigned long local_min;
 		struct zone *z = zones[i];
+		unsigned long local_min = z->pages_min;
 
-		local_min = z->pages_min;
 		if (gfp_mask & __GFP_HIGH)
 			local_min >>= 2;
 		if (rt_task(p))
 			local_min >>= 1;
-		min += local_min;
+		min = (1UL << order) + local_min;
+		min += local_min * sysctl_lower_zone_protection * (max_zone - z->zone_type);
+
 		if (z->free_pages >= min ||
 				(!wait && z->free_pages >= z->pages_high)) {
 			page = buffered_rmqueue(z, order, cold);
 			if (page)
 				goto got_pg;
 		}
-		min += local_min * sysctl_lower_zone_protection;
 	}
 
 	/* here we're in the low on memory slow path */
@@ -634,18 +631,17 @@
 	p->flags &= ~PF_MEMALLOC;
 
 	/* go through the zonelist yet one more time */
-	min = 1UL << order;
 	for (i = 0; zones[i] != NULL; i++) {
 		struct zone *z = zones[i];
 
-		min += z->pages_min;
+		min = (1UL << order) + z->pages_min;
+		min += z->pages_min * sysctl_lower_zone_protection * (max_zone - z->zone_type);
 		if (z->free_pages >= min ||
 				(!wait && z->free_pages >= z->pages_high)) {
 			page = buffered_rmqueue(z, order, cold);
 			if (page)
 				goto got_pg;
 		}
-		min += z->pages_low * sysctl_lower_zone_protection;
 	}
 
 	/*
@@ -1107,7 +1103,11 @@
  			j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
  
 		zonelist->zones[j++] = NULL;
-	} 
+
+		if (pgdat->node_zones[i].present_pages > 0)
+			if (i > max_zone)
+				max_zone = i;
+       }
 }
 
 void __init build_all_zonelists(void)
@@ -1246,6 +1246,7 @@
 		spin_lock_init(&zone->lru_lock);
 		zone->zone_pgdat = pgdat;
 		zone->free_pages = 0;
+		zone->zone_type = j;
 
 		zone->temp_priority = zone->prev_priority = DEF_PRIORITY;
 

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2004-02-18 16:19 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2004-02-13 18:32 [PATCH] __alloc_pages - NUMA and lower zone protection Martin Hicks
2004-02-14  0:01 ` Nick Piggin
2004-02-14  2:17   ` Martin Hicks
2004-02-17 22:58 ` Andrew Morton
2004-02-18 16:19   ` Martin Hicks

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox