[PATCH] Configurable reclaim batch size

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [PATCH] Configurable reclaim batch size
@ 2007-09-14 22:23 Christoph Lameter
  2007-09-14 23:20 ` Peter Zijlstra
  0 siblings, 1 reply; 7+ messages in thread
From: Christoph Lameter @ 2007-09-14 22:23 UTC (permalink / raw)
  To: linux-mm; +Cc: linux-kernel

This patch allows a configuration of the basic reclaim unit for reclaim in 
vmscan.c. As memory sizes increase so will the frequency of running 
reclaim. Configuring the reclaim unit higher will reduce the number of 
times reclaim has to be entered and reduce the number of times that the 
zone locks have to be taken.

Signed-off-by: Christoph Lameter <clameter@sgi.com>

---
 include/linux/mmzone.h |    1 +
 kernel/sysctl.c        |    8 ++++++++
 mm/vmscan.c            |   41 +++++++++++++++++++++--------------------
 3 files changed, 30 insertions(+), 20 deletions(-)

Index: linux-2.6/mm/vmscan.c
===================================================================
--- linux-2.6.orig/mm/vmscan.c	2007-09-12 18:21:28.000000000 -0700
+++ linux-2.6/mm/vmscan.c	2007-09-12 18:31:13.000000000 -0700
@@ -57,11 +57,11 @@ struct scan_control {
 	/* Can pages be swapped as part of reclaim? */
 	int may_swap;
 
-	/* This context's SWAP_CLUSTER_MAX. If freeing memory for
-	 * suspend, we effectively ignore SWAP_CLUSTER_MAX.
+	/* This context's  reclaim batch size. If freeing memory for
+	 * suspend, we effectively ignore reclaim_batch.
 	 * In this context, it doesn't matter that we scan the
 	 * whole list at once. */
-	int swap_cluster_max;
+	int reclaim_batch;
 
 	int swappiness;
 
@@ -105,6 +105,7 @@ struct scan_control {
  */
 int vm_swappiness = 60;
 long vm_total_pages;	/* The total number of pages which the VM controls */
+int sysctl_reclaim_batch = SWAP_CLUSTER_MAX;
 
 static LIST_HEAD(shrinker_list);
 static DECLARE_RWSEM(shrinker_rwsem);
@@ -159,7 +160,7 @@ unsigned long shrink_slab(unsigned long 
 	unsigned long ret = 0;
 
 	if (scanned == 0)
-		scanned = SWAP_CLUSTER_MAX;
+		scanned = sysctl_reclaim_batch;
 
 	if (!down_read_trylock(&shrinker_rwsem))
 		return 1;	/* Assume we'll be able to shrink next time */
@@ -338,7 +339,7 @@ static pageout_t pageout(struct page *pa
 		int res;
 		struct writeback_control wbc = {
 			.sync_mode = WB_SYNC_NONE,
-			.nr_to_write = SWAP_CLUSTER_MAX,
+			.nr_to_write = sysctl_reclaim_batch,
 			.range_start = 0,
 			.range_end = LLONG_MAX,
 			.nonblocking = 1,
@@ -801,7 +802,7 @@ static unsigned long shrink_inactive_lis
 		unsigned long nr_freed;
 		unsigned long nr_active;
 
-		nr_taken = isolate_lru_pages(sc->swap_cluster_max,
+		nr_taken = isolate_lru_pages(sc->reclaim_batch,
 			     &zone->inactive_list,
 			     &page_list, &nr_scan, sc->order,
 			     (sc->order > PAGE_ALLOC_COSTLY_ORDER)?
@@ -1076,7 +1077,7 @@ static unsigned long shrink_zone(int pri
 	zone->nr_scan_active +=
 		(zone_page_state(zone, NR_ACTIVE) >> priority) + 1;
 	nr_active = zone->nr_scan_active;
-	if (nr_active >= sc->swap_cluster_max)
+	if (nr_active >= sc->reclaim_batch)
 		zone->nr_scan_active = 0;
 	else
 		nr_active = 0;
@@ -1084,7 +1085,7 @@ static unsigned long shrink_zone(int pri
 	zone->nr_scan_inactive +=
 		(zone_page_state(zone, NR_INACTIVE) >> priority) + 1;
 	nr_inactive = zone->nr_scan_inactive;
-	if (nr_inactive >= sc->swap_cluster_max)
+	if (nr_inactive >= sc->reclaim_batch)
 		zone->nr_scan_inactive = 0;
 	else
 		nr_inactive = 0;
@@ -1092,14 +1093,14 @@ static unsigned long shrink_zone(int pri
 	while (nr_active || nr_inactive) {
 		if (nr_active) {
 			nr_to_scan = min(nr_active,
-					(unsigned long)sc->swap_cluster_max);
+					(unsigned long)sc->reclaim_batch);
 			nr_active -= nr_to_scan;
 			shrink_active_list(nr_to_scan, zone, sc, priority);
 		}
 
 		if (nr_inactive) {
 			nr_to_scan = min(nr_inactive,
-					(unsigned long)sc->swap_cluster_max);
+					(unsigned long)sc->reclaim_batch);
 			nr_inactive -= nr_to_scan;
 			nr_reclaimed += shrink_inactive_list(nr_to_scan, zone,
 								sc);
@@ -1181,7 +1182,7 @@ unsigned long try_to_free_pages(struct z
 	struct scan_control sc = {
 		.gfp_mask = gfp_mask,
 		.may_writepage = !laptop_mode,
-		.swap_cluster_max = SWAP_CLUSTER_MAX,
+		.reclaim_batch = sysctl_reclaim_batch,
 		.may_swap = 1,
 		.swappiness = vm_swappiness,
 		.order = order,
@@ -1210,7 +1211,7 @@ unsigned long try_to_free_pages(struct z
 			reclaim_state->reclaimed_slab = 0;
 		}
 		total_scanned += sc.nr_scanned;
-		if (nr_reclaimed >= sc.swap_cluster_max) {
+		if (nr_reclaimed >= sc.reclaim_batch) {
 			ret = 1;
 			goto out;
 		}
@@ -1222,8 +1223,8 @@ unsigned long try_to_free_pages(struct z
 		 * that's undesirable in laptop mode, where we *want* lumpy
 		 * writeout.  So in laptop mode, write out the whole world.
 		 */
-		if (total_scanned > sc.swap_cluster_max +
-					sc.swap_cluster_max / 2) {
+		if (total_scanned > sc.reclaim_batch +
+					sc.reclaim_batch / 2) {
 			wakeup_pdflush(laptop_mode ? 0 : total_scanned);
 			sc.may_writepage = 1;
 		}
@@ -1288,7 +1289,7 @@ static unsigned long balance_pgdat(pg_da
 	struct scan_control sc = {
 		.gfp_mask = GFP_KERNEL,
 		.may_swap = 1,
-		.swap_cluster_max = SWAP_CLUSTER_MAX,
+		.reclaim_batch = sysctl_reclaim_batch,
 		.swappiness = vm_swappiness,
 		.order = order,
 	};
@@ -1388,7 +1389,7 @@ loop_again:
 			 * the reclaim ratio is low, start doing writepage
 			 * even in laptop mode
 			 */
-			if (total_scanned > SWAP_CLUSTER_MAX * 2 &&
+			if (total_scanned > sysctl_reclaim_batch * 2 &&
 			    total_scanned > nr_reclaimed + nr_reclaimed / 2)
 				sc.may_writepage = 1;
 		}
@@ -1407,7 +1408,7 @@ loop_again:
 		 * matches the direct reclaim path behaviour in terms of impact
 		 * on zone->*_priority.
 		 */
-		if (nr_reclaimed >= SWAP_CLUSTER_MAX)
+		if (nr_reclaimed >= sysctl_reclaim_batch)
 			break;
 	}
 out:
@@ -1600,7 +1601,7 @@ unsigned long shrink_all_memory(unsigned
 	struct scan_control sc = {
 		.gfp_mask = GFP_KERNEL,
 		.may_swap = 0,
-		.swap_cluster_max = nr_pages,
+		.reclaim_batch = nr_pages,
 		.may_writepage = 1,
 		.swappiness = vm_swappiness,
 	};
@@ -1782,8 +1783,8 @@ static int __zone_reclaim(struct zone *z
 	struct scan_control sc = {
 		.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
 		.may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP),
-		.swap_cluster_max = max_t(unsigned long, nr_pages,
-					SWAP_CLUSTER_MAX),
+		.reclaim_batch = max_t(unsigned long, nr_pages,
+					sysctl_reclaim_batch),
 		.gfp_mask = gfp_mask,
 		.swappiness = vm_swappiness,
 	};
Index: linux-2.6/include/linux/mmzone.h
===================================================================
--- linux-2.6.orig/include/linux/mmzone.h	2007-09-12 18:28:58.000000000 -0700
+++ linux-2.6/include/linux/mmzone.h	2007-09-12 18:29:42.000000000 -0700
@@ -607,6 +607,7 @@ int sysctl_min_unmapped_ratio_sysctl_han
 int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int,
 			struct file *, void __user *, size_t *, loff_t *);
 
+extern int sysctl_reclaim_batch;
 extern int numa_zonelist_order_handler(struct ctl_table *, int,
 			struct file *, void __user *, size_t *, loff_t *);
 extern char numa_zonelist_order[];
Index: linux-2.6/kernel/sysctl.c
===================================================================
--- linux-2.6.orig/kernel/sysctl.c	2007-09-12 18:27:12.000000000 -0700
+++ linux-2.6/kernel/sysctl.c	2007-09-12 18:28:48.000000000 -0700
@@ -900,6 +900,14 @@ static ctl_table vm_table[] = {
 		.strategy	= &sysctl_intvec,
 	},
 	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "reclaim_batch",
+		.data		= &sysctl_reclaim_batch,
+		.maxlen		= sizeof(sysctl_reclaim_batch),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
 		.ctl_name	= VM_DROP_PAGECACHE,
 		.procname	= "drop_caches",
 		.data		= &sysctl_drop_caches,

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] Configurable reclaim batch size
  2007-09-14 22:23 [PATCH] Configurable reclaim batch size Christoph Lameter
@ 2007-09-14 23:20 ` Peter Zijlstra
  2007-09-17 17:54   ` Christoph Lameter
  0 siblings, 1 reply; 7+ messages in thread
From: Peter Zijlstra @ 2007-09-14 23:20 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: linux-mm, linux-kernel

On Fri, 2007-09-14 at 15:23 -0700, Christoph Lameter wrote:
> This patch allows a configuration of the basic reclaim unit for reclaim in 
> vmscan.c. As memory sizes increase so will the frequency of running 
> reclaim. Configuring the reclaim unit higher will reduce the number of 
> times reclaim has to be entered and reduce the number of times that the 
> zone locks have to be taken.

It increases the lock hold times though. Otoh it might work out with the
lock placement.

Do you have any numbers that show this is worthwhile?



--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] Configurable reclaim batch size
  2007-09-14 23:20 ` Peter Zijlstra
@ 2007-09-17 17:54   ` Christoph Lameter
  2007-09-17 19:56     ` Peter Zijlstra
  0 siblings, 1 reply; 7+ messages in thread
From: Christoph Lameter @ 2007-09-17 17:54 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: linux-mm, linux-kernel

On Sat, 15 Sep 2007, Peter Zijlstra wrote:

> It increases the lock hold times though. Otoh it might work out with the
> lock placement.

Yeah may be good for NUMA.

> Do you have any numbers that show this is worthwhile?

Tried to run AIM7 but the improvements are in the noise. I need a tests 
that really does large memory allocation and stresses the LRU. I could 
code something up but then Lee's patch addresses some of the same issues.
Is there any standard test that shows LRU handling regressions?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] Configurable reclaim batch size
  2007-09-17 17:54   ` Christoph Lameter
@ 2007-09-17 19:56     ` Peter Zijlstra
  2007-09-17 20:05       ` Christoph Lameter
  2007-09-17 20:48       ` Balbir Singh
  0 siblings, 2 replies; 7+ messages in thread
From: Peter Zijlstra @ 2007-09-17 19:56 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: linux-mm, linux-kernel, Mel Gorman

On Mon, 17 Sep 2007 10:54:59 -0700 (PDT) Christoph Lameter
<clameter@sgi.com> wrote:

> On Sat, 15 Sep 2007, Peter Zijlstra wrote:
> 
> > It increases the lock hold times though. Otoh it might work out with the
> > lock placement.
> 
> Yeah may be good for NUMA.

Might, I'd just like a _little_ justification for an extra tunable.

> > Do you have any numbers that show this is worthwhile?
> 
> Tried to run AIM7 but the improvements are in the noise. I need a tests 
> that really does large memory allocation and stresses the LRU. I could 
> code something up but then Lee's patch addresses some of the same issues.
> Is there any standard test that shows LRU handling regressions?

hehe, I wish. I was just hoping you'd done this patch as a result of an
actual problem and not a hunch.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] Configurable reclaim batch size
  2007-09-17 19:56     ` Peter Zijlstra
@ 2007-09-17 20:05       ` Christoph Lameter
  2007-09-17 20:48       ` Balbir Singh
  1 sibling, 0 replies; 7+ messages in thread
From: Christoph Lameter @ 2007-09-17 20:05 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: linux-mm, linux-kernel, Mel Gorman

On Mon, 17 Sep 2007, Peter Zijlstra wrote:

> > Tried to run AIM7 but the improvements are in the noise. I need a tests 
> > that really does large memory allocation and stresses the LRU. I could 
> > code something up but then Lee's patch addresses some of the same issues.
> > Is there any standard test that shows LRU handling regressions?
> 
> hehe, I wish. I was just hoping you'd done this patch as a result of an
> actual problem and not a hunch.

It was Andrew's hunch. I'd rather see Lee's approach go forward because 
I think it has the potential of solving the issue in a more general way. 
If I get some spare time with a problem system then I will test the patch.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] Configurable reclaim batch size
  2007-09-17 19:56     ` Peter Zijlstra
  2007-09-17 20:05       ` Christoph Lameter
@ 2007-09-17 20:48       ` Balbir Singh
  2007-09-17 21:24         ` Christoph Lameter
  1 sibling, 1 reply; 7+ messages in thread
From: Balbir Singh @ 2007-09-17 20:48 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Christoph Lameter, linux-mm, linux-kernel, Mel Gorman

Peter Zijlstra wrote:
> On Mon, 17 Sep 2007 10:54:59 -0700 (PDT) Christoph Lameter
> <clameter@sgi.com> wrote:
> 
>> On Sat, 15 Sep 2007, Peter Zijlstra wrote:
>>
>>> It increases the lock hold times though. Otoh it might work out with the
>>> lock placement.
>> Yeah may be good for NUMA.
> 
> Might, I'd just like a _little_ justification for an extra tunable.
> 
>>> Do you have any numbers that show this is worthwhile?
>> Tried to run AIM7 but the improvements are in the noise. I need a tests 
>> that really does large memory allocation and stresses the LRU. I could 
>> code something up but then Lee's patch addresses some of the same issues.
>> Is there any standard test that shows LRU handling regressions?
> 
> hehe, I wish. I was just hoping you'd done this patch as a result of an
> actual problem and not a hunch.

Please do let me know if someone finds a good standard test for it or a
way to stress reclaim. I've heard AIM7 come up often, but never been
able to push it much. I should retry.

-- 
	Warm Regards,
	Balbir Singh
	Linux Technology Center
	IBM, ISTL

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] Configurable reclaim batch size
  2007-09-17 20:48       ` Balbir Singh
@ 2007-09-17 21:24         ` Christoph Lameter
  0 siblings, 0 replies; 7+ messages in thread
From: Christoph Lameter @ 2007-09-17 21:24 UTC (permalink / raw)
  To: Balbir Singh; +Cc: Peter Zijlstra, linux-mm, linux-kernel, Mel Gorman

On Tue, 18 Sep 2007, Balbir Singh wrote:

> Please do let me know if someone finds a good standard test for it or a
> way to stress reclaim. I've heard AIM7 come up often, but never been
> able to push it much. I should retry.

AIM7 does small computing loads reflecting an earlier time. I wish there 
was something better reflecting large computing loads of today. The tests 
that I know of require MPI and other libraries and are not that suitable 
for kernel hackers.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2007-09-17 21:24 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-09-14 22:23 [PATCH] Configurable reclaim batch size Christoph Lameter
2007-09-14 23:20 ` Peter Zijlstra
2007-09-17 17:54   ` Christoph Lameter
2007-09-17 19:56     ` Peter Zijlstra
2007-09-17 20:05       ` Christoph Lameter
2007-09-17 20:48       ` Balbir Singh
2007-09-17 21:24         ` Christoph Lameter

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).