public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
From: Con Kolivas <kernel@kolivas.org>
To: Rik van Riel <riel@redhat.com>
Cc: ck kernel mailing list <ck@vds.kolivas.org>,
	linux kernel mailing list <linux-kernel@vger.kernel.org>
Subject: Re: 2.6.8.1-ck4
Date: Thu, 26 Aug 2004 10:10:44 +1000	[thread overview]
Message-ID: <412D2A84.7010705@kolivas.org> (raw)
In-Reply-To: <Pine.LNX.4.44.0408251621160.5145-100000@chimarrao.boston.redhat.com>


[-- Attachment #1.1: Type: text/plain, Size: 1525 bytes --]

Rik van Riel wrote:
> On Sun, 22 Aug 2004, Con Kolivas wrote:
> 
> 
>>Added since 2.6.8.1-ck3:
>>+mapped_watermark.diff
> 
> 
> Sounds like a bad idea for file servers ;)
> 
> Wouldn't it be better to lazily move these cached pages to
> a "cached" list like the BSDs have, and reclaim it immediately
> when the memory is needed for something else ?
> 
> It should be easy enough to keep the cached data around and
> still have the cache pages easily reclaimable.

Sounds like a good idea. Would this leave us with large buddies though? 
Also with file caching it tends to do very little so the ram does indeed 
fill up still. Furthermore, setting mapped to 0 basically disables it 
anyway so that shouldn't be an issue. Currently file servers are 
recommended to tune the "swappiness" value to 100 in the same manner.

This is the buddy condition the machine looks like after it's been 
running for days with this patch in situ.

cat /proc/buddyinfo
Node 0, zone      DMA     15     14     10     11     13     11      9 
     5      3      1      0
Node 0, zone   Normal  15778  17800   6885    722     14      1      3 
     0      0      1      0

Just FYI here's the current version of the patch.

Cheers,
Con

  include/linux/mmzone.h |    3 +
  include/linux/swap.h   |    2 -
  include/linux/sysctl.h |    2 -
  kernel/sysctl.c        |    8 ++--
  mm/page_alloc.c        |    5 ++
  mm/vmscan.c            |   98 
+++++++++++++++++++++++--------------------------
  6 files changed, 59 insertions(+), 59 deletions(-)

[-- Attachment #1.2: mapped_watermark2.diff --]
[-- Type: text/x-patch, Size: 10039 bytes --]

Index: linux-2.6.8.1/include/linux/mmzone.h
===================================================================
--- linux-2.6.8.1.orig/include/linux/mmzone.h	2004-08-15 14:08:19.000000000 +1000
+++ linux-2.6.8.1/include/linux/mmzone.h	2004-08-26 10:01:38.153695699 +1000
@@ -125,7 +125,7 @@ struct zone {
 	 */
 	spinlock_t		lock;
 	unsigned long		free_pages;
-	unsigned long		pages_min, pages_low, pages_high;
+	unsigned long		pages_min, pages_low, pages_high, pages_unmapped;
 	/*
 	 * protection[] is a pre-calculated number of extra pages that must be
 	 * available in a zone in order for __alloc_pages() to allocate memory
@@ -276,6 +276,7 @@ typedef struct pglist_data {
 	struct pglist_data *pgdat_next;
 	wait_queue_head_t       kswapd_wait;
 	struct task_struct *kswapd;
+	int maplimit;
 } pg_data_t;
 
 #define node_present_pages(nid)	(NODE_DATA(nid)->node_present_pages)
Index: linux-2.6.8.1/include/linux/swap.h
===================================================================
--- linux-2.6.8.1.orig/include/linux/swap.h	2004-08-15 14:08:19.000000000 +1000
+++ linux-2.6.8.1/include/linux/swap.h	2004-08-26 10:01:38.154695545 +1000
@@ -174,7 +174,7 @@ extern void swap_setup(void);
 /* linux/mm/vmscan.c */
 extern int try_to_free_pages(struct zone **, unsigned int, unsigned int);
 extern int shrink_all_memory(int);
-extern int vm_swappiness;
+extern int vm_mapped;
 
 #ifdef CONFIG_MMU
 /* linux/mm/shmem.c */
Index: linux-2.6.8.1/include/linux/sysctl.h
===================================================================
--- linux-2.6.8.1.orig/include/linux/sysctl.h	2004-08-15 14:08:19.000000000 +1000
+++ linux-2.6.8.1/include/linux/sysctl.h	2004-08-26 10:01:38.155695390 +1000
@@ -157,7 +157,7 @@ enum
 	VM_OVERCOMMIT_RATIO=16, /* percent of RAM to allow overcommit in */
 	VM_PAGEBUF=17,		/* struct: Control pagebuf parameters */
 	VM_HUGETLB_PAGES=18,	/* int: Number of available Huge Pages */
-	VM_SWAPPINESS=19,	/* Tendency to steal mapped memory */
+	VM_MAPPED=19,		/* percent mapped min while evicting cache */
 	VM_LOWER_ZONE_PROTECTION=20,/* Amount of protection of lower zones */
 	VM_MIN_FREE_KBYTES=21,	/* Minimum free kilobytes to maintain */
 	VM_MAX_MAP_COUNT=22,	/* int: Maximum number of mmaps/address-space */
Index: linux-2.6.8.1/kernel/sysctl.c
===================================================================
--- linux-2.6.8.1.orig/kernel/sysctl.c	2004-08-15 14:08:19.000000000 +1000
+++ linux-2.6.8.1/kernel/sysctl.c	2004-08-26 10:01:38.156695236 +1000
@@ -701,10 +701,10 @@ static ctl_table vm_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 	{
-		.ctl_name	= VM_SWAPPINESS,
-		.procname	= "swappiness",
-		.data		= &vm_swappiness,
-		.maxlen		= sizeof(vm_swappiness),
+		.ctl_name	= VM_MAPPED,
+		.procname	= "mapped",
+		.data		= &vm_mapped,
+		.maxlen		= sizeof(vm_mapped),
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec_minmax,
 		.strategy	= &sysctl_intvec,
Index: linux-2.6.8.1/mm/page_alloc.c
===================================================================
--- linux-2.6.8.1.orig/mm/page_alloc.c	2004-08-15 14:08:19.000000000 +1000
+++ linux-2.6.8.1/mm/page_alloc.c	2004-08-26 10:01:42.205069628 +1000
@@ -628,6 +628,10 @@ __alloc_pages(unsigned int gfp_mask, uns
 		 */
 		if (rt_task(p))
 			min -= z->pages_low >> 1;
+		else if (vm_mapped && wait && 
+			z->free_pages < z->pages_unmapped &&
+			z->free_pages > z->pages_low)
+				wakeup_kswapd(z);
 
 		if (z->free_pages >= min ||
 				(!wait && z->free_pages >= z->pages_high)) {
@@ -1905,6 +1909,7 @@ static void setup_per_zone_pages_min(voi
 
 		zone->pages_low = zone->pages_min * 2;
 		zone->pages_high = zone->pages_min * 3;
+		zone->pages_unmapped = zone->present_pages / 3;
 		spin_unlock_irqrestore(&zone->lru_lock, flags);
 	}
 }
Index: linux-2.6.8.1/mm/vmscan.c
===================================================================
--- linux-2.6.8.1.orig/mm/vmscan.c	2004-08-15 14:08:19.000000000 +1000
+++ linux-2.6.8.1/mm/vmscan.c	2004-08-26 10:01:42.207069319 +1000
@@ -115,10 +115,7 @@ struct shrinker {
 #define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)
 #endif
 
-/*
- * From 0 .. 100.  Higher means more swappy.
- */
-int vm_swappiness = 60;
+int vm_mapped = 66;
 static long total_memory;
 
 static LIST_HEAD(shrinker_list);
@@ -338,7 +335,8 @@ static pageout_t pageout(struct page *pa
 /*
  * shrink_list adds the number of reclaimed pages to sc->nr_reclaimed
  */
-static int shrink_list(struct list_head *page_list, struct scan_control *sc)
+static int shrink_list(struct list_head *page_list, struct scan_control *sc,
+			int maplimit)
 {
 	LIST_HEAD(ret_pages);
 	struct pagevec freed_pvec;
@@ -366,6 +364,8 @@ static int shrink_list(struct list_head 
 			goto keep_locked;
 
 		sc->nr_scanned++;
+		if (maplimit && page_mapped(page))
+			goto keep_locked;
 		/* Double the slab pressure for mapped and swapcache pages */
 		if (page_mapped(page) || PageSwapCache(page))
 			sc->nr_scanned++;
@@ -543,6 +543,7 @@ static void shrink_cache(struct zone *zo
 	LIST_HEAD(page_list);
 	struct pagevec pvec;
 	int max_scan = sc->nr_to_scan;
+	int maplimit = 0;
 
 	pagevec_init(&pvec, 1);
 
@@ -584,11 +585,12 @@ static void shrink_cache(struct zone *zo
 			goto done;
 
 		max_scan -= nr_scan;
-		if (current_is_kswapd())
+		if (current_is_kswapd()) {
 			mod_page_state_zone(zone, pgscan_kswapd, nr_scan);
-		else
+			maplimit = zone->zone_pgdat->maplimit;
+		} else
 			mod_page_state_zone(zone, pgscan_direct, nr_scan);
-		nr_freed = shrink_list(&page_list, sc);
+		nr_freed = shrink_list(&page_list, sc, maplimit);
 		if (current_is_kswapd())
 			mod_page_state(kswapd_steal, nr_freed);
 		mod_page_state_zone(zone, pgsteal, nr_freed);
@@ -648,10 +650,6 @@ refill_inactive_zone(struct zone *zone, 
 	LIST_HEAD(l_active);	/* Pages to go onto the active_list */
 	struct page *page;
 	struct pagevec pvec;
-	int reclaim_mapped = 0;
-	long mapped_ratio;
-	long distress;
-	long swap_tendency;
 
 	lru_add_drain();
 	pgmoved = 0;
@@ -681,42 +679,11 @@ refill_inactive_zone(struct zone *zone, 
 	zone->nr_active -= pgmoved;
 	spin_unlock_irq(&zone->lru_lock);
 
-	/*
-	 * `distress' is a measure of how much trouble we're having reclaiming
-	 * pages.  0 -> no problems.  100 -> great trouble.
-	 */
-	distress = 100 >> zone->prev_priority;
-
-	/*
-	 * The point of this algorithm is to decide when to start reclaiming
-	 * mapped memory instead of just pagecache.  Work out how much memory
-	 * is mapped.
-	 */
-	mapped_ratio = (sc->nr_mapped * 100) / total_memory;
-
-	/*
-	 * Now decide how much we really want to unmap some pages.  The mapped
-	 * ratio is downgraded - just because there's a lot of mapped memory
-	 * doesn't necessarily mean that page reclaim isn't succeeding.
-	 *
-	 * The distress ratio is important - we don't want to start going oom.
-	 *
-	 * A 100% value of vm_swappiness overrides this algorithm altogether.
-	 */
-	swap_tendency = mapped_ratio / 2 + distress + vm_swappiness;
-
-	/*
-	 * Now use this metric to decide whether to start moving mapped memory
-	 * onto the inactive list.
-	 */
-	if (swap_tendency >= 100)
-		reclaim_mapped = 1;
-
 	while (!list_empty(&l_hold)) {
 		page = lru_to_page(&l_hold);
 		list_del(&page->lru);
 		if (page_mapped(page)) {
-			if (!reclaim_mapped) {
+			if (zone->zone_pgdat->maplimit) {
 				list_add(&page->lru, &l_active);
 				continue;
 			}
@@ -981,7 +948,7 @@ out:
  * the page allocator fallback scheme to ensure that aging of pages is balanced
  * across the zones.
  */
-static int balance_pgdat(pg_data_t *pgdat, int nr_pages)
+static int balance_pgdat(pg_data_t *pgdat, int nr_pages, int maplimit)
 {
 	int to_free = nr_pages;
 	int priority;
@@ -994,6 +961,20 @@ static int balance_pgdat(pg_data_t *pgda
 	sc.may_writepage = 0;
 	sc.nr_mapped = read_page_state(nr_mapped);
 
+	/*
+	 * Sanity check to ensure we don't have a stale maplimit set
+	 * and are calling balance_pgdat for a different reason.
+	 */
+	if (nr_pages)
+		maplimit = 0;
+	/*
+	 * kswapd does a light balance_pgdat() when there is less than 1/3
+	 * ram free provided there is less than vm_mapped % of that ram 
+	 * mapped.
+	 */
+	if (maplimit && sc.nr_mapped * 100 / total_memory > vm_mapped)
+		return 0;
+
 	inc_page_state(pageoutrun);
 
 	for (i = 0; i < pgdat->nr_zones; i++) {
@@ -1007,6 +988,12 @@ static int balance_pgdat(pg_data_t *pgda
 		int end_zone = 0;	/* Inclusive.  0 = ZONE_DMA */
 		unsigned long lru_pages = 0;
 
+		/*
+		 * Only do low priority scanning if we're here due to
+		 * mapped watermark.
+		 */
+		if (maplimit && priority < DEF_PRIORITY)
+			goto out;
 		if (nr_pages == 0) {
 			/*
 			 * Scan in the highmem->dma direction for the highest
@@ -1019,10 +1006,13 @@ static int balance_pgdat(pg_data_t *pgda
 						priority != DEF_PRIORITY)
 					continue;
 
-				if (zone->free_pages <= zone->pages_high) {
-					end_zone = i;
-					goto scan;
+				if (zone->free_pages <= zone->pages_high ||
+					(maplimit && zone->free_pages <= 
+					zone->pages_unmapped)) {
+						end_zone = i;
+						goto scan;
 				}
+
 			}
 			goto out;
 		} else {
@@ -1148,7 +1138,7 @@ static int kswapd(void *p)
 		schedule();
 		finish_wait(&pgdat->kswapd_wait, &wait);
 
-		balance_pgdat(pgdat, 0);
+		balance_pgdat(pgdat, 0, pgdat->maplimit);
 	}
 	return 0;
 }
@@ -1158,11 +1148,15 @@ static int kswapd(void *p)
  */
 void wakeup_kswapd(struct zone *zone)
 {
+	if (zone->free_pages > zone->pages_unmapped)
+		goto out;
 	if (zone->free_pages > zone->pages_low)
-		return;
+		zone->zone_pgdat->maplimit = 1;
 	if (!waitqueue_active(&zone->zone_pgdat->kswapd_wait))
-		return;
+		goto out;
 	wake_up_interruptible(&zone->zone_pgdat->kswapd_wait);
+out:
+	zone->zone_pgdat->maplimit = 0;
 }
 
 #ifdef CONFIG_PM
@@ -1182,7 +1176,7 @@ int shrink_all_memory(int nr_pages)
 	current->reclaim_state = &reclaim_state;
 	for_each_pgdat(pgdat) {
 		int freed;
-		freed = balance_pgdat(pgdat, nr_to_free);
+		freed = balance_pgdat(pgdat, nr_to_free, 0);
 		ret += freed;
 		nr_to_free -= freed;
 		if (nr_to_free <= 0)

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 256 bytes --]

      parent reply	other threads:[~2004-08-26  0:12 UTC|newest]

Thread overview: 16+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2004-08-22 11:17 2.6.8.1-ck4 Con Kolivas
2004-08-22 18:15 ` 2.6.8.1-ck4 Hans Reiser
2004-08-22 21:37   ` 2.6.8.1-ck4 Con Kolivas
2004-08-23 17:04 ` 2.6.8.1-ck4 Joshua Schmidlkofer
2004-08-23 17:19   ` 2.6.8.1-ck4 Prakash K. Cheemplavam
2004-08-23 17:31     ` 2.6.8.1-ck4 Joshua Schmidlkofer
2004-08-23 21:48     ` 2.6.8.1-ck4 Con Kolivas
2004-08-23 23:34       ` 2.6.8.1-ck4 Con Kolivas
2004-08-24  9:28         ` 2.6.8.1-ck4 Prakash K. Cheemplavam
2004-08-24  9:43           ` 2.6.8.1-ck4 Con Kolivas
2004-08-24  9:54             ` 2.6.8.1-ck4 Prakash K. Cheemplavam
2004-08-25 20:22 ` 2.6.8.1-ck4 Rik van Riel
2004-08-25 20:50   ` 2.6.8.1-ck4 William Lee Irwin III
2004-08-25 20:56     ` 2.6.8.1-ck4 Rik van Riel
2004-08-25 21:02       ` 2.6.8.1-ck4 William Lee Irwin III
2004-08-26  0:10   ` Con Kolivas [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=412D2A84.7010705@kolivas.org \
    --to=kernel@kolivas.org \
    --cc=ck@vds.kolivas.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=riel@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox