[PATCH--] Re: Linux VM/IO balancing (fwd to linux-mm?) (fwd)

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Roger Larsson <roger.larsson@norran.net>
To: Rik van Riel <riel@conectiva.com.br>
Cc: linux-mm@kvack.org, Matthew Dillon <dillon@apollo.backplane.com>
Subject: [PATCH--] Re: Linux VM/IO balancing (fwd to linux-mm?) (fwd)
Date: Tue, 23 May 2000 17:29:25 +0200	[thread overview]
Message-ID: <392AA3D5.FD6B5399@norran.net> (raw)
In-Reply-To: Pine.LNX.4.21.0005230934240.19121-100000@duckman.distro.conectiva

[-- Attachment #1: Type: text/plain, Size: 975 bytes --]

From: Matthew Dillon <dillon@apollo.backplane.com>
>     The algorithm is a *modified* LRU.  Lets say you decide on a weighting
>     betweeen 0 and 10.  When a page is first allocated (either to the
>     buffer cache or for anonymous memory) its statistical weight is
>     set to the middle (5).  If the page is used often the statistical 
>     weight slowly rises to its maximum (10).  If the page remains idle
>     (or was just used once) the statistical weight slowly drops to its
>     minimum (0).


My patches has been approaching this a while... [slowly...]
The currently included patch adds has divided lru in four lists [0..3].
New pages are added at level 1.
Scan is performed - and referenced pages are moved up.

Pages are moved down due to list balancing, but I have been playing with
other ideas.

These patches should be a good continuation point.
Patches are against pre9-3 with Quintela applied.

/RogerL

--
Home page:
  http://www.norran.net/nra02596/

[-- Attachment #2: patch-2.3.99-pre9.3-q-shrink_mmap.3 --]
[-- Type: text/plain, Size: 11223 bytes --]

diff -aur linux-2.3.99-pre9.3-q/include/linux/mm.h linux/include/linux/mm.h
--- linux-2.3.99-pre9.3-q/include/linux/mm.h	Fri May 12 21:16:14 2000
+++ linux/include/linux/mm.h	Mon May 22 23:30:15 2000
@@ -15,7 +15,7 @@
 extern unsigned long num_physpages;
 extern void * high_memory;
 extern int page_cluster;
-extern struct list_head lru_cache;
+extern struct list_head * new_lru_cache;
 
 #include <asm/page.h>
 #include <asm/pgtable.h>
@@ -456,6 +456,8 @@
 extern void remove_inode_page(struct page *);
 extern unsigned long page_unuse(struct page *);
 extern int shrink_mmap(int, int);
+extern int age_mmap(void);
+extern void lru_cache_init(void);
 extern void truncate_inode_pages(struct address_space *, loff_t);
 
 /* generic vm_area_ops exported for stackable file systems */
diff -aur linux-2.3.99-pre9.3-q/include/linux/swap.h linux/include/linux/swap.h
--- linux-2.3.99-pre9.3-q/include/linux/swap.h	Fri May 12 21:16:13 2000
+++ linux/include/linux/swap.h	Mon May 22 23:30:19 2000
@@ -166,7 +166,7 @@
 #define	lru_cache_add(page)			\
 do {						\
 	spin_lock(&pagemap_lru_lock);		\
-	list_add(&(page)->lru, &lru_cache);	\
+	list_add(&(page)->lru, new_lru_cache);	\
 	nr_lru_pages++;				\
 	spin_unlock(&pagemap_lru_lock);		\
 } while (0)
diff -aur linux-2.3.99-pre9.3-q/mm/filemap.c linux/mm/filemap.c
--- linux-2.3.99-pre9.3-q/mm/filemap.c	Mon May 22 22:25:51 2000
+++ linux/mm/filemap.c	Tue May 23 16:36:41 2000
@@ -44,7 +44,10 @@
 atomic_t page_cache_size = ATOMIC_INIT(0);
 unsigned int page_hash_bits;
 struct page **page_hash_table;
-struct list_head lru_cache;
+
+#define NR_OF_LRU_LISTS 4
+static struct list_head lru_cache[NR_OF_LRU_LISTS]; // pages removed from low
+struct list_head * new_lru_cache = &lru_cache[1];   // here goes new pages
 
 static spinlock_t pagecache_lock = SPIN_LOCK_UNLOCKED;
 /*
@@ -249,20 +252,19 @@
  * before doing sync writes.  We can only do sync writes if we can
  * wait for IO (__GFP_IO set).
  */
-int shrink_mmap(int priority, int gfp_mask)
+static unsigned long shrink_mmap_referenced_moved; /* debug only? */
+
+static inline int
+ _shrink_mmap(struct list_head *lru_head,
+	      int gfp_mask, int *toscan, int *toskipwait)
 {
-	int ret = 0, count, nr_dirty;
-	struct list_head * page_lru;
+        int ret = 0, count = *toscan, nr_dirty = *toskipwait;
+
+	struct list_head * page_lru = lru_head;
 	struct page * page = NULL;
-	
-	count = nr_lru_pages / (priority + 1);
-	nr_dirty = priority;
 
-	/* we need pagemap_lru_lock for list_del() ... subtle code below */
-	spin_lock(&pagemap_lru_lock);
-	while (count > 0 && (page_lru = lru_cache.prev) != &lru_cache) {
+	while (count > 0 && (page_lru = page_lru->prev) != lru_head) {
 		page = list_entry(page_lru, struct page, lru);
-		list_del(page_lru);
 
 		count--;
 		if (PageTestandClearReferenced(page))
@@ -273,10 +275,12 @@
 		 * immediate tell are untouchable..
 		 */
 		if (!page->buffers && page_count(page) > 1)
-			goto dispose_continue;
+		  continue;
 
+		/* Lock this lru page, reentrant
+		 * will be disposed correctly when unlocked */
 		if (TryLockPage(page))
-			goto dispose_continue;
+			continue;
 
 		/* Release the pagemap_lru lock even if the page is not yet
 		   queued in any lru queue since we have just locked down
@@ -352,26 +356,171 @@
 		spin_lock(&pagemap_lru_lock);
 		UnlockPage(page);
 		page_cache_release(page);
+		continue;
+
 dispose_continue:
-		list_add(page_lru, &lru_cache);
-	}
-	goto out;
+		/* have the pagemap_lru_lock, lru cannot change */
+		/* Move it to top of this list with referenced cleared
+		 * to give a chance for progress even when running without
+		 * kswapd...
+		 */
+		{
+		  struct list_head * page_lru_to_move = page_lru; 
+		  page_lru = page_lru->next; /* continues with page_lru.prev */
+		  list_del(page_lru_to_move);
+		  list_add(page_lru_to_move, lru_head);
+		  shrink_mmap_referenced_moved++;
+		}
+		continue;
 
 made_inode_progress:
-	page_cache_release(page);
+		page_cache_release(page);
 made_buffer_progress:
-	UnlockPage(page);
-	page_cache_release(page);
-	ret = 1;
+		/* like to have the lru lock before UnlockPage */
+		spin_lock(&pagemap_lru_lock);
+
+		UnlockPage(page);
+
+		/* lru manipulation needs the spin lock */
+		{
+		  struct list_head * page_lru_to_free = page_lru; 
+		  page_lru = page_lru->next; /* continues with page_lru.prev */
+		  list_del(page_lru_to_free);
+		}
+
+		/* nr_lru_pages needs the spinlock */
+		page_cache_release(page);
+		ret++;
+		nr_lru_pages--;
+
+	}
+
+	*toskipwait = nr_dirty;
+	*toscan = count;
+	return ret;
+}
+
+int shrink_mmap(int priority, int gfp_mask)
+{
+	int ret = 0, count, ix, nr_dirty;
+	
+	count = nr_lru_pages >> priority;
+	nr_dirty = 1 << priority; /* magic number */
+
+	/* we need pagemap_lru_lock for list_del() ... subtle code below */
+	/* NOTE: we could have a lock per list... */
 	spin_lock(&pagemap_lru_lock);
-	/* nr_lru_pages needs the spinlock */
-	nr_lru_pages--;
 
-out:
+	for (ix = 0; ix < NR_OF_LRU_LISTS; ix ++)
+	  ret += _shrink_mmap(&lru_cache[ix], gfp_mask, &count, &nr_dirty);
+
 	spin_unlock(&pagemap_lru_lock);
 
 	return ret;
 }
+
+/* called with pagemap_lru_lock locked! */
+
+struct age_pyramid
+{
+  int number_of;
+  int moved_up;
+  int moved_down;
+};
+
+static inline void
+ _age_mmap_list(struct list_head *page_lru_head,
+		struct list_head *dispose_up,
+		struct list_head *dispose_down,
+		struct age_pyramid *statistics)
+{
+  struct list_head * page_lru = page_lru_head;
+  struct list_head * dispose = NULL;
+  struct page * page;
+
+  int balance = nr_lru_pages / NR_OF_LRU_LISTS;
+
+  statistics->number_of = 0;
+  statistics->moved_up = 0;
+  statistics->moved_down = 0;
+
+  while ((page_lru = page_lru->prev) != page_lru_head) {
+    page = list_entry(page_lru, struct page, lru);
+
+    balance--;
+      
+    if (PageTestandClearReferenced(page)) {
+      dispose = dispose_up;
+      statistics->moved_up++;
+    }
+    else if (balance < 0) {
+      dispose = dispose_down;
+      statistics->moved_down++;
+    }
+    else {
+      statistics->number_of++;
+    }
+
+    /* dispose the page */
+    if (dispose)
+    {
+      struct list_head * page_lru_to_move = page_lru; 
+      page_lru = page_lru->next; /* continues with page_lru.prev */
+      list_del(page_lru_to_move);
+      list_add(page_lru_to_move, dispose);
+    }
+  }
+}
+
+int age_mmap(void)
+{
+	LIST_HEAD(referenced);
+
+	struct list_head * dispose_up;
+
+	int moved_pre, ix;
+	struct age_pyramid stat[NR_OF_LRU_LISTS];
+
+	spin_lock(&pagemap_lru_lock);
+
+	/* Scanned in shrink_mmap ? */
+	moved_pre = shrink_mmap_referenced_moved;
+	shrink_mmap_referenced_moved = 0;
+
+	/* Scan new list first */
+	dispose_up = &referenced;
+	for (ix = NR_OF_LRU_LISTS - 1; ix > 0; ix--) {
+	  _age_mmap_list(&lru_cache[ix],
+			 dispose_up,
+			 &lru_cache[ix - 1],
+			 &stat[ix]);
+	  dispose_up = &lru_cache[ix];
+	}
+	_age_mmap_list(&lru_cache[ix], dispose_up, NULL, &stat[ix]);
+
+	/* referenced pages in top list go top of lru_cache (same list) */
+	list_splice(&referenced, &lru_cache[NR_OF_LRU_LISTS - 1]);
+
+	spin_unlock(&pagemap_lru_lock);
+
+	printk(KERN_DEBUG "age_mmap: [%4d %4d<%4d>%4d %4d<%4d>%4d %4d<%4d>%4d / %6d]\n",
+	       moved_pre,
+	       stat[0].moved_down, stat[0].number_of, stat[0].moved_up,
+	       stat[1].moved_down, stat[1].number_of, stat[1].moved_up,
+	       stat[2].moved_down, stat[2].number_of, stat[2].moved_up,
+	       nr_lru_pages);
+
+	return 0; /* until it is known what to return... */
+}
+ 
+
+void lru_cache_init(void)
+{
+  int ix;
+  for (ix = 0; ix < NR_OF_LRU_LISTS; ix++)
+	INIT_LIST_HEAD(&lru_cache[ix]);
+}
+
 
 static inline struct page * __find_page_nolock(struct address_space *mapping, unsigned long offset, struct page *page)
 {
diff -aur linux-2.3.99-pre9.3-q/mm/page_alloc.c linux/mm/page_alloc.c
--- linux-2.3.99-pre9.3-q/mm/page_alloc.c	Fri May 12 20:21:20 2000
+++ linux/mm/page_alloc.c	Mon May 22 22:37:55 2000
@@ -497,7 +497,8 @@
 	freepages.min += i;
 	freepages.low += i * 2;
 	freepages.high += i * 3;
-	memlist_init(&lru_cache);
+
+	lru_cache_init();
 
 	/*
 	 * Some architectures (with lots of mem and discontinous memory
diff -aur linux-2.3.99-pre9.3-q/mm/vmscan.c linux/mm/vmscan.c
--- linux-2.3.99-pre9.3-q/mm/vmscan.c	Mon May 22 22:25:51 2000
+++ linux/mm/vmscan.c	Tue May 23 00:11:23 2000
@@ -363,7 +363,7 @@
 	 * Think of swap_cnt as a "shadow rss" - it tells us which process
 	 * we want to page out (always try largest first).
 	 */
-	counter = (nr_threads << 2) >> (priority >> 2);
+	counter = (nr_threads << 1) >> (priority >> 1);
 	if (counter < 1)
 		counter = 1;
 
@@ -435,17 +435,15 @@
 {
 	int priority;
 	int count = FREE_COUNT;
-	int swap_count;
 
 	/* Always trim SLAB caches when memory gets low. */
 	kmem_cache_reap(gfp_mask);
 
-	priority = 64;
+	priority = 6;
 	do {
-		while (shrink_mmap(priority, gfp_mask)) {
-			if (!--count)
-				goto done;
-		}
+	        count -= shrink_mmap(priority, gfp_mask);
+		if (count <= 0)
+		  goto done;
 
 
 		/* Try to get rid of some shared memory pages.. */
@@ -472,18 +470,20 @@
 		 * put in the swap cache), so we must not count this
 		 * as a "count" success.
 		 */
-		swap_count = SWAP_COUNT;
-		while (swap_out(priority, gfp_mask))
-			if (--swap_count < 0)
-				break;
+		{
+			int swap_count = SWAP_COUNT;
+			while (swap_out(priority, gfp_mask))
+				if (--swap_count < 0)
+					break;
+		}
 
 	} while (--priority >= 0);
 
 	/* Always end on a shrink_mmap.. */
-	while (shrink_mmap(0, gfp_mask)) {
-		if (!--count)
-			goto done;
-	}
+	count -= shrink_mmap(0, gfp_mask);
+	if (count <= 0)
+	  goto done;
+
 	/* We return 1 if we are freed some page */
 	return (count != FREE_COUNT);
 
@@ -543,15 +543,52 @@
 				if (!zone->size || !zone->zone_wake_kswapd)
 					continue;
 				if (zone->free_pages < zone->pages_low)
-					something_to_do = 1;
-				do_try_to_free_pages(GFP_KSWAPD);
+					something_to_do += 100;
+
+				something_to_do += 1;
+
+				printk(KERN_DEBUG
+				       "kswapd: low memory zone: %s (%ld)\n",
+				       zone->name, zone->free_pages
+				       );
 			}
 			pgdat = pgdat->node_next;
 		} while (pgdat);
 
-		if (!something_to_do) {
+		if (something_to_do) {
+		  do_try_to_free_pages(GFP_KSWAPD);
+		  run_task_queue(&tq_disk);
+		  printk(KERN_DEBUG "kswapd: to do %d - done\n", something_to_do);
+		}
+
+		/* Always sleep - give requestors a chance to use
+		 * freed pages. Even low prio ones - they might be
+		 * the only ones.
+		 */
+
+		/* Do not depend on kswapd.. it should be able to sleep
+		 * sleep time is possible to trim:
+		 * - function of pages aged?
+		 * - function of something_to_do?
+		 * - free_pages?
+		 * right now 1s periods, wakeup possible */
+		if (something_to_do < MAX_NR_ZONES/2 + 1)
+		{
+			static long aging_time = 0L;
+			if (aging_time == 0L) {
+			  aging_time = 1L*HZ;
+			}
+
 			tsk->state = TASK_INTERRUPTIBLE;
-			interruptible_sleep_on(&kswapd_wait);
+			aging_time = interruptible_sleep_on_timeout(&kswapd_wait, 1*HZ);
+
+			/* age after wakeup, slept at least one jiffie or have
+			 * been waken up - a lot might have happened.
+			 */
+			(void)age_mmap();
+		}
+		else if (tsk->need_resched) {
+		        schedule();
 		}
 	}
 }

next prev parent reply	other threads:[~2000-05-23 15:29 UTC|newest]

Thread overview: 4+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2000-05-23 12:35 Linux VM/IO balancing (fwd to linux-mm?) (fwd) Rik van Riel
2000-05-23 15:29 ` Roger Larsson [this message]
2000-05-23 16:51   ` [PATCH--] " Chuck Lever
2000-05-23 19:56     ` Roger Larsson

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=392AA3D5.FD6B5399@norran.net \
    --to=roger.larsson@norran.net \
    --cc=dillon@apollo.backplane.com \
    --cc=linux-mm@kvack.org \
    --cc=riel@conectiva.com.br \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.