public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
From: Rik van Riel <riel@redhat.com>
To: linux-kernel@vger.kernel.org
Cc: Andrew Morton <akpm@linux-foundation.org>,
	Lee Schermerhorn <lee.schermerhorn@hp.com>,
	Kosaki Motohiro <kosaki.motohiro@jp.fujitsu.com>,
	Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Subject: [PATCH -mm 13/16] No Reclaim LRU Infrastructure
Date: Fri, 23 May 2008 15:55:19 -0400	[thread overview]
Message-ID: <20080523195535.715196843@redhat.com> (raw)
In-Reply-To: 20080523195506.084894989@redhat.com

[-- Attachment #1: rvr-11-lts-noreclaim-lru-infrastructure.patch --]
[-- Type: text/plain, Size: 27450 bytes --]

From: Lee Schermerhorn <Lee.Schermerhorn@hp.com>

Infrastructure to manage pages excluded from reclaim--i.e., hidden
from vmscan.  Based on a patch by Larry Woodman of Red Hat. Reworked
to maintain "nonreclaimable" pages on a separate per-zone LRU list,
to "hide" them from vmscan.  A separate noreclaim pagevec is provided
for shrink_active_list() to move nonreclaimable pages to the noreclaim
list without over burdening the zone lru_lock.

Pages on the noreclaim list have both PG_noreclaim and PG_lru set.
Thus, PG_noreclaim is analogous to and mutually exclusive with
PG_active--it specifies which LRU list the page is on.  

The noreclaim infrastructure is enabled by a new mm Kconfig option
[CONFIG_]NORECLAIM_LRU.

A new function 'page_reclaimable(page, vma)' in vmscan.c tests whether
or not a page is reclaimable.  Subsequent patches will add the various
!reclaimable tests.  We'll want to keep these tests light-weight for
use in shrink_active_list() and, possibly, the fault path.

Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Signed-off-by: Rik van Riel <riel@redhat.com>

---
V3 -> V6:
+ remove lru_cache_add_active_or_noreclaim().  Only used by
  optional patch to cull nonreclaimable pages in fault path.
  Will add back to that patch.
+ misc cleanup pointed out by review of V5

V1 -> V3:
+ rebase to 23-mm1 atop RvR's split LRU series
+ define NR_NORECLAIM and LRU_NORECLAIM to avoid errors when not
  configured.

V1 -> V2:
+  handle review comments -- various typos and errors.
+  extract "putback_all_noreclaim_pages()" into a separate patch
   and rework as "scan_all_zones_noreclaim_pages().

 include/linux/mm_inline.h  |   13 ++-
 include/linux/mmzone.h     |   24 ++++++
 include/linux/page-flags.h |   13 +++
 include/linux/pagevec.h    |    1 
 include/linux/swap.h       |   12 +++
 mm/Kconfig                 |   10 ++
 mm/internal.h              |   26 +++++++
 mm/mempolicy.c             |    2 
 mm/migrate.c               |   68 ++++++++++++-------
 mm/page_alloc.c            |    9 ++
 mm/swap.c                  |   50 ++++++++++++--
 mm/vmscan.c                |  156 ++++++++++++++++++++++++++++++++++++++++-----
 12 files changed, 329 insertions(+), 55 deletions(-)

Index: linux-2.6.26-rc2-mm1/mm/Kconfig
===================================================================
--- linux-2.6.26-rc2-mm1.orig/mm/Kconfig	2008-05-23 14:50:58.000000000 -0400
+++ linux-2.6.26-rc2-mm1/mm/Kconfig	2008-05-23 14:55:22.000000000 -0400
@@ -205,3 +205,13 @@ config NR_QUICK
 config VIRT_TO_BUS
 	def_bool y
 	depends on !ARCH_NO_VIRT_TO_BUS
+
+config NORECLAIM_LRU
+	bool "Add LRU list to track non-reclaimable pages (EXPERIMENTAL, 64BIT only)"
+	depends on EXPERIMENTAL && 64BIT
+	help
+	  Supports tracking of non-reclaimable pages off the [in]active lists
+	  to avoid excessive reclaim overhead on large memory systems.  Pages
+	  may be non-reclaimable because:  they are locked into memory, they
+	  are anonymous pages for which no swap space exists, or they are anon
+	  pages that are expensive to unmap [long anon_vma "related vma" list.]
Index: linux-2.6.26-rc2-mm1/include/linux/page-flags.h
===================================================================
--- linux-2.6.26-rc2-mm1.orig/include/linux/page-flags.h	2008-05-23 14:53:41.000000000 -0400
+++ linux-2.6.26-rc2-mm1/include/linux/page-flags.h	2008-05-23 15:00:00.000000000 -0400
@@ -94,6 +94,9 @@ enum pageflags {
 	PG_reclaim,		/* To be reclaimed asap */
 	PG_buddy,		/* Page is free, on buddy lists */
 	PG_swapbacked,		/* Page is backed by RAM/swap */
+#ifdef CONFIG_NORECLAIM_LRU
+	PG_noreclaim,		/* Page is "non-reclaimable"  */
+#endif
 #ifdef CONFIG_IA64_UNCACHED_ALLOCATOR
 	PG_uncached,		/* Page has been mapped as uncached */
 #endif
@@ -167,6 +170,7 @@ PAGEFLAG(Referenced, referenced) TESTCLE
 PAGEFLAG(Dirty, dirty) TESTSCFLAG(Dirty, dirty) __CLEARPAGEFLAG(Dirty, dirty)
 PAGEFLAG(LRU, lru) __CLEARPAGEFLAG(LRU, lru)
 PAGEFLAG(Active, active) __CLEARPAGEFLAG(Active, active)
+	TESTCLEARFLAG(Active, active)
 __PAGEFLAG(Slab, slab)
 PAGEFLAG(Checked, owner_priv_1)		/* Used by some filesystems */
 PAGEFLAG(Pinned, owner_priv_1) TESTSCFLAG(Pinned, owner_priv_1) /* Xen */
@@ -203,6 +207,15 @@ PAGEFLAG(SwapCache, swapcache)
 PAGEFLAG_FALSE(SwapCache)
 #endif
 
+#ifdef CONFIG_NORECLAIM_LRU
+PAGEFLAG(Noreclaim, noreclaim) __CLEARPAGEFLAG(Noreclaim, noreclaim)
+	TESTCLEARFLAG(Noreclaim, noreclaim)
+#else
+PAGEFLAG_FALSE(Noreclaim) TESTCLEARFLAG_FALSE(Noreclaim)
+	SETPAGEFLAG_NOOP(Noreclaim) CLEARPAGEFLAG_NOOP(Noreclaim)
+	__CLEARPAGEFLAG_NOOP(Noreclaim)
+#endif
+
 #ifdef CONFIG_IA64_UNCACHED_ALLOCATOR
 PAGEFLAG(Uncached, uncached)
 #else
Index: linux-2.6.26-rc2-mm1/include/linux/mmzone.h
===================================================================
--- linux-2.6.26-rc2-mm1.orig/include/linux/mmzone.h	2008-05-23 14:50:58.000000000 -0400
+++ linux-2.6.26-rc2-mm1/include/linux/mmzone.h	2008-05-23 14:55:22.000000000 -0400
@@ -85,6 +85,11 @@ enum zone_stat_item {
 	NR_ACTIVE_ANON,		/*  "     "     "   "       "           */
 	NR_INACTIVE_FILE,	/*  "     "     "   "       "           */
 	NR_ACTIVE_FILE,		/*  "     "     "   "       "           */
+#ifdef CONFIG_NORECLAIM_LRU
+	NR_NORECLAIM,	/*  "     "     "   "       "         */
+#else
+	NR_NORECLAIM = NR_ACTIVE_FILE, /* avoid compiler errors in dead code */
+#endif
 	NR_ANON_PAGES,	/* Mapped anonymous pages */
 	NR_FILE_MAPPED,	/* pagecache pages mapped into pagetables.
 			   only modified from process context */
@@ -124,10 +129,18 @@ enum lru_list {
 	LRU_ACTIVE_ANON = LRU_BASE + LRU_ACTIVE,
 	LRU_INACTIVE_FILE = LRU_BASE + LRU_FILE,
 	LRU_ACTIVE_FILE = LRU_BASE + LRU_FILE + LRU_ACTIVE,
-	NR_LRU_LISTS };
+#ifdef CONFIG_NORECLAIM_LRU
+	LRU_NORECLAIM,
+#else
+	LRU_NORECLAIM = LRU_ACTIVE_FILE, /* avoid compiler errors in dead code */
+#endif
+	NR_LRU_LISTS
+};
 
 #define for_each_lru(l) for (l = 0; l < NR_LRU_LISTS; l++)
 
+#define for_each_reclaimable_lru(l) for (l = 0; l <= LRU_ACTIVE_FILE; l++)
+
 static inline int is_file_lru(enum lru_list l)
 {
 	return (l == LRU_INACTIVE_FILE || l == LRU_ACTIVE_FILE);
@@ -138,6 +151,15 @@ static inline int is_active_lru(enum lru
 	return (l == LRU_ACTIVE_ANON || l == LRU_ACTIVE_FILE);
 }
 
+static inline int is_noreclaim_lru(enum lru_list l)
+{
+#ifdef CONFIG_NORECLAIM_LRU
+	return (l == LRU_NORECLAIM);
+#else
+	return 0;
+#endif
+}
+
 enum lru_list page_lru(struct page *page);
 
 struct per_cpu_pages {
Index: linux-2.6.26-rc2-mm1/mm/page_alloc.c
===================================================================
--- linux-2.6.26-rc2-mm1.orig/mm/page_alloc.c	2008-05-23 14:50:58.000000000 -0400
+++ linux-2.6.26-rc2-mm1/mm/page_alloc.c	2008-05-23 15:14:04.000000000 -0400
@@ -256,6 +256,9 @@ static void bad_page(struct page *page)
 			1 << PG_private |
 			1 << PG_locked	|
 			1 << PG_active	|
+#ifdef CONFIG_NORECLAIM_LRU
+			1 << PG_noreclaim	|
+#endif
 			1 << PG_dirty	|
 			1 << PG_reclaim |
 			1 << PG_slab    |
@@ -491,6 +494,9 @@ static inline int free_pages_check(struc
 			1 << PG_swapcache |
 			1 << PG_writeback |
 			1 << PG_reserved |
+#ifdef CONFIG_NORECLAIM_LRU
+			1 << PG_noreclaim |
+#endif
 			1 << PG_buddy ))))
 		bad_page(page);
 	if (PageDirty(page))
@@ -642,6 +648,9 @@ static int prep_new_page(struct page *pa
 			1 << PG_private	|
 			1 << PG_locked	|
 			1 << PG_active	|
+#ifdef CONFIG_NORECLAIM_LRU
+			1 << PG_noreclaim	|
+#endif
 			1 << PG_dirty	|
 			1 << PG_slab    |
 			1 << PG_swapcache |
Index: linux-2.6.26-rc2-mm1/include/linux/mm_inline.h
===================================================================
--- linux-2.6.26-rc2-mm1.orig/include/linux/mm_inline.h	2008-05-23 14:50:58.000000000 -0400
+++ linux-2.6.26-rc2-mm1/include/linux/mm_inline.h	2008-05-23 15:03:55.000000000 -0400
@@ -89,11 +89,16 @@ del_page_from_lru(struct zone *zone, str
 	enum lru_list l = LRU_INACTIVE_ANON;
 
 	list_del(&page->lru);
-	if (PageActive(page)) {
-		__ClearPageActive(page);
-		l += LRU_ACTIVE;
+	if (PageNoreclaim(page)) {
+		__ClearPageNoreclaim(page);
+		l = LRU_NORECLAIM;
+	} else {
+		 if (PageActive(page)) {
+			__ClearPageActive(page);
+			l += LRU_ACTIVE;
+		}
+		l += page_file_cache(page);
 	}
-	l += page_file_cache(page);
 	__dec_zone_state(zone, NR_INACTIVE_ANON + l);
 }
 
Index: linux-2.6.26-rc2-mm1/include/linux/swap.h
===================================================================
--- linux-2.6.26-rc2-mm1.orig/include/linux/swap.h	2008-05-23 14:50:58.000000000 -0400
+++ linux-2.6.26-rc2-mm1/include/linux/swap.h	2008-05-23 15:14:03.000000000 -0400
@@ -180,6 +180,8 @@ extern int lru_add_drain_all(void);
 extern void rotate_reclaimable_page(struct page *page);
 extern void swap_setup(void);
 
+extern void add_page_to_noreclaim_list(struct page *page);
+
 /**
  * lru_cache_add: add a page to the page lists
  * @page: the page to add
@@ -228,6 +230,16 @@ static inline int zone_reclaim(struct zo
 }
 #endif
 
+#ifdef CONFIG_NORECLAIM_LRU
+extern int page_reclaimable(struct page *page, struct vm_area_struct *vma);
+#else
+static inline int page_reclaimable(struct page *page,
+						struct vm_area_struct *vma)
+{
+	return 1;
+}
+#endif
+
 extern int kswapd_run(int nid);
 
 #ifdef CONFIG_MMU
Index: linux-2.6.26-rc2-mm1/include/linux/pagevec.h
===================================================================
--- linux-2.6.26-rc2-mm1.orig/include/linux/pagevec.h	2008-05-23 14:50:58.000000000 -0400
+++ linux-2.6.26-rc2-mm1/include/linux/pagevec.h	2008-05-23 15:03:55.000000000 -0400
@@ -101,7 +101,6 @@ static inline void __pagevec_lru_add_act
 	____pagevec_lru_add(pvec, LRU_ACTIVE_FILE);
 }
 
-
 static inline void pagevec_lru_add_file(struct pagevec *pvec)
 {
 	if (pagevec_count(pvec))
Index: linux-2.6.26-rc2-mm1/mm/swap.c
===================================================================
--- linux-2.6.26-rc2-mm1.orig/mm/swap.c	2008-05-23 14:50:58.000000000 -0400
+++ linux-2.6.26-rc2-mm1/mm/swap.c	2008-05-23 15:03:55.000000000 -0400
@@ -106,9 +106,13 @@ enum lru_list page_lru(struct page *page
 {
 	enum lru_list lru = LRU_BASE;
 
-	if (PageActive(page))
-		lru += LRU_ACTIVE;
-	lru += page_file_cache(page);
+	if (PageNoreclaim(page))
+		lru = LRU_NORECLAIM;
+	else {
+		if (PageActive(page))
+			lru += LRU_ACTIVE;
+		lru += page_file_cache(page);
+	}
 
 	return lru;
 }
@@ -133,7 +137,8 @@ static void pagevec_move_tail(struct pag
 			zone = pagezone;
 			spin_lock(&zone->lru_lock);
 		}
-		if (PageLRU(page) && !PageActive(page)) {
+		if (PageLRU(page) && !PageActive(page) &&
+					!PageNoreclaim(page)) {
 			int lru = page_file_cache(page);
 			list_move_tail(&page->lru, &zone->list[lru]);
 			pgmoved++;
@@ -154,7 +159,7 @@ static void pagevec_move_tail(struct pag
 void  rotate_reclaimable_page(struct page *page)
 {
 	if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) &&
-	    PageLRU(page)) {
+	    !PageNoreclaim(page) && PageLRU(page)) {
 		struct pagevec *pvec;
 		unsigned long flags;
 
@@ -175,7 +180,7 @@ void activate_page(struct page *page)
 	struct zone *zone = page_zone(page);
 
 	spin_lock_irq(&zone->lru_lock);
-	if (PageLRU(page) && !PageActive(page)) {
+	if (PageLRU(page) && !PageActive(page) && !PageNoreclaim(page)) {
 		int file = page_file_cache(page);
 		int lru = LRU_BASE + file;
 		del_page_from_lru_list(zone, page, lru);
@@ -207,7 +212,8 @@ void activate_page(struct page *page)
  */
 void mark_page_accessed(struct page *page)
 {
-	if (!PageActive(page) && PageReferenced(page) && PageLRU(page)) {
+	if (!PageActive(page) && !PageNoreclaim(page) &&
+			PageReferenced(page) && PageLRU(page)) {
 		activate_page(page);
 		ClearPageReferenced(page);
 	} else if (!PageReferenced(page)) {
@@ -235,13 +241,38 @@ void __lru_cache_add(struct page *page, 
 void lru_cache_add_lru(struct page *page, enum lru_list lru)
 {
 	if (PageActive(page)) {
+		VM_BUG_ON(PageNoreclaim(page));
 		ClearPageActive(page);
+	} else if (PageNoreclaim(page)) {
+		VM_BUG_ON(PageActive(page));
+		ClearPageNoreclaim(page);
 	}
 
-	VM_BUG_ON(PageLRU(page) || PageActive(page));
+	VM_BUG_ON(PageLRU(page) || PageActive(page) || PageNoreclaim(page));
 	__lru_cache_add(page, lru);
 }
 
+/**
+ * add_page_to_noreclaim_list
+ * @page:  the page to be added to the noreclaim list
+ *
+ * Add page directly to its zone's noreclaim list.  To avoid races with
+ * tasks that might be making the page reclaimble while it's not on the
+ * lru, we want to add the page while it's locked or otherwise "invisible"
+ * to other tasks.  This is difficult to do when using the pagevec cache,
+ * so bypass that.
+ */
+void add_page_to_noreclaim_list(struct page *page)
+{
+	struct zone *zone = page_zone(page);
+
+	spin_lock_irq(&zone->lru_lock);
+	SetPageNoreclaim(page);
+	SetPageLRU(page);
+	add_page_to_lru_list(zone, page, LRU_NORECLAIM);
+	spin_unlock_irq(&zone->lru_lock);
+}
+
 /*
  * Drain pages out of the cpu's pagevecs.
  * Either "cpu" is the current CPU, and preemption has already been
@@ -339,6 +370,7 @@ void release_pages(struct page **pages, 
 
 		if (PageLRU(page)) {
 			struct zone *pagezone = page_zone(page);
+
 			if (pagezone != zone) {
 				if (zone)
 					spin_unlock_irqrestore(&zone->lru_lock,
@@ -415,6 +447,7 @@ void ____pagevec_lru_add(struct pagevec 
 {
 	int i;
 	struct zone *zone = NULL;
+	VM_BUG_ON(is_noreclaim_lru(lru));
 
 	for (i = 0; i < pagevec_count(pvec); i++) {
 		struct page *page = pvec->pages[i];
@@ -426,6 +459,7 @@ void ____pagevec_lru_add(struct pagevec 
 			zone = pagezone;
 			spin_lock_irq(&zone->lru_lock);
 		}
+		VM_BUG_ON(PageActive(page) || PageNoreclaim(page));
 		VM_BUG_ON(PageLRU(page));
 		SetPageLRU(page);
 		if (is_active_lru(lru))
Index: linux-2.6.26-rc2-mm1/mm/migrate.c
===================================================================
--- linux-2.6.26-rc2-mm1.orig/mm/migrate.c	2008-05-23 14:50:58.000000000 -0400
+++ linux-2.6.26-rc2-mm1/mm/migrate.c	2008-05-23 15:19:11.000000000 -0400
@@ -53,14 +53,9 @@ int migrate_prep(void)
 	return 0;
 }
 
-static inline void move_to_lru(struct page *page)
-{
-	lru_cache_add_lru(page, page_lru(page));
-	put_page(page);
-}
-
 /*
- * Add isolated pages on the list back to the LRU.
+ * Add isolated pages on the list back to the LRU under page lock
+ * to avoid leaking reclaimable pages back onto noreclaim list.
  *
  * returns the number of pages put back.
  */
@@ -72,7 +67,9 @@ int putback_lru_pages(struct list_head *
 
 	list_for_each_entry_safe(page, page2, l, lru) {
 		list_del(&page->lru);
-		move_to_lru(page);
+		lock_page(page);
+		if (putback_lru_page(page))
+			unlock_page(page);
 		count++;
 	}
 	return count;
@@ -340,8 +337,11 @@ static void migrate_page_copy(struct pag
 		SetPageReferenced(newpage);
 	if (PageUptodate(page))
 		SetPageUptodate(newpage);
-	if (PageActive(page))
+	if (TestClearPageActive(page)) {
+		VM_BUG_ON(PageNoreclaim(page));
 		SetPageActive(newpage);
+	} else
+		noreclaim_migrate_page(newpage, page);
 	if (PageChecked(page))
 		SetPageChecked(newpage);
 	if (PageMappedToDisk(page))
@@ -362,7 +362,6 @@ static void migrate_page_copy(struct pag
 #ifdef CONFIG_SWAP
 	ClearPageSwapCache(page);
 #endif
-	ClearPageActive(page);
 	ClearPagePrivate(page);
 	set_page_private(page, 0);
 	page->mapping = NULL;
@@ -541,10 +540,15 @@ static int fallback_migrate_page(struct 
  *
  * The new page will have replaced the old page if this function
  * is successful.
+ *
+ * Return value:
+ *   < 0 - error code
+ *  == 0 - success
  */
 static int move_to_new_page(struct page *newpage, struct page *page)
 {
 	struct address_space *mapping;
+	int unlock = 1;
 	int rc;
 
 	/*
@@ -579,10 +583,16 @@ static int move_to_new_page(struct page 
 
 	if (!rc) {
 		remove_migration_ptes(page, newpage);
+		/*
+		 * Put back on LRU while holding page locked to
+		 * handle potential race with, e.g., munlock()
+		 */
+		unlock = putback_lru_page(newpage);
 	} else
 		newpage->mapping = NULL;
 
-	unlock_page(newpage);
+	if (unlock)
+		unlock_page(newpage);
 
 	return rc;
 }
@@ -599,18 +609,19 @@ static int unmap_and_move(new_page_t get
 	struct page *newpage = get_new_page(page, private, &result);
 	int rcu_locked = 0;
 	int charge = 0;
+	int unlock = 1;
 
 	if (!newpage)
 		return -ENOMEM;
 
 	if (page_count(page) == 1)
 		/* page was freed from under us. So we are done. */
-		goto move_newpage;
+		goto end_migration;
 
 	charge = mem_cgroup_prepare_migration(page, newpage);
 	if (charge == -ENOMEM) {
 		rc = -ENOMEM;
-		goto move_newpage;
+		goto end_migration;
 	}
 	/* prepare cgroup just returns 0 or -ENOMEM */
 	BUG_ON(charge);
@@ -618,7 +629,7 @@ static int unmap_and_move(new_page_t get
 	rc = -EAGAIN;
 	if (TestSetPageLocked(page)) {
 		if (!force)
-			goto move_newpage;
+			goto end_migration;
 		lock_page(page);
 	}
 
@@ -680,8 +691,6 @@ rcu_unlock:
 
 unlock:
 
-	unlock_page(page);
-
 	if (rc != -EAGAIN) {
  		/*
  		 * A page that has been migrated has all references
@@ -690,17 +699,30 @@ unlock:
  		 * restored.
  		 */
  		list_del(&page->lru);
- 		move_to_lru(page);
+		if (!page->mapping) {
+			VM_BUG_ON(page_count(page) != 1);
+			unlock_page(page);
+			put_page(page);		/* just free the old page */
+			goto end_migration;
+		} else
+			unlock = putback_lru_page(page);
 	}
 
-move_newpage:
+	if (unlock)
+		unlock_page(page);
+
+end_migration:
 	if (!charge)
 		mem_cgroup_end_migration(newpage);
-	/*
-	 * Move the new page to the LRU. If migration was not successful
-	 * then this will free the page.
-	 */
-	move_to_lru(newpage);
+
+	if (!newpage->mapping) {
+		/*
+		 * Migration failed or was never attempted.
+		 * Free the newpage.
+		 */
+		VM_BUG_ON(page_count(newpage) != 1);
+		put_page(newpage);
+	}
 	if (result) {
 		if (rc)
 			*result = rc;
Index: linux-2.6.26-rc2-mm1/mm/vmscan.c
===================================================================
--- linux-2.6.26-rc2-mm1.orig/mm/vmscan.c	2008-05-23 14:50:58.000000000 -0400
+++ linux-2.6.26-rc2-mm1/mm/vmscan.c	2008-05-23 15:19:11.000000000 -0400
@@ -437,6 +437,70 @@ cannot_free:
 	return 0;
 }
 
+/**
+ * putback_lru_page
+ * @page to be put back to appropriate lru list
+ *
+ * Add previously isolated @page to appropriate LRU list.
+ * Page may still be non-reclaimable for other reasons.
+ *
+ * lru_lock must not be held, interrupts must be enabled.
+ * Must be called with page locked.
+ *
+ * return 1 if page still locked [not truncated], else 0
+ */
+int putback_lru_page(struct page *page)
+{
+	int lru;
+	int ret = 1;
+
+	VM_BUG_ON(!PageLocked(page));
+	VM_BUG_ON(PageLRU(page));
+
+	lru = !!TestClearPageActive(page);
+	ClearPageNoreclaim(page);	/* for page_reclaimable() */
+
+	if (unlikely(!page->mapping)) {
+		/*
+		 * page truncated.  drop lock as put_page() will
+		 * free the page.
+		 */
+		VM_BUG_ON(page_count(page) != 1);
+		unlock_page(page);
+		ret = 0;
+	} else if (page_reclaimable(page, NULL)) {
+		/*
+		 * For reclaimable pages, we can use the cache.
+		 * In event of a race, worst case is we end up with a
+		 * non-reclaimable page on [in]active list.
+		 * We know how to handle that.
+		 */
+		lru_cache_add_lru(page, lru + page_file_cache(page));
+	} else {
+		/*
+		 * Put non-reclaimable pages directly on zone's noreclaim
+		 * list.
+		 */
+		add_page_to_noreclaim_list(page);
+	}
+
+	put_page(page);		/* drop ref from isolate */
+	return ret;		/* ret => "page still locked" */
+}
+
+/*
+ * Cull page that shrink_*_list() has detected to be non-reclaimable
+ * under page lock to close races with other tasks that might be making
+ * the page reclaimable.  Avoid stranding a reclaimable page on the
+ * noreclaim list.
+ */
+static inline void cull_nonreclaimable_page(struct page *page)
+{
+	lock_page(page);
+	if (putback_lru_page(page))
+		unlock_page(page);
+}
+
 /*
  * shrink_page_list() returns the number of reclaimed pages
  */
@@ -470,6 +534,12 @@ static unsigned long shrink_page_list(st
 
 		sc->nr_scanned++;
 
+		if (unlikely(!page_reclaimable(page, NULL))) {
+			if (putback_lru_page(page))
+				unlock_page(page);
+			continue;
+		}
+
 		if (!sc->may_swap && page_mapped(page))
 			goto keep_locked;
 
@@ -566,7 +636,7 @@ static unsigned long shrink_page_list(st
 		 * possible for a page to have PageDirty set, but it is actually
 		 * clean (all its buffers are clean).  This happens if the
 		 * buffers were written out directly, with submit_bh(). ext3
-		 * will do this, as well as the blockdev mapping. 
+		 * will do this, as well as the blockdev mapping.
 		 * try_to_release_page() will discover that cleanness and will
 		 * drop the buffers and mark the page clean - it can be freed.
 		 *
@@ -598,6 +668,7 @@ activate_locked:
 		/* Not a candidate for swapping, so reclaim swap space. */
 		if (PageSwapCache(page) && vm_swap_full())
 			remove_exclusive_swap_page_ref(page);
+		VM_BUG_ON(PageActive(page));
 		SetPageActive(page);
 		pgactivate++;
 keep_locked:
@@ -647,6 +718,14 @@ int __isolate_lru_page(struct page *page
 	if (mode != ISOLATE_BOTH && (!page_file_cache(page) != !file))
 		return ret;
 
+	/*
+	 * Non-reclaimable pages shouldn't make it onto either the active
+	 * nor the inactive list. However, when doing lumpy reclaim of
+	 * higher order pages we can still run into them.
+	 */
+	if (PageNoreclaim(page))
+		return ret;
+
 	ret = -EBUSY;
 	if (likely(get_page_unless_zero(page))) {
 		/*
@@ -758,7 +837,7 @@ static unsigned long isolate_lru_pages(u
 				/* else it is being freed elsewhere */
 				list_move(&cursor_page->lru, src);
 			default:
-				break;
+				break;	/* ! on LRU or wrong list */
 			}
 		}
 	}
@@ -818,8 +897,9 @@ static unsigned long clear_active_flags(
  * Returns -EBUSY if the page was not on an LRU list.
  *
  * The returned page will have PageLRU() cleared.  If it was found on
- * the active list, it will have PageActive set.  That flag may need
- * to be cleared by the caller before letting the page go.
+ * the active list, it will have PageActive set.  If it was found on
+ * the noreclaim list, it will have the PageNoreclaim bit set. That flag
+ * may need to be cleared by the caller before letting the page go.
  *
  * The vmstat statistic corresponding to the list on which the page was
  * found will be decremented.
@@ -844,7 +924,13 @@ int isolate_lru_page(struct page *page)
 			ret = 0;
 			ClearPageLRU(page);
 
+			/* Calculate the LRU list for normal pages ... */
 			lru += page_file_cache(page) + !!PageActive(page);
+
+			/* ... except NoReclaim, which has its own list. */
+			if (PageNoreclaim(page))
+				lru = LRU_NORECLAIM;
+
 			del_page_from_lru_list(zone, page, lru);
 		}
 		spin_unlock_irq(&zone->lru_lock);
@@ -959,18 +1045,25 @@ static unsigned long shrink_inactive_lis
 			int lru = LRU_BASE;
 			page = lru_to_page(&page_list);
 			VM_BUG_ON(PageLRU(page));
-			SetPageLRU(page);
 			list_del(&page->lru);
-			if (page_file_cache(page))
-				lru += LRU_FILE;
-			if (scan_global_lru(sc)) {
+			if (unlikely(!page_reclaimable(page, NULL))) {
+				spin_unlock_irq(&zone->lru_lock);
+				cull_nonreclaimable_page(page);
+				spin_lock_irq(&zone->lru_lock);
+				continue;
+			} else {
 				if (page_file_cache(page))
-					zone->recent_rotated_file++;
-				else
-					zone->recent_rotated_anon++;
+					lru += LRU_FILE;
+				if (scan_global_lru(sc)) {
+					if (page_file_cache(page))
+						zone->recent_rotated_file++;
+					else
+						zone->recent_rotated_anon++;
+				}
+				if (PageActive(page))
+					lru += LRU_ACTIVE;
 			}
-			if (PageActive(page))
-				lru += LRU_ACTIVE;
+			SetPageLRU(page);
 			add_page_to_lru_list(zone, page, lru);
 			if (!pagevec_add(&pvec, page)) {
 				spin_unlock_irq(&zone->lru_lock);
@@ -1064,6 +1157,12 @@ static void shrink_active_list(unsigned 
 		cond_resched();
 		page = lru_to_page(&l_hold);
 		list_del(&page->lru);
+
+		if (unlikely(!page_reclaimable(page, NULL))) {
+			cull_nonreclaimable_page(page);
+			continue;
+		}
+
 		if (page_referenced(page, 0, sc->mem_cgroup) && file) {
 			/* Referenced file pages stay active. */
 			list_add(&page->lru, &l_active);
@@ -1271,7 +1370,7 @@ static unsigned long shrink_zone(int pri
 
 	get_scan_ratio(zone, sc, percent);
 
-	for_each_lru(l) {
+	for_each_reclaimable_lru(l) {
 		if (scan_global_lru(sc)) {
 			int file = is_file_lru(l);
 			int scan;
@@ -1302,7 +1401,7 @@ static unsigned long shrink_zone(int pri
 
 	while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
 					nr[LRU_INACTIVE_FILE]) {
-		for_each_lru(l) {
+		for_each_reclaimable_lru(l) {
 			if (nr[l]) {
 				nr_to_scan = min(nr[l],
 					(unsigned long)sc->swap_cluster_max);
@@ -1853,8 +1952,8 @@ static unsigned long shrink_all_zones(un
 		if (zone_is_all_unreclaimable(zone) && prio != DEF_PRIORITY)
 			continue;
 
-		for_each_lru(l) {
-			/* For pass = 0 we don't shrink the active list */
+		for_each_reclaimable_lru(l) {
+			/* For pass = 0, we don't shrink the active list */
 			if (pass == 0 &&
 				(l == LRU_ACTIVE_ANON || l == LRU_ACTIVE_FILE))
 				continue;
@@ -2191,3 +2290,26 @@ int zone_reclaim(struct zone *zone, gfp_
 	return ret;
 }
 #endif
+
+#ifdef CONFIG_NORECLAIM_LRU
+/*
+ * page_reclaimable - test whether a page is reclaimable
+ * @page: the page to test
+ * @vma: the VMA in which the page is or will be mapped, may be NULL
+ *
+ * Test whether page is reclaimable--i.e., should be placed on active/inactive
+ * lists vs noreclaim list.
+ *
+ * Reasons page might not be reclaimable:
+ * TODO - later patches
+ */
+int page_reclaimable(struct page *page, struct vm_area_struct *vma)
+{
+
+	VM_BUG_ON(PageNoreclaim(page));
+
+	/* TODO:  test page [!]reclaimable conditions */
+
+	return 1;
+}
+#endif
Index: linux-2.6.26-rc2-mm1/mm/mempolicy.c
===================================================================
--- linux-2.6.26-rc2-mm1.orig/mm/mempolicy.c	2008-05-23 14:50:58.000000000 -0400
+++ linux-2.6.26-rc2-mm1/mm/mempolicy.c	2008-05-23 14:55:22.000000000 -0400
@@ -2199,7 +2199,7 @@ static void gather_stats(struct page *pa
 	if (PageSwapCache(page))
 		md->swapcache++;
 
-	if (PageActive(page))
+	if (PageActive(page) || PageNoreclaim(page))
 		md->active++;
 
 	if (PageWriteback(page))
Index: linux-2.6.26-rc2-mm1/mm/internal.h
===================================================================
--- linux-2.6.26-rc2-mm1.orig/mm/internal.h	2008-05-23 14:21:32.000000000 -0400
+++ linux-2.6.26-rc2-mm1/mm/internal.h	2008-05-23 15:19:11.000000000 -0400
@@ -34,8 +34,15 @@ static inline void __put_page(struct pag
 	atomic_dec(&page->_count);
 }
 
+/*
+ * in mm/vmscan.c:
+ */
 extern int isolate_lru_page(struct page *page);
+extern int putback_lru_page(struct page *page);
 
+/*
+ * in mm/page_alloc.c
+ */
 extern void __free_pages_bootmem(struct page *page, unsigned int order);
 
 /*
@@ -49,6 +56,25 @@ static inline unsigned long page_order(s
 	return page_private(page);
 }
 
+#ifdef CONFIG_NORECLAIM_LRU
+/*
+ * noreclaim_migrate_page() called only from migrate_page_copy() to
+ * migrate noreclaim flag to new page.
+ * Note that the old page has been isolated from the LRU lists at this
+ * point so we don't need to worry about LRU statistics.
+ */
+static inline void noreclaim_migrate_page(struct page *new, struct page *old)
+{
+	if (TestClearPageNoreclaim(old))
+		SetPageNoreclaim(new);
+}
+#else
+static inline void noreclaim_migrate_page(struct page *new, struct page *old)
+{
+}
+#endif
+
+
 /*
  * FLATMEM and DISCONTIGMEM configurations use alloc_bootmem_node,
  * so all functions starting at paging_init should be marked __init

-- 
All Rights Reversed


  parent reply	other threads:[~2008-05-23 22:07 UTC|newest]

Thread overview: 39+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2008-05-23 19:55 [PATCH -mm 00/16] VM pageout scalability improvements (V8) Rik van Riel
2008-05-23 19:55 ` [PATCH -mm 01/16] move isolate_lru_page() to vmscan.c Rik van Riel
2008-05-23 19:55 ` [PATCH -mm 02/16] Use an indexed array for LRU variables Rik van Riel
2008-05-27 16:54   ` Lee Schermerhorn
2008-05-27 17:03     ` Rik van Riel
2008-05-23 19:55 ` [PATCH -mm 03/16] use an array for the LRU pagevecs Rik van Riel
2008-05-23 19:55 ` [PATCH -mm 04/16] free swap space on swap-in/activation Rik van Riel
2008-05-28  9:08   ` Daisuke Nishimura
2008-05-23 19:55 ` [PATCH -mm 05/16] define page_file_cache() function Rik van Riel
2008-05-23 19:55 ` [PATCH -mm 06/16] split LRU lists into anon & file sets Rik van Riel
2008-05-23 19:55 ` [PATCH -mm 07/16] second chance replacement for anonymous pages Rik van Riel
2008-05-28  5:36   ` Daisuke Nishimura
2008-05-28 13:39     ` Rik van Riel
2008-05-28 15:42       ` Daisuke Nishimura
2008-05-28 16:08         ` Rik van Riel
2008-05-28 11:03   ` KOSAKI Motohiro
2008-05-28 13:43     ` Rik van Riel
2008-05-23 19:55 ` [PATCH -mm 08/16] add some sanity checks to get_scan_ratio Rik van Riel
2008-05-23 19:55 ` [PATCH -mm 09/16] fix pagecache reclaim referenced bit check Rik van Riel
2008-05-23 19:55 ` [PATCH -mm 10/16] add newly swapped in pages to the inactive list Rik van Riel
2008-05-23 19:55 ` [PATCH -mm 11/16] more aggressively use lumpy reclaim Rik van Riel
2008-05-23 19:55 ` [PATCH -mm 12/16] pageflag helpers for configed-out flags Rik van Riel
2008-05-23 19:55 ` Rik van Riel [this message]
2008-05-23 19:55 ` [PATCH -mm 14/16] Non-reclaimable page statistics Rik van Riel
2008-05-23 19:55 ` [PATCH -mm 15/16] ramfs pages are non-reclaimable Rik van Riel
2008-05-23 19:55 ` [PATCH -mm 16/16] SHM_LOCKED pages are nonreclaimable Rik van Riel
2008-05-26 18:24 ` [PATCH -mm 00/16] VM pageout scalability improvements (V8) Balbir Singh
2008-05-26 19:33   ` Rik van Riel
2008-05-27 15:54     ` Lee Schermerhorn
2008-05-27 16:10       ` Balbir Singh
2008-05-28  1:12       ` KAMEZAWA Hiroyuki
2008-05-28 11:04         ` [RFC PATCH] No Reclaim LRU Infrastructure enhancement for memcgroup KOSAKI Motohiro
2008-05-29  2:30           ` Balbir Singh
2008-05-29 11:14             ` Daisuke Nishimura
2008-05-28 11:49     ` [PATCH -mm 00/16] VM pageout scalability improvements (V8) Balbir Singh
2008-05-28 13:33       ` KOSAKI Motohiro
2008-05-28 13:36         ` Balbir Singh
2008-05-29 12:47 ` Carsten Otte
2008-05-29 14:43   ` Rik van Riel

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20080523195535.715196843@redhat.com \
    --to=riel@redhat.com \
    --cc=akpm@linux-foundation.org \
    --cc=kosaki.motohiro@jp.fujitsu.com \
    --cc=lee.schermerhorn@hp.com \
    --cc=linux-kernel@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox