linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* [RFC 0/7] Postphone reclaim laundry to write at high water marks
@ 2007-08-20 21:50 Christoph Lameter
  2007-08-20 21:50 ` [RFC 1/7] release_lru_pages(): Generic release of pages to the LRU Christoph Lameter
                   ` (9 more replies)
  0 siblings, 10 replies; 46+ messages in thread
From: Christoph Lameter @ 2007-08-20 21:50 UTC (permalink / raw)
  To: linux-mm
  Cc: linux-kernel, akpm, dkegel, Peter Zijlstra, David Miller,
	Nick Piggin

One of the problems with reclaim writeout is that it occurs when memory in a
zone is low. A particular bad problem can occur if memory in a zone is
already low and now the first page that we encounter during reclaim is dirty.
So the writeout function is called without the filesystem or device having
much of a reserve that would allow further allocations. Triggering writeout
of dirty pages early does not improve the memory situation since the actual
writeout of the page is a relatively long process. The call to writepage
will therefore not improve the low memory situation but make it worse
because extra memory may be needed to get the device to write the page.

This patchset fixes that issue by:

1. First reclaiming non dirty pages. Dirty pages are deferred until reclaim
   has reestablished the high marks. Then all the dirty pages (the laundry)
   is written out.

2. Reclaim is essentially complete during the writeout phase. So we remove
   PF_MEMALLOC and allow recursive reclaim if we still run into trouble
   during writeout.

-- 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 46+ messages in thread

* [RFC 1/7] release_lru_pages(): Generic release of pages to the LRU
  2007-08-20 21:50 [RFC 0/7] Postphone reclaim laundry to write at high water marks Christoph Lameter
@ 2007-08-20 21:50 ` Christoph Lameter
  2007-08-21 14:52   ` Mel Gorman
  2007-08-20 21:50 ` [RFC 2/7] Move checks from pageout() to shrink_page_list Christoph Lameter
                   ` (8 subsequent siblings)
  9 siblings, 1 reply; 46+ messages in thread
From: Christoph Lameter @ 2007-08-20 21:50 UTC (permalink / raw)
  To: linux-mm
  Cc: linux-kernel, akpm, dkegel, Peter Zijlstra, David Miller,
	Nick Piggin

[-- Attachment #1: release_lru_pages --]
[-- Type: text/plain, Size: 3395 bytes --]

Provide a function to generically release pages that were isolated back
to the LRU. The function supports mixing zones etc.

Signed-off-by: Christoph Lameter <clameter@sgi.com>

---
 mm/vmscan.c |   72 ++++++++++++++++++++++++++++++++++--------------------------
 1 file changed, 41 insertions(+), 31 deletions(-)

Index: linux-2.6/mm/vmscan.c
===================================================================
--- linux-2.6.orig/mm/vmscan.c	2007-08-19 23:12:43.000000000 -0700
+++ linux-2.6/mm/vmscan.c	2007-08-19 23:13:24.000000000 -0700
@@ -581,6 +581,42 @@ keep:
 	return nr_reclaimed;
 }
 
+/*
+ * Put back any unfreeable pages.
+ */
+void release_lru_pages(struct list_head *page_list)
+{
+	struct page *page;
+	struct pagevec pvec;
+	struct zone *zone = NULL;
+
+	pagevec_init(&pvec, 1);
+	while (!list_empty(page_list)) {
+		page = lru_to_page(page_list);
+		VM_BUG_ON(PageLRU(page));
+		if (zone != page_zone(page)) {
+			if (zone)
+				spin_unlock_irq(&zone->lru_lock);
+			zone = page_zone(page);
+			spin_lock_irq(&zone->lru_lock);
+		}
+		SetPageLRU(page);
+		list_del(&page->lru);
+		if (PageActive(page))
+			add_page_to_active_list(zone, page);
+		else
+			add_page_to_inactive_list(zone, page);
+		if (!pagevec_add(&pvec, page)) {
+			spin_unlock_irq(&zone->lru_lock);
+			__pagevec_release(&pvec);
+			spin_lock_irq(&zone->lru_lock);
+		}
+	}
+	if (zone)
+		spin_unlock_irq(&zone->lru_lock);
+	pagevec_release(&pvec);
+}
+
 /* LRU Isolation modes. */
 #define ISOLATE_INACTIVE 0	/* Isolate inactive pages. */
 #define ISOLATE_ACTIVE 1	/* Isolate active pages. */
@@ -756,21 +792,17 @@ static unsigned long shrink_inactive_lis
 				struct zone *zone, struct scan_control *sc)
 {
 	LIST_HEAD(page_list);
-	struct pagevec pvec;
 	unsigned long nr_scanned = 0;
 	unsigned long nr_reclaimed = 0;
 
-	pagevec_init(&pvec, 1);
-
 	lru_add_drain();
-	spin_lock_irq(&zone->lru_lock);
 	do {
-		struct page *page;
 		unsigned long nr_taken;
 		unsigned long nr_scan;
 		unsigned long nr_freed;
 		unsigned long nr_active;
 
+		spin_lock_irq(&zone->lru_lock);
 		nr_taken = isolate_lru_pages(sc->swap_cluster_max,
 			     &zone->inactive_list,
 			     &page_list, &nr_scan, sc->order,
@@ -794,34 +826,12 @@ static unsigned long shrink_inactive_lis
 		} else
 			__count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan);
 		__count_zone_vm_events(PGSTEAL, zone, nr_freed);
+		local_irq_enable();
+		release_lru_pages(&page_list);
 
-		if (nr_taken == 0)
-			goto done;
-
-		spin_lock(&zone->lru_lock);
-		/*
-		 * Put back any unfreeable pages.
-		 */
-		while (!list_empty(&page_list)) {
-			page = lru_to_page(&page_list);
-			VM_BUG_ON(PageLRU(page));
-			SetPageLRU(page);
-			list_del(&page->lru);
-			if (PageActive(page))
-				add_page_to_active_list(zone, page);
-			else
-				add_page_to_inactive_list(zone, page);
-			if (!pagevec_add(&pvec, page)) {
-				spin_unlock_irq(&zone->lru_lock);
-				__pagevec_release(&pvec);
-				spin_lock_irq(&zone->lru_lock);
-			}
-		}
+		if (!nr_taken)
+			break;
   	} while (nr_scanned < max_scan);
-	spin_unlock(&zone->lru_lock);
-done:
-	local_irq_enable();
-	pagevec_release(&pvec);
 	return nr_reclaimed;
 }
 

-- 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 46+ messages in thread

* [RFC 2/7] Move checks from pageout() to shrink_page_list
  2007-08-20 21:50 [RFC 0/7] Postphone reclaim laundry to write at high water marks Christoph Lameter
  2007-08-20 21:50 ` [RFC 1/7] release_lru_pages(): Generic release of pages to the LRU Christoph Lameter
@ 2007-08-20 21:50 ` Christoph Lameter
  2007-08-20 21:50 ` [RFC 3/7] shrink_page_list: Support isolating dirty pages on laundry list Christoph Lameter
                   ` (7 subsequent siblings)
  9 siblings, 0 replies; 46+ messages in thread
From: Christoph Lameter @ 2007-08-20 21:50 UTC (permalink / raw)
  To: linux-mm
  Cc: linux-kernel, akpm, dkegel, Peter Zijlstra, David Miller,
	Nick Piggin

[-- Attachment #1: move_checks_to_shrink_page_list --]
[-- Type: text/plain, Size: 4876 bytes --]

This is necessary because we soon will do other things than calling
pageout() from shrink_page_list().

Signed-off-by: Christoph Lameter <clameter@sgi.com>

---
 mm/vmscan.c |   90 ++++++++++++++++++++++++++++++------------------------------
 1 file changed, 45 insertions(+), 45 deletions(-)

Index: linux-2.6/mm/vmscan.c
===================================================================
--- linux-2.6.orig/mm/vmscan.c	2007-08-19 21:39:55.000000000 -0700
+++ linux-2.6/mm/vmscan.c	2007-08-19 21:47:56.000000000 -0700
@@ -273,8 +273,6 @@ static void handle_write_error(struct ad
 
 /* possible outcome of pageout() */
 typedef enum {
-	/* failed to write page out, page is locked */
-	PAGE_KEEP,
 	/* move page to the active list, page is locked */
 	PAGE_ACTIVATE,
 	/* page has been sent to the disk successfully, page is unlocked */
@@ -289,44 +287,6 @@ typedef enum {
  */
 static pageout_t pageout(struct page *page, struct address_space *mapping)
 {
-	/*
-	 * If the page is dirty, only perform writeback if that write
-	 * will be non-blocking.  To prevent this allocation from being
-	 * stalled by pagecache activity.  But note that there may be
-	 * stalls if we need to run get_block().  We could test
-	 * PagePrivate for that.
-	 *
-	 * If this process is currently in generic_file_write() against
-	 * this page's queue, we can perform writeback even if that
-	 * will block.
-	 *
-	 * If the page is swapcache, write it back even if that would
-	 * block, for some throttling. This happens by accident, because
-	 * swap_backing_dev_info is bust: it doesn't reflect the
-	 * congestion state of the swapdevs.  Easy to fix, if needed.
-	 * See swapfile.c:page_queue_congested().
-	 */
-	if (!is_page_cache_freeable(page))
-		return PAGE_KEEP;
-	if (!mapping) {
-		/*
-		 * Some data journaling orphaned pages can have
-		 * page->mapping == NULL while being dirty with clean buffers.
-		 */
-		if (PagePrivate(page)) {
-			if (try_to_free_buffers(page)) {
-				ClearPageDirty(page);
-				printk("%s: orphaned page\n", __FUNCTION__);
-				return PAGE_CLEAN;
-			}
-		}
-		return PAGE_KEEP;
-	}
-	if (mapping->a_ops->writepage == NULL)
-		return PAGE_ACTIVATE;
-	if (!may_write_to_queue(mapping->backing_dev_info))
-		return PAGE_KEEP;
-
 	if (clear_page_dirty_for_io(page)) {
 		int res;
 		struct writeback_control wbc = {
@@ -504,18 +464,58 @@ static unsigned long shrink_page_list(st
 			if (!sc->may_writepage)
 				goto keep_locked;
 
+			/*
+			 * If the page is dirty, only perform writeback if
+			 * that write will be non-blocking.  To prevent this
+			 * allocation from being stalled by pagecache
+			 * activity.  But note that there may be stalls if
+			 * we need to run get_block().  We could test
+			 * PagePrivate for that.
+			 *
+			 * If this process is currently in
+			 * generic_file_write() against this page's queue,
+			 * we can perform writeback even if that will block.
+			 *
+			 * If the page is swapcache, write it back even if
+			 * that would block, for some throttling. This happens
+			 * by accident, because swap_backing_dev_info is bust:
+			 * it doesn't reflect the congestion state of the
+			 * swapdevs.  Easy to fix, if needed.
+			 * See swapfile.c:page_queue_congested().
+			 */
+			if (!is_page_cache_freeable(page))
+				goto keep_locked;
+			if (!mapping) {
+				/*
+				 * Some data journaling orphaned pages can
+				 * have page->mapping == NULL while being
+				 * dirty with clean buffers.
+				 */
+				if (PagePrivate(page)) {
+					if (try_to_free_buffers(page)) {
+						ClearPageDirty(page);
+						printk("%s: orphaned page\n",
+								__FUNCTION__);
+						goto release_page;
+					}
+				}
+				goto keep_locked;
+			}
+			if (mapping->a_ops->writepage == NULL)
+				goto activate_locked;
+			if (!may_write_to_queue(mapping->backing_dev_info))
+				goto keep_locked;
+
 			/* Page is dirty, try to write it out here */
 			switch(pageout(page, mapping)) {
-			case PAGE_KEEP:
-				goto keep_locked;
 			case PAGE_ACTIVATE:
 				goto activate_locked;
 			case PAGE_SUCCESS:
 				if (PageWriteback(page) || PageDirty(page))
 					goto keep;
 				/*
-				 * A synchronous write - probably a ramdisk.  Go
-				 * ahead and try to reclaim the page.
+				 * A synchronous write - probably a ramdisk.
+				 * Go ahead and try to reclaim the page.
 				 */
 				if (TestSetPageLocked(page))
 					goto keep;
@@ -526,7 +526,7 @@ static unsigned long shrink_page_list(st
 				; /* try to free the page below */
 			}
 		}
-
+release_page:
 		/*
 		 * If the page has buffers, try to free the buffer mappings
 		 * associated with this page. If we succeed we try to free

-- 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 46+ messages in thread

* [RFC 3/7] shrink_page_list: Support isolating dirty pages on laundry list
  2007-08-20 21:50 [RFC 0/7] Postphone reclaim laundry to write at high water marks Christoph Lameter
  2007-08-20 21:50 ` [RFC 1/7] release_lru_pages(): Generic release of pages to the LRU Christoph Lameter
  2007-08-20 21:50 ` [RFC 2/7] Move checks from pageout() to shrink_page_list Christoph Lameter
@ 2007-08-20 21:50 ` Christoph Lameter
  2007-08-21 15:04   ` Mel Gorman
  2007-08-20 21:50 ` [RFC 4/7] Pass laundry through shrink_inactive_list() and shrink_zone() Christoph Lameter
                   ` (6 subsequent siblings)
  9 siblings, 1 reply; 46+ messages in thread
From: Christoph Lameter @ 2007-08-20 21:50 UTC (permalink / raw)
  To: linux-mm
  Cc: linux-kernel, akpm, dkegel, Peter Zijlstra, David Miller,
	Nick Piggin

[-- Attachment #1: shrink_modes --]
[-- Type: text/plain, Size: 2329 bytes --]

If a laundry list is specified then do not write out pages but put
dirty pages on a laundry list for later processing.

Signed-off-by: Christoph Lameter <clameter@sgi.com>

---
 mm/vmscan.c |   23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

Index: linux-2.6/mm/vmscan.c
===================================================================
--- linux-2.6.orig/mm/vmscan.c	2007-08-19 23:13:28.000000000 -0700
+++ linux-2.6/mm/vmscan.c	2007-08-19 23:27:00.000000000 -0700
@@ -380,16 +380,22 @@ cannot_free:
 }
 
 /*
- * shrink_page_list() returns the number of reclaimed pages
+ * shrink_page_list() returns the number of reclaimed pages.
+ *
+ * If laundry is specified then dirty pages are put onto the
+ * laundry list and no writes are triggered.
  */
 static unsigned long shrink_page_list(struct list_head *page_list,
-					struct scan_control *sc)
+		struct scan_control *sc, struct list_head *laundry)
 {
 	LIST_HEAD(ret_pages);
 	struct pagevec freed_pvec;
 	int pgactivate = 0;
 	unsigned long nr_reclaimed = 0;
 
+	if (list_empty(page_list))
+		return 0;
+
 	cond_resched();
 
 	pagevec_init(&freed_pvec, 1);
@@ -407,10 +413,11 @@ static unsigned long shrink_page_list(st
 		if (TestSetPageLocked(page))
 			goto keep;
 
-		VM_BUG_ON(PageActive(page));
-
 		sc->nr_scanned++;
 
+		if (PageActive(page))
+			goto keep_locked;
+
 		if (!sc->may_swap && page_mapped(page))
 			goto keep_locked;
 
@@ -506,6 +513,12 @@ static unsigned long shrink_page_list(st
 			if (!may_write_to_queue(mapping->backing_dev_info))
 				goto keep_locked;
 
+			if (laundry) {
+				list_add(&page->lru, laundry);
+				unlock_page(page);
+				continue;
+			}
+
 			/* Page is dirty, try to write it out here */
 			switch(pageout(page, mapping)) {
 			case PAGE_ACTIVATE:
@@ -817,7 +830,7 @@ static unsigned long shrink_inactive_lis
 		spin_unlock_irq(&zone->lru_lock);
 
 		nr_scanned += nr_scan;
-		nr_freed = shrink_page_list(&page_list, sc);
+		nr_freed = shrink_page_list(&page_list, sc, NULL);
 		nr_reclaimed += nr_freed;
 		local_irq_disable();
 		if (current_is_kswapd()) {

-- 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 46+ messages in thread

* [RFC 4/7] Pass laundry through shrink_inactive_list() and shrink_zone()
  2007-08-20 21:50 [RFC 0/7] Postphone reclaim laundry to write at high water marks Christoph Lameter
                   ` (2 preceding siblings ...)
  2007-08-20 21:50 ` [RFC 3/7] shrink_page_list: Support isolating dirty pages on laundry list Christoph Lameter
@ 2007-08-20 21:50 ` Christoph Lameter
  2007-08-20 21:50 ` [RFC 5/7] Laundry handling for direct reclaim Christoph Lameter
                   ` (5 subsequent siblings)
  9 siblings, 0 replies; 46+ messages in thread
From: Christoph Lameter @ 2007-08-20 21:50 UTC (permalink / raw)
  To: linux-mm
  Cc: linux-kernel, akpm, dkegel, Peter Zijlstra, David Miller,
	Nick Piggin

[-- Attachment #1: shrink_zones --]
[-- Type: text/plain, Size: 3231 bytes --]

Both functions are equipped with an additional laundry parameter
that is then passed to shrink_page_list.

Signed-off-by: Christoph Lameter <clameter@sgi.com>

---
 mm/vmscan.c |   16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

Index: linux-2.6/mm/vmscan.c
===================================================================
--- linux-2.6.orig/mm/vmscan.c	2007-08-19 23:27:16.000000000 -0700
+++ linux-2.6/mm/vmscan.c	2007-08-19 23:30:15.000000000 -0700
@@ -802,7 +802,7 @@ static unsigned long clear_active_flags(
  * of reclaimed pages
  */
 static unsigned long shrink_inactive_list(unsigned long max_scan,
-				struct zone *zone, struct scan_control *sc)
+	struct zone *zone, struct scan_control *sc, struct list_head *laundry)
 {
 	LIST_HEAD(page_list);
 	unsigned long nr_scanned = 0;
@@ -830,7 +830,7 @@ static unsigned long shrink_inactive_lis
 		spin_unlock_irq(&zone->lru_lock);
 
 		nr_scanned += nr_scan;
-		nr_freed = shrink_page_list(&page_list, sc, NULL);
+		nr_freed = shrink_page_list(&page_list, sc, laundry);
 		nr_reclaimed += nr_freed;
 		local_irq_disable();
 		if (current_is_kswapd()) {
@@ -1030,7 +1030,7 @@ force_reclaim_mapped:
  * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
  */
 static unsigned long shrink_zone(int priority, struct zone *zone,
-				struct scan_control *sc)
+			struct scan_control *sc, struct list_head *laundry)
 {
 	unsigned long nr_active;
 	unsigned long nr_inactive;
@@ -1072,7 +1072,7 @@ static unsigned long shrink_zone(int pri
 					(unsigned long)sc->swap_cluster_max);
 			nr_inactive -= nr_to_scan;
 			nr_reclaimed += shrink_inactive_list(nr_to_scan, zone,
-								sc);
+								sc, laundry);
 		}
 	}
 
@@ -1121,7 +1121,7 @@ static unsigned long shrink_zones(int pr
 
 		sc->all_unreclaimable = 0;
 
-		nr_reclaimed += shrink_zone(priority, zone, sc);
+		nr_reclaimed += shrink_zone(priority, zone, sc, NULL);
 	}
 	return nr_reclaimed;
 }
@@ -1341,7 +1341,7 @@ loop_again:
 			temp_priority[i] = priority;
 			sc.nr_scanned = 0;
 			note_zone_scanning_priority(zone, priority);
-			nr_reclaimed += shrink_zone(priority, zone, &sc);
+			nr_reclaimed += shrink_zone(priority, zone, &sc, NULL);
 			reclaim_state->reclaimed_slab = 0;
 			nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
 						lru_pages);
@@ -1539,7 +1539,7 @@ static unsigned long shrink_all_zones(un
 			zone->nr_scan_inactive = 0;
 			nr_to_scan = min(nr_pages,
 				zone_page_state(zone, NR_INACTIVE));
-			ret += shrink_inactive_list(nr_to_scan, zone, sc);
+			ret += shrink_inactive_list(nr_to_scan, zone, sc, NULL);
 			if (ret >= nr_pages)
 				return ret;
 		}
@@ -1780,7 +1780,7 @@ static int __zone_reclaim(struct zone *z
 		priority = ZONE_RECLAIM_PRIORITY;
 		do {
 			note_zone_scanning_priority(zone, priority);
-			nr_reclaimed += shrink_zone(priority, zone, &sc);
+			nr_reclaimed += shrink_zone(priority, zone, &sc, NULL);
 			priority--;
 		} while (priority >= 0 && nr_reclaimed < nr_pages);
 	}

-- 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 46+ messages in thread

* [RFC 5/7] Laundry handling for direct reclaim
  2007-08-20 21:50 [RFC 0/7] Postphone reclaim laundry to write at high water marks Christoph Lameter
                   ` (3 preceding siblings ...)
  2007-08-20 21:50 ` [RFC 4/7] Pass laundry through shrink_inactive_list() and shrink_zone() Christoph Lameter
@ 2007-08-20 21:50 ` Christoph Lameter
  2007-08-21 15:06   ` Mel Gorman
  2007-08-21 15:19   ` Mel Gorman
  2007-08-20 21:50 ` [RFC 6/7] kswapd: Do laundry after reclaim Christoph Lameter
                   ` (4 subsequent siblings)
  9 siblings, 2 replies; 46+ messages in thread
From: Christoph Lameter @ 2007-08-20 21:50 UTC (permalink / raw)
  To: linux-mm
  Cc: linux-kernel, akpm, dkegel, Peter Zijlstra, David Miller,
	Nick Piggin

[-- Attachment #1: direct_reclaim --]
[-- Type: text/plain, Size: 2230 bytes --]

Direct reclaim collects a global laundry list in try_to_free_pages().

Pages are only written back after a reclaim pass is complete.

Signed-off-by: Christoph Lameter <clameter@sgi.com>

---
 mm/vmscan.c |   12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

Index: linux-2.6/mm/vmscan.c
===================================================================
--- linux-2.6.orig/mm/vmscan.c	2007-08-19 23:30:15.000000000 -0700
+++ linux-2.6/mm/vmscan.c	2007-08-19 23:53:43.000000000 -0700
@@ -1099,7 +1099,7 @@ static unsigned long shrink_zone(int pri
  * scan then give up on it.
  */
 static unsigned long shrink_zones(int priority, struct zone **zones,
-					struct scan_control *sc)
+			struct scan_control *sc, struct list_head *laundry)
 {
 	unsigned long nr_reclaimed = 0;
 	int i;
@@ -1121,7 +1121,7 @@ static unsigned long shrink_zones(int pr
 
 		sc->all_unreclaimable = 0;
 
-		nr_reclaimed += shrink_zone(priority, zone, sc, NULL);
+		nr_reclaimed += shrink_zone(priority, zone, sc, laundry);
 	}
 	return nr_reclaimed;
 }
@@ -1156,6 +1156,7 @@ unsigned long try_to_free_pages(struct z
 		.swappiness = vm_swappiness,
 		.order = order,
 	};
+	LIST_HEAD(laundry);
 
 	count_vm_event(ALLOCSTALL);
 
@@ -1170,16 +1171,19 @@ unsigned long try_to_free_pages(struct z
 	}
 
 	for (priority = DEF_PRIORITY; priority >= 0; priority--) {
+
 		sc.nr_scanned = 0;
 		if (!priority)
 			disable_swap_token();
-		nr_reclaimed += shrink_zones(priority, zones, &sc);
+		nr_reclaimed += shrink_zones(priority, zones, &sc, &laundry);
 		shrink_slab(sc.nr_scanned, gfp_mask, lru_pages);
 		if (reclaim_state) {
 			nr_reclaimed += reclaim_state->reclaimed_slab;
 			reclaim_state->reclaimed_slab = 0;
 		}
+
 		total_scanned += sc.nr_scanned;
+
 		if (nr_reclaimed >= sc.swap_cluster_max) {
 			ret = 1;
 			goto out;
@@ -1223,6 +1227,8 @@ out:
 
 		zone->prev_priority = priority;
 	}
+	nr_reclaimed += shrink_page_list(&laundry, &sc, NULL);
+	release_lru_pages(&laundry);
 	return ret;
 }
 

-- 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 46+ messages in thread

* [RFC 6/7] kswapd: Do laundry after reclaim
  2007-08-20 21:50 [RFC 0/7] Postphone reclaim laundry to write at high water marks Christoph Lameter
                   ` (4 preceding siblings ...)
  2007-08-20 21:50 ` [RFC 5/7] Laundry handling for direct reclaim Christoph Lameter
@ 2007-08-20 21:50 ` Christoph Lameter
  2007-08-20 21:50 ` [RFC 7/7] Switch of PF_MEMALLOC during writeout Christoph Lameter
                   ` (3 subsequent siblings)
  9 siblings, 0 replies; 46+ messages in thread
From: Christoph Lameter @ 2007-08-20 21:50 UTC (permalink / raw)
  To: linux-mm
  Cc: linux-kernel, akpm, dkegel, Peter Zijlstra, David Miller,
	Nick Piggin

[-- Attachment #1: kswapd --]
[-- Type: text/plain, Size: 1711 bytes --]

Collect dirty pages and perform writeout when everything else is done.

Signed-off-by: Christoph Lameter <clameter@sgi.com>

---
 mm/vmscan.c |    7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

Index: linux-2.6/mm/vmscan.c
===================================================================
--- linux-2.6.orig/mm/vmscan.c	2007-08-19 23:53:43.000000000 -0700
+++ linux-2.6/mm/vmscan.c	2007-08-19 23:53:47.000000000 -0700
@@ -1273,6 +1273,7 @@ static unsigned long balance_pgdat(pg_da
 	 * this zone was successfully refilled to free_pages == pages_high.
 	 */
 	int temp_priority[MAX_NR_ZONES];
+	LIST_HEAD(laundry);
 
 loop_again:
 	total_scanned = 0;
@@ -1347,7 +1348,7 @@ loop_again:
 			temp_priority[i] = priority;
 			sc.nr_scanned = 0;
 			note_zone_scanning_priority(zone, priority);
-			nr_reclaimed += shrink_zone(priority, zone, &sc, NULL);
+			nr_reclaimed += shrink_zone(priority, zone, &sc, &laundry);
 			reclaim_state->reclaimed_slab = 0;
 			nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
 						lru_pages);
@@ -1374,6 +1375,7 @@ loop_again:
 		 * OK, kswapd is getting into trouble.  Take a nap, then take
 		 * another pass across the zones.
 		 */
+		throttle_vm_writeout(GFP_KERNEL);
 		if (total_scanned && priority < DEF_PRIORITY - 2)
 			congestion_wait(WRITE, HZ/10);
 
@@ -1404,7 +1406,8 @@ out:
 
 		goto loop_again;
 	}
-
+	nr_reclaimed += shrink_page_list(&laundry, &sc, NULL);
+	release_lru_pages(&laundry);
 	return nr_reclaimed;
 }
 

-- 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 46+ messages in thread

* [RFC 7/7] Switch of PF_MEMALLOC during writeout
  2007-08-20 21:50 [RFC 0/7] Postphone reclaim laundry to write at high water marks Christoph Lameter
                   ` (5 preceding siblings ...)
  2007-08-20 21:50 ` [RFC 6/7] kswapd: Do laundry after reclaim Christoph Lameter
@ 2007-08-20 21:50 ` Christoph Lameter
  2007-08-20 23:08   ` Andi Kleen
  2007-08-21 10:36 ` [RFC 0/7] Postphone reclaim laundry to write at high water marks Peter Zijlstra
                   ` (2 subsequent siblings)
  9 siblings, 1 reply; 46+ messages in thread
From: Christoph Lameter @ 2007-08-20 21:50 UTC (permalink / raw)
  To: linux-mm
  Cc: linux-kernel, akpm, dkegel, Peter Zijlstra, David Miller,
	Nick Piggin

[-- Attachment #1: nopfmemalloc --]
[-- Type: text/plain, Size: 1568 bytes --]

Switch off PF_MEMALLOC during both direct and kswapd reclaim.

This works because we are not holding any locks at that point because
reclaim is essentially complete. The write occurs when the memory on
the zones is at the high water mark so it is unlikely that writeout
will get into trouble. If so then reclaim can be called recursively to
reclaim more pages.

Signed-off-by: Christoph Lameter <clameter@sgi.com>

---
 mm/vmscan.c |   10 ++++++++++
 1 file changed, 10 insertions(+)

Index: linux-2.6/mm/vmscan.c
===================================================================
--- linux-2.6.orig/mm/vmscan.c	2007-08-19 23:53:47.000000000 -0700
+++ linux-2.6/mm/vmscan.c	2007-08-19 23:55:29.000000000 -0700
@@ -1227,8 +1227,16 @@ out:
 
 		zone->prev_priority = priority;
 	}
+
+	/*
+	 * Trigger writeout. Drop PF_MEMALLOC for writeback
+	 * since we are holding no locks. Callbacks into
+	 * reclaim should be fine
+	 */
+	current->flags &= ~PF_MEMALLOC;
 	nr_reclaimed += shrink_page_list(&laundry, &sc, NULL);
 	release_lru_pages(&laundry);
+	current->flags |= PF_MEMALLOC;
 	return ret;
 }
 
@@ -1406,8 +1414,10 @@ out:
 
 		goto loop_again;
 	}
+	current->flags &= ~PF_MEMALLOC;
 	nr_reclaimed += shrink_page_list(&laundry, &sc, NULL);
 	release_lru_pages(&laundry);
+	current->flags |= PF_MEMALLOC;
 	return nr_reclaimed;
 }
 

-- 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC 7/7] Switch of PF_MEMALLOC during writeout
  2007-08-20 21:50 ` [RFC 7/7] Switch of PF_MEMALLOC during writeout Christoph Lameter
@ 2007-08-20 23:08   ` Andi Kleen
  2007-08-20 23:19     ` Christoph Lameter
  0 siblings, 1 reply; 46+ messages in thread
From: Andi Kleen @ 2007-08-20 23:08 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: linux-mm, linux-kernel

Christoph Lameter <clameter@sgi.com> writes:

> Switch off PF_MEMALLOC during both direct and kswapd reclaim.
> 
> This works because we are not holding any locks at that point because
> reclaim is essentially complete. The write occurs when the memory on
> the zones is at the high water mark so it is unlikely that writeout
> will get into trouble. If so then reclaim can be called recursively to
> reclaim more pages.

What would stop multiple recursions in extreme low memory cases? Seems 
risky to me and risking stack overflow.  Perhaps define another flag to catch that?

-Andi

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC 7/7] Switch of PF_MEMALLOC during writeout
  2007-08-20 23:08   ` Andi Kleen
@ 2007-08-20 23:19     ` Christoph Lameter
  2007-08-21  1:13       ` Andi Kleen
  0 siblings, 1 reply; 46+ messages in thread
From: Christoph Lameter @ 2007-08-20 23:19 UTC (permalink / raw)
  To: Andi Kleen; +Cc: linux-mm, linux-kernel

On Mon, 21 Aug 2007, Andi Kleen wrote:

> Christoph Lameter <clameter@sgi.com> writes:
> 
> > Switch off PF_MEMALLOC during both direct and kswapd reclaim.
> > 
> > This works because we are not holding any locks at that point because
> > reclaim is essentially complete. The write occurs when the memory on
> > the zones is at the high water mark so it is unlikely that writeout
> > will get into trouble. If so then reclaim can be called recursively to
> > reclaim more pages.
> 
> What would stop multiple recursions in extreme low memory cases? Seems 
> risky to me and risking stack overflow.  Perhaps define another flag to catch that?

Right. I am not sure exactly how to handle that. There is also the issue 
of the writes being deferred. I thought maybe of using pdflush to writeout 
the pages? Maybe increase priority of the pdflush so that it runs 
immediately when notified. Shrink_page_list would gather the dirty pages 
in pvecs and then forward to a pdflush. That may make the whole thing much 
cleaner.

Opinions?


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC 7/7] Switch of PF_MEMALLOC during writeout
  2007-08-20 23:19     ` Christoph Lameter
@ 2007-08-21  1:13       ` Andi Kleen
  0 siblings, 0 replies; 46+ messages in thread
From: Andi Kleen @ 2007-08-21  1:13 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: Andi Kleen, linux-mm, linux-kernel

> Right. I am not sure exactly how to handle that. There is also the issue 
> of the writes being deferred. I thought maybe of using pdflush to writeout 
> the pages? Maybe increase priority of the pdflush so that it runs 
> immediately when notified. Shrink_page_list would gather the dirty pages 
> in pvecs and then forward to a pdflush. That may make the whole thing much 
> cleaner.

Not sure anything complicated is needed.

You could just add another process flag and set PF_MEMALLOC on the first
recursion?

-Andi

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC 0/7] Postphone reclaim laundry to write at high water marks
  2007-08-20 21:50 [RFC 0/7] Postphone reclaim laundry to write at high water marks Christoph Lameter
                   ` (6 preceding siblings ...)
  2007-08-20 21:50 ` [RFC 7/7] Switch of PF_MEMALLOC during writeout Christoph Lameter
@ 2007-08-21 10:36 ` Peter Zijlstra
  2007-08-21 20:48   ` Christoph Lameter
  2007-08-21 15:16 ` Rik van Riel
  2007-08-21 15:51 ` Dave McCracken
  9 siblings, 1 reply; 46+ messages in thread
From: Peter Zijlstra @ 2007-08-21 10:36 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: linux-mm, linux-kernel

On Mon, 2007-08-20 at 14:50 -0700, Christoph Lameter wrote:
> One of the problems with reclaim writeout is that it occurs when memory in a
> zone is low. A particular bad problem can occur if memory in a zone is
> already low and now the first page that we encounter during reclaim is dirty.
> So the writeout function is called without the filesystem or device having
> much of a reserve that would allow further allocations. Triggering writeout
> of dirty pages early does not improve the memory situation since the actual
> writeout of the page is a relatively long process. The call to writepage
> will therefore not improve the low memory situation but make it worse
> because extra memory may be needed to get the device to write the page.
> 
> This patchset fixes that issue by:
> 
> 1. First reclaiming non dirty pages. Dirty pages are deferred until reclaim
>    has reestablished the high marks. Then all the dirty pages (the laundry)
>    is written out.
> 
> 2. Reclaim is essentially complete during the writeout phase. So we remove
>    PF_MEMALLOC and allow recursive reclaim if we still run into trouble
>    during writeout.

This almost insta-OOMs with anonymous workloads.


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC 1/7] release_lru_pages(): Generic release of pages to the LRU
  2007-08-20 21:50 ` [RFC 1/7] release_lru_pages(): Generic release of pages to the LRU Christoph Lameter
@ 2007-08-21 14:52   ` Mel Gorman
  2007-08-21 20:51     ` Christoph Lameter
  0 siblings, 1 reply; 46+ messages in thread
From: Mel Gorman @ 2007-08-21 14:52 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: linux-mm, linux-kernel, akpm, dkegel, Peter Zijlstra,
	David Miller, Nick Piggin

On (20/08/07 14:50), Christoph Lameter didst pronounce:
> Provide a function to generically release pages that were isolated back
> to the LRU. The function supports mixing zones etc.
> 
> Signed-off-by: Christoph Lameter <clameter@sgi.com>
> 
> ---
>  mm/vmscan.c |   72 ++++++++++++++++++++++++++++++++++--------------------------
>  1 file changed, 41 insertions(+), 31 deletions(-)
> 
> Index: linux-2.6/mm/vmscan.c
> ===================================================================
> --- linux-2.6.orig/mm/vmscan.c	2007-08-19 23:12:43.000000000 -0700
> +++ linux-2.6/mm/vmscan.c	2007-08-19 23:13:24.000000000 -0700
> @@ -581,6 +581,42 @@ keep:
>  	return nr_reclaimed;
>  }
>  
> +/*
> + * Put back any unfreeable pages.
> + */
> +void release_lru_pages(struct list_head *page_list)
> +{

Can the migrate.c#putback_lru_pages() be replaced with this?

> +	struct page *page;
> +	struct pagevec pvec;
> +	struct zone *zone = NULL;
> +
> +	pagevec_init(&pvec, 1);
> +	while (!list_empty(page_list)) {
> +		page = lru_to_page(page_list);
> +		VM_BUG_ON(PageLRU(page));
> +		if (zone != page_zone(page)) {
> +			if (zone)
> +				spin_unlock_irq(&zone->lru_lock);
> +			zone = page_zone(page);
> +			spin_lock_irq(&zone->lru_lock);

Is this really necessary? Why situation would occur that would have a
list of pages in multiple zones?

Also, it may be worth commenting here that __pagevec_release() is able to
handle lists of pages in multiple zones.

> +		}
> +		SetPageLRU(page);
> +		list_del(&page->lru);
> +		if (PageActive(page))
> +			add_page_to_active_list(zone, page);
> +		else
> +			add_page_to_inactive_list(zone, page);
> +		if (!pagevec_add(&pvec, page)) {
> +			spin_unlock_irq(&zone->lru_lock);
> +			__pagevec_release(&pvec);
> +			spin_lock_irq(&zone->lru_lock);
> +		}
> +	}
> +	if (zone)
> +		spin_unlock_irq(&zone->lru_lock);
> +	pagevec_release(&pvec);
> +}
> +
>  /* LRU Isolation modes. */
>  #define ISOLATE_INACTIVE 0	/* Isolate inactive pages. */
>  #define ISOLATE_ACTIVE 1	/* Isolate active pages. */
> @@ -756,21 +792,17 @@ static unsigned long shrink_inactive_lis
>  				struct zone *zone, struct scan_control *sc)
>  {
>  	LIST_HEAD(page_list);
> -	struct pagevec pvec;
>  	unsigned long nr_scanned = 0;
>  	unsigned long nr_reclaimed = 0;
>  
> -	pagevec_init(&pvec, 1);
> -
>  	lru_add_drain();
> -	spin_lock_irq(&zone->lru_lock);
>  	do {
> -		struct page *page;
>  		unsigned long nr_taken;
>  		unsigned long nr_scan;
>  		unsigned long nr_freed;
>  		unsigned long nr_active;
>  
> +		spin_lock_irq(&zone->lru_lock);
>  		nr_taken = isolate_lru_pages(sc->swap_cluster_max,
>  			     &zone->inactive_list,
>  			     &page_list, &nr_scan, sc->order,
> @@ -794,34 +826,12 @@ static unsigned long shrink_inactive_lis
>  		} else
>  			__count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan);
>  		__count_zone_vm_events(PGSTEAL, zone, nr_freed);
> +		local_irq_enable();
> +		release_lru_pages(&page_list);
>  

Separate these apart by a line. I thought the local_irq_enable() was related
to the call to release_lru_pages(&page_list) while reading the patch
which isn't the case at all.

> -		if (nr_taken == 0)
> -			goto done;
> -
> -		spin_lock(&zone->lru_lock);
> -		/*
> -		 * Put back any unfreeable pages.
> -		 */
> -		while (!list_empty(&page_list)) {
> -			page = lru_to_page(&page_list);
> -			VM_BUG_ON(PageLRU(page));
> -			SetPageLRU(page);
> -			list_del(&page->lru);
> -			if (PageActive(page))
> -				add_page_to_active_list(zone, page);
> -			else
> -				add_page_to_inactive_list(zone, page);
> -			if (!pagevec_add(&pvec, page)) {
> -				spin_unlock_irq(&zone->lru_lock);
> -				__pagevec_release(&pvec);
> -				spin_lock_irq(&zone->lru_lock);
> -			}
> -		}
> +		if (!nr_taken)
> +			break;
>    	} while (nr_scanned < max_scan);
> -	spin_unlock(&zone->lru_lock);
> -done:
> -	local_irq_enable();
> -	pagevec_release(&pvec);
>  	return nr_reclaimed;
>  }
>  

-- 
Mel Gorman
Part-time Phd Student                          Linux Technology Center
University of Limerick                         IBM Dublin Software Lab

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC 3/7] shrink_page_list: Support isolating dirty pages on laundry list
  2007-08-20 21:50 ` [RFC 3/7] shrink_page_list: Support isolating dirty pages on laundry list Christoph Lameter
@ 2007-08-21 15:04   ` Mel Gorman
  2007-08-21 20:53     ` Christoph Lameter
  0 siblings, 1 reply; 46+ messages in thread
From: Mel Gorman @ 2007-08-21 15:04 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: linux-mm, linux-kernel, akpm, dkegel, Peter Zijlstra,
	David Miller, Nick Piggin

On (20/08/07 14:50), Christoph Lameter didst pronounce:
> If a laundry list is specified then do not write out pages but put
> dirty pages on a laundry list for later processing.
> 
> Signed-off-by: Christoph Lameter <clameter@sgi.com>
> 
> ---
>  mm/vmscan.c |   23 ++++++++++++++++++-----
>  1 file changed, 18 insertions(+), 5 deletions(-)
> 
> Index: linux-2.6/mm/vmscan.c
> ===================================================================
> --- linux-2.6.orig/mm/vmscan.c	2007-08-19 23:13:28.000000000 -0700
> +++ linux-2.6/mm/vmscan.c	2007-08-19 23:27:00.000000000 -0700
> @@ -380,16 +380,22 @@ cannot_free:
>  }
>  
>  /*
> - * shrink_page_list() returns the number of reclaimed pages
> + * shrink_page_list() returns the number of reclaimed pages.
> + *
> + * If laundry is specified then dirty pages are put onto the
> + * laundry list and no writes are triggered.
>   */
>  static unsigned long shrink_page_list(struct list_head *page_list,
> -					struct scan_control *sc)
> +		struct scan_control *sc, struct list_head *laundry)
>  {
>  	LIST_HEAD(ret_pages);
>  	struct pagevec freed_pvec;
>  	int pgactivate = 0;
>  	unsigned long nr_reclaimed = 0;
>  
> +	if (list_empty(page_list))
> +		return 0;
> +

This needs a comment to explain why shrink_page_list() would be called
with an empty list.

>  	cond_resched();
>  
>  	pagevec_init(&freed_pvec, 1);
> @@ -407,10 +413,11 @@ static unsigned long shrink_page_list(st
>  		if (TestSetPageLocked(page))
>  			goto keep;
>  
> -		VM_BUG_ON(PageActive(page));
> -

This needs explanation in the leader. It implies that later you expect active
and inactive pages to be passed to shrink_page. i.e. We now need to keep an
eye out for where shrink_active_list() is sending pages to shrink_page_list()
instead of simply rotating the active list to the inactive.

>  		sc->nr_scanned++;
>  
> +		if (PageActive(page))
> +			goto keep_locked;
> +
>  		if (!sc->may_swap && page_mapped(page))
>  			goto keep_locked;
>  
> @@ -506,6 +513,12 @@ static unsigned long shrink_page_list(st
>  			if (!may_write_to_queue(mapping->backing_dev_info))
>  				goto keep_locked;
>  
> +			if (laundry) {
> +				list_add(&page->lru, laundry);
> +				unlock_page(page);
> +				continue;
> +			}

This needs a comment. What you are doing is explained in the leader but
it may not help a future reader of the code.

Also, with laundry specified there is no longer a check for PagePrivate
to see if the buffers can be freed and got rid of. According to the
comments in the next code block;

                 * We do this even if the page is PageDirty().
                 * try_to_release_page() does not perform I/O, but it is
                 * possible for a page to have PageDirty set, but it is actually
                 * clean (all its buffers are clean)

Is this intentional?

> +
>  			/* Page is dirty, try to write it out here */
>  			switch(pageout(page, mapping)) {
>  			case PAGE_ACTIVATE:
> @@ -817,7 +830,7 @@ static unsigned long shrink_inactive_lis
>  		spin_unlock_irq(&zone->lru_lock);
>  
>  		nr_scanned += nr_scan;
> -		nr_freed = shrink_page_list(&page_list, sc);
> +		nr_freed = shrink_page_list(&page_list, sc, NULL);
>  		nr_reclaimed += nr_freed;
>  		local_irq_disable();
>  		if (current_is_kswapd()) {
> 

-- 
Mel Gorman
Part-time Phd Student                          Linux Technology Center
University of Limerick                         IBM Dublin Software Lab

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC 5/7] Laundry handling for direct reclaim
  2007-08-20 21:50 ` [RFC 5/7] Laundry handling for direct reclaim Christoph Lameter
@ 2007-08-21 15:06   ` Mel Gorman
  2007-08-21 20:55     ` Christoph Lameter
  2007-08-21 15:19   ` Mel Gorman
  1 sibling, 1 reply; 46+ messages in thread
From: Mel Gorman @ 2007-08-21 15:06 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: linux-mm, linux-kernel, akpm, dkegel, Peter Zijlstra,
	David Miller, Nick Piggin

On (20/08/07 14:50), Christoph Lameter didst pronounce:
> Direct reclaim collects a global laundry list in try_to_free_pages().
> 
> Pages are only written back after a reclaim pass is complete.
> 
> Signed-off-by: Christoph Lameter <clameter@sgi.com>
> 
> ---
>  mm/vmscan.c |   12 +++++++++---
>  1 file changed, 9 insertions(+), 3 deletions(-)
> 
> Index: linux-2.6/mm/vmscan.c
> ===================================================================
> --- linux-2.6.orig/mm/vmscan.c	2007-08-19 23:30:15.000000000 -0700
> +++ linux-2.6/mm/vmscan.c	2007-08-19 23:53:43.000000000 -0700
> @@ -1099,7 +1099,7 @@ static unsigned long shrink_zone(int pri
>   * scan then give up on it.
>   */
>  static unsigned long shrink_zones(int priority, struct zone **zones,
> -					struct scan_control *sc)
> +			struct scan_control *sc, struct list_head *laundry)
>  {
>  	unsigned long nr_reclaimed = 0;
>  	int i;
> @@ -1121,7 +1121,7 @@ static unsigned long shrink_zones(int pr
>  
>  		sc->all_unreclaimable = 0;
>  
> -		nr_reclaimed += shrink_zone(priority, zone, sc, NULL);
> +		nr_reclaimed += shrink_zone(priority, zone, sc, laundry);
>  	}
>  	return nr_reclaimed;
>  }
> @@ -1156,6 +1156,7 @@ unsigned long try_to_free_pages(struct z
>  		.swappiness = vm_swappiness,
>  		.order = order,
>  	};
> +	LIST_HEAD(laundry);

Why is the laundry not made part of the scan_control?

>  
>  	count_vm_event(ALLOCSTALL);
>  
> @@ -1170,16 +1171,19 @@ unsigned long try_to_free_pages(struct z
>  	}
>  
>  	for (priority = DEF_PRIORITY; priority >= 0; priority--) {
> +
>  		sc.nr_scanned = 0;
>  		if (!priority)
>  			disable_swap_token();
> -		nr_reclaimed += shrink_zones(priority, zones, &sc);
> +		nr_reclaimed += shrink_zones(priority, zones, &sc, &laundry);
>  		shrink_slab(sc.nr_scanned, gfp_mask, lru_pages);
>  		if (reclaim_state) {
>  			nr_reclaimed += reclaim_state->reclaimed_slab;
>  			reclaim_state->reclaimed_slab = 0;
>  		}
> +
>  		total_scanned += sc.nr_scanned;
> +
>  		if (nr_reclaimed >= sc.swap_cluster_max) {
>  			ret = 1;
>  			goto out;
> @@ -1223,6 +1227,8 @@ out:
>  
>  		zone->prev_priority = priority;
>  	}
> +	nr_reclaimed += shrink_page_list(&laundry, &sc, NULL);
> +	release_lru_pages(&laundry);
>  	return ret;
>  }
>  
> 
> -- 
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

-- 
-- 
Mel Gorman
Part-time Phd Student                          Linux Technology Center
University of Limerick                         IBM Dublin Software Lab

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC 0/7] Postphone reclaim laundry to write at high water marks
  2007-08-20 21:50 [RFC 0/7] Postphone reclaim laundry to write at high water marks Christoph Lameter
                   ` (7 preceding siblings ...)
  2007-08-21 10:36 ` [RFC 0/7] Postphone reclaim laundry to write at high water marks Peter Zijlstra
@ 2007-08-21 15:16 ` Rik van Riel
  2007-08-21 20:59   ` Christoph Lameter
  2007-08-21 15:51 ` Dave McCracken
  9 siblings, 1 reply; 46+ messages in thread
From: Rik van Riel @ 2007-08-21 15:16 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: linux-mm, linux-kernel, akpm, dkegel, Peter Zijlstra,
	David Miller, Nick Piggin

Christoph Lameter wrote:

> 1. First reclaiming non dirty pages. Dirty pages are deferred until reclaim
>    has reestablished the high marks. Then all the dirty pages (the laundry)
>    is written out.

That sounds like a horrendously bad idea.  While one process
is busy freeing all the non dirty pages, other processes can
allocate those pages, leaving you with no memory to free up
the dirty pages!

How exactly are you planning to prevent that problem?

Also, writing out all the dirty pages at once seems like it
could hurt latency quite badly, especially on large systems.

-- 
Politics is the struggle between those who want to make their country
the best in the world, and those who believe it already is.  Each group
calls the other unpatriotic.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC 5/7] Laundry handling for direct reclaim
  2007-08-20 21:50 ` [RFC 5/7] Laundry handling for direct reclaim Christoph Lameter
  2007-08-21 15:06   ` Mel Gorman
@ 2007-08-21 15:19   ` Mel Gorman
  2007-08-21 21:00     ` Christoph Lameter
  1 sibling, 1 reply; 46+ messages in thread
From: Mel Gorman @ 2007-08-21 15:19 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: linux-mm, linux-kernel, akpm, dkegel, Peter Zijlstra,
	David Miller, Nick Piggin

On (20/08/07 14:50), Christoph Lameter didst pronounce:
> Direct reclaim collects a global laundry list in try_to_free_pages().
> 
> Pages are only written back after a reclaim pass is complete.
> 
> Signed-off-by: Christoph Lameter <clameter@sgi.com>
> 
> ---
>  mm/vmscan.c |   12 +++++++++---
>  1 file changed, 9 insertions(+), 3 deletions(-)
> 
> Index: linux-2.6/mm/vmscan.c
> ===================================================================
> --- linux-2.6.orig/mm/vmscan.c	2007-08-19 23:30:15.000000000 -0700
> +++ linux-2.6/mm/vmscan.c	2007-08-19 23:53:43.000000000 -0700
> @@ -1099,7 +1099,7 @@ static unsigned long shrink_zone(int pri
>   * scan then give up on it.
>   */
>  static unsigned long shrink_zones(int priority, struct zone **zones,
> -					struct scan_control *sc)
> +			struct scan_control *sc, struct list_head *laundry)
>  {
>  	unsigned long nr_reclaimed = 0;
>  	int i;
> @@ -1121,7 +1121,7 @@ static unsigned long shrink_zones(int pr
>  
>  		sc->all_unreclaimable = 0;
>  
> -		nr_reclaimed += shrink_zone(priority, zone, sc, NULL);
> +		nr_reclaimed += shrink_zone(priority, zone, sc, laundry);
>  	}
>  	return nr_reclaimed;
>  }
> @@ -1156,6 +1156,7 @@ unsigned long try_to_free_pages(struct z
>  		.swappiness = vm_swappiness,
>  		.order = order,
>  	};
> +	LIST_HEAD(laundry);
>  
>  	count_vm_event(ALLOCSTALL);
>  
> @@ -1170,16 +1171,19 @@ unsigned long try_to_free_pages(struct z
>  	}
>  
>  	for (priority = DEF_PRIORITY; priority >= 0; priority--) {
> +
>  		sc.nr_scanned = 0;
>  		if (!priority)
>  			disable_swap_token();
> -		nr_reclaimed += shrink_zones(priority, zones, &sc);
> +		nr_reclaimed += shrink_zones(priority, zones, &sc, &laundry);
>  		shrink_slab(sc.nr_scanned, gfp_mask, lru_pages);
>  		if (reclaim_state) {
>  			nr_reclaimed += reclaim_state->reclaimed_slab;
>  			reclaim_state->reclaimed_slab = 0;
>  		}
> +
>  		total_scanned += sc.nr_scanned;
> +

Could this not isolate a load of dirty pages on the laundry list and then
shortly later go to sleep in congestion_wait() ? It would appear that with
writeout deferred that the going to sleep is going to do nothing to help
the situation.

>  		if (nr_reclaimed >= sc.swap_cluster_max) {
>  			ret = 1;
>  			goto out;
> @@ -1223,6 +1227,8 @@ out:
>  
>  		zone->prev_priority = priority;
>  	}
> +	nr_reclaimed += shrink_page_list(&laundry, &sc, NULL);
> +	release_lru_pages(&laundry);
>  	return ret;
>  }
>  

-- 
Mel Gorman
Part-time Phd Student                          Linux Technology Center
University of Limerick                         IBM Dublin Software Lab

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC 0/7] Postphone reclaim laundry to write at high water marks
  2007-08-20 21:50 [RFC 0/7] Postphone reclaim laundry to write at high water marks Christoph Lameter
                   ` (8 preceding siblings ...)
  2007-08-21 15:16 ` Rik van Riel
@ 2007-08-21 15:51 ` Dave McCracken
  2007-08-21 21:03   ` Christoph Lameter
  9 siblings, 1 reply; 46+ messages in thread
From: Dave McCracken @ 2007-08-21 15:51 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: linux-mm, linux-kernel

On Monday 20 August 2007, Christoph Lameter wrote:
> 1. First reclaiming non dirty pages. Dirty pages are deferred until reclaim
>    has reestablished the high marks. Then all the dirty pages (the laundry)
>    is written out.

I don't buy it.  What happens when there aren't enough clean pages in the 
system to achieve the high water mark?  I'm guessing we'd get a quick OOM (as 
observed by Peter).

> 2. Reclaim is essentially complete during the writeout phase. So we remove
>    PF_MEMALLOC and allow recursive reclaim if we still run into trouble
>    during writeout.

You're assuming the system is static and won't allocate new pages behind your 
back.  We could be back to critically low memory before the write happens.

More broadly, we need to be proactive about getting dirty pages cleaned before 
they consume the system.  Deferring the write just makes it harder to keep 
up.

Dave McCracken

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC 0/7] Postphone reclaim laundry to write at high water marks
  2007-08-21 10:36 ` [RFC 0/7] Postphone reclaim laundry to write at high water marks Peter Zijlstra
@ 2007-08-21 20:48   ` Christoph Lameter
  2007-08-21 21:13     ` Peter Zijlstra
  0 siblings, 1 reply; 46+ messages in thread
From: Christoph Lameter @ 2007-08-21 20:48 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: linux-mm, linux-kernel

On Tue, 21 Aug 2007, Peter Zijlstra wrote:

> This almost insta-OOMs with anonymous workloads.

What does the workload do? So writeout needs to begin earlier. There are 
likely issues with throttling.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC 1/7] release_lru_pages(): Generic release of pages to the LRU
  2007-08-21 14:52   ` Mel Gorman
@ 2007-08-21 20:51     ` Christoph Lameter
  0 siblings, 0 replies; 46+ messages in thread
From: Christoph Lameter @ 2007-08-21 20:51 UTC (permalink / raw)
  To: Mel Gorman
  Cc: linux-mm, linux-kernel, akpm, dkegel, Peter Zijlstra,
	David Miller, Nick Piggin

On Tue, 21 Aug 2007, Mel Gorman wrote:

> > + */
> > +void release_lru_pages(struct list_head *page_list)
> > +{
> 
> Can the migrate.c#putback_lru_pages() be replaced with this?

Correct. We can get rid of the putback_lru_pages in migrate.c 
with this.

> > +	struct page *page;
> > +	struct pagevec pvec;
> > +	struct zone *zone = NULL;
> > +
> > +	pagevec_init(&pvec, 1);
> > +	while (!list_empty(page_list)) {
> > +		page = lru_to_page(page_list);
> > +		VM_BUG_ON(PageLRU(page));
> > +		if (zone != page_zone(page)) {
> > +			if (zone)
> > +				spin_unlock_irq(&zone->lru_lock);
> > +			zone = page_zone(page);
> > +			spin_lock_irq(&zone->lru_lock);
> 
> Is this really necessary? Why situation would occur that would have a
> list of pages in multiple zones?

Because we reclaim from multiple zones and gather laundry from different 
zones.

> Also, it may be worth commenting here that __pagevec_release() is able to
> handle lists of pages in multiple zones.

Ok.

> >  			__count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan);
> >  		__count_zone_vm_events(PGSTEAL, zone, nr_freed);
> > +		local_irq_enable();
> > +		release_lru_pages(&page_list);
> >  
> 
> Separate these apart by a line. I thought the local_irq_enable() was related
> to the call to release_lru_pages(&page_list) while reading the patch
> which isn't the case at all.

Will do.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC 3/7] shrink_page_list: Support isolating dirty pages on laundry list
  2007-08-21 15:04   ` Mel Gorman
@ 2007-08-21 20:53     ` Christoph Lameter
  0 siblings, 0 replies; 46+ messages in thread
From: Christoph Lameter @ 2007-08-21 20:53 UTC (permalink / raw)
  To: Mel Gorman
  Cc: linux-mm, linux-kernel, akpm, dkegel, Peter Zijlstra,
	David Miller, Nick Piggin

On Tue, 21 Aug 2007, Mel Gorman wrote:

> > +	if (list_empty(page_list))
> > +		return 0;
> > +
> 
> This needs a comment to explain why shrink_page_list() would be called
> with an empty list.

Ok.

> > @@ -407,10 +413,11 @@ static unsigned long shrink_page_list(st
> >  		if (TestSetPageLocked(page))
> >  			goto keep;
> >  
> > -		VM_BUG_ON(PageActive(page));
> > -
> 
> This needs explanation in the leader. It implies that later you expect active
> and inactive pages to be passed to shrink_page. i.e. We now need to keep an
> eye out for where shrink_active_list() is sending pages to shrink_page_list()
> instead of simply rotating the active list to the inactive.

Ok.

> 
> >  		sc->nr_scanned++;
> >  
> > +		if (PageActive(page))
> > +			goto keep_locked;
> > +
> >  		if (!sc->may_swap && page_mapped(page))
> >  			goto keep_locked;
> >  
> > @@ -506,6 +513,12 @@ static unsigned long shrink_page_list(st
> >  			if (!may_write_to_queue(mapping->backing_dev_info))
> >  				goto keep_locked;
> >  
> > +			if (laundry) {
> > +				list_add(&page->lru, laundry);
> > +				unlock_page(page);
> > +				continue;
> > +			}
> 
> This needs a comment. What you are doing is explained in the leader but
> it may not help a future reader of the code.
> 
> Also, with laundry specified there is no longer a check for PagePrivate
> to see if the buffers can be freed and got rid of. According to the
> comments in the next code block;

The check for buffers comes after the writeout. Writeout occurs when 
laundry == NULL.

> 
>                  * We do this even if the page is PageDirty().
>                  * try_to_release_page() does not perform I/O, but it is
>                  * possible for a page to have PageDirty set, but it is actually
>                  * clean (all its buffers are clean)
> 
> Is this intentional?

Yes buffers will be removed after writeout. Writeout requires buffers.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC 5/7] Laundry handling for direct reclaim
  2007-08-21 15:06   ` Mel Gorman
@ 2007-08-21 20:55     ` Christoph Lameter
  0 siblings, 0 replies; 46+ messages in thread
From: Christoph Lameter @ 2007-08-21 20:55 UTC (permalink / raw)
  To: Mel Gorman
  Cc: linux-mm, linux-kernel, akpm, dkegel, Peter Zijlstra,
	David Miller, Nick Piggin

On Tue, 21 Aug 2007, Mel Gorman wrote:

> > @@ -1156,6 +1156,7 @@ unsigned long try_to_free_pages(struct z
> >  		.swappiness = vm_swappiness,
> >  		.order = order,
> >  	};
> > +	LIST_HEAD(laundry);
> 
> Why is the laundry not made part of the scan_control?

That is one possibility. The other is to treat laundry as a lru type list 
under zone->lru_lock. This would allow the writeback process (whichever 
that is) to be independent of the producer of the laundry. Dirty pages 
could be isolated from an atomic context.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC 0/7] Postphone reclaim laundry to write at high water marks
  2007-08-21 15:16 ` Rik van Riel
@ 2007-08-21 20:59   ` Christoph Lameter
  2007-08-21 21:14     ` Rik van Riel
  0 siblings, 1 reply; 46+ messages in thread
From: Christoph Lameter @ 2007-08-21 20:59 UTC (permalink / raw)
  To: Rik van Riel
  Cc: linux-mm, linux-kernel, akpm, dkegel, Peter Zijlstra,
	David Miller, Nick Piggin

On Tue, 21 Aug 2007, Rik van Riel wrote:

> Christoph Lameter wrote:
> 
> > 1. First reclaiming non dirty pages. Dirty pages are deferred until reclaim
> >    has reestablished the high marks. Then all the dirty pages (the laundry)
> >    is written out.
> 
> That sounds like a horrendously bad idea.  While one process
> is busy freeing all the non dirty pages, other processes can
> allocate those pages, leaving you with no memory to free up
> the dirty pages!

What is preventing that from occurring right now? If the dirty pags are 
aligned in the right way you can have the exact same situation.
 
> Also, writing out all the dirty pages at once seems like it
> could hurt latency quite badly, especially on large systems.

We only write back the dirty pages that we are about to reclaim not all of 
them. The bigger batching occurs if we go through multiple priorities. 
Plus writeback in the sync reclaim case is stopped if the device becomes 
contended anyways.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC 5/7] Laundry handling for direct reclaim
  2007-08-21 15:19   ` Mel Gorman
@ 2007-08-21 21:00     ` Christoph Lameter
  0 siblings, 0 replies; 46+ messages in thread
From: Christoph Lameter @ 2007-08-21 21:00 UTC (permalink / raw)
  To: Mel Gorman
  Cc: linux-mm, linux-kernel, akpm, dkegel, Peter Zijlstra,
	David Miller, Nick Piggin

On Tue, 21 Aug 2007, Mel Gorman wrote:

> > +		nr_reclaimed += shrink_zones(priority, zones, &sc, &laundry);
> >  		shrink_slab(sc.nr_scanned, gfp_mask, lru_pages);
> >  		if (reclaim_state) {
> >  			nr_reclaimed += reclaim_state->reclaimed_slab;
> >  			reclaim_state->reclaimed_slab = 0;
> >  		}
> > +
> >  		total_scanned += sc.nr_scanned;
> > +
> 
> Could this not isolate a load of dirty pages on the laundry list and then
> shortly later go to sleep in congestion_wait() ? It would appear that with
> writeout deferred that the going to sleep is going to do nothing to help
> the situation.

Yep that seems to be the problem that Peter saw. We need to throttle 
later.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC 0/7] Postphone reclaim laundry to write at high water marks
  2007-08-21 15:51 ` Dave McCracken
@ 2007-08-21 21:03   ` Christoph Lameter
  0 siblings, 0 replies; 46+ messages in thread
From: Christoph Lameter @ 2007-08-21 21:03 UTC (permalink / raw)
  To: Dave McCracken; +Cc: linux-mm, linux-kernel

[-- Attachment #1: Type: TEXT/PLAIN, Size: 1333 bytes --]

On Tue, 21 Aug 2007, Dave McCracken wrote:

> On Monday 20 August 2007, Christoph Lameter wrote:
> > 1. First reclaiming non dirty pages. Dirty pages are deferred until reclaim
> >    has reestablished the high marks. Then all the dirty pages (the laundry)
> >    is written out.
> 
> I don't buy it.  What happens when there aren't enough clean pages in the 
> system to achieve the high water mark?  I'm guessing we'd get a quick OOM (as 
> observed by Peter).

We reclaim the clean pages that there are (removing the executable 
pages from memory) and then we do writeback.

The quick OOM is due to throttling not working right AFAIK.

> > 2. Reclaim is essentially complete during the writeout phase. So we remove
> >    PF_MEMALLOC and allow recursive reclaim if we still run into trouble
> >    during writeout.
> 
> You're assuming the system is static and won't allocate new pages behind your 
> back.  We could be back to critically low memory before the write happens.

Yes and that occurs now too.

> More broadly, we need to be proactive about getting dirty pages cleaned before 
> they consume the system.  Deferring the write just makes it harder to keep 
> up.

Cleaning dirty pages through writeout consumes memory. Writing dirty pages 
out early makes the memory situation even worse.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC 0/7] Postphone reclaim laundry to write at high water marks
  2007-08-21 20:48   ` Christoph Lameter
@ 2007-08-21 21:13     ` Peter Zijlstra
  2007-08-21 21:29       ` Christoph Lameter
  0 siblings, 1 reply; 46+ messages in thread
From: Peter Zijlstra @ 2007-08-21 21:13 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: linux-mm, linux-kernel

On Tue, 2007-08-21 at 13:48 -0700, Christoph Lameter wrote:
> On Tue, 21 Aug 2007, Peter Zijlstra wrote:
> 
> > This almost insta-OOMs with anonymous workloads.
> 
> What does the workload do? So writeout needs to begin earlier. There are 
> likely issues with throttling.

The workload is a single program mapping 256M of anonymous memory and
cycling through it with writes ran on a 128M setup.

It quickly ends up with all of memory in the laundry list and then
recursing into __alloc_pages which will fail to make progress and OOMs.

But aside from the numerous issues with the patch set as presented, I'm
not seeing the seeing the big picture, why are you doing this.

Anonymous pages are a there to stay, and we cannot tell people how to
use them. So we need some free or freeable pages in order to avoid the
vm deadlock that arises from all memory dirty.

Currently we keep them free, this has the advantage that the buddy
allocator can at least try to coalese them.

'Optimizing' this by switching to freeable pages has mainly
disadvantages IMHO, finding them scrambles LRU order and complexifies
relcaim and all that for a relatively small gain in space for clean
pagecache pages.

Please, stop writing patches and write down a solid proposal of how you
envision the VM working in the various scenarios and why its better than
the current approach.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC 0/7] Postphone reclaim laundry to write at high water marks
  2007-08-21 20:59   ` Christoph Lameter
@ 2007-08-21 21:14     ` Rik van Riel
  2007-08-21 21:30       ` Christoph Lameter
  0 siblings, 1 reply; 46+ messages in thread
From: Rik van Riel @ 2007-08-21 21:14 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: linux-mm, linux-kernel, akpm, dkegel, Peter Zijlstra,
	David Miller, Nick Piggin

Christoph Lameter wrote:
> On Tue, 21 Aug 2007, Rik van Riel wrote:
> 
>> Christoph Lameter wrote:
>>
>>> 1. First reclaiming non dirty pages. Dirty pages are deferred until reclaim
>>>    has reestablished the high marks. Then all the dirty pages (the laundry)
>>>    is written out.
>> That sounds like a horrendously bad idea.  While one process
>> is busy freeing all the non dirty pages, other processes can
>> allocate those pages, leaving you with no memory to free up
>> the dirty pages!
> 
> What is preventing that from occurring right now? If the dirty pags are 
> aligned in the right way you can have the exact same situation.

For one, dirty page writeout is done even when free memory
is low.  The kernel will dig into the PF_MEMALLOC reserves,
instead of deciding not to do writeout unless there is lots
of free memory.

Secondly, why would you want to recreate this worst case on
purpose every time the pageout code runs?

-- 
Politics is the struggle between those who want to make their country
the best in the world, and those who believe it already is.  Each group
calls the other unpatriotic.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC 0/7] Postphone reclaim laundry to write at high water marks
  2007-08-21 21:13     ` Peter Zijlstra
@ 2007-08-21 21:29       ` Christoph Lameter
  2007-08-21 21:43         ` Rik van Riel
  2007-08-21 22:09         ` Peter Zijlstra
  0 siblings, 2 replies; 46+ messages in thread
From: Christoph Lameter @ 2007-08-21 21:29 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: linux-mm, linux-kernel

On Tue, 21 Aug 2007, Peter Zijlstra wrote:

> It quickly ends up with all of memory in the laundry list and then
> recursing into __alloc_pages which will fail to make progress and OOMs.

Hmmmm... Okay that needs to be addressed. Reserves need to be used and we 
only should enter reclaim if that runs out (like the first patch that I 
did).

> But aside from the numerous issues with the patch set as presented, I'm
> not seeing the seeing the big picture, why are you doing this.

I want general improvements to reclaim to address the issues that you see 
and other issues related to reclaim instead of the strange code that makes 
PF_MEMALLOC allocs compete for allocations from a single slab and putting 
logic into the kernel to decide which allocs to fail. We can reclaim after 
all. Its just a matter of finding the right way to do this. 

> Anonymous pages are a there to stay, and we cannot tell people how to
> use them. So we need some free or freeable pages in order to avoid the
> vm deadlock that arises from all memory dirty.

No one is trying to abolish Anonymous pages. Free memory is readily 
available on demand if one calls reclaim. Your scheme introduces complex 
negotiations over a few scraps of memory when large amounts of memory 
would still be readily available if one would do the right thing and call 
into reclaim.

> 'Optimizing' this by switching to freeable pages has mainly
> disadvantages IMHO, finding them scrambles LRU order and complexifies
> relcaim and all that for a relatively small gain in space for clean
> pagecache pages.

Sounds like you would like to change the way we handle memory in general 
in the VM? Reclaim (and thus finding freeable pages) is basic to Linux 
memory management.

> Please, stop writing patches and write down a solid proposal of how you
> envision the VM working in the various scenarios and why its better than
> the current approach.

Sorry I just got into this a short time ago and I may need a few cycles 
to get this all straight. An approach that uses memory instead of 
ignoring available memory is certainly better.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC 0/7] Postphone reclaim laundry to write at high water marks
  2007-08-21 21:14     ` Rik van Riel
@ 2007-08-21 21:30       ` Christoph Lameter
  0 siblings, 0 replies; 46+ messages in thread
From: Christoph Lameter @ 2007-08-21 21:30 UTC (permalink / raw)
  To: Rik van Riel
  Cc: linux-mm, linux-kernel, akpm, dkegel, Peter Zijlstra,
	David Miller, Nick Piggin

On Tue, 21 Aug 2007, Rik van Riel wrote:

> > What is preventing that from occurring right now? If the dirty pags are
> > aligned in the right way you can have the exact same situation.
> 
> For one, dirty page writeout is done even when free memory
> is low.  The kernel will dig into the PF_MEMALLOC reserves,
> instead of deciding not to do writeout unless there is lots
> of free memory.

Right that is a fundamental problem with this RFC. We need to be able to 
get into PF_MEMALLOC reserves for writeout.
  
> Secondly, why would you want to recreate this worst case on
> purpose every time the pageout code runs?

I did not intend that to occur.


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC 0/7] Postphone reclaim laundry to write at high water marks
  2007-08-21 21:29       ` Christoph Lameter
@ 2007-08-21 21:43         ` Rik van Riel
  2007-08-21 22:32           ` Christoph Lameter
  2007-08-21 22:09         ` Peter Zijlstra
  1 sibling, 1 reply; 46+ messages in thread
From: Rik van Riel @ 2007-08-21 21:43 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: Peter Zijlstra, linux-mm, linux-kernel

Christoph Lameter wrote:

> I want general improvements to reclaim to address the issues that you see 
> and other issues related to reclaim instead of the strange code that makes 
> PF_MEMALLOC allocs compete for allocations from a single slab and putting 
> logic into the kernel to decide which allocs to fail. We can reclaim after 
> all. Its just a matter of finding the right way to do this. 

The simplest way of achieving that would be to allow
recursion of the page reclaim code, under the condition
that the second level call can only reclaim clean pages,
while the "outer" call does what the VM does today.

-- 
Politics is the struggle between those who want to make their country
the best in the world, and those who believe it already is.  Each group
calls the other unpatriotic.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC 0/7] Postphone reclaim laundry to write at high water marks
  2007-08-21 21:29       ` Christoph Lameter
  2007-08-21 21:43         ` Rik van Riel
@ 2007-08-21 22:09         ` Peter Zijlstra
  2007-08-21 22:43           ` Christoph Lameter
  2007-08-23 12:08           ` Andrea Arcangeli
  1 sibling, 2 replies; 46+ messages in thread
From: Peter Zijlstra @ 2007-08-21 22:09 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: linux-mm, linux-kernel

On Tue, 2007-08-21 at 14:29 -0700, Christoph Lameter wrote:
> On Tue, 21 Aug 2007, Peter Zijlstra wrote:
> 
> > It quickly ends up with all of memory in the laundry list and then
> > recursing into __alloc_pages which will fail to make progress and OOMs.
> 
> Hmmmm... Okay that needs to be addressed. Reserves need to be used and we 
> only should enter reclaim if that runs out (like the first patch that I 
> did).
> 
> > But aside from the numerous issues with the patch set as presented, I'm
> > not seeing the seeing the big picture, why are you doing this.
> 
> I want general improvements to reclaim to address the issues that you see 
> and other issues related to reclaim instead of the strange code that makes 
> PF_MEMALLOC allocs compete for allocations from a single slab and putting 
> logic into the kernel to decide which allocs to fail. We can reclaim after 
> all. Its just a matter of finding the right way to do this. 

The latest patch I posted got rid of that global slab.

Also, all I want is for slab to honour gfp flags like page allocation
does, nothing more, nothing less.

(well, actually slightly less, since I'm only really interrested in the
ALLOC_MIN|ALLOC_HIGH|ALLOC_HARDER -> ALLOC_NO_WATERMARKS transition and
not all higher ones)

I want slab to fail when a similar page alloc would fail, no magic.

Strictly speaking:

if:

 page = alloc_page(gfp);

fails but:

 obj = kmem_cache_alloc(s, gfp);

succeeds then its a bug.

But I'm not actually needing it that strict, just the ALLOC_NO_WATERMARK
part needs to be done, ALLOC_HARDER, ALLOC_HIGH those may fudge a bit.

> > Anonymous pages are a there to stay, and we cannot tell people how to
> > use them. So we need some free or freeable pages in order to avoid the
> > vm deadlock that arises from all memory dirty.
> 
> No one is trying to abolish Anonymous pages. Free memory is readily 
> available on demand if one calls reclaim. Your scheme introduces complex 
> negotiations over a few scraps of memory when large amounts of memory 
> would still be readily available if one would do the right thing and call 
> into reclaim.

This is the thing I contend, there need not be large amounts of memory
around. In my test prog the hot code path fits into a single page, the
rest can be anonymous.

> > 'Optimizing' this by switching to freeable pages has mainly
> > disadvantages IMHO, finding them scrambles LRU order and complexifies
> > relcaim and all that for a relatively small gain in space for clean
> > pagecache pages.
> 
> Sounds like you would like to change the way we handle memory in general 
> in the VM? Reclaim (and thus finding freeable pages) is basic to Linux 
> memory management.

Not quite, currently we have free pages in the reserves, if you want to
replace some (or all) of that by freeable pages then that is a change.

I'm just using the reserves.

> > Please, stop writing patches and write down a solid proposal of how you
> > envision the VM working in the various scenarios and why its better than
> > the current approach.
> 
> Sorry I just got into this a short time ago and I may need a few cycles 
> to get this all straight. An approach that uses memory instead of 
> ignoring available memory is certainly better.

Sure if and when possible. There will always be need to fall back to the
reserves.

A bit off-topic, re that reclaim from atomic context:
Currently we try to hold spinlocks only for short periods of time so
that reclaim can be preempted, if you run all of reclaim from a
non-preemptible context you get very large preemption latencies and if
done from int context it'd also generate large int latencies.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC 0/7] Postphone reclaim laundry to write at high water marks
  2007-08-21 21:43         ` Rik van Riel
@ 2007-08-21 22:32           ` Christoph Lameter
  2007-08-23 12:05             ` Andrea Arcangeli
  0 siblings, 1 reply; 46+ messages in thread
From: Christoph Lameter @ 2007-08-21 22:32 UTC (permalink / raw)
  To: Rik van Riel
  Cc: Peter Zijlstra, linux-mm, dkegel, akpm, Nick Piggin, ak,
	linux-kernel

On Tue, 21 Aug 2007, Rik van Riel wrote:

> Christoph Lameter wrote:
> 
> > I want general improvements to reclaim to address the issues that you see
> > and other issues related to reclaim instead of the strange code that makes
> > PF_MEMALLOC allocs compete for allocations from a single slab and putting
> > logic into the kernel to decide which allocs to fail. We can reclaim after
> > all. Its just a matter of finding the right way to do this. 
> 
> The simplest way of achieving that would be to allow
> recursion of the page reclaim code, under the condition
> that the second level call can only reclaim clean pages,
> while the "outer" call does what the VM does today.

Yes that is what the precursor to this patchset does.

See http://marc.info/?l=linux-mm&m=118710207203449&w=2

This one did not even come up to the level of the earlier one. Sigh.

The way forward may be:

1. Like in the earlier patchset allow reentry to reclaim under 
   PF_MEMALLOC if we are out of all memory.

2. Do the laundry as here but do not write out laundry directly.
   Instead move laundry to a new lru style list in the zone structure.
   This will allow the recursive reclaim to also trigger writeout
   of pages (what this patchset was supposed to accomplish).

3. Perform writeback only from kswapd. Make other threads
   wait on kswapd if memory is low, we can wait and writeback still
   has to progress.

4. Then allow reclaim of GFP_ATOMIC allocs (see
   http://marc.info/?l=linux-kernel&m=118710595617696&w=2). Atomic
   reclaim can then also put pages onto the zone laundry lists from where
   it is going to be picked up and written out by kswapd ASAP. This one
   may be tricky so maybe keep this separate.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC 0/7] Postphone reclaim laundry to write at high water marks
  2007-08-21 22:09         ` Peter Zijlstra
@ 2007-08-21 22:43           ` Christoph Lameter
  2007-08-22  7:02             ` Peter Zijlstra
  2007-08-22  7:45             ` Ingo Molnar
  2007-08-23 12:08           ` Andrea Arcangeli
  1 sibling, 2 replies; 46+ messages in thread
From: Christoph Lameter @ 2007-08-21 22:43 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: linux-mm, linux-kernel

On Wed, 22 Aug 2007, Peter Zijlstra wrote:

> Also, all I want is for slab to honour gfp flags like page allocation
> does, nothing more, nothing less.
> 
> (well, actually slightly less, since I'm only really interrested in the
> ALLOC_MIN|ALLOC_HIGH|ALLOC_HARDER -> ALLOC_NO_WATERMARKS transition and
> not all higher ones)

I am still not sure what that brings you. There may be multiple 
PF_MEMALLOC going on at the same time. On a large system with N cpus
there may be more than N of these that can steal objects from one another. 

A NUMA system will be shot anyways if memory gets that problematic to 
handle since the OS cannot effectively place memory if all zones are 
overallocated so that only a few pages are left.


> I want slab to fail when a similar page alloc would fail, no magic.

Yes I know. I do not want allocations to fail but that reclaim occurs in 
order to avoid failing any allocation. We need provisions that 
make sure that we never get into such a bad memory situation that would
cause severe slowless and usually end up in a livelock anyways.

> > > Anonymous pages are a there to stay, and we cannot tell people how to
> > > use them. So we need some free or freeable pages in order to avoid the
> > > vm deadlock that arises from all memory dirty.
> > 
> > No one is trying to abolish Anonymous pages. Free memory is readily 
> > available on demand if one calls reclaim. Your scheme introduces complex 
> > negotiations over a few scraps of memory when large amounts of memory 
> > would still be readily available if one would do the right thing and call 
> > into reclaim.
> 
> This is the thing I contend, there need not be large amounts of memory
> around. In my test prog the hot code path fits into a single page, the
> rest can be anonymous.

Thats a bit extreme.... We need to make sure that there are larger amounts 
of memory around. Pages are used for all shorts of short term uses (like 
slab shrinking etc etc.). If memory is that low that a single page matters
then we are in very bad shape anyways.

> > Sounds like you would like to change the way we handle memory in general 
> > in the VM? Reclaim (and thus finding freeable pages) is basic to Linux 
> > memory management.
> 
> Not quite, currently we have free pages in the reserves, if you want to
> replace some (or all) of that by freeable pages then that is a change.

We have free pages primarily to optimize the allocation. Meaning we do not 
have to run reclaim on every call. We want to use all of memory. The 
reserves are there for the case that we cannot call into reclaim. The easy 
solution if that is problematic is to enhance the reclaim to work in the
critical situations that we care about.


> > Sorry I just got into this a short time ago and I may need a few cycles 
> > to get this all straight. An approach that uses memory instead of 
> > ignoring available memory is certainly better.
> 
> Sure if and when possible. There will always be need to fall back to the
> reserves.

Maybe. But we can certainly avoid that as much as possible which would 
also increase our ability to use all available memory instead of leaving 
some of it unused./

> A bit off-topic, re that reclaim from atomic context:
> Currently we try to hold spinlocks only for short periods of time so
> that reclaim can be preempted, if you run all of reclaim from a
> non-preemptible context you get very large preemption latencies and if
> done from int context it'd also generate large int latencies.

If you call into the page allocator from an interrupt context then you are 
already in bad shape since we may check pcps lists and then potentially 
have to traverse the zonelists and check all sorts of things. If we 
would implement atomic reclaim then the reserves may become a latency 
optimizations. At least we will not fail anymore if the reserves are out.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC 0/7] Postphone reclaim laundry to write at high water marks
  2007-08-21 22:43           ` Christoph Lameter
@ 2007-08-22  7:02             ` Peter Zijlstra
  2007-08-22 19:04               ` Christoph Lameter
  2007-08-22  7:45             ` Ingo Molnar
  1 sibling, 1 reply; 46+ messages in thread
From: Peter Zijlstra @ 2007-08-22  7:02 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: linux-mm, linux-kernel, riel

On Tue, 2007-08-21 at 15:43 -0700, Christoph Lameter wrote:
> On Wed, 22 Aug 2007, Peter Zijlstra wrote:
> 
> > Also, all I want is for slab to honour gfp flags like page allocation
> > does, nothing more, nothing less.
> > 
> > (well, actually slightly less, since I'm only really interrested in the
> > ALLOC_MIN|ALLOC_HIGH|ALLOC_HARDER -> ALLOC_NO_WATERMARKS transition and
> > not all higher ones)
> 
> I am still not sure what that brings you. There may be multiple 
> PF_MEMALLOC going on at the same time. On a large system with N cpus
> there may be more than N of these that can steal objects from one another. 

Yes, quite aware of that, and have ideas on how to properly fix that.
Once it is, the reserves can be shrunk too, perhaps you can work on
this?

> A NUMA system will be shot anyways if memory gets that problematic to 
> handle since the OS cannot effectively place memory if all zones are 
> overallocated so that only a few pages are left.

Also not a new problem.

> > I want slab to fail when a similar page alloc would fail, no magic.
> 
> Yes I know. I do not want allocations to fail but that reclaim occurs in 
> order to avoid failing any allocation. We need provisions that 
> make sure that we never get into such a bad memory situation that would
> cause severe slowless and usually end up in a livelock anyways.

Its unavoidable, at some point it just happens. Also using reclaim
doesn't seem like the ideal way to get out of live-locks since reclaim
itself can live-lock on these large boxen.

> > > > Anonymous pages are a there to stay, and we cannot tell people how to
> > > > use them. So we need some free or freeable pages in order to avoid the
> > > > vm deadlock that arises from all memory dirty.
> > > 
> > > No one is trying to abolish Anonymous pages. Free memory is readily 
> > > available on demand if one calls reclaim. Your scheme introduces complex 
> > > negotiations over a few scraps of memory when large amounts of memory 
> > > would still be readily available if one would do the right thing and call 
> > > into reclaim.
> > 
> > This is the thing I contend, there need not be large amounts of memory
> > around. In my test prog the hot code path fits into a single page, the
> > rest can be anonymous.
> 
> Thats a bit extreme.... We need to make sure that there are larger amounts 
> of memory around. Pages are used for all shorts of short term uses (like 
> slab shrinking etc etc.). If memory is that low that a single page matters
> then we are in very bad shape anyways.

Yes we are, but its a legitimate situation. Denying it won't get us very
far. Also placing a large bound on anonymous memory usage is not going
to be appreciated by the userspace people.

Slab cache will also be at a minimum is the pressure persists for a
while.

> > > Sounds like you would like to change the way we handle memory in general 
> > > in the VM? Reclaim (and thus finding freeable pages) is basic to Linux 
> > > memory management.
> > 
> > Not quite, currently we have free pages in the reserves, if you want to
> > replace some (or all) of that by freeable pages then that is a change.
> 
> We have free pages primarily to optimize the allocation. Meaning we do not 
> have to run reclaim on every call. We want to use all of memory. The 
> reserves are there for the case that we cannot call into reclaim. 

> The easy 
> solution if that is problematic is to enhance the reclaim to work in the
> critical situations that we care about.

As shown, there are cases where there just isn't any memory to reclaim.
Please accept this.

Also, by reclaiming memory and getting out of the tight spot you give
the rest of the system access to that memory, and it can be used for
other things than getting out of the tight spot.

You really want a separate allocation state that allows only reclaim to
access memory.

> > > Sorry I just got into this a short time ago and I may need a few cycles 
> > > to get this all straight. An approach that uses memory instead of 
> > > ignoring available memory is certainly better.
> > 
> > Sure if and when possible. There will always be need to fall back to the
> > reserves.
> 
> Maybe. But we can certainly avoid that as much as possible which would 
> also increase our ability to use all available memory instead of leaving 
> some of it unused./
> 
> > A bit off-topic, re that reclaim from atomic context:
> > Currently we try to hold spinlocks only for short periods of time so
> > that reclaim can be preempted, if you run all of reclaim from a
> > non-preemptible context you get very large preemption latencies and if
> > done from int context it'd also generate large int latencies.
> 
> If you call into the page allocator from an interrupt context then you are 
> already in bad shape since we may check pcps lists and then potentially 
> have to traverse the zonelists and check all sorts of things. 

Only an issue on these obscenely large NUMA boxen, normal machines don't
have large zone lists. No reason to hurt the small boxen in favour of
the large boxen.

> If we 
> would implement atomic reclaim then the reserves may become a latency 
> optimizations. At least we will not fail anymore if the reserves are out.

Yes it will, because there is no guarantee that there is anything
reclaimable.

Also, failing a memory allocation isn't bad, why are you so worried
about that? It happens all the time.


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC 0/7] Postphone reclaim laundry to write at high water marks
  2007-08-21 22:43           ` Christoph Lameter
  2007-08-22  7:02             ` Peter Zijlstra
@ 2007-08-22  7:45             ` Ingo Molnar
  2007-08-22 19:19               ` Christoph Lameter
  1 sibling, 1 reply; 46+ messages in thread
From: Ingo Molnar @ 2007-08-22  7:45 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: Peter Zijlstra, linux-mm, linux-kernel

* Christoph Lameter <clameter@sgi.com> wrote:

> > I want slab to fail when a similar page alloc would fail, no magic.
> 
> Yes I know. I do not want allocations to fail but that reclaim occurs 
> in order to avoid failing any allocation. We need provisions that make 
> sure that we never get into such a bad memory situation that would 
> cause severe slowless and usually end up in a livelock anyways.

Could you outline the "big picture" as you see it? To me your argument 
that reclaim can always be done instantly and that the cases where it 
cannot be done are pathological and need to be avoided is fundamentally 
dangerous and quite a bit short-sighted at first glance.

The big picture way to think about this is the following: the free page 
pool is the "cache" of the MM. It's what "greases" the mechanism and 
bridges the inevitable reclaim latency and makes "atomic memory" 
available to the reclaim mechanism itself. We _cannot_ remove that cache 
without a conceptual replacement (or a _very_ robust argument and proof 
that the free pages pool is not needed at all - this would be a major 
design change (and a stupid mistake IMO)). Your patchset, in essence, 
tries to claim that we dont really need this cache and that all that 
matters is to keep enough clean pagecache pages around. That approach 
misses the full picture and i dont think we can progress without 
agreeing on the fundamentals first.

That "cache" cannot be handled in your scheme: a fully or mostly 
anonymous workload (tons of apps are like that) instantly destroys the 
"there is always a minimal amount of atomically reclaimable pages 
around" property of freelists, and this cannot be talked or tweaked 
around by twiddling any existing property of anonymous reclaim. 
Anonymous memory is dirty and takes ages to reclaim. The fact that your 
patchset causes an easy anonymous OOM further underlines this flaw of 
your thinking. Not making anonymous workloads OOM is the _hardest_ part 
of the MM, by far. Pagecache reclaim is a breeze in comparison :-)

So there is a large and fundamental rift between having pages on the 
freelist (instantly available to any context) and having them on the 
(current) LRU where they might or might not be clean, etc. The freelists 
are an implicit guarantee of buffering and atomicity and they can and do 
save the day if everything else fails to keep stuff insta-freeable. (And 
then we havent even considered the performance and scalability 
differences between picking from the pcp freelists versus picking pages 
from the LRU, havent considered the better higher-order page allocation 
property of the buddy pool and havent considered the atomicity of 
in-irq-handler allocations.)

	Ingo

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC 0/7] Postphone reclaim laundry to write at high water marks
  2007-08-22  7:02             ` Peter Zijlstra
@ 2007-08-22 19:04               ` Christoph Lameter
  2007-08-22 20:03                 ` Peter Zijlstra
  0 siblings, 1 reply; 46+ messages in thread
From: Christoph Lameter @ 2007-08-22 19:04 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: linux-mm, linux-kernel, riel

On Wed, 22 Aug 2007, Peter Zijlstra wrote:

> Its unavoidable, at some point it just happens. Also using reclaim
> doesn't seem like the ideal way to get out of live-locks since reclaim
> itself can live-lock on these large boxen.

If reclaim can live lock then it needs to be fixed.

> As shown, there are cases where there just isn't any memory to reclaim.
> Please accept this.

That is an extreme case that AFAIK we currently ignore and could be 
avoided with some effort. The initial PF_MEMALLOC patchset seems to be 
still enough to deal with your issues.

> Also, by reclaiming memory and getting out of the tight spot you give
> the rest of the system access to that memory, and it can be used for
> other things than getting out of the tight spot.

The rest of the system may have their own tights spots. Language the "the 
tight spot" sets up all sort of alarms over here since you seem to be 
thinking about a system doing a single task. The system may be handling 
multiple critical tasks on various devices that have various memory needs. 
So multiple critical spots can happen concurrently in multiple 
application contexts.

> You really want a separate allocation state that allows only reclaim to
> access memory.

We have that with PF_MEMALLOC.

> Also, failing a memory allocation isn't bad, why are you so worried
> about that? It happens all the time.

Its a performance impact and plainly does not make sense if there is 
reclaimable memory availble. The common action of the vm is to reclaim if 
there is a demand for memory. Now we suddenly abandon that approach?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC 0/7] Postphone reclaim laundry to write at high water marks
  2007-08-22  7:45             ` Ingo Molnar
@ 2007-08-22 19:19               ` Christoph Lameter
  0 siblings, 0 replies; 46+ messages in thread
From: Christoph Lameter @ 2007-08-22 19:19 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Peter Zijlstra, linux-mm, linux-kernel

On Wed, 22 Aug 2007, Ingo Molnar wrote:

> Could you outline the "big picture" as you see it? To me your argument 
> that reclaim can always be done instantly and that the cases where it 
> cannot be done are pathological and need to be avoided is fundamentally 
> dangerous and quite a bit short-sighted at first glance.

That is a bit overdrawing my argument. The issues that Peter saw can be 
fixed by allowing recursive reclaim (see the earlier patchset). The rest 
is so far sugar on top or building extreme cases where we already have 
trouble.

> The big picture way to think about this is the following: the free page 
> pool is the "cache" of the MM. It's what "greases" the mechanism and 
> bridges the inevitable reclaim latency and makes "atomic memory" 
> available to the reclaim mechanism itself. We _cannot_ remove that cache 
> without a conceptual replacement (or a _very_ robust argument and proof 
> that the free pages pool is not needed at all - this would be a major 
> design change (and a stupid mistake IMO)). Your patchset, in essence, 
> tries to claim that we dont really need this cache and that all that 
> matters is to keep enough clean pagecache pages around. That approach 
> misses the full picture and i dont think we can progress without 
> agreeing on the fundamentals first.

The patchset attempts to deal with the reserves in a more intelligent 
way in order not to fail when this pool becomes exhausted because some
device needs a lot of memory in the writeout path.

> That "cache" cannot be handled in your scheme: a fully or mostly 
> anonymous workload (tons of apps are like that) instantly destroys the 
> "there is always a minimal amount of atomically reclaimable pages 
> around" property of freelists, and this cannot be talked or tweaked 
> around by twiddling any existing property of anonymous reclaim. 

A extreme anonymous workload like discussed here can even cause the 
current VM to fail. Realistically at least portions of the executable and 
varios slab caches will remain in memory in addition to the reserves.

> Anonymous memory is dirty and takes ages to reclaim. The fact that your 
> patchset causes an easy anonymous OOM further underlines this flaw of 
> your thinking. Not making anonymous workloads OOM is the _hardest_ part 
> of the MM, by far. Pagecache reclaim is a breeze in comparison :-)

The central flaw in my thinking was the switching of of PF_MEMALLOC on the 
writeout path instead of allowing recursive PF_MEMALLOC reclaim as in the 
first patch. But the first patchset did not have that flaw.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC 0/7] Postphone reclaim laundry to write at high water marks
  2007-08-22 19:04               ` Christoph Lameter
@ 2007-08-22 20:03                 ` Peter Zijlstra
  2007-08-22 20:16                   ` Christoph Lameter
  2007-08-23 12:16                   ` Andrea Arcangeli
  0 siblings, 2 replies; 46+ messages in thread
From: Peter Zijlstra @ 2007-08-22 20:03 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: linux-mm, linux-kernel, riel

On Wed, 2007-08-22 at 12:04 -0700, Christoph Lameter wrote:
> On Wed, 22 Aug 2007, Peter Zijlstra wrote:
> 
> > Its unavoidable, at some point it just happens. Also using reclaim
> > doesn't seem like the ideal way to get out of live-locks since reclaim
> > itself can live-lock on these large boxen.
> 
> If reclaim can live lock then it needs to be fixed.

Riel is working on that.

> > As shown, there are cases where there just isn't any memory to reclaim.
> > Please accept this.
> 
> That is an extreme case that AFAIK we currently ignore and could be 
> avoided with some effort.

Its not extreme, not even rare, and its handled now. Its what
PF_MEMALLOC is for.

> The initial PF_MEMALLOC patchset seems to be 
> still enough to deal with your issues.

No it isnt. 

Take the anonyous workload, user-space will block once the page
allocator hits ALLOC_MIN. Network will be able to receive until
ALLOC_MIN|ALLOC_HIGH - if the completion doesn't arrive by then it will
start dropping all packets until there is memory again. But userspace is
wedged and hence will not consume the network traffic, hence we
deadlock.

Even if there is something to reclaim initially, if the pressure
persists that can eventually be exhausted.

> > Also, by reclaiming memory and getting out of the tight spot you give
> > the rest of the system access to that memory, and it can be used for
> > other things than getting out of the tight spot.
> 
> The rest of the system may have their own tights spots. Language the "the 
> tight spot" sets up all sort of alarms over here since you seem to be 
> thinking about a system doing a single task.

reclaim

>  The system may be handling 
> multiple critical tasks on various devices that have various memory needs. 
> So multiple critical spots can happen concurrently in multiple 
> application contexts.

yes, reclaim can be unbounded concurrent, and that is one of the
(theoretically) major problems we currently have.

> > You really want a separate allocation state that allows only reclaim to
> > access memory.
> 
> We have that with PF_MEMALLOC.

Exactly. But if you recognise the need for PF_MEMALLOC then what is this
argument about?

Networking can currently be seen as having two states:

 1 receive packets and consume memory
 2 drop all packets (when out of memory)

I need a 3rd state:

 3 receiving packets but not consuming memory

Now, I need this state when we're in PF_MEMALLOC territory, because I
need to be able to process an unspecified amount of network traffic in
order to receive the writeout completion.

In order to operate this 3rd network state, some memory is needed in
which packets can be received and when deemed not important freed and
reused.

It needs a bounded amount of memory in order to process an unbounded
amount of network traffic.

What exactly is not clear about this? If you accept the need for
PF_MEMALLOC you surely must also agree that at the point you're using it
running reclaim is useless.

> > Also, failing a memory allocation isn't bad, why are you so worried
> > about that? It happens all the time.
> 
> Its a performance impact and plainly does not make sense if there is 
> reclaimable memory availble. The common action of the vm is to reclaim if 
> there is a demand for memory. Now we suddenly abandon that approach?

I'm utterly confused by this, on one hand you recognise the need for
PF_MEMALLOC but on the other hand you're saying its not needed and
anybody needing memory (even reclaim itself) should use reclaim.


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC 0/7] Postphone reclaim laundry to write at high water marks
  2007-08-22 20:03                 ` Peter Zijlstra
@ 2007-08-22 20:16                   ` Christoph Lameter
  2007-08-23  7:39                     ` Peter Zijlstra
  2007-08-26  4:52                     ` Rik van Riel
  2007-08-23 12:16                   ` Andrea Arcangeli
  1 sibling, 2 replies; 46+ messages in thread
From: Christoph Lameter @ 2007-08-22 20:16 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: linux-mm, linux-kernel, riel

On Wed, 22 Aug 2007, Peter Zijlstra wrote:

> > That is an extreme case that AFAIK we currently ignore and could be 
> > avoided with some effort.
> 
> Its not extreme, not even rare, and its handled now. Its what
> PF_MEMALLOC is for.

No its not. If you have all pages allocated as anonymous pages and your 
writeout requires more pages than available in the reserves then you are 
screwed either way regardless if you have PF_MEMALLOC set or not.

> > The initial PF_MEMALLOC patchset seems to be 
> > still enough to deal with your issues.
> 
> Take the anonyous workload, user-space will block once the page
> allocator hits ALLOC_MIN. Network will be able to receive until
> ALLOC_MIN|ALLOC_HIGH - if the completion doesn't arrive by then it will
> start dropping all packets until there is memory again. But userspace is
> wedged and hence will not consume the network traffic, hence we
> deadlock.
> 
> Even if there is something to reclaim initially, if the pressure
> persists that can eventually be exhausted.

Sure ultimately you will end up with pages that are all unreclaimable if 
you reclaim all reclaimable memory.

> > multiple critical tasks on various devices that have various memory needs. 
> > So multiple critical spots can happen concurrently in multiple 
> > application contexts.
> 
> yes, reclaim can be unbounded concurrent, and that is one of the
> (theoretically) major problems we currently have.

So your patchset is not fixing it?

> > We have that with PF_MEMALLOC.
> 
> Exactly. But if you recognise the need for PF_MEMALLOC then what is this
> argument about?

The PF_MEMALLOC patchset f.e. is about avoiding to go out of 
memory when there is still memory available even if we are doing a 
PF_MEMALLOC allocation and would OOM otherwise.

> Networking can currently be seen as having two states:
> 
>  1 receive packets and consume memory
>  2 drop all packets (when out of memory)
> 
> I need a 3rd state:
> 
>  3 receiving packets but not consuming memory

So far a good idea. If you are not consuming memory then why are the 
allocators involved?
 
> Now, I need this state when we're in PF_MEMALLOC territory, because I
> need to be able to process an unspecified amount of network traffic in
> order to receive the writeout completion.
> 
> In order to operate this 3rd network state, some memory is needed in
> which packets can be received and when deemed not important freed and
> reused.
> 
> It needs a bounded amount of memory in order to process an unbounded
> amount of network traffic.
> 
> What exactly is not clear about this? If you accept the need for
> PF_MEMALLOC you surely must also agree that at the point you're using it
> running reclaim is useless.

Yes looks like you would like to add something to the network layer to 
filter important packets. As long as you stay within PF_MEMALLOC 
boundaries you can allocate and throw packets away. If you want to have a 
reserve that is secure and just for you then you need to take it away from 
the reserves (which in turn will lead reclaim to restore them).

> > > Also, failing a memory allocation isn't bad, why are you so worried
> > > about that? It happens all the time.
> > 
> > Its a performance impact and plainly does not make sense if there is 
> > reclaimable memory availble. The common action of the vm is to reclaim if 
> > there is a demand for memory. Now we suddenly abandon that approach?
> 
> I'm utterly confused by this, on one hand you recognise the need for
> PF_MEMALLOC but on the other hand you're saying its not needed and
> anybody needing memory (even reclaim itself) should use reclaim.

The VM reclaims memory on demand but in exceptional limited cases where we 
cannot do so we use the reserves. I am sure you know this.


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC 0/7] Postphone reclaim laundry to write at high water marks
  2007-08-22 20:16                   ` Christoph Lameter
@ 2007-08-23  7:39                     ` Peter Zijlstra
  2007-08-26  4:52                     ` Rik van Riel
  1 sibling, 0 replies; 46+ messages in thread
From: Peter Zijlstra @ 2007-08-23  7:39 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: linux-mm, linux-kernel, riel

On Wed, 2007-08-22 at 13:16 -0700, Christoph Lameter wrote:
> On Wed, 22 Aug 2007, Peter Zijlstra wrote:


> > > > As shown, there are cases where there just isn't any memory to reclaim.
                                                                       ^^^^^^^
> > > > Please accept this.

> > > That is an extreme case that AFAIK we currently ignore and could be 
> > > avoided with some effort.
> > 
> > Its not extreme, not even rare, and its handled now. Its what
> > PF_MEMALLOC is for.
> 
> No its not. If you have all pages allocated as anonymous pages and your 
> writeout requires more pages than available in the reserves then you are 
> screwed either way regardless if you have PF_MEMALLOC set or not.

Christoph, we were talking about memory to reclaim, no about exhausting
the reserves.

> > > The initial PF_MEMALLOC patchset seems to be 
> > > still enough to deal with your issues.
> > 
> > Take the anonyous workload, user-space will block once the page
> > allocator hits ALLOC_MIN. Network will be able to receive until
> > ALLOC_MIN|ALLOC_HIGH - if the completion doesn't arrive by then it will
> > start dropping all packets until there is memory again. But userspace is
> > wedged and hence will not consume the network traffic, hence we
> > deadlock.
> > 
> > Even if there is something to reclaim initially, if the pressure
> > persists that can eventually be exhausted.
> 
> Sure ultimately you will end up with pages that are all unreclaimable if 
> you reclaim all reclaimable memory.
> 
> > > multiple critical tasks on various devices that have various memory needs. 
> > > So multiple critical spots can happen concurrently in multiple 
> > > application contexts.
> > 
> > yes, reclaim can be unbounded concurrent, and that is one of the
> > (theoretically) major problems we currently have.
> 
> So your patchset is not fixing it?

No, and I never said it would. I've been meaning to do one that does
though. Just haven't come around to actually doing it :-/

> > > We have that with PF_MEMALLOC.
> > 
> > Exactly. But if you recognise the need for PF_MEMALLOC then what is this
> > argument about?
> 
> The PF_MEMALLOC patchset f.e. is about avoiding to go out of 
> memory when there is still memory available even if we are doing a 
> PF_MEMALLOC allocation and would OOM otherwise.

Right, but as long as there is a need for PF_MEMALLOC there is a need
for the patches I proposed.

> > Networking can currently be seen as having two states:
> > 
> >  1 receive packets and consume memory
> >  2 drop all packets (when out of memory)
> > 
> > I need a 3rd state:
> > 
> >  3 receiving packets but not consuming memory
> 
> So far a good idea. If you are not consuming memory then why are the 
> allocators involved?

Because I do need to receive some packets, its just that I'll free them
again. So it won't keep consuming memory. This needs a little pool of
memory in order to operate in a stable state.

Its: alloc, receive, inspect, free
total memory use: 0
memory delta: a little
 
(its just that you need to be able to receive a significant number of
packets, not 1, due to funny things like ip-defragmentation before you
can be sure to actually receive 1 whole tcp packet - but the idea is the
same)

> > Now, I need this state when we're in PF_MEMALLOC territory, because I
> > need to be able to process an unspecified amount of network traffic in
> > order to receive the writeout completion.
> > 
> > In order to operate this 3rd network state, some memory is needed in
> > which packets can be received and when deemed not important freed and
> > reused.
> > 
> > It needs a bounded amount of memory in order to process an unbounded
> > amount of network traffic.
> > 
> > What exactly is not clear about this? If you accept the need for
> > PF_MEMALLOC you surely must also agree that at the point you're using it
> > running reclaim is useless.
> 
> Yes looks like you would like to add something to the network layer to 
> filter important packets. As long as you stay within PF_MEMALLOC 
> boundaries you can allocate and throw packets away. If you want to have a 
> reserve that is secure and just for you then you need to take it away from 
> the reserves (which in turn will lead reclaim to restore them).

Ah, but also note that _using_ PF_MEMALLOC is the trigger to enter that
3rd network state. These two are tightly coupled. You only need this 3rd
state when under PF_MEMALLOC, otherwise we could just receive normally.

So, my thinking was that, if the current reserves are good enough to
keep the system 'deadlock' free, I can just enlarge the reserves by
whatever it is I need for that network state and we're all good, no?

Why separate these two? If the current reserve is large enough (and
theoretically it is not - but I'm meaning to fix that) it will not
consume the extra memory I added below.

Note how:
  [PATCH 09/10] mm: emergency pool
pushes up the current reserves in a fashion so as to maintain the
relative operating range of the page allocator (distance between
min,low,high and scaling of the wmarks under ALLOC_HIGH|ALLOC_HARDER).

> > > > Also, failing a memory allocation isn't bad, why are you so worried
> > > > about that? It happens all the time.
> > > 
> > > Its a performance impact and plainly does not make sense if there is 
> > > reclaimable memory availble. The common action of the vm is to reclaim if 
> > > there is a demand for memory. Now we suddenly abandon that approach?
> > 
> > I'm utterly confused by this, on one hand you recognise the need for
> > PF_MEMALLOC but on the other hand you're saying its not needed and
> > anybody needing memory (even reclaim itself) should use reclaim.
> 
> The VM reclaims memory on demand but in exceptional limited cases where we 
> cannot do so we use the reserves. I am sure you know this.

Its the abandon part I got confused about. I'm not at all abandoning
reclaim, its just that I must operate under PF_MEMALLOC, so reclaim is
pointless.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC 0/7] Postphone reclaim laundry to write at high water marks
  2007-08-21 22:32           ` Christoph Lameter
@ 2007-08-23 12:05             ` Andrea Arcangeli
  2007-08-23 20:23               ` Christoph Lameter
  0 siblings, 1 reply; 46+ messages in thread
From: Andrea Arcangeli @ 2007-08-23 12:05 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Rik van Riel, Peter Zijlstra, linux-mm, dkegel, akpm, Nick Piggin,
	ak, linux-kernel

On Tue, Aug 21, 2007 at 03:32:25PM -0700, Christoph Lameter wrote:
> 1. Like in the earlier patchset allow reentry to reclaim under 
>    PF_MEMALLOC if we are out of all memory.

Can you simply tweak on the may_writepage flag only to achieve the
second pass? We're talking here about a totally non-performance case,
almost impossible to hit in practice unless you do real weird things,
and certainly very unlikely to happen. So I'm unsure what's all that
complexity just to make a regular pass on the lru looking for clean
pages, something may_writepage=0 already does.

Like Andi said at most one may_writepage=0 recursion should be
allowed.

If the PF_MEMALLOC is found empty, I agree entering reclaim a second
time with may_writepage=0 sounds theoretically a good idea (in
practice it should never be necessary). printk must also be printed to
warn the user he was risking to deadlock for real and he has to
increase the min_free_kbytes.

> 2. Do the laundry as here but do not write out laundry directly.
>    Instead move laundry to a new lru style list in the zone structure.
>    This will allow the recursive reclaim to also trigger writeout
>    of pages (what this patchset was supposed to accomplish).

A new lru for this sounds overkill to me, we're talking about deadlock
avoidance, this has absolutely nothing to do with real life 99.9999%
of runtime of all kernels out there.

> 3. Perform writeback only from kswapd. Make other threads
>    wait on kswapd if memory is low, we can wait and writeback still
>    has to progress.

What does buy you to think about other threads? The whole trouble is
that PF_MEMALLOC is global, no matter which thread (pdflush like other
email to Andi or kswapd here) still it'll deadlock the same way. If
your intent is to limit the max number of in-flight writepage that
could be achieved with a sempahore, not by context switching for no
good reason. kswapd is needed for atomic allocations and to pipeline
the VM so that the vm runs more likely asynchronous inside kswapd.

> 4. Then allow reclaim of GFP_ATOMIC allocs (see
>    http://marc.info/?l=linux-kernel&m=118710595617696&w=2). Atomic
>    reclaim can then also put pages onto the zone laundry lists from where
>    it is going to be picked up and written out by kswapd ASAP. This one
>    may be tricky so maybe keep this separate.

That sounds a bit risky, there are latency considerations here to
make, GFP_ATOMIC will run with irq locally disabled and it may hang
for indefinite amount of time (O(N)). So irq latency may break and it
may be better to lose a packet once in a while than to hang
interrupts. If you want to do this you'd probably need to add a new
GFP_ATOMIC_RECLAIM or similar.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC 0/7] Postphone reclaim laundry to write at high water marks
  2007-08-21 22:09         ` Peter Zijlstra
  2007-08-21 22:43           ` Christoph Lameter
@ 2007-08-23 12:08           ` Andrea Arcangeli
  2007-08-23 12:59             ` Peter Zijlstra
  1 sibling, 1 reply; 46+ messages in thread
From: Andrea Arcangeli @ 2007-08-23 12:08 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Christoph Lameter, linux-mm, linux-kernel

On Wed, Aug 22, 2007 at 12:09:03AM +0200, Peter Zijlstra wrote:
> Strictly speaking:
> 
> if:
> 
>  page = alloc_page(gfp);
> 
> fails but:
> 
>  obj = kmem_cache_alloc(s, gfp);
> 
> succeeds then its a bug.

Why? this is like saying that if alloc_pages(order=1) fails but
alloc_pages(order=0) succeeds then it's a bug. Obviously it's not a
bug.

The only bug is if slab allocations <=4k fails despite
alloc_pages(order=0) would succeed.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC 0/7] Postphone reclaim laundry to write at high water marks
  2007-08-22 20:03                 ` Peter Zijlstra
  2007-08-22 20:16                   ` Christoph Lameter
@ 2007-08-23 12:16                   ` Andrea Arcangeli
  1 sibling, 0 replies; 46+ messages in thread
From: Andrea Arcangeli @ 2007-08-23 12:16 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Christoph Lameter, linux-mm, linux-kernel, riel

On Wed, Aug 22, 2007 at 10:03:45PM +0200, Peter Zijlstra wrote:
> Its not extreme, not even rare, and its handled now. Its what
> PF_MEMALLOC is for.

Agreed. This is the whole point, either you limit the max amount of
anon memory, slab, alloc_pages a driver can do or you reserve a
pool. Guess what? In practice limiting the max ram a driver can eat in
alloc_pages, at the same time while limting the max amount of pages
that can be anon ram, etc..etc.. is called "reserving a pool of
freepages for PF_MEMALLOC".

Now in theory we could try a may_writepage=0 second reclaim pass
before using the PF_MEMALLOC pool but would that make any difference
other than being slower? We can argue what should be done first but
the PF_MEMALLOC pool isn't likely to go away with this patch... only
way to make it go away is to have every subsystem including tcp
incoming to have mempools for everything which is too complicated to
implement so we've to live the imperfect world that just works good
enough.

This logic of falling back in a may_writepage=0 pass will make things
a bit more reliable but certainly not perfect and it doesn't obsolete
the need of the current code IMHO.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC 0/7] Postphone reclaim laundry to write at high water marks
  2007-08-23 12:08           ` Andrea Arcangeli
@ 2007-08-23 12:59             ` Peter Zijlstra
  0 siblings, 0 replies; 46+ messages in thread
From: Peter Zijlstra @ 2007-08-23 12:59 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: Christoph Lameter, linux-mm, linux-kernel

On Thu, 2007-08-23 at 14:08 +0200, Andrea Arcangeli wrote:
> On Wed, Aug 22, 2007 at 12:09:03AM +0200, Peter Zijlstra wrote:
> > Strictly speaking:
> > 
> > if:
> > 
> >  page = alloc_page(gfp);
> > 
> > fails but:
> > 
> >  obj = kmem_cache_alloc(s, gfp);
> > 
> > succeeds then its a bug.
> 
> Why? this is like saying that if alloc_pages(order=1) fails but
> alloc_pages(order=0) succeeds then it's a bug. Obviously it's not a
> bug.
> 
> The only bug is if slab allocations <=4k fails despite
> alloc_pages(order=0) would succeed.

That would be currently true. However I need it to be stricter.

I'm wanting to do networked swap. And in order to be able to receive
writeout completions when in the PF_MEMALLOC region I need to introduce
a new network state. This is because it needs to operate in a steady
state with limited (bounded) memory use.

Normal network either consumes memory, or fails to receive anything at
all.

So this new network state will allocate space for a packet, receive the
packet from the NIC, inspect the packet, and toss the packet when its
not found to be aimed at the VM (ie. does not contain a writeout
completion).

So the total memory consumption of this state is 0 - it always frees
what it takes, but the memory use is non 0 but bounded - it does
temporarily use memory, but will limit itself to never exceed a given
maximum)

Because the network stack runs on the slab allocator in generic (both
kmem_cache and kmalloc) I need this extra guarantee so that a slab
allocated from the reserves will not serve objects to some random
non-critical application.

If this is not restricted this network state can leak memory to outside
of PF_MEMALLOC and will not be stable.

So what I need is:

  kmem_cache_alloc(s, gfp) to fail when alloc_page(gfp) fails

agreeing on the extra condition:

  when kmem_cache_size(s) <= PAGE_SIZE

and the extra note that:

  I only really need it to fail for ALLOC_NO_WATERMARKS, the other
  levels like ALLOC_HIGH and ALLOC_HARDER are not critical.

Which ends up with:

  if the current gfp-context does not allow ALLOC_NO_WATERMARKS
allocations, and alloc_page() fails, so must kmem_cache_alloc(s,) if
kmem_cache_size(s) <= PAGE_SIZE.

(yes this leaves jumbo frames broken)

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC 0/7] Postphone reclaim laundry to write at high water marks
  2007-08-23 12:05             ` Andrea Arcangeli
@ 2007-08-23 20:23               ` Christoph Lameter
  0 siblings, 0 replies; 46+ messages in thread
From: Christoph Lameter @ 2007-08-23 20:23 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Rik van Riel, Peter Zijlstra, linux-mm, dkegel, akpm, Nick Piggin,
	ak, linux-kernel

On Thu, 23 Aug 2007, Andrea Arcangeli wrote:

> On Tue, Aug 21, 2007 at 03:32:25PM -0700, Christoph Lameter wrote:
> > 1. Like in the earlier patchset allow reentry to reclaim under 
> >    PF_MEMALLOC if we are out of all memory.
> 
> Can you simply tweak on the may_writepage flag only to achieve the
> second pass? We're talking here about a totally non-performance case,
> almost impossible to hit in practice unless you do real weird things,
> and certainly very unlikely to happen. So I'm unsure what's all that
> complexity just to make a regular pass on the lru looking for clean
> pages, something may_writepage=0 already does.
> 

Yes that is what the PF_MEMALLOC patch that I posted before does. This 
discussion gets me more and more to thinking that the recursive reclaim on 
PF_MEMALLOC is all that is needed for emergency situations (to get out of 
the "tight spot").

See
http://marc.info/?l=linux-kernel&m=118710219116624&w=2

> If the PF_MEMALLOC is found empty, I agree entering reclaim a second
> time with may_writepage=0 sounds theoretically a good idea (in
> practice it should never be necessary). printk must also be printed to
> warn the user he was risking to deadlock for real and he has to
> increase the min_free_kbytes.

Ok. I can add a printk to that one.

> That sounds a bit risky, there are latency considerations here to
> make, GFP_ATOMIC will run with irq locally disabled and it may hang
> for indefinite amount of time (O(N)). So irq latency may break and it
> may be better to lose a packet once in a while than to hang
> interrupts. If you want to do this you'd probably need to add a new
> GFP_ATOMIC_RECLAIM or similar.

Well we could do the same as for PF_MEMALLOC: print a warning and then 
reclaim nevertheless if we cannot fail (We already have a GFP_NOFAIL 
flag). It is better to generate a latency than the system failing 
altogether. However the GFP_ATOMIC reclaim patchset is a 
bit more invasive (http://marc.info/?l=linux-mm&m=118710584014150&w=2). 
Maybe this is too much churn for the rare need of such a reclaim.
 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [RFC 0/7] Postphone reclaim laundry to write at high water marks
  2007-08-22 20:16                   ` Christoph Lameter
  2007-08-23  7:39                     ` Peter Zijlstra
@ 2007-08-26  4:52                     ` Rik van Riel
  1 sibling, 0 replies; 46+ messages in thread
From: Rik van Riel @ 2007-08-26  4:52 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: Peter Zijlstra, linux-mm, linux-kernel

Christoph Lameter wrote:
> On Wed, 22 Aug 2007, Peter Zijlstra wrote:
> 
>>> That is an extreme case that AFAIK we currently ignore and could be 
>>> avoided with some effort.
>> Its not extreme, not even rare, and its handled now. Its what
>> PF_MEMALLOC is for.
> 
> No its not. If you have all pages allocated as anonymous pages and your 
> writeout requires more pages than available in the reserves then you are 
> screwed either way regardless if you have PF_MEMALLOC set or not.

Only if the _first_ writeout needs more pages.

If the sum of all writeouts need more pages than you have
available, that is fine.  After all, buffer heads and some
other metadata is freed on IO completion.

Recursive reclaim will also be able to free the data pages
after IO completion, and really fix the problem.

-- 
Politics is the struggle between those who want to make their country
the best in the world, and those who believe it already is.  Each group
calls the other unpatriotic.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 46+ messages in thread

end of thread, other threads:[~2007-08-26  4:52 UTC | newest]

Thread overview: 46+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-08-20 21:50 [RFC 0/7] Postphone reclaim laundry to write at high water marks Christoph Lameter
2007-08-20 21:50 ` [RFC 1/7] release_lru_pages(): Generic release of pages to the LRU Christoph Lameter
2007-08-21 14:52   ` Mel Gorman
2007-08-21 20:51     ` Christoph Lameter
2007-08-20 21:50 ` [RFC 2/7] Move checks from pageout() to shrink_page_list Christoph Lameter
2007-08-20 21:50 ` [RFC 3/7] shrink_page_list: Support isolating dirty pages on laundry list Christoph Lameter
2007-08-21 15:04   ` Mel Gorman
2007-08-21 20:53     ` Christoph Lameter
2007-08-20 21:50 ` [RFC 4/7] Pass laundry through shrink_inactive_list() and shrink_zone() Christoph Lameter
2007-08-20 21:50 ` [RFC 5/7] Laundry handling for direct reclaim Christoph Lameter
2007-08-21 15:06   ` Mel Gorman
2007-08-21 20:55     ` Christoph Lameter
2007-08-21 15:19   ` Mel Gorman
2007-08-21 21:00     ` Christoph Lameter
2007-08-20 21:50 ` [RFC 6/7] kswapd: Do laundry after reclaim Christoph Lameter
2007-08-20 21:50 ` [RFC 7/7] Switch of PF_MEMALLOC during writeout Christoph Lameter
2007-08-20 23:08   ` Andi Kleen
2007-08-20 23:19     ` Christoph Lameter
2007-08-21  1:13       ` Andi Kleen
2007-08-21 10:36 ` [RFC 0/7] Postphone reclaim laundry to write at high water marks Peter Zijlstra
2007-08-21 20:48   ` Christoph Lameter
2007-08-21 21:13     ` Peter Zijlstra
2007-08-21 21:29       ` Christoph Lameter
2007-08-21 21:43         ` Rik van Riel
2007-08-21 22:32           ` Christoph Lameter
2007-08-23 12:05             ` Andrea Arcangeli
2007-08-23 20:23               ` Christoph Lameter
2007-08-21 22:09         ` Peter Zijlstra
2007-08-21 22:43           ` Christoph Lameter
2007-08-22  7:02             ` Peter Zijlstra
2007-08-22 19:04               ` Christoph Lameter
2007-08-22 20:03                 ` Peter Zijlstra
2007-08-22 20:16                   ` Christoph Lameter
2007-08-23  7:39                     ` Peter Zijlstra
2007-08-26  4:52                     ` Rik van Riel
2007-08-23 12:16                   ` Andrea Arcangeli
2007-08-22  7:45             ` Ingo Molnar
2007-08-22 19:19               ` Christoph Lameter
2007-08-23 12:08           ` Andrea Arcangeli
2007-08-23 12:59             ` Peter Zijlstra
2007-08-21 15:16 ` Rik van Riel
2007-08-21 20:59   ` Christoph Lameter
2007-08-21 21:14     ` Rik van Riel
2007-08-21 21:30       ` Christoph Lameter
2007-08-21 15:51 ` Dave McCracken
2007-08-21 21:03   ` Christoph Lameter

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).