[RFC][PATCH] big continuous memory allocator v2

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [RFC][PATCH] big continuous memory allocator v2
@ 2010-09-07  2:45 KAMEZAWA Hiroyuki
  2010-09-07  7:29 ` Andi Kleen
  2010-09-07  8:37 ` Minchan Kim
  0 siblings, 2 replies; 9+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-09-07  2:45 UTC (permalink / raw)
  To: linux-mm@kvack.org
  Cc: linux-kernel@vger.kernel.org, minchan.kim@gmail.com, Mel Gorman,
	kosaki.motohiro@jp.fujitsu.com


This is a page allcoator based on memory migration/hotplug code.
passed some small tests, and maybe easier to read than previous one.

==
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>

This patch as a memory allocator for contiguous memory larger than MAX_ORDER.

  alloc_contig_pages(hint, size, node);

  This function allocates 'size' of contigoues pages, whose physical address
  is higher than 'hint' and on "node". size and hint are specified in pfn.
  Allocated pages's page_count() are set to 1.
  Return value is the top page. 

 free_contig_pages(start, size)
 free all pages in the range.

This patch does
  - find an area which can be ISOLATED with skipping memory holes.
  - migrate LRU pages in the area.
  - steal chunk of pages from allocator.

Most of codes are for "deteciting candidate of range for allocating memory".
migration/isolation reuses memory hotplug codes.

This is fully experimental and written as example.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
 include/linux/page-isolation.h |    9 +
 mm/memory_hotplug.c            |   86 -----------
 mm/page_alloc.c                |   28 +++
 mm/page_isolation.c            |  301 +++++++++++++++++++++++++++++++++++++++++
 4 files changed, 340 insertions(+), 84 deletions(-)

Index: kametest/mm/page_isolation.c
===================================================================
--- kametest.orig/mm/page_isolation.c
+++ kametest/mm/page_isolation.c
@@ -3,8 +3,11 @@
  */
 
 #include <linux/mm.h>
+#include <linux/swap.h>
 #include <linux/page-isolation.h>
 #include <linux/pageblock-flags.h>
+#include <linux/mm_inline.h>
+#include <linux/migrate.h>
 #include "internal.h"
 
 static inline struct page *
@@ -140,3 +143,301 @@ int test_pages_isolated(unsigned long st
 	spin_unlock_irqrestore(&zone->lock, flags);
 	return ret ? 0 : -EBUSY;
 }
+
+#define MIGRATION_RETRY	(5)
+
+/*
+ * Scanning pfn is much easier than scanning lru list.
+ * Scan pfn from start to end and Find LRU page.
+ */
+unsigned long scan_lru_pages(unsigned long start, unsigned long end)
+{
+	unsigned long pfn;
+	struct page *page;
+
+	for (pfn = start; pfn < end; pfn++) {
+		if (pfn_valid(pfn)) {
+			page = pfn_to_page(pfn);
+			if (PageLRU(page))
+				return pfn;
+		}
+	}
+	return pfn;
+}
+
+/* Migrate all LRU pages in the range to somewhere else */
+static struct page *
+hotremove_migrate_alloc(struct page *page, unsigned long private, int **x)
+{
+	/* This should be improooooved!! */
+	return alloc_page(GFP_HIGHUSER_MOVABLE);
+}
+
+#define NR_MOVE_AT_ONCE_PAGES	(256)
+int do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
+{
+	unsigned long pfn;
+	struct page *page;
+	int move_pages = NR_MOVE_AT_ONCE_PAGES;
+	int not_managed = 0;
+	int ret = 0;
+	LIST_HEAD(source);
+
+	for (pfn = start_pfn; pfn < end_pfn && move_pages > 0; pfn++) {
+		if (!pfn_valid(pfn))
+			continue;
+		page = pfn_to_page(pfn);
+		if (!page_count(page))
+			continue;
+		/*
+		 * We can skip free pages. And we can only deal with pages on
+		 * LRU.
+		 */
+		ret = isolate_lru_page(page);
+		if (!ret) { /* Success */
+			list_add_tail(&page->lru, &source);
+			move_pages--;
+			inc_zone_page_state(page, NR_ISOLATED_ANON +
+					    page_is_file_cache(page));
+
+		} else {
+			/* Becasue we don't have big zone->lock. we should
+			   check this again here. */
+			if (page_count(page))
+				not_managed++;
+#ifdef CONFIG_DEBUG_VM
+			printk(KERN_ALERT "removing pfn %lx from LRU failed\n",
+			       pfn);
+			dump_page(page);
+#endif
+		}
+	}
+	ret = -EBUSY;
+	if (not_managed) {
+		if (!list_empty(&source))
+			putback_lru_pages(&source);
+		goto out;
+	}
+	ret = 0;
+	if (list_empty(&source))
+		goto out;
+	/* this function returns # of failed pages */
+	ret = migrate_pages(&source, hotremove_migrate_alloc, 0, 1);
+
+out:
+	return ret;
+}
+
+
+/*
+ * An interface to isolate pages in specified size and range.
+ * Purpose is to return contigous free pages larger than MAX_ORDER.
+ * Below codes are very slow and sleeps, please never call this under
+ * performance critical codes.
+ */
+
+struct page_range {
+	unsigned long base, end, pages;
+};
+
+static inline unsigned long  MAX_O_ALIGN(unsigned long x) {
+	return ALIGN(x, MAX_ORDER_NR_PAGES);
+}
+
+static inline unsigned long MAX_O_BASE(unsigned long x) {
+	return (x & ~(MAX_ORDER_NR_PAGES - 1));
+}
+
+int __get_contig_block(unsigned long pfn, unsigned long nr_pages, void *arg)
+{
+	struct page_range *blockinfo = arg;
+	unsigned long end;
+
+	end = pfn + nr_pages;
+	pfn = MAX_O_ALIGN(pfn);
+	end = MAX_O_BASE(end);
+	if (end < pfn)
+		return 0;
+	if (end - pfn >= blockinfo->pages) {
+		blockinfo->base = pfn;
+		blockinfo->end = end;
+		return 1;
+	}
+	return 0;
+}
+
+static void __trim_zone(struct page_range *range)
+{
+	struct zone *zone;
+	unsigned long pfn;
+	/*
+	 * In most case, each zone's [start_pfn, end_pfn) has no
+	 * overlap between each other. But some arch allows it and
+	 * we need to check it here.
+	 */
+	for (pfn = range->base, zone = page_zone(pfn_to_page(pfn));
+	     pfn < range->end;
+	     pfn += MAX_ORDER_NR_PAGES) {
+
+		if (zone != page_zone(pfn_to_page(pfn)))
+			break;
+	}
+	range->end = min(pfn, range->end);
+	return;
+}
+static unsigned long __find_contig_block(unsigned long base,
+		unsigned long end, unsigned long pages)
+{
+	unsigned long pfn;
+	struct page_range blockinfo;
+	int ret;
+
+	/* Skip memory holes */
+retry:
+	blockinfo.base = base;
+	blockinfo.end = end;
+	blockinfo.pages = pages;
+	/*
+	 * retruns a contiguous page range within [base, end) which is
+	 * larger than pages.
+	 */
+	ret = walk_system_ram_range(base, end - base, &blockinfo,
+		__get_contig_block);
+	if (!ret)
+		return 0;
+
+	__trim_zone(&blockinfo);
+	/* Ok, we found contiguous memory chunk of size. Isolate it.*/
+	for (pfn = blockinfo.base; pfn + pages < blockinfo.end;
+	     pfn += MAX_ORDER_NR_PAGES) {
+		/*
+		 * Now, we know [base,end) of a contiguous chunk.
+		 * Don't need to take care of memory holes.
+		 */
+		if (!start_isolate_page_range(pfn, pfn + pages))
+			return pfn;
+	}
+	/* failed */
+	if (blockinfo.end + pages < end) {
+		/* Move base address and find the next block of RAM. */
+		base = blockinfo.end;
+		goto retry;
+	}
+	return 0;
+}
+
+/**
+ *	alloc_contig_pages - allocate a contigous physical pages
+ *	@hint:		the base address of searching free space(in pfn)
+ *	@size:		size of requested area (in # of pages)
+ *	@node:		the node where memory allocated from. If -1, ignored.
+ *
+ *	Search an area of @size in the physical memory map and checks wheter
+ *	we can create a contigous free space. If it seems possible, try to
+ *	create contigous space with page migration.
+ *
+ *	Returns a page of the beginning of contiguous block. At failure, NULL
+ *	is returned. Each page in the area is set to page_count() = 1. Because
+ *	this function does page	migration, this function is very heavy and
+ *	sleeps some time. Caller must be aware that "NULL returned" is not a
+ *	special case.
+ *
+ *	Now, returned range is aligned to MAX_ORDER.
+ */
+
+struct page *alloc_contig_pages(unsigned long hint,
+				unsigned long size, int node)
+{
+	unsigned long base, found, end, pages, start;
+	struct page *ret = NULL;
+	int migration_failed;
+	struct zone *zone;
+
+	hint = MAX_O_ALIGN(hint);
+	/* request size should be aligned to pageblock */
+	pages = MAX_O_ALIGN(size);
+	found = 0;
+retry:
+	for_each_populated_zone(zone) {
+		unsigned long zone_end_pfn;
+
+		if (node >= 0 && node != zone_to_nid(zone))
+			continue;
+		if (zone->present_pages < pages)
+			continue;
+		base = MAX_O_ALIGN(zone->zone_start_pfn);
+		base = max(base, hint);
+		zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
+		if (base + pages > zone_end_pfn)
+			continue;
+		found = __find_contig_block(base, zone_end_pfn, pages);
+		/* Next try will see the next block. */
+		hint = base + MAX_ORDER_NR_PAGES;
+		if (found)
+			break;
+	}
+
+	if (!found)
+		goto out;
+	/*
+	 * Ok, here, we have contiguous pageblock marked as "isolated"
+	 * try migration.
+	 *
+	 * FIXME: permanent migration_failure detection logic seems not very
+	 * precise.
+ 	 */
+	end = found + pages;
+	/* scan_lru_pages() finds the next PG_lru page in the range */
+	for (start = scan_lru_pages(found, end), migration_failed = 0;
+	     start < end;
+	     start = scan_lru_pages(start, end)) {
+		if (do_migrate_range(start, end)) {
+			/* it's better to try another block ? */
+			if (++migration_failed >= MIGRATION_RETRY)
+				break;
+			/* take a rest and synchronize LRU etc. */
+			lru_add_drain_all();
+			flush_scheduled_work();
+			cond_resched();
+			drain_all_pages();
+		} else /* reset migration_failure counter */
+			migration_failed = 0;
+	}
+
+	lru_add_drain_all();
+	flush_scheduled_work();
+	drain_all_pages();
+	/* Check all pages are isolated */
+	if (test_pages_isolated(found, end)) {
+		undo_isolate_page_range(found, pages);
+		/* We failed at [start...???) migration. */
+		hint = MAX_O_ALIGN(start + 1);
+		goto retry; /* goto next chunk */
+	}
+	/*
+	 * Ok, here, [found...found+pages) memory are isolated.
+	 * All pages in the range will be moved into the list with
+	 * page_count(page)=1.
+	 */
+	ret = pfn_to_page(found);
+	alloc_contig_freed_pages(found, found + pages);
+	/* unset ISOLATE */
+	undo_isolate_page_range(found, pages);
+	/* Free unnecessary pages in tail */
+	for (start = found + size; start < found + pages; start++)
+		__free_page(pfn_to_page(start));
+out:
+	return ret;
+
+}
+
+
+void free_contig_pages(struct page *page, int nr_pages)
+{
+	int i;
+	for (i = 0; i < nr_pages; i++)
+		__free_page(page + i);
+}
+
+EXPORT_SYMBOL_GPL(alloc_contig_pages);
+EXPORT_SYMBOL_GPL(free_contig_pages);
Index: kametest/include/linux/page-isolation.h
===================================================================
--- kametest.orig/include/linux/page-isolation.h
+++ kametest/include/linux/page-isolation.h
@@ -33,5 +33,14 @@ test_pages_isolated(unsigned long start_
 extern int set_migratetype_isolate(struct page *page);
 extern void unset_migratetype_isolate(struct page *page);
 
+/* For contiguous memory alloc */
+extern int do_migrate_range(unsigned long start_pfn, unsigned long end_pfn);
+extern void alloc_contig_freed_pages(unsigned long pfn,  unsigned long end);
+extern unsigned long scan_lru_pages(unsigned long start, unsigned long end);
+
+
+extern struct page *alloc_contig_pages(unsigned long hint,
+			unsigned long size, int node);
+extern void free_contig_pages(struct page *page, int nr_pages);
 
 #endif
Index: kametest/mm/memory_hotplug.c
===================================================================
--- kametest.orig/mm/memory_hotplug.c
+++ kametest/mm/memory_hotplug.c
@@ -568,7 +568,7 @@ out:
 }
 EXPORT_SYMBOL_GPL(add_memory);
 
-#ifdef CONFIG_MEMORY_HOTREMOVE
+#if defined(CONFIG_MEMORY_HOTREMOVE) || defined(CONFIG_CONTIG_ALLOC)
 /*
  * A free page on the buddy free lists (not the per-cpu lists) has PageBuddy
  * set and the size of the free page is given by page_order(). Using this,
@@ -643,87 +643,6 @@ static int test_pages_in_a_zone(unsigned
 }
 
 /*
- * Scanning pfn is much easier than scanning lru list.
- * Scan pfn from start to end and Find LRU page.
- */
-int scan_lru_pages(unsigned long start, unsigned long end)
-{
-	unsigned long pfn;
-	struct page *page;
-	for (pfn = start; pfn < end; pfn++) {
-		if (pfn_valid(pfn)) {
-			page = pfn_to_page(pfn);
-			if (PageLRU(page))
-				return pfn;
-		}
-	}
-	return 0;
-}
-
-static struct page *
-hotremove_migrate_alloc(struct page *page, unsigned long private, int **x)
-{
-	/* This should be improooooved!! */
-	return alloc_page(GFP_HIGHUSER_MOVABLE);
-}
-
-#define NR_OFFLINE_AT_ONCE_PAGES	(256)
-static int
-do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
-{
-	unsigned long pfn;
-	struct page *page;
-	int move_pages = NR_OFFLINE_AT_ONCE_PAGES;
-	int not_managed = 0;
-	int ret = 0;
-	LIST_HEAD(source);
-
-	for (pfn = start_pfn; pfn < end_pfn && move_pages > 0; pfn++) {
-		if (!pfn_valid(pfn))
-			continue;
-		page = pfn_to_page(pfn);
-		if (!page_count(page))
-			continue;
-		/*
-		 * We can skip free pages. And we can only deal with pages on
-		 * LRU.
-		 */
-		ret = isolate_lru_page(page);
-		if (!ret) { /* Success */
-			list_add_tail(&page->lru, &source);
-			move_pages--;
-			inc_zone_page_state(page, NR_ISOLATED_ANON +
-					    page_is_file_cache(page));
-
-		} else {
-			/* Becasue we don't have big zone->lock. we should
-			   check this again here. */
-			if (page_count(page))
-				not_managed++;
-#ifdef CONFIG_DEBUG_VM
-			printk(KERN_ALERT "removing pfn %lx from LRU failed\n",
-			       pfn);
-			dump_page(page);
-#endif
-		}
-	}
-	ret = -EBUSY;
-	if (not_managed) {
-		if (!list_empty(&source))
-			putback_lru_pages(&source);
-		goto out;
-	}
-	ret = 0;
-	if (list_empty(&source))
-		goto out;
-	/* this function returns # of failed pages */
-	ret = migrate_pages(&source, hotremove_migrate_alloc, 0, 1);
-
-out:
-	return ret;
-}
-
-/*
  * remove from free_area[] and mark all as Reserved.
  */
 static int
@@ -740,7 +659,6 @@ offline_isolated_pages(unsigned long sta
 	walk_system_ram_range(start_pfn, end_pfn - start_pfn, NULL,
 				offline_isolated_pages_cb);
 }
-
 /*
  * Check all pages in range, recoreded as memory resource, are isolated.
  */
@@ -833,7 +751,7 @@ repeat:
 	}
 
 	pfn = scan_lru_pages(start_pfn, end_pfn);
-	if (pfn) { /* We have page on LRU */
+	if (pfn != end_pfn) { /* We have page on LRU */
 		ret = do_migrate_range(pfn, end_pfn);
 		if (!ret) {
 			drain = 1;
Index: kametest/mm/page_alloc.c
===================================================================
--- kametest.orig/mm/page_alloc.c
+++ kametest/mm/page_alloc.c
@@ -5401,6 +5401,34 @@ out:
 	spin_unlock_irqrestore(&zone->lock, flags);
 }
 
+void alloc_contig_freed_pages(unsigned long pfn,  unsigned long end)
+{
+	struct page *page;
+	struct zone *zone;
+	int order;
+	unsigned long start = pfn;
+
+	zone = page_zone(pfn_to_page(pfn));
+	spin_lock_irq(&zone->lock);
+	while (pfn < end) {
+		VM_BUG_ON(!pfn_valid(pfn));
+		page = pfn_to_page(pfn);
+		VM_BUG_ON(page_count(page));
+		VM_BUG_ON(!PageBuddy(page));
+		list_del(&page->lru);
+		order = page_order(page);
+		zone->free_area[order].nr_free--;
+		rmv_page_order(page);
+		__mod_zone_page_state(zone, NR_FREE_PAGES, - (1UL << order));
+		pfn += 1 << order;
+	}
+	spin_unlock_irq(&zone->lock);
+
+	/*After this, pages in the range can be freed one be one */
+	for (pfn = start; pfn < end; pfn++)
+		prep_new_page(pfn_to_page(pfn), 0, 0);
+}
+
 #ifdef CONFIG_MEMORY_HOTREMOVE
 /*
  * All pages in the range must be isolated before calling this.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC][PATCH] big continuous memory allocator v2
  2010-09-07  2:45 [RFC][PATCH] big continuous memory allocator v2 KAMEZAWA Hiroyuki
@ 2010-09-07  7:29 ` Andi Kleen
  2010-09-07  8:25   ` KAMEZAWA Hiroyuki
  2010-09-07  8:37 ` Minchan Kim
  1 sibling, 1 reply; 9+ messages in thread
From: Andi Kleen @ 2010-09-07  7:29 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: linux-mm@kvack.org, linux-kernel@vger.kernel.org,
	minchan.kim@gmail.com, Mel Gorman, kosaki.motohiro@jp.fujitsu.com

KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> writes:

> This is a page allcoator based on memory migration/hotplug code.
> passed some small tests, and maybe easier to read than previous one.

Maybe I'm missing context here, but what is the use case for this?

If this works well enough the 1GB page code for x86, which currently
only supports allocating at boot time due to the MAX_ORDER problem,
could be moved over to runtime allocation. This would make
GB pages a lot nicer to use.

I think it would still need declaring a large moveable
area at boot right? (but moveable area is better than
prereserved memory)

On the other hand I'm not sure the VM is really up to speed
in managing such large areas.

-Andi

-- 
ak@linux.intel.com -- Speaking for myself only.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC][PATCH] big continuous memory allocator v2
  2010-09-07  7:29 ` Andi Kleen
@ 2010-09-07  8:25   ` KAMEZAWA Hiroyuki
  2010-09-07  8:46     ` Andi Kleen
  0 siblings, 1 reply; 9+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-09-07  8:25 UTC (permalink / raw)
  To: Andi Kleen
  Cc: linux-mm@kvack.org, linux-kernel@vger.kernel.org,
	minchan.kim@gmail.com, Mel Gorman, kosaki.motohiro@jp.fujitsu.com

On Tue, 07 Sep 2010 09:29:21 +0200
Andi Kleen <andi@firstfloor.org> wrote:

> KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> writes:
> 
> > This is a page allcoator based on memory migration/hotplug code.
> > passed some small tests, and maybe easier to read than previous one.
> 
> Maybe I'm missing context here, but what is the use case for this?
> 

I hear some drivers want to allocate xxMB of continuous area.(camera?)
Maybe embeded guys can answer the question.

> If this works well enough the 1GB page code for x86, which currently
> only supports allocating at boot time due to the MAX_ORDER problem,
> could be moved over to runtime allocation. This would make
> GB pages a lot nicer to use.
> 
> I think it would still need declaring a large moveable
> area at boot right? (but moveable area is better than
> prereserved memory)
> 
Right.  

I think a main use-case is using allocation-at-init rather than boot
option. If modules can allocate a big chunk in __init_module() at boot,
boot option will not be necessary and it will be user friendly.
I think there are big free space before application starts running.

If on-demand loading of modules are required, it's safe to use MOVABLE zones.

> On the other hand I'm not sure the VM is really up to speed
> in managing such large areas.
> 

Thanks,
-Kame

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC][PATCH] big continuous memory allocator v2
  2010-09-07  2:45 [RFC][PATCH] big continuous memory allocator v2 KAMEZAWA Hiroyuki
  2010-09-07  7:29 ` Andi Kleen
@ 2010-09-07  8:37 ` Minchan Kim
  2010-09-07  8:47   ` KAMEZAWA Hiroyuki
  1 sibling, 1 reply; 9+ messages in thread
From: Minchan Kim @ 2010-09-07  8:37 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: linux-mm@kvack.org, linux-kernel@vger.kernel.org, Mel Gorman,
	kosaki.motohiro@jp.fujitsu.com

Nice cleanup.
There are some comments in below.

On Mon, Sep 6, 2010 at 7:45 PM, KAMEZAWA Hiroyuki
<kamezawa.hiroyu@jp.fujitsu.com> wrote:
>
> This is a page allcoator based on memory migration/hotplug code.
> passed some small tests, and maybe easier to read than previous one.
>
> ==
> From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
>
> This patch as a memory allocator for contiguous memory larger than MAX_ORDER.
>
>  alloc_contig_pages(hint, size, node);

I have thought this patch is to be good for dumb device drivers which
want big contiguous
memory. So if some device driver want big memory and they can tolerate
latency or fail,
this is good solution, I think.
And some device driver can't tolerate fail, they have to use MOVABLE zone.

For it, I hope we have a option like ALLOC_FIXED(like MAP_FIXED).
That's because embedded people wanted to aware BANK of memory.
So if they get free page which they don't want, it can be pointless.

In addition, I hope it can support CROSS_ZONE migration mode.
Most of small system can't support swap system. So if we can't migrate
anon pages into other zones, external fragment problem still happens.

I think reclaim(ex, discard file-backed pages) can become one option to prevent
the problem. But it's more cost so we can support it by calling mode.
(But it could be trivial since caller should know this function is very cost)

ex) alloc_contig_pages(hint, size, node, ALLOC_FIXED|ALLOC_RECLAIM);

Thanks, Kame.

-- 
Kind regards,
Minchan Kim

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC][PATCH] big continuous memory allocator v2
  2010-09-07  8:25   ` KAMEZAWA Hiroyuki
@ 2010-09-07  8:46     ` Andi Kleen
  2010-09-07  9:03       ` KAMEZAWA Hiroyuki
  0 siblings, 1 reply; 9+ messages in thread
From: Andi Kleen @ 2010-09-07  8:46 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: linux-mm@kvack.org, linux-kernel@vger.kernel.org,
	minchan.kim@gmail.com, Mel Gorman, kosaki.motohiro@jp.fujitsu.com

On Tue, 7 Sep 2010 17:25:59 +0900
KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:

> On Tue, 07 Sep 2010 09:29:21 +0200
> Andi Kleen <andi@firstfloor.org> wrote:
> 
> > KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> writes:
> > 
> > > This is a page allcoator based on memory migration/hotplug code.
> > > passed some small tests, and maybe easier to read than previous
> > > one.
> > 
> > Maybe I'm missing context here, but what is the use case for this?
> > 
> 
> I hear some drivers want to allocate xxMB of continuous area.(camera?)
> Maybe embeded guys can answer the question.

Ok what I wanted to say -- assuming you can make this work
nicely, and the delays (swap storms?) likely caused by this are not
too severe, it would be interesting for improving the 1GB pages on x86.

This would be a major use case and probably be enough
to keep the code around.

But it depends on how well it works.

e.g. when the zone is already fully filled how long
does the allocation of 1GB take?

How about when parallel programs are allocating/freeing
in it too?

What's the worst case delay under stress?

Does it cause swap storms?

One issue is also that it would be good to be able to decide
in advance if the OOM killer is likely triggered (and if yes
reject the allocation in the first place). 

-Andi

-- 
ak@linux.intel.com -- Speaking for myself only.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC][PATCH] big continuous memory allocator v2
  2010-09-07  8:37 ` Minchan Kim
@ 2010-09-07  8:47   ` KAMEZAWA Hiroyuki
  2010-09-07 14:51     ` Minchan Kim
  0 siblings, 1 reply; 9+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-09-07  8:47 UTC (permalink / raw)
  To: Minchan Kim
  Cc: linux-mm@kvack.org, linux-kernel@vger.kernel.org, Mel Gorman,
	kosaki.motohiro@jp.fujitsu.com

On Tue, 7 Sep 2010 01:37:27 -0700
Minchan Kim <minchan.kim@gmail.com> wrote:

> Nice cleanup.
> There are some comments in below.
> 
> On Mon, Sep 6, 2010 at 7:45 PM, KAMEZAWA Hiroyuki
> <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> >
> > This is a page allcoator based on memory migration/hotplug code.
> > passed some small tests, and maybe easier to read than previous one.
> >
> > ==
> > From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> >
> > This patch as a memory allocator for contiguous memory larger than MAX_ORDER.
> >
> > A alloc_contig_pages(hint, size, node);
> 
> I have thought this patch is to be good for dumb device drivers which
> want big contiguous
> memory. So if some device driver want big memory and they can tolerate
> latency or fail,
> this is good solution, I think.
> And some device driver can't tolerate fail, they have to use MOVABLE zone.
> 
> For it, I hope we have a option like ALLOC_FIXED(like MAP_FIXED).
> That's because embedded people wanted to aware BANK of memory.
> So if they get free page which they don't want, it can be pointless.
> 
Okay.


> In addition, I hope it can support CROSS_ZONE migration mode.
> Most of small system can't support swap system. So if we can't migrate
> anon pages into other zones, external fragment problem still happens.
> 
Now, this code migrates pages to somewhere, including crossing zone, node etc..
(because it just use GFP_HIGHUSER_MOVABLE)

> I think reclaim(ex, discard file-backed pages) can become one option to prevent
> the problem. But it's more cost so we can support it by calling mode.
> (But it could be trivial since caller should know this function is very cost)
> 

> ex) alloc_contig_pages(hint, size, node, ALLOC_FIXED|ALLOC_RECLAIM);
> 

This migration's page allocation code will cause memory reclaim and
kswapd wakeup if memory is in short. But hmm, there are no codes as

 reclaim_memory_within(start, end).

But I guess if there are LRU pages within the range which cannot be migrated,
they can't be dropped. In another consideration, 

  shrink_slab_within(start, end)
will be able to make success-rate better. (and this is good for memory hotplug, too)

I'll start from adding ALLOC_FIXED.

Thanks,
-Kame

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC][PATCH] big continuous memory allocator v2
  2010-09-07  8:46     ` Andi Kleen
@ 2010-09-07  9:03       ` KAMEZAWA Hiroyuki
  2010-09-07  9:45         ` Andi Kleen
  0 siblings, 1 reply; 9+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-09-07  9:03 UTC (permalink / raw)
  To: Andi Kleen
  Cc: linux-mm@kvack.org, linux-kernel@vger.kernel.org,
	minchan.kim@gmail.com, Mel Gorman, kosaki.motohiro@jp.fujitsu.com

On Tue, 7 Sep 2010 10:46:35 +0200
Andi Kleen <andi@firstfloor.org> wrote:

> On Tue, 7 Sep 2010 17:25:59 +0900
> KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> 
> > On Tue, 07 Sep 2010 09:29:21 +0200
> > Andi Kleen <andi@firstfloor.org> wrote:
> > 
> > > KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> writes:
> > > 
> > > > This is a page allcoator based on memory migration/hotplug code.
> > > > passed some small tests, and maybe easier to read than previous
> > > > one.
> > > 
> > > Maybe I'm missing context here, but what is the use case for this?
> > > 
> > 
> > I hear some drivers want to allocate xxMB of continuous area.(camera?)
> > Maybe embeded guys can answer the question.
> 
> Ok what I wanted to say -- assuming you can make this work
> nicely, and the delays (swap storms?) likely caused by this are not
> too severe, it would be interesting for improving the 1GB pages on x86.
> 

Oh, I didn't consider that. Hmm. If x86 really wants to support 1GB page,
MAX_ORDER should be raised. (I'm sorry if it was already disccused.)


> This would be a major use case and probably be enough
> to keep the code around.
> 
> But it depends on how well it works.
> 
Sure.

> e.g. when the zone is already fully filled how long
> does the allocation of 1GB take?
> 
Maybe not very quick, even slow.

> How about when parallel programs are allocating/freeing
> in it too?
> 
This code doesn't assume that. I wonder I should add mutex because this code
generates IPI for draining some per-cpu lists.

I think 1GB pages should be preallocated as current hugepage does.


> What's the worst case delay under stress?
> 
memory offline itself is robust against stress because it make
pageblock ISOLATED. But memory allocation of 1GB is problem.
I have an idea (see below).

> Does it cause swap storms?
> 
Maybe same as allocating 1GB of memory when memory is full.
It's LRU matter.


> One issue is also that it would be good to be able to decide
> in advance if the OOM killer is likely triggered (and if yes
> reject the allocation in the first place). 
> 

Checking the amount of memory and swap before starts ? 
It sounds nice. I'd like to add something.

Or changing my patche's logic as..

  1. allocates required migration target pages (of 1GB)
  2. start migration to allocated pages.
  3. create a big page. 

Then, we can use some GFP_XXXX at (1) and can do some tuning as usual
vm codes.

Thanks,
-Kame

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC][PATCH] big continuous memory allocator v2
  2010-09-07  9:03       ` KAMEZAWA Hiroyuki
@ 2010-09-07  9:45         ` Andi Kleen
  0 siblings, 0 replies; 9+ messages in thread
From: Andi Kleen @ 2010-09-07  9:45 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: linux-mm@kvack.org, linux-kernel@vger.kernel.org,
	minchan.kim@gmail.com, Mel Gorman, kosaki.motohiro@jp.fujitsu.com

On Tue, 7 Sep 2010 18:03:54 +0900
KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:


> Oh, I didn't consider that. Hmm. If x86 really wants to support 1GB
> page, MAX_ORDER should be raised. (I'm sorry if it was already
> disccused.)

That doesn't really work, it requires alignment of all the
zones to 1GB too (not practical) and has a lot of overhead.

Also for the normal case it wouldn't work anyways due to fragmentation.

> > One issue is also that it would be good to be able to decide
> > in advance if the OOM killer is likely triggered (and if yes
> > reject the allocation in the first place). 
> > 
> 
> Checking the amount of memory and swap before starts ? 
> It sounds nice. I'd like to add something.

That would be the simple variant, but perhaps it could 
even consider parallel traffic? (I guess that would
be difficult) Or perhaps bail out early if OOM is likely.

-Andi

-- 
ak@linux.intel.com -- Speaking for myself only.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC][PATCH] big continuous memory allocator v2
  2010-09-07  8:47   ` KAMEZAWA Hiroyuki
@ 2010-09-07 14:51     ` Minchan Kim
  0 siblings, 0 replies; 9+ messages in thread
From: Minchan Kim @ 2010-09-07 14:51 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: linux-mm@kvack.org, linux-kernel@vger.kernel.org, Mel Gorman,
	kosaki.motohiro@jp.fujitsu.com

On Tue, Sep 07, 2010 at 05:47:43PM +0900, KAMEZAWA Hiroyuki wrote:
> On Tue, 7 Sep 2010 01:37:27 -0700
> Minchan Kim <minchan.kim@gmail.com> wrote:
> 
> > Nice cleanup.
> > There are some comments in below.
> > 
> > On Mon, Sep 6, 2010 at 7:45 PM, KAMEZAWA Hiroyuki
> > <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> > >
> > > This is a page allcoator based on memory migration/hotplug code.
> > > passed some small tests, and maybe easier to read than previous one.
> > >
> > > ==
> > > From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> > >
> > > This patch as a memory allocator for contiguous memory larger than MAX_ORDER.
> > >
> > > ??alloc_contig_pages(hint, size, node);
> > 
> > I have thought this patch is to be good for dumb device drivers which
> > want big contiguous
> > memory. So if some device driver want big memory and they can tolerate
> > latency or fail,
> > this is good solution, I think.
> > And some device driver can't tolerate fail, they have to use MOVABLE zone.
> > 
> > For it, I hope we have a option like ALLOC_FIXED(like MAP_FIXED).
> > That's because embedded people wanted to aware BANK of memory.
> > So if they get free page which they don't want, it can be pointless.
> > 
> Okay.
> 
> 
> > In addition, I hope it can support CROSS_ZONE migration mode.
> > Most of small system can't support swap system. So if we can't migrate
> > anon pages into other zones, external fragment problem still happens.
> > 
> Now, this code migrates pages to somewhere, including crossing zone, node etc..
> (because it just use GFP_HIGHUSER_MOVABLE)
> 
> > I think reclaim(ex, discard file-backed pages) can become one option to prevent
> > the problem. But it's more cost so we can support it by calling mode.
> > (But it could be trivial since caller should know this function is very cost)
> > 
> 
> > ex) alloc_contig_pages(hint, size, node, ALLOC_FIXED|ALLOC_RECLAIM);
> > 
> 
> This migration's page allocation code will cause memory reclaim and
> kswapd wakeup if memory is in short. But hmm, there are no codes as

Yes. But it's useless. That's because it's not a zone/node we want to reclaim.
The zone we want to reclaim is not alloc failed zone but the zone which include 
alloc_contig_pages's hint address. 

> 
>  reclaim_memory_within(start, end).
> 
> But I guess if there are LRU pages within the range which cannot be migrated,
> they can't be dropped. In another consideration, 
> 
>   shrink_slab_within(start, end)
> will be able to make success-rate better. (and this is good for memory hotplug, too)

And it can help normal external memory fragement, too. 

> 
> I'll start from adding ALLOC_FIXED.

I am looking forward to seeing your next version. :)
Thanks, Kame. 
-- 
Kind regards,
Minchan Kim

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2010-09-07 14:51 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-09-07  2:45 [RFC][PATCH] big continuous memory allocator v2 KAMEZAWA Hiroyuki
2010-09-07  7:29 ` Andi Kleen
2010-09-07  8:25   ` KAMEZAWA Hiroyuki
2010-09-07  8:46     ` Andi Kleen
2010-09-07  9:03       ` KAMEZAWA Hiroyuki
2010-09-07  9:45         ` Andi Kleen
2010-09-07  8:37 ` Minchan Kim
2010-09-07  8:47   ` KAMEZAWA Hiroyuki
2010-09-07 14:51     ` Minchan Kim

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).