[RFC][PATCH 1/6] Create Critical Page Pool

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Matthew Dobson <colpatch@us.ibm.com>
To: linux-kernel@vger.kernel.org
Cc: andrea@suse.de, Sridhar Samudrala <sri@us.ibm.com>,
	pavel@suse.cz, Andrew Morton <akpm@osdl.org>,
	Linux Memory Management <linux-mm@kvack.org>
Subject: [RFC][PATCH 1/6] Create Critical Page Pool
Date: Tue, 13 Dec 2005 23:52:46 -0800	[thread overview]
Message-ID: <439FCF4E.3090202@us.ibm.com> (raw)
In-Reply-To: <439FCECA.3060909@us.ibm.com>

[-- Attachment #1: Type: text/plain, Size: 238 bytes --]

Create the basic Critical Page Pool.  Any allocation specifying
__GFP_CRITICAL will, as a last resort before failing the allocation, try to
get a page from the critical pool.  For now, only singleton (order 0) pages
are supported.

-Matt

[-- Attachment #2: critical_pool.patch --]
[-- Type: text/x-patch, Size: 11007 bytes --]

Implement a Critical Page Pool:

* Write a number of pages into /proc/sys/vm/critical_pages
* These pages will be reserved for 'critical' allocations, signified
     by the __GFP_CRITICAL flag passed to an allocation request
* Reading /proc/sys/vm/critical_pages will give statistics about the
     critical page pool

Signed-off-by: Matthew Dobson <colpatch@us.ibm.com>

Index: linux-2.6.15-rc5+critical_pool/Documentation/sysctl/vm.txt
===================================================================
--- linux-2.6.15-rc5+critical_pool.orig/Documentation/sysctl/vm.txt	2005-12-13 15:56:55.819232488 -0800
+++ linux-2.6.15-rc5+critical_pool/Documentation/sysctl/vm.txt	2005-12-13 16:01:57.783326968 -0800
@@ -26,6 +26,7 @@ Currently, these files are in /proc/sys/
 - min_free_kbytes
 - laptop_mode
 - block_dump
+- critical_pages
 
 ==============================================================
 
@@ -102,3 +103,12 @@ This is used to force the Linux VM to ke
 of kilobytes free.  The VM uses this number to compute a pages_min
 value for each lowmem zone in the system.  Each lowmem zone gets 
 a number of reserved free pages based proportionally on its size.
+
+==============================================================
+
+critical_pages:
+
+This is used to force the Linux VM to reserve a pool of pages for
+emergency (__GFP_CRITICAL) allocations.  Allocations with this flag
+MUST succeed.
+The number written into this file is the number of pages to reserve.
Index: linux-2.6.15-rc5+critical_pool/include/linux/gfp.h
===================================================================
--- linux-2.6.15-rc5+critical_pool.orig/include/linux/gfp.h	2005-12-13 15:56:55.820232336 -0800
+++ linux-2.6.15-rc5+critical_pool/include/linux/gfp.h	2005-12-13 15:56:57.531972112 -0800
@@ -47,6 +47,7 @@ struct vm_area_struct;
 #define __GFP_ZERO	((__force gfp_t)0x8000u)/* Return zeroed page on success */
 #define __GFP_NOMEMALLOC ((__force gfp_t)0x10000u) /* Don't use emergency reserves */
 #define __GFP_HARDWALL   ((__force gfp_t)0x20000u) /* Enforce hardwall cpuset memory allocs */
+#define __GFP_CRITICAL	((__force gfp_t)0x40000u) /* Critical allocation. MUST succeed! */
 
 #define __GFP_BITS_SHIFT 20	/* Room for 20 __GFP_FOO bits */
 #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
@@ -55,7 +56,7 @@ struct vm_area_struct;
 #define GFP_LEVEL_MASK (__GFP_WAIT|__GFP_HIGH|__GFP_IO|__GFP_FS| \
 			__GFP_COLD|__GFP_NOWARN|__GFP_REPEAT| \
 			__GFP_NOFAIL|__GFP_NORETRY|__GFP_NO_GROW|__GFP_COMP| \
-			__GFP_NOMEMALLOC|__GFP_HARDWALL)
+			__GFP_NOMEMALLOC|__GFP_HARDWALL|__GFP_CRITICAL)
 
 #define GFP_ATOMIC	(__GFP_HIGH)
 #define GFP_NOIO	(__GFP_WAIT)
@@ -64,6 +65,8 @@ struct vm_area_struct;
 #define GFP_USER	(__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL)
 #define GFP_HIGHUSER	(__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL | \
 			 __GFP_HIGHMEM)
+#define GFP_ATOMIC_CRIT	(GFP_ATOMIC | __GFP_CRITICAL)
+#define GFP_KERNEL_CRIT	(GFP_KERNEL | __GFP_CRITICAL)
 
 /* Flag - indicates that the buffer will be suitable for DMA.  Ignored on some
    platforms, used as appropriate on others */
Index: linux-2.6.15-rc5+critical_pool/include/linux/sysctl.h
===================================================================
--- linux-2.6.15-rc5+critical_pool.orig/include/linux/sysctl.h	2005-12-13 15:56:55.820232336 -0800
+++ linux-2.6.15-rc5+critical_pool/include/linux/sysctl.h	2005-12-13 16:02:13.757898464 -0800
@@ -180,6 +180,7 @@ enum
 	VM_VFS_CACHE_PRESSURE=26, /* dcache/icache reclaim pressure */
 	VM_LEGACY_VA_LAYOUT=27, /* legacy/compatibility virtual address space layout */
 	VM_SWAP_TOKEN_TIMEOUT=28, /* default time for token time out */
+	VM_CRITICAL_PAGES=29,	/* # of pages to reserve for __GFP_CRITICAL allocs */
 };
 
 
Index: linux-2.6.15-rc5+critical_pool/kernel/sysctl.c
===================================================================
--- linux-2.6.15-rc5+critical_pool.orig/kernel/sysctl.c	2005-12-13 15:56:55.820232336 -0800
+++ linux-2.6.15-rc5+critical_pool/kernel/sysctl.c	2005-12-13 16:01:57.784326816 -0800
@@ -849,6 +849,16 @@ static ctl_table vm_table[] = {
 		.strategy	= &sysctl_jiffies,
 	},
 #endif
+	{
+		.ctl_name	= VM_CRITICAL_PAGES,
+		.procname	= "critical_pages",
+		.data		= &critical_pages,
+		.maxlen		= sizeof(critical_pages),
+		.mode		= 0644,
+		.proc_handler	= &critical_pages_sysctl_handler,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+	},
 	{ .ctl_name = 0 }
 };
 
Index: linux-2.6.15-rc5+critical_pool/mm/page_alloc.c
===================================================================
--- linux-2.6.15-rc5+critical_pool.orig/mm/page_alloc.c	2005-12-13 15:56:55.820232336 -0800
+++ linux-2.6.15-rc5+critical_pool/mm/page_alloc.c	2005-12-13 16:01:57.810322864 -0800
@@ -53,6 +53,68 @@ unsigned long totalram_pages __read_most
 unsigned long totalhigh_pages __read_mostly;
 long nr_swap_pages;
 
+/* The number of pages to maintain in the critical page pool */
+int critical_pages = 0;
+
+/* Critical Page Pool control structure */
+static struct critical_pool {
+	unsigned int free, min_free;
+	spinlock_t lock;
+	struct list_head pages;
+} critical_pool = {
+	.free		= 0,
+	.min_free	= 0,
+	.lock		= SPIN_LOCK_UNLOCKED,
+	.pages		= LIST_HEAD_INIT(critical_pool.pages),
+};
+
+/* MCD - This needs to be arch specific */
+#define CRITICAL_POOL_GFPMASK	(GFP_KERNEL)
+
+static inline int is_critical_pool_full(void)
+{
+	return critical_pool.free >= critical_pages;
+}
+
+static inline struct page *get_critical_page(gfp_t gfpmask)
+{
+	struct page *page = NULL;
+
+	spin_lock(&critical_pool.lock);
+	if (!list_empty(&critical_pool.pages)) {
+		/* Grab the next free critical pool page */
+		page = list_entry(critical_pool.pages.next, struct page, lru);
+		list_del(&page->lru);
+		critical_pool.free--;
+		if (critical_pool.free < critical_pool.min_free)
+			critical_pool.min_free = critical_pool.free;
+	}
+	spin_unlock(&critical_pool.lock);
+
+	return page;
+}
+
+static inline int put_critical_page(struct page *page)
+{
+	int ret = 0;
+
+	spin_lock(&critical_pool.lock);
+	if (!is_critical_pool_full()) {
+		/*
+		 * We snached this page away in the process of being freed, so
+		 * we must re-increment it's count so we don't cause problems
+		 * when we hand it back out to a future __GFP_CRITICAL alloc.
+		 */
+		BUG_ON(!get_page_testone(page));
+		list_add(&page->lru, &critical_pool.pages);
+		critical_pool.free++;
+		ret = 1;
+	}
+	spin_unlock(&critical_pool.lock);
+
+	return ret;
+}
+
 /*
  * results with 256, 32 in the lowmem_reserve sysctl:
  *	1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
@@ -302,6 +364,10 @@ static inline void __free_pages_bulk (st
 
 	if (unlikely(order))
 		destroy_compound_page(page, order);
+	else if (!is_critical_pool_full())
+		/* If the critical pool isn't full, add this page to it */
+		if (put_critical_page(page))
+			return;
 
 	page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
 
@@ -997,6 +1063,19 @@ rebalance:
 	}
 
 nopage:
+	/*
+	 * This is the LAST DITCH effort.
+	 * We maintain a pool of singleton (order 0) 'critical' pages,
+	 * specifically for allocation requests marked __GFP_CRITICAL.
+	 * Rather than fail one of these allocations, take a page,
+	 * if there are any, from the critical pool.
+	 */
+	if ((gfp_mask & __GFP_CRITICAL) && !order) {
+		page = get_critical_page(gfp_mask);
+		if (page)
+			goto got_pg;
+	}
+
 	if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
 		printk(KERN_WARNING "%s: page allocation failure."
 			" order:%d, mode:0x%x\n",
@@ -2539,6 +2618,90 @@ int lowmem_reserve_ratio_sysctl_handler(
 	return 0;
 }
 
+static inline void fill_critical_pool(int num)
+{
+	struct page *page;
+	int i;
+
+	for (i = 0; i < num; i++) {
+		page = alloc_page(CRITICAL_POOL_GFPMASK);
+		if (!page)
+			return;
+		spin_lock(&critical_pool.lock);
+		list_add(&page->lru, &critical_pool.pages);
+		critical_pool.free++;
+		critical_pool.min_free++;
+		spin_unlock(&critical_pool.lock);
+	}
+}
+
+static inline void drain_critical_pool(int num)
+{
+	struct page *page;
+	int i;
+
+	for (i = 0; i < num; i++) {
+		spin_lock(&critical_pool.lock);
+		BUG_ON(critical_pool.free < 0);
+		if (list_empty(&critical_pool.pages) || !critical_pool.free) {
+			spin_unlock(&critical_pool.lock);
+			break;
+		}
+			
+		page = list_entry(critical_pool.pages.next, struct page, lru);
+		list_del(&page->lru);
+		critical_pool.free--;
+		if (critical_pool.free < critical_pool.min_free)
+			critical_pool.min_free = critical_pool.free;
+		spin_unlock(&critical_pool.lock);
+			
+		__free_pages(page, 0);
+	}
+}
+
+/*
+ * critical_pages_sysctl_handler - handle r/w of /proc/sys/vm/critical_pages
+ *     On write, add/remove pages to/from the critical page pool.
+ *     On read, dump out some simple stats about the critical page pool.
+ */
+int critical_pages_sysctl_handler(ctl_table *table, int write,
+				   struct file *file, void __user *buffer,
+				   size_t *length, loff_t *ppos)
+{
+	char buf[100];
+	int num, len;
+
+	if (write) {
+		proc_dointvec(table, write, file, buffer, length, ppos);
+
+		num = critical_pages - critical_pool.free;
+		if (num > 0)
+			fill_critical_pool(num);
+		else if (num < 0)
+			drain_critical_pool(-num);
+
+		return 0;
+	}
+
+	if (!*length || *ppos) {
+		*length = 0;
+		return 0;
+	}
+
+	sprintf(buf, "Critical Page Pool Size: %d\n"
+		"Critical Pages In Use: %d\n"
+		"Max Critical Pages In Use: %d\n", critical_pages,
+		critical_pages - critical_pool.free,
+		critical_pages - critical_pool.min_free);
+	len = strlen(buf);
+	if (copy_to_user(buffer, buf, len))
+		return -EFAULT;
+
+	*length = len;
+	*ppos += len;
+	return 0;
+}
+
 __initdata int hashdist = HASHDIST_DEFAULT;
 
 #ifdef CONFIG_NUMA
Index: linux-2.6.15-rc5+critical_pool/include/linux/mmzone.h
===================================================================
--- linux-2.6.15-rc5+critical_pool.orig/include/linux/mmzone.h	2005-12-13 15:56:55.820232336 -0800
+++ linux-2.6.15-rc5+critical_pool/include/linux/mmzone.h	2005-12-13 15:56:57.537971200 -0800
@@ -422,6 +422,8 @@ int min_free_kbytes_sysctl_handler(struc
 extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1];
 int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, struct file *,
 					void __user *, size_t *, loff_t *);
+int critical_pages_sysctl_handler(struct ctl_table *, int, struct file *,
+				  void __user *, size_t *, loff_t *);
 
 #include <linux/topology.h>
 /* Returns the number of the current Node. */
Index: linux-2.6.15-rc5+critical_pool/include/linux/mm.h
===================================================================
--- linux-2.6.15-rc5+critical_pool.orig/include/linux/mm.h	2005-12-13 15:56:55.820232336 -0800
+++ linux-2.6.15-rc5+critical_pool/include/linux/mm.h	2005-12-13 16:01:57.783326968 -0800
@@ -32,6 +32,8 @@ extern int sysctl_legacy_va_layout;
 #define sysctl_legacy_va_layout 0
 #endif
 
+extern int critical_pages;
+
 #include <asm/page.h>
 #include <asm/pgtable.h>
 #include <asm/processor.h>

next prev parent reply	other threads:[~2005-12-14  7:52 UTC|newest]

Thread overview: 46+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2005-12-14  7:50 [RFC][PATCH 0/6] Critical Page Pool Matthew Dobson
2005-12-14  7:50 ` Matthew Dobson
2005-12-14  7:52 ` Matthew Dobson [this message]
2005-12-14 10:48   ` [RFC][PATCH 1/6] Create " Andrea Arcangeli
2005-12-14 10:48     ` Andrea Arcangeli
2005-12-14 13:30   ` Rik van Riel
2005-12-14 13:30     ` Rik van Riel
2005-12-14 16:26     ` Matthew Dobson
2005-12-14 16:26       ` Matthew Dobson
2005-12-15  3:29       ` Matt Mackall
2005-12-15  3:29         ` Matt Mackall
2005-12-14  7:54 ` [RFC][PATCH 2/6] in_emergency Trigger Matthew Dobson
2005-12-14  7:56 ` [RFC][PATCH 3/6] Slab Prep: get/return_object Matthew Dobson
2005-12-14  8:19   ` Pekka Enberg
2005-12-14  8:19     ` Pekka Enberg
2005-12-14 16:26     ` Matthew Dobson
2005-12-14 16:26       ` Matthew Dobson
2005-12-14  7:58 ` [RFC][PATCH 4/6] Slab Prep: slab_destruct() Matthew Dobson
2005-12-14  8:37   ` Pekka Enberg
2005-12-14  8:37     ` Pekka Enberg
2005-12-14 16:30     ` Matthew Dobson
2005-12-14 16:30       ` Matthew Dobson
2005-12-14  7:59 ` [RFC][PATCH 5/6] Slab Prep: Move cache_grow() Matthew Dobson
2005-12-14  8:02 ` [RFC][PATCH 6/6] Critical Page Pool: Slab Support Matthew Dobson
2005-12-14 10:08 ` [RFC][PATCH 0/6] Critical Page Pool Pavel Machek
2005-12-14 10:08   ` Pavel Machek
2005-12-14 12:01   ` Andrea Arcangeli
2005-12-14 12:01     ` Andrea Arcangeli
2005-12-14 13:03     ` Alan Cox
2005-12-14 13:03       ` Alan Cox
2005-12-14 16:37       ` Matthew Dobson
2005-12-14 16:37         ` Matthew Dobson
2005-12-14 19:17         ` Alan Cox
2005-12-14 19:17           ` Alan Cox
2005-12-15 16:27         ` Pavel Machek
2005-12-15 16:27           ` Pavel Machek
2005-12-14 16:03     ` Matthew Dobson
2005-12-14 16:03       ` Matthew Dobson
2005-12-14 15:55   ` Matthew Dobson
2005-12-14 15:55     ` Matthew Dobson
2005-12-15 16:26     ` Pavel Machek
2005-12-15 16:26       ` Pavel Machek
2005-12-15 21:51       ` Matthew Dobson
2005-12-15 21:51         ` Matthew Dobson
2005-12-16  5:02         ` Sridhar Samudrala
2005-12-16  5:02           ` Sridhar Samudrala

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=439FCF4E.3090202@us.ibm.com \
    --to=colpatch@us.ibm.com \
    --cc=akpm@osdl.org \
    --cc=andrea@suse.de \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=pavel@suse.cz \
    --cc=sri@us.ibm.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.