All of lore.kernel.org
 help / color / mirror / Atom feed
From: dave@sr71.net
To: linux-mm@kvack.org
Cc: balbir@in.ibm.com, dave@sr71.net
Subject: [RFC][PATCH] "challenged" memory controller
Date: Tue, 15 Aug 2006 12:20:47 -0700	[thread overview]
Message-ID: <20060815192047.EE4A0960@localhost.localdomain> (raw)

I've been toying with a little memory controller for the past
few weeks, on and off.  My goal was to create something simple
and hackish that would at least be a toy to play with in the
process of creating something that might actually be feasible.

I call it "challenged" because it has some definite limitations.
However, it only adds about 50 lines of code to generic areas
of the VM, and I haven't been the slightest bit careful, yet.
I think it probably also breaks CONFIG_PM and !CONFIG_CPUSETS,
but those are "features". ;)

It uses cpusets for now, just because they are there, and are
relatively easy to modify.  The page->cpuset bit is only
temporary, and I have some plans to remove it later.

How does it work?  It adds two fields to the scan control
structure.  One that tells the scan to only pay attention to
_any_ cpuset over its memory limits, and the other to tell it
to only scan pages for a _particular_ cpuset.

I've been pretty indiscriminately hacking away, so I have the
feeling that there are some more efficient and nicer ways to
hook into the page scanning logic.  Comments are very welcome.

Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
---

 lxc-dave/include/linux/cpuset.h |    4 +
 lxc-dave/include/linux/gfp.h    |    4 -
 lxc-dave/include/linux/swap.h   |    2 
 lxc-dave/kernel/cpuset.c        |   99 +++++++++++++++++++++++++++++++++++++++-
 lxc-dave/mm/page_alloc.c        |   12 ++++
 lxc-dave/mm/rmap.c              |    5 ++
 lxc-dave/mm/vmscan.c            |   33 +++++++++++--
 7 files changed, 149 insertions(+), 10 deletions(-)

diff -puN include/linux/cpuset.h~challenged-memory-controller include/linux/cpuset.h
--- lxc/include/linux/cpuset.h~challenged-memory-controller	2006-08-14 13:22:12.000000000 -0700
+++ lxc-dave/include/linux/cpuset.h	2006-08-15 07:58:15.000000000 -0700
@@ -127,5 +127,9 @@ static inline int cpuset_do_slab_mem_spr
 }
 
 #endif /* !CONFIG_CPUSETS */
+int cpuset_inc_nr_pages(struct cpuset *cs, int nr, gfp_t gfpmask);
+void cpuset_dec_nr_pages(struct cpuset *cs, int nr);
+int cpuset_get_nr_pages(const struct cpuset *cs);
+int cpuset_amount_over_memory_max(const struct cpuset *cs);
 
 #endif /* _LINUX_CPUSET_H */
diff -puN include/linux/gfp.h~challenged-memory-controller include/linux/gfp.h
--- lxc/include/linux/gfp.h~challenged-memory-controller	2006-08-15 07:47:28.000000000 -0700
+++ lxc-dave/include/linux/gfp.h	2006-08-15 07:47:34.000000000 -0700
@@ -108,10 +108,6 @@ static inline enum zone_type gfp_zone(gf
  * optimized to &contig_page_data at compile-time.
  */
 
-#ifndef HAVE_ARCH_FREE_PAGE
-static inline void arch_free_page(struct page *page, int order) { }
-#endif
-
 extern struct page *
 FASTCALL(__alloc_pages(gfp_t, unsigned int, struct zonelist *));
 
diff -puN include/linux/mm.h~challenged-memory-controller include/linux/mm.h
diff -puN include/linux/sched.h~challenged-memory-controller include/linux/sched.h
diff -puN include/linux/swap.h~challenged-memory-controller include/linux/swap.h
--- lxc/include/linux/swap.h~challenged-memory-controller	2006-08-15 07:47:28.000000000 -0700
+++ lxc-dave/include/linux/swap.h	2006-08-15 07:47:34.000000000 -0700
@@ -188,7 +188,7 @@ extern void swap_setup(void);
 
 /* linux/mm/vmscan.c */
 extern unsigned long try_to_free_pages(struct zone **, gfp_t);
-extern unsigned long shrink_all_memory(unsigned long nr_pages);
+extern unsigned long shrink_all_memory(unsigned long nr_pages, struct cpuset *cs);
 extern int vm_swappiness;
 extern int remove_mapping(struct address_space *mapping, struct page *page);
 extern long vm_total_pages;
diff -puN kernel/cpuset.c~challenged-memory-controller kernel/cpuset.c
--- lxc/kernel/cpuset.c~challenged-memory-controller	2006-08-14 13:22:12.000000000 -0700
+++ lxc-dave/kernel/cpuset.c	2006-08-15 08:00:40.000000000 -0700
@@ -21,6 +21,7 @@
 #include <linux/cpu.h>
 #include <linux/cpumask.h>
 #include <linux/cpuset.h>
+#include <linux/delay.h>
 #include <linux/err.h>
 #include <linux/errno.h>
 #include <linux/file.h>
@@ -46,6 +47,7 @@
 #include <linux/spinlock.h>
 #include <linux/stat.h>
 #include <linux/string.h>
+#include <linux/swap.h>
 #include <linux/time.h>
 #include <linux/backing-dev.h>
 #include <linux/sort.h>
@@ -97,6 +99,8 @@ struct cpuset {
 	 * recent time this cpuset changed its mems_allowed.
 	 */
 	int mems_generation;
+	int mems_nr_pages;
+	int mems_max_pages;
 
 	struct fmeter fmeter;		/* memory_pressure filter */
 };
@@ -112,6 +116,55 @@ typedef enum {
 	CS_SPREAD_SLAB,
 } cpuset_flagbits_t;
 
+int shrink_cpuset(struct cpuset *cs, gfp_t gfpmask, int tries)
+{
+	int nr_shrunk = 0;
+	while (cpuset_amount_over_memory_max(cs)) {
+		if (tries-- < 0)
+			break;
+		nr_shrunk += shrink_all_memory(10, cs);
+	}
+	return 0;
+}
+
+int cpuset_inc_nr_pages(struct cpuset *cs, int nr, gfp_t gfpmask)
+{
+	int ret;
+	if (!cs)
+		return 0;
+	cs->mems_nr_pages += nr;
+	if (cpuset_amount_over_memory_max(cs)) {
+		if (!(gfpmask & __GFP_WAIT))
+			return -ENOMEM;
+		ret = shrink_cpuset(cs, gfpmask, 50);
+	}
+	if (cpuset_amount_over_memory_max(cs))
+		return -ENOMEM;
+	return 0;
+}
+void cpuset_dec_nr_pages(struct cpuset *cs, int nr)
+{
+	if (!cs)
+		return;
+	cs->mems_nr_pages -= nr;
+}
+int cpuset_get_nr_pages(const struct cpuset *cs)
+{
+	return cs->mems_nr_pages;
+}
+int cpuset_amount_over_memory_max(const struct cpuset *cs)
+{
+	int amount;
+
+	if (!cs || cs->mems_max_pages < 0)
+		return 0;
+	amount = cs->mems_nr_pages - cs->mems_max_pages;
+	if (amount < 0)
+		amount = 0;
+	return amount;
+}
+
+
 /* convenient tests for these bits */
 static inline int is_cpu_exclusive(const struct cpuset *cs)
 {
@@ -173,6 +226,8 @@ static struct cpuset top_cpuset = {
 	.flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)),
 	.cpus_allowed = CPU_MASK_ALL,
 	.mems_allowed = NODE_MASK_ALL,
+	.mems_nr_pages = 0,
+	.mems_max_pages = -1,
 	.count = ATOMIC_INIT(0),
 	.sibling = LIST_HEAD_INIT(top_cpuset.sibling),
 	.children = LIST_HEAD_INIT(top_cpuset.children),
@@ -1021,6 +1076,17 @@ static int update_memory_pressure_enable
 	return 0;
 }
 
+static int update_memory_max_nr_pages(struct cpuset *cs, char *buf)
+{
+	int rate = simple_strtol(buf, NULL, 10);
+	int shrunk;
+	int loopnr = 0;
+	cs->mems_max_pages = rate;
+	while (cpuset_amount_over_memory_max(cs))
+		shrunk = shrink_cpuset(cs, 0, 10);
+	return 0;
+}
+
 /*
  * update_flag - read a 0 or a 1 in a file and update associated flag
  * bit:	the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE,
@@ -1109,6 +1175,7 @@ static int update_flag(cpuset_flagbits_t
  */
 
 #define FM_COEF 933		/* coefficient for half-life of 10 secs */
+#define FM_COEF 93		/* coefficient for half-life of 10 secs */
 #define FM_MAXTICKS ((time_t)99) /* useless computing more ticks than this */
 #define FM_MAXCNT 1000000	/* limit cnt to avoid overflow */
 #define FM_SCALE 1000		/* faux fixed point scale */
@@ -1263,6 +1330,8 @@ typedef enum {
 	FILE_NOTIFY_ON_RELEASE,
 	FILE_MEMORY_PRESSURE_ENABLED,
 	FILE_MEMORY_PRESSURE,
+	FILE_MEMORY_MAX,
+	FILE_MEMORY_USED,
 	FILE_SPREAD_PAGE,
 	FILE_SPREAD_SLAB,
 	FILE_TASKLIST,
@@ -1321,6 +1390,9 @@ static ssize_t cpuset_common_file_write(
 	case FILE_MEMORY_PRESSURE_ENABLED:
 		retval = update_memory_pressure_enabled(cs, buffer);
 		break;
+	case FILE_MEMORY_MAX:
+		retval = update_memory_max_nr_pages(cs, buffer);
+		break;
 	case FILE_MEMORY_PRESSURE:
 		retval = -EACCES;
 		break;
@@ -1441,6 +1513,12 @@ static ssize_t cpuset_common_file_read(s
 	case FILE_MEMORY_PRESSURE:
 		s += sprintf(s, "%d", fmeter_getrate(&cs->fmeter));
 		break;
+	case FILE_MEMORY_MAX:
+		s += sprintf(s, "%d", cs->mems_max_pages);
+		break;
+	case FILE_MEMORY_USED:
+		s += sprintf(s, "%d", cs->mems_nr_pages);
+		break;
 	case FILE_SPREAD_PAGE:
 		*s++ = is_spread_page(cs) ? '1' : '0';
 		break;
@@ -1785,6 +1863,16 @@ static struct cftype cft_cpu_exclusive =
 	.private = FILE_CPU_EXCLUSIVE,
 };
 
+static struct cftype cft_mem_used = {
+	.name = "memory_nr_pages",
+	.private = FILE_MEMORY_USED,
+};
+
+static struct cftype cft_mem_max = {
+	.name = "memory_max_pages",
+	.private = FILE_MEMORY_MAX,
+};
+
 static struct cftype cft_mem_exclusive = {
 	.name = "mem_exclusive",
 	.private = FILE_MEM_EXCLUSIVE,
@@ -1830,6 +1918,10 @@ static int cpuset_populate_dir(struct de
 		return err;
 	if ((err = cpuset_add_file(cs_dentry, &cft_cpu_exclusive)) < 0)
 		return err;
+	if ((err = cpuset_add_file(cs_dentry, &cft_mem_max)) < 0)
+		return err;
+	if ((err = cpuset_add_file(cs_dentry, &cft_mem_used)) < 0)
+		return err;
 	if ((err = cpuset_add_file(cs_dentry, &cft_mem_exclusive)) < 0)
 		return err;
 	if ((err = cpuset_add_file(cs_dentry, &cft_notify_on_release)) < 0)
@@ -1880,6 +1972,8 @@ static long cpuset_create(struct cpuset 
 	INIT_LIST_HEAD(&cs->sibling);
 	INIT_LIST_HEAD(&cs->children);
 	cs->mems_generation = cpuset_mems_generation++;
+	cs->mems_max_pages = parent->mems_max_pages;
+	cs->mems_nr_pages = 0;
 	fmeter_init(&cs->fmeter);
 
 	cs->parent = parent;
@@ -1986,6 +2080,8 @@ int __init cpuset_init_early(void)
 
 	tsk->cpuset = &top_cpuset;
 	tsk->cpuset->mems_generation = cpuset_mems_generation++;
+	tsk->cpuset->mems_max_pages = -1;
+	tsk->cpuset->mems_nr_pages = 0;
 	return 0;
 }
 
@@ -2005,6 +2101,8 @@ int __init cpuset_init(void)
 
 	fmeter_init(&top_cpuset.fmeter);
 	top_cpuset.mems_generation = cpuset_mems_generation++;
+	top_cpuset.mems_max_pages = -1;
+	top_cpuset.mems_nr_pages = 0;
 
 	init_task.cpuset = &top_cpuset;
 
@@ -2438,7 +2536,6 @@ int cpuset_memory_pressure_enabled __rea
 void __cpuset_memory_pressure_bump(void)
 {
 	struct cpuset *cs;
-
 	task_lock(current);
 	cs = current->cpuset;
 	fmeter_markevent(&cs->fmeter);
diff -puN mm/page_alloc.c~challenged-memory-controller mm/page_alloc.c
--- lxc/mm/page_alloc.c~challenged-memory-controller	2006-08-14 13:24:16.000000000 -0700
+++ lxc-dave/mm/page_alloc.c	2006-08-15 07:57:13.000000000 -0700
@@ -470,6 +470,11 @@ static void free_one_page(struct zone *z
 	free_pages_bulk(zone, 1, &list, order);
 }
 
+void arch_free_page(struct page *page, int order)
+{
+	cpuset_dec_nr_pages(page->cpuset, 1<<order);
+}
+
 static void __free_pages_ok(struct page *page, unsigned int order)
 {
 	unsigned long flags;
@@ -1020,6 +1025,9 @@ __alloc_pages(gfp_t gfp_mask, unsigned i
 
 	might_sleep_if(wait);
 
+	if (cpuset_inc_nr_pages(current->cpuset, 1<<order, gfp_mask))
+		return NULL;
+
 restart:
 	z = zonelist->zones;  /* the list of zones suitable for gfp_mask */
 
@@ -1159,6 +1167,10 @@ got_pg:
 	if (page)
 		set_page_owner(page, order, gfp_mask);
 #endif
+	if (page)
+		page->cpuset = current->cpuset;
+	else
+		cpuset_dec_nr_pages(current->cpuset, 1<<order);
 	return page;
 }
 
diff -puN mm/rmap.c~challenged-memory-controller mm/rmap.c
--- lxc/mm/rmap.c~challenged-memory-controller	2006-08-15 07:47:28.000000000 -0700
+++ lxc-dave/mm/rmap.c	2006-08-15 08:01:26.000000000 -0700
@@ -927,3 +927,8 @@ int try_to_unmap(struct page *page, int 
 	return ret;
 }
 
+extern int cpuset_amount_over_memory_max(const struct cpuset *cs);
+int page_has_naughty_cpuset(struct page *page)
+{
+	return cpuset_amount_over_memory_max(page->cpuset);
+}
diff -puN mm/vmscan.c~challenged-memory-controller mm/vmscan.c
--- lxc/mm/vmscan.c~challenged-memory-controller	2006-08-15 07:47:28.000000000 -0700
+++ lxc-dave/mm/vmscan.c	2006-08-15 08:05:03.000000000 -0700
@@ -63,8 +63,9 @@ struct scan_control {
 	int swap_cluster_max;
 
 	int swappiness;
-
 	int all_unreclaimable;
+	int only_pages_with_naughty_cpusets;
+	struct cpuset *only_this_cpuset;
 };
 
 #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
@@ -445,6 +446,10 @@ static unsigned long shrink_page_list(st
 
 		VM_BUG_ON(PageActive(page));
 
+		if (cpuset_amount_over_memory_max(sc->only_this_cpuset) &&
+		    page->cpuset && page->cpuset != sc->only_this_cpuset) {
+			goto keep_locked;
+		}
 		sc->nr_scanned++;
 
 		if (!sc->may_swap && page_mapped(page))
@@ -793,9 +798,20 @@ force_reclaim_mapped:
 	spin_unlock_irq(&zone->lru_lock);
 
 	while (!list_empty(&l_hold)) {
+		extern int page_has_naughty_cpuset(struct page *page);
 		cond_resched();
 		page = lru_to_page(&l_hold);
 		list_del(&page->lru);
+		if (sc->only_this_cpuset &&
+		    page->cpuset && page->cpuset != sc->only_this_cpuset) {
+			list_add(&page->lru, &l_active);
+			continue;
+		}
+		if (sc->only_pages_with_naughty_cpusets &&
+		    !page_has_naughty_cpuset(page)) {
+			list_add(&page->lru, &l_active);
+			continue;
+		}
 		if (page_mapped(page)) {
 			if (!reclaim_mapped ||
 			    (total_swap_pages == 0 && PageAnon(page)) ||
@@ -875,6 +891,7 @@ static unsigned long shrink_zone(int pri
 	unsigned long nr_inactive;
 	unsigned long nr_to_scan;
 	unsigned long nr_reclaimed = 0;
+	int nr_scans = 0;
 
 	atomic_inc(&zone->reclaim_in_progress);
 
@@ -897,6 +914,11 @@ static unsigned long shrink_zone(int pri
 		nr_inactive = 0;
 
 	while (nr_active || nr_inactive) {
+		nr_scans++;
+		if (printk_ratelimit())
+			printk("%s() scan nr: %d\n", __func__, nr_scans);
+		if (nr_scans > 20)
+			sc->only_pages_with_naughty_cpusets = 0;
 		if (nr_active) {
 			nr_to_scan = min(nr_active,
 					(unsigned long)sc->swap_cluster_max);
@@ -993,6 +1015,7 @@ unsigned long try_to_free_pages(struct z
 		.swap_cluster_max = SWAP_CLUSTER_MAX,
 		.may_swap = 1,
 		.swappiness = vm_swappiness,
+		.only_pages_with_naughty_cpusets = 1,
 	};
 
 	delay_swap_prefetch();
@@ -1090,6 +1113,7 @@ static unsigned long balance_pgdat(pg_da
 		.may_swap = 1,
 		.swap_cluster_max = SWAP_CLUSTER_MAX,
 		.swappiness = vm_swappiness,
+		.only_pages_with_naughty_cpusets = 1,
 	};
 
 loop_again:
@@ -1310,7 +1334,6 @@ void wakeup_kswapd(struct zone *zone, in
 	wake_up_interruptible(&pgdat->kswapd_wait);
 }
 
-#ifdef CONFIG_PM
 /*
  * Helper function for shrink_all_memory().  Tries to reclaim 'nr_pages' pages
  * from LRU lists system-wide, for given pass and priority, and returns the
@@ -1363,7 +1386,7 @@ static unsigned long shrink_all_zones(un
  * LRU order by reclaiming preferentially
  * inactive > active > active referenced > active mapped
  */
-unsigned long shrink_all_memory(unsigned long nr_pages)
+unsigned long shrink_all_memory(unsigned long nr_pages, struct cpuset *cs)
 {
 	unsigned long lru_pages, nr_slab;
 	unsigned long ret = 0;
@@ -1376,6 +1399,8 @@ unsigned long shrink_all_memory(unsigned
 		.swap_cluster_max = nr_pages,
 		.may_writepage = 1,
 		.swappiness = vm_swappiness,
+		.only_pages_with_naughty_cpusets = 1,
+		.only_this_cpuset = cs,
 	};
 
 	delay_swap_prefetch();
@@ -1462,7 +1487,6 @@ out:
 
 	return ret;
 }
-#endif
 
 #ifdef CONFIG_HOTPLUG_CPU
 /* It's optimal to keep kswapds on the same CPUs as their memory, but
@@ -1568,6 +1592,7 @@ static int __zone_reclaim(struct zone *z
 					SWAP_CLUSTER_MAX),
 		.gfp_mask = gfp_mask,
 		.swappiness = vm_swappiness,
+		.only_pages_with_naughty_cpusets = 1,
 	};
 
 	disable_swap_token();
_

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

             reply	other threads:[~2006-08-15 19:21 UTC|newest]

Thread overview: 11+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2006-08-15 19:20 dave [this message]
2006-08-15 22:07 ` [RFC][PATCH] "challenged" memory controller Paul Jackson
2006-08-15 22:24   ` Dave Hansen
2006-08-15 22:49     ` Paul Jackson
2006-08-17 10:41   ` Balbir Singh
2006-08-17 14:47     ` Dave Hansen
2006-08-18  3:33       ` Balbir Singh
2006-08-17 16:34     ` Paul Jackson
2006-08-16  5:44 ` Matt Helsley
2006-08-16 14:00 ` Balbir Singh
2006-08-16 20:31 ` Chandra Seetharaman

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20060815192047.EE4A0960@localhost.localdomain \
    --to=dave@sr71.net \
    --cc=balbir@in.ibm.com \
    --cc=linux-mm@kvack.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.