From: Balbir Singh <balbir@linux.vnet.ibm.com>
To: linux-mm@kvack.org
Cc: YAMAMOTO Takashi <yamamoto@valinux.co.jp>,
lizf@cn.fujitsu.com,
KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>,
Balbir Singh <balbir@linux.vnet.ibm.com>,
Rik van Riel <riel@redhat.com>,
Andrew Morton <akpm@linux-foundation.org>,
KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Subject: [PATCH 4/4] Memory controller soft limit reclaim on contention (v6)
Date: Sat, 14 Mar 2009 23:01:11 +0530 [thread overview]
Message-ID: <20090314173111.16591.68465.sendpatchset@localhost.localdomain> (raw)
In-Reply-To: <20090314173043.16591.18336.sendpatchset@localhost.localdomain>
Feature: Implement reclaim from groups over their soft limit
From: Balbir Singh <balbir@linux.vnet.ibm.com>
Changelog v6...v5
1. Reclaim arguments to hierarchical reclaim have been merged into one
parameter called reclaim_options.
2. Check if we failed to reclaim from one cgroup during soft reclaim, if
so move on to the next one. This can be very useful if the zonelist
passed to soft limit reclaim has no allocations from the selected
memory cgroup
3. Coding style cleanups
Changelog v5...v4
1. Throttling is removed, earlier we throttled tasks over their soft limit
2. Reclaim has been moved back to __alloc_pages_internal, several experiments
and tests showed that it was the best place to reclaim memory. kswapd has
a different goal, that does not work with a single soft limit for the memory
cgroup.
3. Soft limit reclaim is more targetted and the pages reclaim depend on the
amount by which the soft limit is exceeded.
Changelog v4...v3
1. soft_reclaim is now called from balance_pgdat
2. soft_reclaim is aware of nodes and zones
3. A mem_cgroup will be throttled if it is undergoing soft limit reclaim
and at the same time trying to allocate pages and exceed its soft limit.
4. A new mem_cgroup_shrink_zone() routine has been added to shrink zones
particular to a mem cgroup.
Changelog v3...v2
1. Convert several arguments to hierarchical reclaim to flags, thereby
consolidating them
2. The reclaim for soft limits is now triggered from kswapd
3. try_to_free_mem_cgroup_pages() now accepts an optional zonelist argument
Changelog v2...v1
1. Added support for hierarchical soft limits
This patch allows reclaim from memory cgroups on contention (via the
direct reclaim path).
memory cgroup soft limit reclaim finds the group that exceeds its soft limit
by the largest number of pages and reclaims pages from it and then reinserts the
cgroup into its correct place in the rbtree.
Reclaim arguments to hierarchical reclaim have been merged into one parameter
called reclaim_options.
Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
---
include/linux/memcontrol.h | 8 ++
include/linux/swap.h | 1
mm/memcontrol.c | 205 ++++++++++++++++++++++++++++++++++++++++----
mm/page_alloc.c | 9 ++
mm/vmscan.c | 5 +
5 files changed, 205 insertions(+), 23 deletions(-)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 18146c9..b99d9c5 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -116,7 +116,8 @@ static inline bool mem_cgroup_disabled(void)
}
extern bool mem_cgroup_oom_called(struct task_struct *task);
-
+unsigned long mem_cgroup_soft_limit_reclaim(struct zonelist *zl,
+ gfp_t gfp_mask);
#else /* CONFIG_CGROUP_MEM_RES_CTLR */
struct mem_cgroup;
@@ -264,6 +265,11 @@ mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
{
}
+static inline
+unsigned long mem_cgroup_soft_limit_reclaim(struct zonelist *zl, gfp_t gfp_mask)
+{
+ return 0;
+}
#endif /* CONFIG_CGROUP_MEM_CONT */
#endif /* _LINUX_MEMCONTROL_H */
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 989eb53..c128337 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -215,6 +215,7 @@ static inline void lru_cache_add_active_file(struct page *page)
extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
gfp_t gfp_mask);
extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem,
+ struct zonelist *zl,
gfp_t gfp_mask, bool noswap,
unsigned int swappiness);
extern int __isolate_lru_page(struct page *page, int mode, int file);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 200d44a..980bd18 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -191,6 +191,7 @@ struct mem_cgroup {
unsigned long last_tree_update; /* Last time the tree was */
/* updated in jiffies */
+ bool on_tree; /* Is the node on tree? */
/*
* statistics. This must be placed at the end of memcg.
*/
@@ -227,18 +228,29 @@ pcg_default_flags[NR_CHARGE_TYPE] = {
#define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff)
#define MEMFILE_ATTR(val) ((val) & 0xffff)
+/*
+ * Bits used for hierarchical reclaim bits
+ */
+#define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0
+#define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
+#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1
+#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
+#define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2
+#define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT)
+
static void mem_cgroup_get(struct mem_cgroup *mem);
static void mem_cgroup_put(struct mem_cgroup *mem);
static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
-static void mem_cgroup_insert_exceeded(struct mem_cgroup *mem)
+static void __mem_cgroup_insert_exceeded(struct mem_cgroup *mem)
{
struct rb_node **p = &mem_cgroup_soft_limit_tree.rb_node;
struct rb_node *parent = NULL;
struct mem_cgroup *mem_node;
- unsigned long flags;
- spin_lock_irqsave(&memcg_soft_limit_tree_lock, flags);
+ if (mem->on_tree)
+ return;
+
mem->usage_in_excess = res_counter_soft_limit_excess(&mem->res);
while (*p) {
parent = *p;
@@ -256,6 +268,23 @@ static void mem_cgroup_insert_exceeded(struct mem_cgroup *mem)
rb_insert_color(&mem->mem_cgroup_node,
&mem_cgroup_soft_limit_tree);
mem->last_tree_update = jiffies;
+ mem->on_tree = true;
+}
+
+static void __mem_cgroup_remove_exceeded(struct mem_cgroup *mem)
+{
+ if (!mem->on_tree)
+ return;
+ rb_erase(&mem->mem_cgroup_node, &mem_cgroup_soft_limit_tree);
+ mem->on_tree = false;
+}
+
+static void mem_cgroup_insert_exceeded(struct mem_cgroup *mem)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&memcg_soft_limit_tree_lock, flags);
+ __mem_cgroup_insert_exceeded(mem);
spin_unlock_irqrestore(&memcg_soft_limit_tree_lock, flags);
}
@@ -263,8 +292,53 @@ static void mem_cgroup_remove_exceeded(struct mem_cgroup *mem)
{
unsigned long flags;
spin_lock_irqsave(&memcg_soft_limit_tree_lock, flags);
- rb_erase(&mem->mem_cgroup_node, &mem_cgroup_soft_limit_tree);
+ __mem_cgroup_remove_exceeded(mem);
+ spin_unlock_irqrestore(&memcg_soft_limit_tree_lock, flags);
+}
+
+unsigned long mem_cgroup_get_excess(struct mem_cgroup *mem)
+{
+ unsigned long flags;
+ unsigned long long excess;
+
+ spin_lock_irqsave(&memcg_soft_limit_tree_lock, flags);
+ excess = mem->usage_in_excess >> PAGE_SHIFT;
spin_unlock_irqrestore(&memcg_soft_limit_tree_lock, flags);
+ return (excess > ULONG_MAX) ? ULONG_MAX : excess;
+}
+
+static struct mem_cgroup *__mem_cgroup_largest_soft_limit_node(void)
+{
+ struct rb_node *rightmost = NULL;
+ struct mem_cgroup *mem = NULL;
+
+retry:
+ rightmost = rb_last(&mem_cgroup_soft_limit_tree);
+ if (!rightmost)
+ goto done; /* Nothing to reclaim from */
+
+ mem = rb_entry(rightmost, struct mem_cgroup, mem_cgroup_node);
+ /*
+ * Remove the node now but someone else can add it back,
+ * we will to add it back at the end of reclaim to its correct
+ * position in the tree.
+ */
+ __mem_cgroup_remove_exceeded(mem);
+ if (!css_tryget(&mem->css) || !res_counter_soft_limit_excess(&mem->res))
+ goto retry;
+done:
+ return mem;
+}
+
+static struct mem_cgroup *mem_cgroup_largest_soft_limit_node(void)
+{
+ struct mem_cgroup *mem;
+ unsigned long flags;
+
+ spin_lock_irqsave(&memcg_soft_limit_tree_lock, flags);
+ mem = __mem_cgroup_largest_soft_limit_node();
+ spin_unlock_irqrestore(&memcg_soft_limit_tree_lock, flags);
+ return mem;
}
static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
@@ -889,14 +963,42 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
* If shrink==true, for avoiding to free too much, this returns immedieately.
*/
static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
- gfp_t gfp_mask, bool noswap, bool shrink)
+ struct zonelist *zl,
+ gfp_t gfp_mask,
+ unsigned long reclaim_options)
{
struct mem_cgroup *victim;
int ret, total = 0;
int loop = 0;
+ bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP;
+ bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK;
+ bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT;
+ unsigned long excess = mem_cgroup_get_excess(root_mem);
- while (loop < 2) {
+ while (1) {
+ if (loop >= 2) {
+ if (!check_soft)
+ break;
+ /*
+ * We want to do more targetted reclaim. excess >> 4
+ * >> 4 is not to excessive so as to reclaim too
+ * much, nor too less that we keep coming back
+ * to reclaim from this cgroup
+ */
+ if (total >= (excess >> 4))
+ break;
+ }
victim = mem_cgroup_select_victim(root_mem);
+ /*
+ * In the first loop, don't reclaim from victims below
+ * their soft limit
+ */
+ if (!loop && res_counter_check_under_soft_limit(&victim->res)) {
+ if (victim == root_mem)
+ loop++;
+ css_put(&victim->css);
+ continue;
+ }
if (victim == root_mem)
loop++;
if (!mem_cgroup_local_usage(&victim->stat)) {
@@ -905,8 +1007,9 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
continue;
}
/* we use swappiness of local cgroup */
- ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, noswap,
- get_swappiness(victim));
+ ret = try_to_free_mem_cgroup_pages(victim, zl, gfp_mask,
+ noswap,
+ get_swappiness(victim));
css_put(&victim->css);
/*
* At shrinking usage, we can't check we should stop here or
@@ -916,7 +1019,10 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
if (shrink)
return ret;
total += ret;
- if (mem_cgroup_check_under_limit(root_mem))
+ if (check_soft) {
+ if (res_counter_check_under_soft_limit(&root_mem->res))
+ return total;
+ } else if (mem_cgroup_check_under_limit(root_mem))
return 1 + total;
}
return total;
@@ -1022,7 +1128,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
while (1) {
int ret;
- bool noswap = false;
+ unsigned long flags = 0;
ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res,
&soft_fail_res);
@@ -1035,7 +1141,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
break;
/* mem+swap counter fails */
res_counter_uncharge(&mem->res, PAGE_SIZE, NULL);
- noswap = true;
+ flags = MEM_CGROUP_RECLAIM_NOSWAP;
mem_over_limit = mem_cgroup_from_res_counter(fail_res,
memsw);
} else
@@ -1046,8 +1152,8 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
if (!(gfp_mask & __GFP_WAIT))
goto nomem;
- ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, gfp_mask,
- noswap, false);
+ ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
+ gfp_mask, flags);
if (ret)
continue;
@@ -1757,8 +1863,8 @@ int mem_cgroup_shrink_usage(struct page *page,
return 0;
do {
- progress = mem_cgroup_hierarchical_reclaim(mem,
- gfp_mask, true, false);
+ progress = mem_cgroup_hierarchical_reclaim(mem, NULL,
+ gfp_mask, MEM_CGROUP_RECLAIM_NOSWAP);
progress += mem_cgroup_check_under_limit(mem);
} while (!progress && --retry);
@@ -1812,8 +1918,9 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
if (!ret)
break;
- progress = mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL,
- false, true);
+ progress = mem_cgroup_hierarchical_reclaim(memcg, NULL,
+ GFP_KERNEL,
+ MEM_CGROUP_RECLAIM_SHRINK);
curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
/* Usage is reduced ? */
if (curusage >= oldusage)
@@ -1861,7 +1968,9 @@ int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
if (!ret)
break;
- mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL, true, true);
+ mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
+ MEM_CGROUP_RECLAIM_NOSWAP |
+ MEM_CGROUP_RECLAIM_SHRINK);
curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
/* Usage is reduced ? */
if (curusage >= oldusage)
@@ -1872,6 +1981,62 @@ int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
return ret;
}
+unsigned long mem_cgroup_soft_limit_reclaim(struct zonelist *zl, gfp_t gfp_mask)
+{
+ unsigned long nr_reclaimed = 0;
+ struct mem_cgroup *mem, *next_mem = NULL;
+ unsigned long flags;
+ unsigned long reclaimed;
+
+ /*
+ * This loop can run a while, specially if mem_cgroup's continuously
+ * keep exceeding their soft limit and putting the system under
+ * pressure
+ */
+ do {
+ if (next_mem)
+ mem = next_mem;
+ else
+ mem = mem_cgroup_largest_soft_limit_node();
+ if (!mem)
+ break;
+
+ reclaimed = mem_cgroup_hierarchical_reclaim(mem, zl,
+ gfp_mask,
+ MEM_CGROUP_RECLAIM_SOFT);
+ nr_reclaimed += reclaimed;
+ spin_lock_irqsave(&memcg_soft_limit_tree_lock, flags);
+
+ /*
+ * If we failed to reclaim anything from this memory cgroup
+ * it is time to move on to the next cgroup
+ */
+ next_mem = NULL;
+ if (!reclaimed) {
+ do {
+ /*
+ * By the time we get the soft_limit lock
+ * again, someone might have aded the
+ * group back on the RB tree. Iterate to
+ * make sure we get a different mem.
+ * mem_cgroup_largest_soft_limit_node returns
+ * NULL if no other cgroup is present on
+ * the tree
+ */
+ next_mem =
+ __mem_cgroup_largest_soft_limit_node();
+ } while (next_mem == mem);
+ }
+ mem->usage_in_excess = res_counter_soft_limit_excess(&mem->res);
+ __mem_cgroup_remove_exceeded(mem);
+ if (mem->usage_in_excess)
+ __mem_cgroup_insert_exceeded(mem);
+ spin_unlock_irqrestore(&memcg_soft_limit_tree_lock, flags);
+ css_put(&mem->css);
+ } while (!nr_reclaimed);
+ return nr_reclaimed;
+}
+
/*
* This routine traverse page_cgroup in given list and drop them all.
* *And* this routine doesn't reclaim page itself, just removes page_cgroup.
@@ -1995,7 +2160,7 @@ try_to_free:
ret = -EINTR;
goto out;
}
- progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL,
+ progress = try_to_free_mem_cgroup_pages(mem, NULL, GFP_KERNEL,
false, get_swappiness(mem));
if (!progress) {
nr_retries--;
@@ -2600,6 +2765,8 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
mem->last_scanned_child = 0;
mem->usage_in_excess = 0;
mem->last_tree_update = 0; /* Yes, time begins at 0 here */
+ mem->on_tree = false;
+
spin_lock_init(&mem->reclaim_param_lock);
if (parent)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f8fd1e2..5e1a6ca 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1598,7 +1598,14 @@ nofail_alloc:
reclaim_state.reclaimed_slab = 0;
p->reclaim_state = &reclaim_state;
- did_some_progress = try_to_free_pages(zonelist, order, gfp_mask);
+ /*
+ * Try to free up some pages from the memory controllers soft
+ * limit queue.
+ */
+ did_some_progress = mem_cgroup_soft_limit_reclaim(zonelist, gfp_mask);
+ if (order || !did_some_progress)
+ did_some_progress += try_to_free_pages(zonelist, order,
+ gfp_mask);
p->reclaim_state = NULL;
lockdep_clear_current_reclaim_state();
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 15f7737..13001d9 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1708,6 +1708,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
#ifdef CONFIG_CGROUP_MEM_RES_CTLR
unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
+ struct zonelist *zonelist,
gfp_t gfp_mask,
bool noswap,
unsigned int swappiness)
@@ -1721,14 +1722,14 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
.mem_cgroup = mem_cont,
.isolate_pages = mem_cgroup_isolate_pages,
};
- struct zonelist *zonelist;
if (noswap)
sc.may_unmap = 0;
sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
- zonelist = NODE_DATA(numa_node_id())->node_zonelists;
+ if (!zonelist)
+ zonelist = NODE_DATA(numa_node_id())->node_zonelists;
return do_try_to_free_pages(zonelist, &sc);
}
#endif
--
Balbir
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
next prev parent reply other threads:[~2009-03-14 17:31 UTC|newest]
Thread overview: 29+ messages / expand[flat|nested] mbox.gz Atom feed top
2009-03-14 17:30 [PATCH 0/4] Memory controller soft limit patches (v6) Balbir Singh
2009-03-14 17:30 ` [PATCH 1/4] Memory controller soft limit documentation (v6) Balbir Singh
2009-03-14 17:30 ` [PATCH 2/4] Memory controller soft limit interface (v6) Balbir Singh
2009-03-14 17:31 ` [PATCH 3/4] Memory controller soft limit organize cgroups (v6) Balbir Singh
2009-03-16 0:21 ` KAMEZAWA Hiroyuki
2009-03-16 8:47 ` Balbir Singh
2009-03-16 8:57 ` KAMEZAWA Hiroyuki
2009-03-14 17:31 ` Balbir Singh [this message]
2009-03-16 0:52 ` [PATCH 4/4] Memory controller soft limit reclaim on contention (v6) KAMEZAWA Hiroyuki
2009-03-16 8:35 ` Balbir Singh
2009-03-16 8:49 ` KAMEZAWA Hiroyuki
2009-03-16 9:03 ` KAMEZAWA Hiroyuki
2009-03-16 9:10 ` Balbir Singh
2009-03-16 11:10 ` KAMEZAWA Hiroyuki
2009-03-16 11:38 ` Balbir Singh
2009-03-16 11:58 ` KAMEZAWA Hiroyuki
2009-03-16 12:19 ` Balbir Singh
2009-03-17 3:47 ` KAMEZAWA Hiroyuki
2009-03-17 4:40 ` Balbir Singh
2009-03-17 4:47 ` KAMEZAWA Hiroyuki
2009-03-17 4:58 ` Balbir Singh
2009-03-17 5:17 ` KAMEZAWA Hiroyuki
2009-03-17 5:55 ` Balbir Singh
2009-03-17 6:00 ` KAMEZAWA Hiroyuki
2009-03-17 6:22 ` Balbir Singh
2009-03-17 6:30 ` KAMEZAWA Hiroyuki
2009-03-17 6:59 ` Balbir Singh
2009-03-18 0:07 ` KAMEZAWA Hiroyuki
2009-03-18 4:14 ` Balbir Singh
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20090314173111.16591.68465.sendpatchset@localhost.localdomain \
--to=balbir@linux.vnet.ibm.com \
--cc=akpm@linux-foundation.org \
--cc=kamezawa.hiroyu@jp.fujitsu.com \
--cc=kosaki.motohiro@jp.fujitsu.com \
--cc=linux-mm@kvack.org \
--cc=lizf@cn.fujitsu.com \
--cc=riel@redhat.com \
--cc=yamamoto@valinux.co.jp \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).