From: Tejun Heo <tj-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
To: hannes-druUgvl0LCNAfugRpC6u6w@public.gmane.org,
mhocko-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org
Cc: cgroups-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
linux-mm-Bw31MaZKKs3YtjvyW6yDsg@public.gmane.org,
vdavydov-bzQdu9zFT3WakBO8gow8eQ@public.gmane.org,
kernel-team-b10kYP2dOMg@public.gmane.org,
Tejun Heo <tj-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
Subject: [PATCH 3/4] memcg: punt high overage reclaim to return-to-userland path
Date: Fri, 28 Aug 2015 11:25:29 -0400 [thread overview]
Message-ID: <1440775530-18630-4-git-send-email-tj@kernel.org> (raw)
In-Reply-To: <1440775530-18630-1-git-send-email-tj-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
Currently, try_charge() tries to reclaim memory directly when the high
limit is breached; however, this has a couple issues.
* try_charge() can be invoked from any in-kernel allocation site and
reclaim path may use considerable amount of stack. This can lead to
stack overflows which are extremely difficult to reproduce.
* If the allocation doesn't have __GFP_WAIT, direct reclaim is
skipped. If a process performs only speculative allocations, it can
blow way past the high limit. This is actually easily reproducible
by simply doing "find /". VFS tries speculative !__GFP_WAIT
allocations first, so as long as there's memory which can be
consumed without blocking, it can keep allocating memory regardless
of the high limit.
This patch makes try_charge() always punt the direct reclaim to the
return-to-userland path. If try_charge() detects that high limit is
breached, it sets current->memcg_over_high to the offending memcg and
schedules execution of mem_cgroup_handle_over_high() which performs
the direct reclaim from the return-to-userland path.
As long as kernel doesn't have a run-away allocation spree, this
should provide enough protection while making kmemcg behave more
consistently.
Signed-off-by: Tejun Heo <tj-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
---
include/linux/memcontrol.h | 6 +++++
include/linux/sched.h | 1 +
include/linux/tracehook.h | 3 +++
mm/memcontrol.c | 66 +++++++++++++++++++++++++++++++++++++---------
4 files changed, 64 insertions(+), 12 deletions(-)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 3d28656..8d345a7 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -402,6 +402,8 @@ static inline int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
return inactive * inactive_ratio < active;
}
+void mem_cgroup_handle_over_high(void);
+
void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
struct task_struct *p);
@@ -621,6 +623,10 @@ static inline void mem_cgroup_end_page_stat(struct mem_cgroup *memcg)
{
}
+static inline void mem_cgroup_handle_over_high(void)
+{
+}
+
static inline void mem_cgroup_oom_enable(void)
{
}
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ef73b54..c76b71d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1785,6 +1785,7 @@ struct task_struct {
#endif /* CONFIG_TRACING */
#ifdef CONFIG_MEMCG
struct mem_cgroup *memcg_in_oom;
+ struct mem_cgroup *memcg_over_high; /* reclaim on returning to user */
gfp_t memcg_oom_gfp_mask;
int memcg_oom_order;
#endif
diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index 84d4972..26c1521 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -50,6 +50,7 @@
#include <linux/ptrace.h>
#include <linux/security.h>
#include <linux/task_work.h>
+#include <linux/memcontrol.h>
struct linux_binprm;
/*
@@ -188,6 +189,8 @@ static inline void tracehook_notify_resume(struct pt_regs *regs)
smp_mb__after_atomic();
if (unlikely(current->task_works))
task_work_run();
+
+ mem_cgroup_handle_over_high();
}
#endif /* <linux/tracehook.h> */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 74abb31..c94b686 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -62,6 +62,7 @@
#include <linux/oom.h>
#include <linux/lockdep.h>
#include <linux/file.h>
+#include <linux/tracehook.h>
#include "internal.h"
#include <net/sock.h>
#include <net/ip.h>
@@ -1963,6 +1964,33 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
return NOTIFY_OK;
}
+/*
+ * Scheduled by try_charge() to be executed from the userland return path
+ * and reclaims memory over the high limit.
+ */
+void mem_cgroup_handle_over_high(void)
+{
+ struct mem_cgroup *memcg = current->memcg_over_high;
+
+ if (likely(!memcg))
+ return;
+
+ do {
+ unsigned long usage = page_counter_read(&memcg->memory);
+ unsigned long high = ACCESS_ONCE(memcg->high);
+
+ if (usage <= high)
+ continue;
+
+ mem_cgroup_events(memcg, MEMCG_HIGH, 1);
+ try_to_free_mem_cgroup_pages(memcg, usage - high,
+ GFP_KERNEL, true);
+ } while ((memcg = parent_mem_cgroup(memcg)));
+
+ css_put(¤t->memcg_over_high->css);
+ current->memcg_over_high = NULL;
+}
+
static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
unsigned int nr_pages)
{
@@ -2071,21 +2099,27 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
css_get_many(&memcg->css, batch);
if (batch > nr_pages)
refill_stock(memcg, batch - nr_pages);
- if (!(gfp_mask & __GFP_WAIT))
- goto done;
+
/*
- * If the hierarchy is above the normal consumption range,
- * make the charging task trim their excess contribution.
+ * If the hierarchy is above the normal consumption range, schedule
+ * direct reclaim on returning to userland. We can perform direct
+ * reclaim here if __GFP_WAIT; however, punting has the benefit of
+ * avoiding surprise high stack usages and it's fine to breach the
+ * high limit temporarily while control stays in kernel.
*/
- do {
- unsigned long usage = page_counter_read(&memcg->memory);
- unsigned long high = ACCESS_ONCE(memcg->high);
+ if (!current->memcg_over_high) {
+ struct mem_cgroup *pos = memcg;
- if (usage <= high)
- continue;
- mem_cgroup_events(memcg, MEMCG_HIGH, 1);
- try_to_free_mem_cgroup_pages(memcg, high - usage, gfp_mask, true);
- } while ((memcg = parent_mem_cgroup(memcg)));
+ do {
+ if (page_counter_read(&pos->memory) > pos->high) {
+ /* make user return path rescan from leaf */
+ css_get(&memcg->css);
+ current->memcg_over_high = memcg;
+ set_notify_resume(current);
+ break;
+ }
+ } while ((pos = parent_mem_cgroup(pos)));
+ }
done:
return ret;
}
@@ -5053,6 +5087,13 @@ static void mem_cgroup_move_task(struct cgroup_subsys_state *css,
}
#endif
+static void mem_cgroup_exit(struct cgroup_subsys_state *css,
+ struct cgroup_subsys_state *old_css,
+ struct task_struct *task)
+{
+ mem_cgroup_handle_over_high();
+}
+
/*
* Cgroup retains root cgroups across [un]mount cycles making it necessary
* to verify whether we're attached to the default hierarchy on each mount
@@ -5223,6 +5264,7 @@ struct cgroup_subsys memory_cgrp_subsys = {
.can_attach = mem_cgroup_can_attach,
.cancel_attach = mem_cgroup_cancel_attach,
.attach = mem_cgroup_move_task,
+ .exit = mem_cgroup_exit,
.bind = mem_cgroup_bind,
.dfl_cftypes = memory_files,
.legacy_cftypes = mem_cgroup_legacy_files,
--
2.4.3
WARNING: multiple messages have this Message-ID (diff)
From: Tejun Heo <tj@kernel.org>
To: hannes@cmpxchg.org, mhocko@kernel.org
Cc: cgroups@vger.kernel.org, linux-mm@kvack.org,
vdavydov@parallels.com, kernel-team@fb.com,
Tejun Heo <tj@kernel.org>
Subject: [PATCH 3/4] memcg: punt high overage reclaim to return-to-userland path
Date: Fri, 28 Aug 2015 11:25:29 -0400 [thread overview]
Message-ID: <1440775530-18630-4-git-send-email-tj@kernel.org> (raw)
In-Reply-To: <1440775530-18630-1-git-send-email-tj@kernel.org>
Currently, try_charge() tries to reclaim memory directly when the high
limit is breached; however, this has a couple issues.
* try_charge() can be invoked from any in-kernel allocation site and
reclaim path may use considerable amount of stack. This can lead to
stack overflows which are extremely difficult to reproduce.
* If the allocation doesn't have __GFP_WAIT, direct reclaim is
skipped. If a process performs only speculative allocations, it can
blow way past the high limit. This is actually easily reproducible
by simply doing "find /". VFS tries speculative !__GFP_WAIT
allocations first, so as long as there's memory which can be
consumed without blocking, it can keep allocating memory regardless
of the high limit.
This patch makes try_charge() always punt the direct reclaim to the
return-to-userland path. If try_charge() detects that high limit is
breached, it sets current->memcg_over_high to the offending memcg and
schedules execution of mem_cgroup_handle_over_high() which performs
the direct reclaim from the return-to-userland path.
As long as kernel doesn't have a run-away allocation spree, this
should provide enough protection while making kmemcg behave more
consistently.
Signed-off-by: Tejun Heo <tj@kernel.org>
---
include/linux/memcontrol.h | 6 +++++
include/linux/sched.h | 1 +
include/linux/tracehook.h | 3 +++
mm/memcontrol.c | 66 +++++++++++++++++++++++++++++++++++++---------
4 files changed, 64 insertions(+), 12 deletions(-)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 3d28656..8d345a7 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -402,6 +402,8 @@ static inline int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
return inactive * inactive_ratio < active;
}
+void mem_cgroup_handle_over_high(void);
+
void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
struct task_struct *p);
@@ -621,6 +623,10 @@ static inline void mem_cgroup_end_page_stat(struct mem_cgroup *memcg)
{
}
+static inline void mem_cgroup_handle_over_high(void)
+{
+}
+
static inline void mem_cgroup_oom_enable(void)
{
}
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ef73b54..c76b71d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1785,6 +1785,7 @@ struct task_struct {
#endif /* CONFIG_TRACING */
#ifdef CONFIG_MEMCG
struct mem_cgroup *memcg_in_oom;
+ struct mem_cgroup *memcg_over_high; /* reclaim on returning to user */
gfp_t memcg_oom_gfp_mask;
int memcg_oom_order;
#endif
diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index 84d4972..26c1521 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -50,6 +50,7 @@
#include <linux/ptrace.h>
#include <linux/security.h>
#include <linux/task_work.h>
+#include <linux/memcontrol.h>
struct linux_binprm;
/*
@@ -188,6 +189,8 @@ static inline void tracehook_notify_resume(struct pt_regs *regs)
smp_mb__after_atomic();
if (unlikely(current->task_works))
task_work_run();
+
+ mem_cgroup_handle_over_high();
}
#endif /* <linux/tracehook.h> */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 74abb31..c94b686 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -62,6 +62,7 @@
#include <linux/oom.h>
#include <linux/lockdep.h>
#include <linux/file.h>
+#include <linux/tracehook.h>
#include "internal.h"
#include <net/sock.h>
#include <net/ip.h>
@@ -1963,6 +1964,33 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
return NOTIFY_OK;
}
+/*
+ * Scheduled by try_charge() to be executed from the userland return path
+ * and reclaims memory over the high limit.
+ */
+void mem_cgroup_handle_over_high(void)
+{
+ struct mem_cgroup *memcg = current->memcg_over_high;
+
+ if (likely(!memcg))
+ return;
+
+ do {
+ unsigned long usage = page_counter_read(&memcg->memory);
+ unsigned long high = ACCESS_ONCE(memcg->high);
+
+ if (usage <= high)
+ continue;
+
+ mem_cgroup_events(memcg, MEMCG_HIGH, 1);
+ try_to_free_mem_cgroup_pages(memcg, usage - high,
+ GFP_KERNEL, true);
+ } while ((memcg = parent_mem_cgroup(memcg)));
+
+ css_put(¤t->memcg_over_high->css);
+ current->memcg_over_high = NULL;
+}
+
static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
unsigned int nr_pages)
{
@@ -2071,21 +2099,27 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
css_get_many(&memcg->css, batch);
if (batch > nr_pages)
refill_stock(memcg, batch - nr_pages);
- if (!(gfp_mask & __GFP_WAIT))
- goto done;
+
/*
- * If the hierarchy is above the normal consumption range,
- * make the charging task trim their excess contribution.
+ * If the hierarchy is above the normal consumption range, schedule
+ * direct reclaim on returning to userland. We can perform direct
+ * reclaim here if __GFP_WAIT; however, punting has the benefit of
+ * avoiding surprise high stack usages and it's fine to breach the
+ * high limit temporarily while control stays in kernel.
*/
- do {
- unsigned long usage = page_counter_read(&memcg->memory);
- unsigned long high = ACCESS_ONCE(memcg->high);
+ if (!current->memcg_over_high) {
+ struct mem_cgroup *pos = memcg;
- if (usage <= high)
- continue;
- mem_cgroup_events(memcg, MEMCG_HIGH, 1);
- try_to_free_mem_cgroup_pages(memcg, high - usage, gfp_mask, true);
- } while ((memcg = parent_mem_cgroup(memcg)));
+ do {
+ if (page_counter_read(&pos->memory) > pos->high) {
+ /* make user return path rescan from leaf */
+ css_get(&memcg->css);
+ current->memcg_over_high = memcg;
+ set_notify_resume(current);
+ break;
+ }
+ } while ((pos = parent_mem_cgroup(pos)));
+ }
done:
return ret;
}
@@ -5053,6 +5087,13 @@ static void mem_cgroup_move_task(struct cgroup_subsys_state *css,
}
#endif
+static void mem_cgroup_exit(struct cgroup_subsys_state *css,
+ struct cgroup_subsys_state *old_css,
+ struct task_struct *task)
+{
+ mem_cgroup_handle_over_high();
+}
+
/*
* Cgroup retains root cgroups across [un]mount cycles making it necessary
* to verify whether we're attached to the default hierarchy on each mount
@@ -5223,6 +5264,7 @@ struct cgroup_subsys memory_cgrp_subsys = {
.can_attach = mem_cgroup_can_attach,
.cancel_attach = mem_cgroup_cancel_attach,
.attach = mem_cgroup_move_task,
+ .exit = mem_cgroup_exit,
.bind = mem_cgroup_bind,
.dfl_cftypes = memory_files,
.legacy_cftypes = mem_cgroup_legacy_files,
--
2.4.3
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
next prev parent reply other threads:[~2015-08-28 15:25 UTC|newest]
Thread overview: 66+ messages / expand[flat|nested] mbox.gz Atom feed top
2015-08-28 15:25 [PATCHSET] memcg: improve high limit behavior and always enable kmemcg on dfl hier Tejun Heo
2015-08-28 15:25 ` Tejun Heo
[not found] ` <1440775530-18630-1-git-send-email-tj-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
2015-08-28 15:25 ` [PATCH 1/4] memcg: fix over-high reclaim amount Tejun Heo
2015-08-28 15:25 ` Tejun Heo
2015-08-28 17:06 ` Michal Hocko
[not found] ` <20150828170612.GA21463-2MMpYkNvuYDjFM9bn6wA6Q@public.gmane.org>
2015-08-28 18:32 ` Tejun Heo
2015-08-28 18:32 ` Tejun Heo
[not found] ` <20150828183209.GA9423-qYNAdHglDFBN0TnZuCh8vA@public.gmane.org>
2015-08-31 7:51 ` Michal Hocko
2015-08-31 7:51 ` Michal Hocko
[not found] ` <20150831075133.GA29723-2MMpYkNvuYDjFM9bn6wA6Q@public.gmane.org>
2015-08-31 13:38 ` Tejun Heo
2015-08-31 13:38 ` Tejun Heo
2015-09-01 12:51 ` Michal Hocko
[not found] ` <20150901125149.GD8810-2MMpYkNvuYDjFM9bn6wA6Q@public.gmane.org>
2015-09-01 18:33 ` Tejun Heo
2015-09-01 18:33 ` Tejun Heo
2015-08-28 15:25 ` [PATCH 2/4] memcg: flatten task_struct->memcg_oom Tejun Heo
2015-08-28 15:25 ` Tejun Heo
[not found] ` <1440775530-18630-3-git-send-email-tj-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
2015-08-28 17:11 ` Michal Hocko
2015-08-28 17:11 ` Michal Hocko
2015-08-28 15:25 ` Tejun Heo [this message]
2015-08-28 15:25 ` [PATCH 3/4] memcg: punt high overage reclaim to return-to-userland path Tejun Heo
[not found] ` <1440775530-18630-4-git-send-email-tj-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
2015-08-28 16:36 ` Vladimir Davydov
2015-08-28 16:36 ` Vladimir Davydov
2015-08-28 16:48 ` Tejun Heo
2015-08-28 16:48 ` Tejun Heo
2015-08-28 20:32 ` Vladimir Davydov
2015-08-28 20:44 ` Tejun Heo
2015-08-28 20:44 ` Tejun Heo
2015-08-28 22:06 ` Tejun Heo
[not found] ` <20150828220632.GF11089-Gd/HAXX7CRxy/B6EtB590w@public.gmane.org>
2015-08-29 7:59 ` Vladimir Davydov
2015-08-29 7:59 ` Vladimir Davydov
2015-08-30 15:52 ` Vladimir Davydov
2015-08-28 17:13 ` Michal Hocko
2015-08-28 17:13 ` Michal Hocko
[not found] ` <20150828171322.GC21463-2MMpYkNvuYDjFM9bn6wA6Q@public.gmane.org>
2015-08-28 17:56 ` Tejun Heo
2015-08-28 17:56 ` Tejun Heo
2015-08-28 20:45 ` Vladimir Davydov
2015-08-28 20:45 ` Vladimir Davydov
2015-08-28 20:53 ` Tejun Heo
2015-08-28 20:53 ` Tejun Heo
[not found] ` <20150828205301.GB11089-Gd/HAXX7CRxy/B6EtB590w@public.gmane.org>
2015-08-28 21:07 ` Vladimir Davydov
2015-08-28 21:07 ` Vladimir Davydov
2015-08-28 21:14 ` Tejun Heo
2015-08-28 21:14 ` Tejun Heo
2015-08-28 15:25 ` [PATCH 4/4] memcg: always enable kmemcg on the default hierarchy Tejun Heo
2015-08-28 15:25 ` Tejun Heo
[not found] ` <1440775530-18630-5-git-send-email-tj-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
2015-08-28 16:49 ` Vladimir Davydov
2015-08-28 16:49 ` Vladimir Davydov
2015-08-28 16:56 ` Tejun Heo
2015-08-28 17:14 ` Michal Hocko
2015-08-28 17:14 ` Michal Hocko
2015-08-28 17:41 ` Tejun Heo
2015-09-01 12:44 ` Michal Hocko
[not found] ` <20150901124459.GC8810-2MMpYkNvuYDjFM9bn6wA6Q@public.gmane.org>
2015-09-01 18:51 ` Tejun Heo
2015-09-01 18:51 ` Tejun Heo
2015-09-04 13:30 ` Michal Hocko
[not found] ` <20150904133038.GC8220-2MMpYkNvuYDjFM9bn6wA6Q@public.gmane.org>
2015-09-04 15:38 ` Vladimir Davydov
2015-09-04 15:38 ` Vladimir Davydov
2015-09-07 9:39 ` Michal Hocko
2015-09-07 9:39 ` Michal Hocko
2015-09-07 10:01 ` Vladimir Davydov
2015-09-07 11:03 ` Michal Hocko
2015-09-04 16:18 ` Tejun Heo
[not found] ` <20150904161845.GB25329-qYNAdHglDFBN0TnZuCh8vA@public.gmane.org>
2015-09-07 10:54 ` Michal Hocko
2015-09-07 10:54 ` Michal Hocko
2015-09-08 18:50 ` Tejun Heo
2015-11-05 17:30 ` Michal Hocko
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1440775530-18630-4-git-send-email-tj@kernel.org \
--to=tj-dgejt+ai2ygdnm+yrofe0a@public.gmane.org \
--cc=cgroups-u79uwXL29TY76Z2rM5mHXA@public.gmane.org \
--cc=hannes-druUgvl0LCNAfugRpC6u6w@public.gmane.org \
--cc=kernel-team-b10kYP2dOMg@public.gmane.org \
--cc=linux-mm-Bw31MaZKKs3YtjvyW6yDsg@public.gmane.org \
--cc=mhocko-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org \
--cc=vdavydov-bzQdu9zFT3WakBO8gow8eQ@public.gmane.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.