* [patch -mm 1/4] mm, memcg: introduce per-memcg oom policy tunable
2018-01-17 2:14 [patch -mm 0/4] mm, memcg: introduce oom policies David Rientjes
@ 2018-01-17 2:15 ` David Rientjes
2018-01-17 2:15 ` [patch -mm 2/4] mm, memcg: replace cgroup aware oom killer mount option with tunable David Rientjes
` (4 subsequent siblings)
5 siblings, 0 replies; 52+ messages in thread
From: David Rientjes @ 2018-01-17 2:15 UTC (permalink / raw)
To: Andrew Morton, Roman Gushchin
Cc: Michal Hocko, Vladimir Davydov, Johannes Weiner, Tetsuo Handa,
Tejun Heo, kernel-team, cgroups, linux-doc, linux-kernel,
linux-mm
The cgroup aware oom killer is needlessly declared for the entire system
by a mount option. It's unnecessary to force the system into a single
oom policy: either cgroup aware, or the traditional process aware.
This patch introduces a memory.oom_policy tunable for all mem cgroups.
It is currently a no-op: it can only be set to "none", which is its
default policy. It will be expanded in the next patch to define cgroup
aware oom killer behavior.
This is an extensible interface that can be used to define cgroup aware
assessment of mem cgroup subtrees or the traditional process aware
assessment.
Another benefit of such an approach is that an admin can lock in a
certain policy for the system or for a mem cgroup subtree and can
delegate the policy decision to the user to determine if the kill should
originate from a subcontainer, as indivisible memory consumers
themselves, or selection should be done per process.
Signed-off-by: David Rientjes <rientjes@google.com>
---
Documentation/cgroup-v2.txt | 9 +++++++++
include/linux/memcontrol.h | 11 +++++++++++
mm/memcontrol.c | 35 +++++++++++++++++++++++++++++++++++
3 files changed, 55 insertions(+)
diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt
--- a/Documentation/cgroup-v2.txt
+++ b/Documentation/cgroup-v2.txt
@@ -1060,6 +1060,15 @@ PAGE_SIZE multiple when read back.
If cgroup-aware OOM killer is not enabled, ENOTSUPP error
is returned on attempt to access the file.
+ memory.oom_policy
+
+ A read-write single string file which exists on all cgroups. The
+ default value is "none".
+
+ If "none", the OOM killer will use the default policy to choose a
+ victim; that is, it will choose the single process with the largest
+ memory footprint.
+
memory.events
A read-only flat-keyed file which exists on non-root cgroups.
The following entries are defined. Unless specified
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -58,6 +58,14 @@ enum memcg_event_item {
MEMCG_NR_EVENTS,
};
+enum memcg_oom_policy {
+ /*
+ * No special oom policy, process selection is determined by
+ * oom_badness()
+ */
+ MEMCG_OOM_POLICY_NONE,
+};
+
struct mem_cgroup_reclaim_cookie {
pg_data_t *pgdat;
int priority;
@@ -203,6 +211,9 @@ struct mem_cgroup {
/* OOM-Killer disable */
int oom_kill_disable;
+ /* OOM policy for this subtree */
+ enum memcg_oom_policy oom_policy;
+
/*
* Treat the sub-tree as an indivisible memory consumer,
* kill all belonging tasks if the memory cgroup selected
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4417,6 +4417,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
if (parent) {
memcg->swappiness = mem_cgroup_swappiness(parent);
memcg->oom_kill_disable = parent->oom_kill_disable;
+ memcg->oom_policy = parent->oom_policy;
}
if (parent && parent->use_hierarchy) {
memcg->use_hierarchy = true;
@@ -5534,6 +5535,34 @@ static int memory_stat_show(struct seq_file *m, void *v)
return 0;
}
+static int memory_oom_policy_show(struct seq_file *m, void *v)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+ enum memcg_oom_policy policy = READ_ONCE(memcg->oom_policy);
+
+ switch (policy) {
+ case MEMCG_OOM_POLICY_NONE:
+ default:
+ seq_puts(m, "none\n");
+ };
+ return 0;
+}
+
+static ssize_t memory_oom_policy_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+ ssize_t ret = nbytes;
+
+ buf = strstrip(buf);
+ if (!memcmp("none", buf, min(sizeof("none")-1, nbytes)))
+ memcg->oom_policy = MEMCG_OOM_POLICY_NONE;
+ else
+ ret = -EINVAL;
+
+ return ret;
+}
+
static struct cftype memory_files[] = {
{
.name = "current",
@@ -5575,6 +5604,12 @@ static struct cftype memory_files[] = {
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = memory_stat_show,
},
+ {
+ .name = "oom_policy",
+ .flags = CFTYPE_NS_DELEGATABLE,
+ .seq_show = memory_oom_policy_show,
+ .write = memory_oom_policy_write,
+ },
{ } /* terminate */
};
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 52+ messages in thread* [patch -mm 2/4] mm, memcg: replace cgroup aware oom killer mount option with tunable
2018-01-17 2:14 [patch -mm 0/4] mm, memcg: introduce oom policies David Rientjes
2018-01-17 2:15 ` [patch -mm 1/4] mm, memcg: introduce per-memcg oom policy tunable David Rientjes
@ 2018-01-17 2:15 ` David Rientjes
[not found] ` <alpine.DEB.2.10.1801161812550.28198-X6Q0R45D7oAcqpCFd4KODRPsWskHk0ljAL8bYrjMMd8@public.gmane.org>
` (3 subsequent siblings)
5 siblings, 0 replies; 52+ messages in thread
From: David Rientjes @ 2018-01-17 2:15 UTC (permalink / raw)
To: Andrew Morton, Roman Gushchin
Cc: Michal Hocko, Vladimir Davydov, Johannes Weiner, Tetsuo Handa,
Tejun Heo, kernel-team, cgroups, linux-doc, linux-kernel,
linux-mm
Now that each mem cgroup on the system has a memory.oom_policy tunable to
specify oom kill selection behavior, remove the needless "groupoom" mount
option that requires (1) the entire system to be forced, perhaps
unnecessarily, perhaps unexpectedly, into a single oom policy that
differs from the traditional per process selection, and (2) a remount to
change.
Instead of enabling the cgroup aware oom killer with the "groupoom" mount
option, set the mem cgroup subtree's memory.oom_policy to "cgroup".
Signed-off-by: David Rientjes <rientjes@google.com>
---
Documentation/cgroup-v2.txt | 43 +++++++++++++++++++++----------------------
include/linux/cgroup-defs.h | 5 -----
include/linux/memcontrol.h | 5 +++++
kernel/cgroup/cgroup.c | 13 +------------
mm/memcontrol.c | 17 ++++++++---------
5 files changed, 35 insertions(+), 48 deletions(-)
diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt
--- a/Documentation/cgroup-v2.txt
+++ b/Documentation/cgroup-v2.txt
@@ -1069,6 +1069,10 @@ PAGE_SIZE multiple when read back.
victim; that is, it will choose the single process with the largest
memory footprint.
+ If "cgroup", the OOM killer will compare mem cgroups as indivisible
+ memory consumers; that is, they will compare mem cgroup usage rather
+ than process memory footprint. See the "OOM Killer" section.
+
memory.events
A read-only flat-keyed file which exists on non-root cgroups.
The following entries are defined. Unless specified
@@ -1275,37 +1279,32 @@ belonging to the affected files to ensure correct memory ownership.
OOM Killer
~~~~~~~~~~
-Cgroup v2 memory controller implements a cgroup-aware OOM killer.
-It means that it treats cgroups as first class OOM entities.
+Cgroup v2 memory controller implements an optional cgroup-aware out of
+memory killer, which treats cgroups as indivisible OOM entities.
-Cgroup-aware OOM logic is turned off by default and requires
-passing the "groupoom" option on mounting cgroupfs. It can also
-by remounting cgroupfs with the following command::
+This policy is controlled by memory.oom_policy. When a memory cgroup is
+out of memory, its memory.oom_policy will dictate how the OOM killer will
+select a process, or cgroup, to kill. Likewise, when the system is OOM,
+the policy is dictated by the root mem cgroup.
- # mount -o remount,groupoom $MOUNT_POINT
+There are currently two available oom policies:
-Under OOM conditions the memory controller tries to make the best
-choice of a victim, looking for a memory cgroup with the largest
-memory footprint, considering leaf cgroups and cgroups with the
-memory.oom_group option set, which are considered to be an indivisible
-memory consumers.
+ - "none": default, choose the largest single memory hogging process to
+ oom kill, as traditionally the OOM killer has always done.
-By default, OOM killer will kill the biggest task in the selected
-memory cgroup. A user can change this behavior by enabling
-the per-cgroup memory.oom_group option. If set, it causes
-the OOM killer to kill all processes attached to the cgroup,
-except processes with oom_score_adj set to -1000.
+ - "cgroup": choose the cgroup with the largest memory footprint from the
+ subtree as an OOM victim and kill at least one process, depending on
+ memory.oom_group, from it.
-This affects both system- and cgroup-wide OOMs. For a cgroup-wide OOM
-the memory controller considers only cgroups belonging to the sub-tree
-of the OOM'ing cgroup.
+When selecting a cgroup as a victim, the OOM killer will kill the process
+with the largest memory footprint. A user can control this behavior by
+enabling the per-cgroup memory.oom_group option. If set, it causes the
+OOM killer to kill all processes attached to the cgroup, except processes
+with /proc/pid/oom_score_adj set to -1000 (oom disabled).
The root cgroup is treated as a leaf memory cgroup, so it's compared
with other leaf memory cgroups and cgroups with oom_group option set.
-If there are no cgroups with the enabled memory controller,
-the OOM killer is using the "traditional" process-based approach.
-
Please, note that memory charges are not migrating if tasks
are moved between different memory cgroups. Moving tasks with
significant memory footprint may affect OOM victim selection logic.
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -81,11 +81,6 @@ enum {
* Enable cpuset controller in v1 cgroup to use v2 behavior.
*/
CGRP_ROOT_CPUSET_V2_MODE = (1 << 4),
-
- /*
- * Enable cgroup-aware OOM killer.
- */
- CGRP_GROUP_OOM = (1 << 5),
};
/* cftype->flags */
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -64,6 +64,11 @@ enum memcg_oom_policy {
* oom_badness()
*/
MEMCG_OOM_POLICY_NONE,
+ /*
+ * Local cgroup usage is used to select a target cgroup, treating each
+ * mem cgroup as an indivisible consumer
+ */
+ MEMCG_OOM_POLICY_CGROUP,
};
struct mem_cgroup_reclaim_cookie {
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1732,9 +1732,6 @@ static int parse_cgroup_root_flags(char *data, unsigned int *root_flags)
if (!strcmp(token, "nsdelegate")) {
*root_flags |= CGRP_ROOT_NS_DELEGATE;
continue;
- } else if (!strcmp(token, "groupoom")) {
- *root_flags |= CGRP_GROUP_OOM;
- continue;
}
pr_err("cgroup2: unknown option \"%s\"\n", token);
@@ -1751,11 +1748,6 @@ static void apply_cgroup_root_flags(unsigned int root_flags)
cgrp_dfl_root.flags |= CGRP_ROOT_NS_DELEGATE;
else
cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE;
-
- if (root_flags & CGRP_GROUP_OOM)
- cgrp_dfl_root.flags |= CGRP_GROUP_OOM;
- else
- cgrp_dfl_root.flags &= ~CGRP_GROUP_OOM;
}
}
@@ -1763,8 +1755,6 @@ static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root
{
if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE)
seq_puts(seq, ",nsdelegate");
- if (cgrp_dfl_root.flags & CGRP_GROUP_OOM)
- seq_puts(seq, ",groupoom");
return 0;
}
@@ -5921,8 +5911,7 @@ static struct kobj_attribute cgroup_delegate_attr = __ATTR_RO(delegate);
static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
- return snprintf(buf, PAGE_SIZE, "nsdelegate\n"
- "groupoom\n");
+ return snprintf(buf, PAGE_SIZE, "nsdelegate\n");
}
static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2798,14 +2798,14 @@ bool mem_cgroup_select_oom_victim(struct oom_control *oc)
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
return false;
- if (!(cgrp_dfl_root.flags & CGRP_GROUP_OOM))
- return false;
-
if (oc->memcg)
root = oc->memcg;
else
root = root_mem_cgroup;
+ if (root->oom_policy != MEMCG_OOM_POLICY_CGROUP)
+ return false;
+
select_victim_memcg(root, oc);
return oc->chosen_memcg;
@@ -5412,9 +5412,6 @@ static int memory_oom_group_show(struct seq_file *m, void *v)
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
bool oom_group = memcg->oom_group;
- if (!(cgrp_dfl_root.flags & CGRP_GROUP_OOM))
- return -ENOTSUPP;
-
seq_printf(m, "%d\n", oom_group);
return 0;
@@ -5428,9 +5425,6 @@ static ssize_t memory_oom_group_write(struct kernfs_open_file *of,
int oom_group;
int err;
- if (!(cgrp_dfl_root.flags & CGRP_GROUP_OOM))
- return -ENOTSUPP;
-
err = kstrtoint(strstrip(buf), 0, &oom_group);
if (err)
return err;
@@ -5541,6 +5535,9 @@ static int memory_oom_policy_show(struct seq_file *m, void *v)
enum memcg_oom_policy policy = READ_ONCE(memcg->oom_policy);
switch (policy) {
+ case MEMCG_OOM_POLICY_CGROUP:
+ seq_puts(m, "cgroup\n");
+ break;
case MEMCG_OOM_POLICY_NONE:
default:
seq_puts(m, "none\n");
@@ -5557,6 +5554,8 @@ static ssize_t memory_oom_policy_write(struct kernfs_open_file *of,
buf = strstrip(buf);
if (!memcmp("none", buf, min(sizeof("none")-1, nbytes)))
memcg->oom_policy = MEMCG_OOM_POLICY_NONE;
+ else if (!memcmp("cgroup", buf, min(sizeof("cgroup")-1, nbytes)))
+ memcg->oom_policy = MEMCG_OOM_POLICY_CGROUP;
else
ret = -EINVAL;
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 52+ messages in thread[parent not found: <alpine.DEB.2.10.1801161812550.28198-X6Q0R45D7oAcqpCFd4KODRPsWskHk0ljAL8bYrjMMd8@public.gmane.org>]
* [patch -mm 3/4] mm, memcg: replace memory.oom_group with policy tunable
[not found] ` <alpine.DEB.2.10.1801161812550.28198-X6Q0R45D7oAcqpCFd4KODRPsWskHk0ljAL8bYrjMMd8@public.gmane.org>
@ 2018-01-17 2:15 ` David Rientjes
2018-01-17 15:41 ` Tejun Heo
0 siblings, 1 reply; 52+ messages in thread
From: David Rientjes @ 2018-01-17 2:15 UTC (permalink / raw)
To: Andrew Morton, Roman Gushchin
Cc: Michal Hocko, Vladimir Davydov, Johannes Weiner, Tetsuo Handa,
Tejun Heo, kernel-team-b10kYP2dOMg,
cgroups-u79uwXL29TY76Z2rM5mHXA, linux-doc-u79uwXL29TY76Z2rM5mHXA,
linux-kernel-u79uwXL29TY76Z2rM5mHXA,
linux-mm-Bw31MaZKKs3YtjvyW6yDsg
The behavior of killing an entire indivisible memory consumer, enabled
by memory.oom_group, is an oom policy itself. It specifies that all
usage should be accounted to an ancestor and, if selected by the cgroup
aware oom killer, all processes attached to it and its descendant mem
cgroups should be oom killed.
This is replaced by writing "all" to memory.oom_policy and allows for the
same functionality as the existing memory.oom_group without (1) polluting
the mem cgroup v2 filesystem unnecessarily and (2) unnecessarily when the
"groupoom" mount option is not used (now by writing "cgroup" to the root
mem cgroup's memory.oom_policy).
The "all" oom policy cannot be enabled on the root mem cgroup.
Signed-off-by: David Rientjes <rientjes-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
---
Documentation/cgroup-v2.txt | 51 ++++++++++++++---------------------------
include/linux/memcontrol.h | 18 +++++++--------
mm/memcontrol.c | 55 ++++++++++++---------------------------------
mm/oom_kill.c | 4 ++--
4 files changed, 41 insertions(+), 87 deletions(-)
diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt
--- a/Documentation/cgroup-v2.txt
+++ b/Documentation/cgroup-v2.txt
@@ -1035,31 +1035,6 @@ PAGE_SIZE multiple when read back.
high limit is used and monitored properly, this limit's
utility is limited to providing the final safety net.
- memory.oom_group
-
- A read-write single value file which exists on non-root
- cgroups. The default is "0".
-
- If set, OOM killer will consider the memory cgroup as an
- indivisible memory consumers and compare it with other memory
- consumers by it's memory footprint.
- If such memory cgroup is selected as an OOM victim, all
- processes belonging to it or it's descendants will be killed.
-
- This applies to system-wide OOM conditions and reaching
- the hard memory limit of the cgroup and their ancestor.
- If OOM condition happens in a descendant cgroup with it's own
- memory limit, the memory cgroup can't be considered
- as an OOM victim, and OOM killer will not kill all belonging
- tasks.
-
- Also, OOM killer respects the /proc/pid/oom_score_adj value -1000,
- and will never kill the unkillable task, even if memory.oom_group
- is set.
-
- If cgroup-aware OOM killer is not enabled, ENOTSUPP error
- is returned on attempt to access the file.
-
memory.oom_policy
A read-write single string file which exists on all cgroups. The
@@ -1073,6 +1048,11 @@ PAGE_SIZE multiple when read back.
memory consumers; that is, they will compare mem cgroup usage rather
than process memory footprint. See the "OOM Killer" section.
+ If "all", the OOM killer will compare mem cgroups and its subtree
+ as indivisible memory consumers and kill all processes attached to
+ the mem cgroup and its subtree. This policy cannot be set on the
+ root mem cgroup. See the "OOM Killer" section.
+
memory.events
A read-only flat-keyed file which exists on non-root cgroups.
The following entries are defined. Unless specified
@@ -1287,29 +1267,32 @@ out of memory, its memory.oom_policy will dictate how the OOM killer will
select a process, or cgroup, to kill. Likewise, when the system is OOM,
the policy is dictated by the root mem cgroup.
-There are currently two available oom policies:
+There are currently three available oom policies:
- "none": default, choose the largest single memory hogging process to
oom kill, as traditionally the OOM killer has always done.
- "cgroup": choose the cgroup with the largest memory footprint from the
- subtree as an OOM victim and kill at least one process, depending on
- memory.oom_group, from it.
+ subtree as an OOM victim and kill at least one process.
+
+ - "all": choose the cgroup with the largest memory footprint considering
+ itself and its subtree and kill all processes attached (cannot be set on
+ the root mem cgroup).
When selecting a cgroup as a victim, the OOM killer will kill the process
-with the largest memory footprint. A user can control this behavior by
-enabling the per-cgroup memory.oom_group option. If set, it causes the
-OOM killer to kill all processes attached to the cgroup, except processes
-with /proc/pid/oom_score_adj set to -1000 (oom disabled).
+with the largest memory footprint, unless the policy is specified as "all".
+In that case, the OOM killer will kill all processes attached to the cgroup
+and its subtree, except processes with /proc/pid/oom_score_adj set to
+-1000 (oom disabled).
The root cgroup is treated as a leaf memory cgroup, so it's compared
-with other leaf memory cgroups and cgroups with oom_group option set.
+with other leaf memory cgroups.
Please, note that memory charges are not migrating if tasks
are moved between different memory cgroups. Moving tasks with
significant memory footprint may affect OOM victim selection logic.
If it's a case, please, consider creating a common ancestor for
-the source and destination memory cgroups and enabling oom_group
+the source and destination memory cgroups and setting a policy of "all"
on ancestor layer.
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -69,6 +69,11 @@ enum memcg_oom_policy {
* mem cgroup as an indivisible consumer
*/
MEMCG_OOM_POLICY_CGROUP,
+ /*
+ * Same as MEMCG_OOM_POLICY_CGROUP, but all eligible processes attached
+ * to the cgroup and subtree should be oom killed
+ */
+ MEMCG_OOM_POLICY_ALL,
};
struct mem_cgroup_reclaim_cookie {
@@ -219,13 +224,6 @@ struct mem_cgroup {
/* OOM policy for this subtree */
enum memcg_oom_policy oom_policy;
- /*
- * Treat the sub-tree as an indivisible memory consumer,
- * kill all belonging tasks if the memory cgroup selected
- * as OOM victim.
- */
- bool oom_group;
-
/* handle for "memory.events" */
struct cgroup_file events_file;
@@ -513,9 +511,9 @@ bool mem_cgroup_oom_synchronize(bool wait);
bool mem_cgroup_select_oom_victim(struct oom_control *oc);
-static inline bool mem_cgroup_oom_group(struct mem_cgroup *memcg)
+static inline bool mem_cgroup_oom_policy_all(struct mem_cgroup *memcg)
{
- return memcg->oom_group;
+ return memcg->oom_policy == MEMCG_OOM_POLICY_ALL;
}
#ifdef CONFIG_MEMCG_SWAP
@@ -1019,7 +1017,7 @@ static inline bool mem_cgroup_select_oom_victim(struct oom_control *oc)
return false;
}
-static inline bool mem_cgroup_oom_group(struct mem_cgroup *memcg)
+static inline bool mem_cgroup_oom_policy_all(struct mem_cgroup *memcg)
{
return false;
}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2715,11 +2715,11 @@ static void select_victim_memcg(struct mem_cgroup *root, struct oom_control *oc)
oc->chosen_points = 0;
/*
- * If OOM is memcg-wide, and the memcg has the oom_group flag set,
- * all tasks belonging to the memcg should be killed.
+ * If OOM is memcg-wide, and the oom policy is "all", all processes
+ * attached to the memcg and subtree should be killed.
* So, we mark the memcg as a victim.
*/
- if (oc->memcg && mem_cgroup_oom_group(oc->memcg)) {
+ if (oc->memcg && mem_cgroup_oom_policy_all(oc->memcg)) {
oc->chosen_memcg = oc->memcg;
css_get(&oc->chosen_memcg->css);
return;
@@ -2728,7 +2728,7 @@ static void select_victim_memcg(struct mem_cgroup *root, struct oom_control *oc)
/*
* The oom_score is calculated for leaf memory cgroups (including
* the root memcg).
- * Non-leaf oom_group cgroups accumulating score of descendant
+ * Cgroups with oom policy of "all" accumulate the score of descendant
* leaf memory cgroups.
*/
rcu_read_lock();
@@ -2736,11 +2736,11 @@ static void select_victim_memcg(struct mem_cgroup *root, struct oom_control *oc)
long score;
/*
- * We don't consider non-leaf non-oom_group memory cgroups
- * as OOM victims.
+ * We don't consider non-leaf memory cgroups without the oom
+ * policy of "all" as oom victims.
*/
if (memcg_has_children(iter) && iter != root_mem_cgroup &&
- !mem_cgroup_oom_group(iter))
+ !mem_cgroup_oom_policy_all(iter))
continue;
/*
@@ -2803,7 +2803,7 @@ bool mem_cgroup_select_oom_victim(struct oom_control *oc)
else
root = root_mem_cgroup;
- if (root->oom_policy != MEMCG_OOM_POLICY_CGROUP)
+ if (root->oom_policy == MEMCG_OOM_POLICY_NONE)
return false;
select_victim_memcg(root, oc);
@@ -5407,33 +5407,6 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
return nbytes;
}
-static int memory_oom_group_show(struct seq_file *m, void *v)
-{
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
- bool oom_group = memcg->oom_group;
-
- seq_printf(m, "%d\n", oom_group);
-
- return 0;
-}
-
-static ssize_t memory_oom_group_write(struct kernfs_open_file *of,
- char *buf, size_t nbytes,
- loff_t off)
-{
- struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
- int oom_group;
- int err;
-
- err = kstrtoint(strstrip(buf), 0, &oom_group);
- if (err)
- return err;
-
- memcg->oom_group = oom_group;
-
- return nbytes;
-}
-
static int memory_events_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
@@ -5538,6 +5511,9 @@ static int memory_oom_policy_show(struct seq_file *m, void *v)
case MEMCG_OOM_POLICY_CGROUP:
seq_puts(m, "cgroup\n");
break;
+ case MEMCG_OOM_POLICY_ALL:
+ seq_puts(m, "all\n");
+ break;
case MEMCG_OOM_POLICY_NONE:
default:
seq_puts(m, "none\n");
@@ -5556,6 +5532,9 @@ static ssize_t memory_oom_policy_write(struct kernfs_open_file *of,
memcg->oom_policy = MEMCG_OOM_POLICY_NONE;
else if (!memcmp("cgroup", buf, min(sizeof("cgroup")-1, nbytes)))
memcg->oom_policy = MEMCG_OOM_POLICY_CGROUP;
+ else if (memcg != root_mem_cgroup &&
+ !memcmp("all", buf, min(sizeof("all")-1, nbytes)))
+ memcg->oom_policy = MEMCG_OOM_POLICY_ALL;
else
ret = -EINVAL;
@@ -5586,12 +5565,6 @@ static struct cftype memory_files[] = {
.seq_show = memory_max_show,
.write = memory_max_write,
},
- {
- .name = "oom_group",
- .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE,
- .seq_show = memory_oom_group_show,
- .write = memory_oom_group_write,
- },
{
.name = "events",
.flags = CFTYPE_NOT_ON_ROOT,
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -1000,11 +1000,11 @@ static bool oom_kill_memcg_victim(struct oom_control *oc)
return oc->chosen_memcg;
/*
- * If memory.oom_group is set, kill all tasks belonging to the sub-tree
+ * If the oom policy is "all", kill all tasks belonging to the sub-tree
* of the chosen memory cgroup, otherwise kill the task with the biggest
* memory footprint.
*/
- if (mem_cgroup_oom_group(oc->chosen_memcg)) {
+ if (mem_cgroup_oom_policy_all(oc->chosen_memcg)) {
mem_cgroup_scan_tasks(oc->chosen_memcg, oom_kill_memcg_member,
NULL);
/* We have one or more terminating processes at this point. */
^ permalink raw reply [flat|nested] 52+ messages in thread* Re: [patch -mm 3/4] mm, memcg: replace memory.oom_group with policy tunable
2018-01-17 2:15 ` [patch -mm 3/4] mm, memcg: replace memory.oom_group with policy tunable David Rientjes
@ 2018-01-17 15:41 ` Tejun Heo
2018-01-17 16:00 ` Michal Hocko
2018-01-17 22:14 ` David Rientjes
0 siblings, 2 replies; 52+ messages in thread
From: Tejun Heo @ 2018-01-17 15:41 UTC (permalink / raw)
To: David Rientjes
Cc: Andrew Morton, Roman Gushchin, Michal Hocko, Vladimir Davydov,
Johannes Weiner, Tetsuo Handa, kernel-team, cgroups, linux-doc,
linux-kernel, linux-mm
Hello, David.
On Tue, Jan 16, 2018 at 06:15:08PM -0800, David Rientjes wrote:
> The behavior of killing an entire indivisible memory consumer, enabled
> by memory.oom_group, is an oom policy itself. It specifies that all
I thought we discussed this before but maybe I'm misremembering.
There are two parts to the OOM policy. One is victim selection, the
other is the action to take thereafter.
The two are different and conflating the two don't work too well. For
example, please consider what should be given to the delegatee when
delegating a subtree, which often is a good excercise when designing
these APIs.
When a given workload is selected for OOM kill (IOW, selected to free
some memory), whether the workload can handle individual process kills
or not is the property of the workload itself. Some applications can
safely handle some of its processes picked off and killed. Most
others can't and want to be handled as a single unit, which makes it a
property of the workload.
That makes sense in the hierarchy too because whether one process or
the whole workload is killed doesn't infringe upon the parent's
authority over resources which in turn implies that there's nothing to
worry about how the parent's groupoom setting should constrain the
descendants.
OOM victim selection policy is a different beast. As you've mentioned
multiple times, especially if you're worrying about people abusing OOM
policies by creating sub-cgroups and so on, the policy, first of all,
shouldn't be delegatable and secondly should have meaningful
hierarchical restrictions so that a policy that an ancestor chose
can't be nullified by a descendant.
I'm not necessarily against adding hierarchical victim selection
policy tunables; however, I am skeptical whether static tunables on
cgroup hierarchy (including selectable policies) can be made clean and
versatile enough, especially because the resource hierarchy doesn't
necessarily, or rather in most cases, match the OOM victim selection
decision tree, but I'd be happy to be proven wrong.
Without explicit configurations, the only thing the OOM killer needs
to guarantee is that the system can make forward progress. We've
always been tweaking victim selection with or without cgroup and
absolutely shouldn't be locked into a specific heuristics. The
heuristics is an implementaiton detail subject to improvements.
To me, your patchset actually seems to demonstrate that these are
separate issues. The goal of groupoom is just to kill logical units
as cgroup hierarchy can inform the kernel of how workloads are
composed in the userspace. If you want to improve victim selection,
sure, please go ahead, but your argument that groupoom can't be merged
because of victim selection policy doesn't make sense to me.
Thanks.
--
tejun
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 52+ messages in thread
* Re: [patch -mm 3/4] mm, memcg: replace memory.oom_group with policy tunable
2018-01-17 15:41 ` Tejun Heo
@ 2018-01-17 16:00 ` Michal Hocko
2018-01-17 22:18 ` David Rientjes
2018-01-17 22:14 ` David Rientjes
1 sibling, 1 reply; 52+ messages in thread
From: Michal Hocko @ 2018-01-17 16:00 UTC (permalink / raw)
To: David Rientjes, Tejun Heo
Cc: Andrew Morton, Roman Gushchin, Vladimir Davydov, Johannes Weiner,
Tetsuo Handa, kernel-team, cgroups, linux-doc, linux-kernel,
linux-mm
On Wed 17-01-18 07:41:55, Tejun Heo wrote:
> Hello, David.
>
> On Tue, Jan 16, 2018 at 06:15:08PM -0800, David Rientjes wrote:
> > The behavior of killing an entire indivisible memory consumer, enabled
> > by memory.oom_group, is an oom policy itself. It specifies that all
>
> I thought we discussed this before but maybe I'm misremembering.
> There are two parts to the OOM policy. One is victim selection, the
> other is the action to take thereafter.
Yes we have. Multiple times! The last time I've said the very same thing
was yesterday http://lkml.kernel.org/r/20180116220907.GD17351@dhcp22.suse.cz
> The two are different and conflating the two don't work too well. For
> example, please consider what should be given to the delegatee when
> delegating a subtree, which often is a good excercise when designing
> these APIs.
Absolutely agreed! And moreover, there are not all that many ways what
to do as an action. You just kill a logical entity - be it a process or
a logical group of processes. But you have way too many policies how
to select that entity. Do you want to chose the youngest process/group
because all the older ones have been computing real stuff and you would
lose days of your cpu time? Or should those who pay more should be
protected (aka give them static priorities), or you name it...
I am sorry, I still didn't grasp the full semantic of the proposed
soluton but the mere fact it is starting by conflating selection and the
action is a no go and a wrong API. This is why I've said that what you
(David) outlined yesterday is probably going to suffer from a much
longer discussion and most likely to be not acceptable. Your patchset
proves me correct...
--
Michal Hocko
SUSE Labs
^ permalink raw reply [flat|nested] 52+ messages in thread
* Re: [patch -mm 3/4] mm, memcg: replace memory.oom_group with policy tunable
2018-01-17 16:00 ` Michal Hocko
@ 2018-01-17 22:18 ` David Rientjes
2018-01-23 15:13 ` Michal Hocko
0 siblings, 1 reply; 52+ messages in thread
From: David Rientjes @ 2018-01-17 22:18 UTC (permalink / raw)
To: Michal Hocko
Cc: Tejun Heo, Andrew Morton, Roman Gushchin, Vladimir Davydov,
Johannes Weiner, Tetsuo Handa, kernel-team, cgroups, linux-doc,
linux-kernel, linux-mm
On Wed, 17 Jan 2018, Michal Hocko wrote:
> Absolutely agreed! And moreover, there are not all that many ways what
> to do as an action. You just kill a logical entity - be it a process or
> a logical group of processes. But you have way too many policies how
> to select that entity. Do you want to chose the youngest process/group
> because all the older ones have been computing real stuff and you would
> lose days of your cpu time? Or should those who pay more should be
> protected (aka give them static priorities), or you name it...
>
That's an argument for making the interface extensible, yes.
> I am sorry, I still didn't grasp the full semantic of the proposed
> soluton but the mere fact it is starting by conflating selection and the
> action is a no go and a wrong API. This is why I've said that what you
> (David) outlined yesterday is probably going to suffer from a much
> longer discussion and most likely to be not acceptable. Your patchset
> proves me correct...
I'm very happy to change the API if there are better suggestions. That
may end up just being an memory.oom_policy file, as this implements, and
separating out a new memory.oom_action that isn't a boolean value to
either do a full group kill or only a single process. Or it could be what
I suggested in my mail to Tejun, such as "hierarchy killall" written to
memory.oom_policy, which would specify a single policy and then an
optional mechanism. With my proposed patchset, there would then be three
policies: "none", "cgroup", and "tree" and one possible optional
mechanism: "killall".
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 52+ messages in thread
* Re: [patch -mm 3/4] mm, memcg: replace memory.oom_group with policy tunable
2018-01-17 22:18 ` David Rientjes
@ 2018-01-23 15:13 ` Michal Hocko
0 siblings, 0 replies; 52+ messages in thread
From: Michal Hocko @ 2018-01-23 15:13 UTC (permalink / raw)
To: David Rientjes
Cc: Tejun Heo, Andrew Morton, Roman Gushchin, Vladimir Davydov,
Johannes Weiner, Tetsuo Handa, kernel-team, cgroups, linux-doc,
linux-kernel, linux-mm
On Wed 17-01-18 14:18:33, David Rientjes wrote:
> On Wed, 17 Jan 2018, Michal Hocko wrote:
>
> > Absolutely agreed! And moreover, there are not all that many ways what
> > to do as an action. You just kill a logical entity - be it a process or
> > a logical group of processes. But you have way too many policies how
> > to select that entity. Do you want to chose the youngest process/group
> > because all the older ones have been computing real stuff and you would
> > lose days of your cpu time? Or should those who pay more should be
> > protected (aka give them static priorities), or you name it...
> >
>
> That's an argument for making the interface extensible, yes.
And there is no interface to control the selection yet so we can develop
one on top.
> > I am sorry, I still didn't grasp the full semantic of the proposed
> > soluton but the mere fact it is starting by conflating selection and the
> > action is a no go and a wrong API. This is why I've said that what you
> > (David) outlined yesterday is probably going to suffer from a much
> > longer discussion and most likely to be not acceptable. Your patchset
> > proves me correct...
>
> I'm very happy to change the API if there are better suggestions. That
> may end up just being an memory.oom_policy file, as this implements, and
> separating out a new memory.oom_action that isn't a boolean value to
> either do a full group kill or only a single process. Or it could be what
> I suggested in my mail to Tejun, such as "hierarchy killall" written to
> memory.oom_policy, which would specify a single policy and then an
> optional mechanism. With my proposed patchset, there would then be three
> policies: "none", "cgroup", and "tree" and one possible optional
> mechanism: "killall".
You haven't convinced me at all. This all sounds more like "what if"
than a really thought through interface. I've tried to point out that
having a real policy driven victim selection is a _hard_ thing to do
_right_.
On the other hand oom_group makes semantic sense. It controls the
killable entity and there are usecases which want to consider the full
memcg as a single killable entity. No matter what selection policy we
chose on top. It is just a natural API.
Now you keep arguing about the victim selection and different strategies
to implement it. We will not move forward as long as you keep conflating
the two things, I am afraid.
--
Michal Hocko
SUSE Labs
^ permalink raw reply [flat|nested] 52+ messages in thread
* Re: [patch -mm 3/4] mm, memcg: replace memory.oom_group with policy tunable
2018-01-17 15:41 ` Tejun Heo
2018-01-17 16:00 ` Michal Hocko
@ 2018-01-17 22:14 ` David Rientjes
2018-01-19 20:53 ` David Rientjes
1 sibling, 1 reply; 52+ messages in thread
From: David Rientjes @ 2018-01-17 22:14 UTC (permalink / raw)
To: Tejun Heo
Cc: Andrew Morton, Roman Gushchin, Michal Hocko, Vladimir Davydov,
Johannes Weiner, Tetsuo Handa, kernel-team, cgroups, linux-doc,
linux-kernel, linux-mm
On Wed, 17 Jan 2018, Tejun Heo wrote:
> Hello, David.
>
Hi Tejun!
> > The behavior of killing an entire indivisible memory consumer, enabled
> > by memory.oom_group, is an oom policy itself. It specifies that all
>
> I thought we discussed this before but maybe I'm misremembering.
> There are two parts to the OOM policy. One is victim selection, the
> other is the action to take thereafter.
>
> The two are different and conflating the two don't work too well. For
> example, please consider what should be given to the delegatee when
> delegating a subtree, which often is a good excercise when designing
> these APIs.
>
> When a given workload is selected for OOM kill (IOW, selected to free
> some memory), whether the workload can handle individual process kills
> or not is the property of the workload itself. Some applications can
> safely handle some of its processes picked off and killed. Most
> others can't and want to be handled as a single unit, which makes it a
> property of the workload.
>
Yes, this is a valid point. The policy of "tree" and "all" are identical
policies and then the mechanism differs wrt to whether one process is
killed or all eligible processes are killed, respectively. My motivation
for this was to avoid having two different tunables, especially because
later we'll need a way for userspace to influence the decisionmaking to
protect (bias against) important subtrees. What would really be nice is
cgroup.subtree_control-type behavior where we could effect a policy and a
mechanism at the same time. It's not clear how that would be handled to
allow only one policy and one mechanism, however, in a clean way. The
simplest for the user would be a new file, to specify the mechanism and
leave memory.oom_policy alone. Would another file really be warranted?
Not sure.
> That makes sense in the hierarchy too because whether one process or
> the whole workload is killed doesn't infringe upon the parent's
> authority over resources which in turn implies that there's nothing to
> worry about how the parent's groupoom setting should constrain the
> descendants.
>
> OOM victim selection policy is a different beast. As you've mentioned
> multiple times, especially if you're worrying about people abusing OOM
> policies by creating sub-cgroups and so on, the policy, first of all,
> shouldn't be delegatable and secondly should have meaningful
> hierarchical restrictions so that a policy that an ancestor chose
> can't be nullified by a descendant.
>
The goal here was to require a policy of either "tree" or "all" that the
user can't change. They are allowed to have their own oom policies
internal to their subtree, however, for oom conditions in that subtree
alone. However, if the common ancestor hits its limit, it is forced to
either be "tree" or "all" and require hierarchical usage to be considered
instead of localized usage. Either "tree" or "all" is appropriate, and
this may be why you brought up the point about separating them out, i.e.
the policy can be demanded by the common ancestor but the actual mechanism
that the oom killer uses, kill either a single process or the full cgroup,
is left to the user depending on their workload. That sounds reasonable
and I can easily separate the two by introducing a new file, similar to
memory.oom_group but in a more extensible way so that it is not simply a
selection of either full cgroup kill or single process.
> I'm not necessarily against adding hierarchical victim selection
> policy tunables; however, I am skeptical whether static tunables on
> cgroup hierarchy (including selectable policies) can be made clean and
> versatile enough, especially because the resource hierarchy doesn't
> necessarily, or rather in most cases, match the OOM victim selection
> decision tree, but I'd be happy to be proven wrong.
>
Right, and I think that giving users control over their subtrees is a
powerful tool and one that can lead to very effective use of the cgroup v2
hierarchy. Being able to circumvent the oom selection by creating child
cgroups is certainly something that can trivially be prevented. The
argument that users can currently divide their entire processes into
several different smaller processes to circumvent today's heuristic
doesn't mean we can't have "tree"-like comparisons between cgroups to
address that issue itself since all processes charge to the tree itself.
I became convinced of this when I saw the real-world usecases that would
use such a feature on cgroup v2: we want to have hierarchical usage for
comparison when full subtrees are dedicated to individual consumers, for
example, and local mem cgroup usage for comparison when using hierarchies
for top-level /admins and /students cgroups for which Michal provided an
example. These can coexist on systems and it's clear that there needs to
be a system-wide policy decision for the cgroup aware oom killer (the idea
behind the current mount option, which isn't needed anymore). So defining
the actual policy, and mechanism as you pointed out, for subtrees is a
very powerful tool, it's extensible, doesn't require a system to either
fully enable or disable, and doesn't require a remount of cgroup v2 to
change.
> Without explicit configurations, the only thing the OOM killer needs
> to guarantee is that the system can make forward progress. We've
> always been tweaking victim selection with or without cgroup and
> absolutely shouldn't be locked into a specific heuristics. The
> heuristics is an implementaiton detail subject to improvements.
>
> To me, your patchset actually seems to demonstrate that these are
> separate issues. The goal of groupoom is just to kill logical units
> as cgroup hierarchy can inform the kernel of how workloads are
> composed in the userspace. If you want to improve victim selection,
> sure, please go ahead, but your argument that groupoom can't be merged
> because of victim selection policy doesn't make sense to me.
>
memory.oom_group, the mechanism behind what the oom killer chooses to do
after victim selection, is not implemented without the selection heuristic
comparing cgroups as indivisible memory consumers. It could be done first
prior to introducing the new selection criteria. We don't have patches
for that right now, because Roman's work introduces it in the opposite
order. If it is acceptable to add a separate file solely for this
purpose, it's rather trivial to do. My other thought was some kind of
echo "hierarchy killall" > memory.oom_policy where the policy and an
(optional) mechanism could be specified. Your input on the actual
tunables would be very valuable.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 52+ messages in thread
* Re: [patch -mm 3/4] mm, memcg: replace memory.oom_group with policy tunable
2018-01-17 22:14 ` David Rientjes
@ 2018-01-19 20:53 ` David Rientjes
[not found] ` <alpine.DEB.2.10.1801191251080.177541-X6Q0R45D7oAcqpCFd4KODRPsWskHk0ljAL8bYrjMMd8@public.gmane.org>
0 siblings, 1 reply; 52+ messages in thread
From: David Rientjes @ 2018-01-19 20:53 UTC (permalink / raw)
To: Tejun Heo
Cc: Andrew Morton, Roman Gushchin, Michal Hocko, Vladimir Davydov,
Johannes Weiner, Tetsuo Handa, kernel-team, cgroups, linux-doc,
linux-kernel, linux-mm
On Wed, 17 Jan 2018, David Rientjes wrote:
> Yes, this is a valid point. The policy of "tree" and "all" are identical
> policies and then the mechanism differs wrt to whether one process is
> killed or all eligible processes are killed, respectively. My motivation
> for this was to avoid having two different tunables, especially because
> later we'll need a way for userspace to influence the decisionmaking to
> protect (bias against) important subtrees. What would really be nice is
> cgroup.subtree_control-type behavior where we could effect a policy and a
> mechanism at the same time. It's not clear how that would be handled to
> allow only one policy and one mechanism, however, in a clean way. The
> simplest for the user would be a new file, to specify the mechanism and
> leave memory.oom_policy alone. Would another file really be warranted?
> Not sure.
>
Hearing no response, I'll implement this as a separate tunable in a v2
series assuming there are no better ideas proposed before next week. One
of the nice things about a separate tunable is that an admin can control
the overall policy and they can delegate the mechanism (killall vs one
process) to a user subtree. I agree with your earlier point that killall
vs one process is a property of the workload and is better defined
separately.
I'll also look to fix the breakage wrt root mem cgroup comparison with
leaf mem cgroup comparison that is currently in -mm.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 52+ messages in thread
* [patch -mm 4/4] mm, memcg: add hierarchical usage oom policy
2018-01-17 2:14 [patch -mm 0/4] mm, memcg: introduce oom policies David Rientjes
` (2 preceding siblings ...)
[not found] ` <alpine.DEB.2.10.1801161812550.28198-X6Q0R45D7oAcqpCFd4KODRPsWskHk0ljAL8bYrjMMd8@public.gmane.org>
@ 2018-01-17 2:15 ` David Rientjes
2018-01-17 11:46 ` [patch -mm 0/4] mm, memcg: introduce oom policies Roman Gushchin
2018-01-25 23:53 ` [patch -mm v2 0/3] " David Rientjes
5 siblings, 0 replies; 52+ messages in thread
From: David Rientjes @ 2018-01-17 2:15 UTC (permalink / raw)
To: Andrew Morton, Roman Gushchin
Cc: Michal Hocko, Vladimir Davydov, Johannes Weiner, Tetsuo Handa,
Tejun Heo, kernel-team, cgroups, linux-doc, linux-kernel,
linux-mm
One of the three significant concerns brought up about the cgroup aware
oom killer is that its decisionmaking is completely evaded by creating
subcontainers and attaching processes such that the ancestor's usage does
not exceed another cgroup on the system.
In this regard, users who do not distribute their processes over a set of
subcontainers for mem cgroup control, statistics, or other controllers
are unfairly penalized.
This adds an oom policy, "tree", that accounts for hierarchical usage
when comparing cgroups and the cgroup aware oom killer is enabled by an
ancestor. This allows administrators, for example, to require users in
their own top-level mem cgroup subtree to be accounted for with
hierarchical usage. In other words, they can longer evade the oom killer
by using other controllers or subcontainers.
Signed-off-by: David Rientjes <rientjes@google.com>
---
Documentation/cgroup-v2.txt | 12 ++++++++++--
include/linux/memcontrol.h | 9 +++++++--
mm/memcontrol.c | 23 +++++++++++++++--------
3 files changed, 32 insertions(+), 12 deletions(-)
diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt
--- a/Documentation/cgroup-v2.txt
+++ b/Documentation/cgroup-v2.txt
@@ -1048,6 +1048,11 @@ PAGE_SIZE multiple when read back.
memory consumers; that is, they will compare mem cgroup usage rather
than process memory footprint. See the "OOM Killer" section.
+ If "tree", the OOM killer will compare mem cgroups and its subtree
+ as indivisible memory consumers when selecting a hierarchy. This
+ policy cannot be set on the root mem cgroup. See the "OOM Killer"
+ section.
+
If "all", the OOM killer will compare mem cgroups and its subtree
as indivisible memory consumers and kill all processes attached to
the mem cgroup and its subtree. This policy cannot be set on the
@@ -1275,6 +1280,9 @@ There are currently three available oom policies:
- "cgroup": choose the cgroup with the largest memory footprint from the
subtree as an OOM victim and kill at least one process.
+ - "tree": choose the cgroup with the largest memory footprint considering
+ itself and its subtree and kill at least one process.
+
- "all": choose the cgroup with the largest memory footprint considering
itself and its subtree and kill all processes attached (cannot be set on
the root mem cgroup).
@@ -1292,8 +1300,8 @@ Please, note that memory charges are not migrating if tasks
are moved between different memory cgroups. Moving tasks with
significant memory footprint may affect OOM victim selection logic.
If it's a case, please, consider creating a common ancestor for
-the source and destination memory cgroups and setting a policy of "all"
-on ancestor layer.
+the source and destination memory cgroups and setting a policy of "tree"
+or "all" on ancestor layer.
IO
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -70,8 +70,13 @@ enum memcg_oom_policy {
*/
MEMCG_OOM_POLICY_CGROUP,
/*
- * Same as MEMCG_OOM_POLICY_CGROUP, but all eligible processes attached
- * to the cgroup and subtree should be oom killed
+ * Tree cgroup usage for all descendant memcg groups, treating each mem
+ * cgroup and its subtree as an indivisible consumer
+ */
+ MEMCG_OOM_POLICY_TREE,
+ /*
+ * Same as MEMCG_OOM_POLICY_TREE, but all eligible processes are also
+ * oom killed
*/
MEMCG_OOM_POLICY_ALL,
};
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2715,11 +2715,11 @@ static void select_victim_memcg(struct mem_cgroup *root, struct oom_control *oc)
oc->chosen_points = 0;
/*
- * If OOM is memcg-wide, and the oom policy is "all", all processes
- * attached to the memcg and subtree should be killed.
- * So, we mark the memcg as a victim.
+ * If OOM is memcg-wide, and the oom policy is "tree" or "all", this
+ * is the selected memcg.
*/
- if (oc->memcg && mem_cgroup_oom_policy_all(oc->memcg)) {
+ if (oc->memcg && (oc->memcg->oom_policy == MEMCG_OOM_POLICY_TREE ||
+ oc->memcg->oom_policy == MEMCG_OOM_POLICY_ALL)) {
oc->chosen_memcg = oc->memcg;
css_get(&oc->chosen_memcg->css);
return;
@@ -2728,8 +2728,8 @@ static void select_victim_memcg(struct mem_cgroup *root, struct oom_control *oc)
/*
* The oom_score is calculated for leaf memory cgroups (including
* the root memcg).
- * Cgroups with oom policy of "all" accumulate the score of descendant
- * leaf memory cgroups.
+ * Cgroups with oom policy of "tree" or "all" accumulate the score of
+ * descendant leaf memory cgroups.
*/
rcu_read_lock();
for_each_mem_cgroup_tree(iter, root) {
@@ -2737,10 +2737,11 @@ static void select_victim_memcg(struct mem_cgroup *root, struct oom_control *oc)
/*
* We don't consider non-leaf memory cgroups without the oom
- * policy of "all" as oom victims.
+ * policy of "tree" or "all" as oom victims.
*/
if (memcg_has_children(iter) && iter != root_mem_cgroup &&
- !mem_cgroup_oom_policy_all(iter))
+ iter->oom_policy != MEMCG_OOM_POLICY_TREE &&
+ iter->oom_policy != MEMCG_OOM_POLICY_ALL)
continue;
/*
@@ -5511,6 +5512,9 @@ static int memory_oom_policy_show(struct seq_file *m, void *v)
case MEMCG_OOM_POLICY_CGROUP:
seq_puts(m, "cgroup\n");
break;
+ case MEMCG_OOM_POLICY_TREE:
+ seq_puts(m, "tree\n");
+ break;
case MEMCG_OOM_POLICY_ALL:
seq_puts(m, "all\n");
break;
@@ -5532,6 +5536,9 @@ static ssize_t memory_oom_policy_write(struct kernfs_open_file *of,
memcg->oom_policy = MEMCG_OOM_POLICY_NONE;
else if (!memcmp("cgroup", buf, min(sizeof("cgroup")-1, nbytes)))
memcg->oom_policy = MEMCG_OOM_POLICY_CGROUP;
+ else if (memcg != root_mem_cgroup &&
+ !memcmp("tree", buf, min(sizeof("tree")-1, nbytes)))
+ memcg->oom_policy = MEMCG_OOM_POLICY_TREE;
else if (memcg != root_mem_cgroup &&
!memcmp("all", buf, min(sizeof("all")-1, nbytes)))
memcg->oom_policy = MEMCG_OOM_POLICY_ALL;
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 52+ messages in thread* Re: [patch -mm 0/4] mm, memcg: introduce oom policies
2018-01-17 2:14 [patch -mm 0/4] mm, memcg: introduce oom policies David Rientjes
` (3 preceding siblings ...)
2018-01-17 2:15 ` [patch -mm 4/4] mm, memcg: add hierarchical usage oom policy David Rientjes
@ 2018-01-17 11:46 ` Roman Gushchin
2018-01-17 22:31 ` David Rientjes
2018-01-25 23:53 ` [patch -mm v2 0/3] " David Rientjes
5 siblings, 1 reply; 52+ messages in thread
From: Roman Gushchin @ 2018-01-17 11:46 UTC (permalink / raw)
To: David Rientjes
Cc: Andrew Morton, Michal Hocko, Vladimir Davydov, Johannes Weiner,
Tetsuo Handa, Tejun Heo, kernel-team, cgroups, linux-doc,
linux-kernel, linux-mm
On Tue, Jan 16, 2018 at 06:14:58PM -0800, David Rientjes wrote:
> There are three significant concerns about the cgroup aware oom killer as
> it is implemented in -mm:
>
> (1) allows users to evade the oom killer by creating subcontainers or
> using other controllers since scoring is done per cgroup and not
> hierarchically,
>
> (2) does not allow the user to influence the decisionmaking, such that
> important subtrees cannot be preferred or biased, and
>
> (3) unfairly compares the root mem cgroup using completely different
> criteria than leaf mem cgroups and allows wildly inaccurate results
> if oom_score_adj is used.
>
> This patchset aims to fix (1) completely and, by doing so, introduces a
> completely extensible user interface that can be expanded in the future.
>
> It eliminates the mount option for the cgroup aware oom killer entirely
> since it is now enabled through the root mem cgroup's oom policy.
>
> It eliminates a pointless tunable, memory.oom_group, that unnecessarily
> pollutes the mem cgroup v2 filesystem and is invalid when cgroup v2 is
> mounted with the "groupoom" option.
You're introducing a new oom_policy knob, which has two separate sets
of possible values for the root and non-root cgroups. I don't think
it aligns with the existing cgroup v2 design.
For the root cgroup it works exactly as mount option, and both "none"
and "cgroup" values have no meaning outside of the root cgroup. We can
discuss if a knob on root cgroup is better than a mount option, or not
(I don't think so), but it has nothing to do with oom policy as you
define it for non-root cgroups.
For non-root cgroups you're introducing "all" and "tree", and the _only_
difference is that in the "all" mode all processes will be killed, rather
than the biggest in the "tree". I find these names confusing, in reality
it's more "evaluate together and kill all" and "evaluate together and
kill one".
So, it's not really the fully hierarchical approach, which I thought,
you were arguing for. You can easily do the same with adding the third
value to the memory.groupoom knob, as I've suggested earlier (say, "disable,
"kill" and "evaluate"), and will be much less confusing.
Thanks!
Roman
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 52+ messages in thread* Re: [patch -mm 0/4] mm, memcg: introduce oom policies
2018-01-17 11:46 ` [patch -mm 0/4] mm, memcg: introduce oom policies Roman Gushchin
@ 2018-01-17 22:31 ` David Rientjes
0 siblings, 0 replies; 52+ messages in thread
From: David Rientjes @ 2018-01-17 22:31 UTC (permalink / raw)
To: Roman Gushchin
Cc: Andrew Morton, Michal Hocko, Vladimir Davydov, Johannes Weiner,
Tetsuo Handa, Tejun Heo, kernel-team, cgroups, linux-doc,
linux-kernel, linux-mm
On Wed, 17 Jan 2018, Roman Gushchin wrote:
> You're introducing a new oom_policy knob, which has two separate sets
> of possible values for the root and non-root cgroups. I don't think
> it aligns with the existing cgroup v2 design.
>
The root mem cgroup can use "none" or "cgroup" to either disable or
enable, respectively, the cgroup aware oom killer. These are available to
non root mem cgroups as well, with the addition of hierarchical usage
comparison to prevent the ability to completely evade the oom killer by
the user by creating sub cgroups. Just as we have files that are made
available on the root cgroup and not others, I think it's fine to allow
only certain policies on the root mem cgroup. As I wrote to Tejun, I'm
fine with the policy being separated from the mechanism. That can be
resolved in that email thread, but the overall point of this patchset is
to allow hierarchical comparison on some subtrees and not on others. We
can avoid the mount option in the same manner.
> For the root cgroup it works exactly as mount option, and both "none"
> and "cgroup" values have no meaning outside of the root cgroup. We can
> discuss if a knob on root cgroup is better than a mount option, or not
> (I don't think so), but it has nothing to do with oom policy as you
> define it for non-root cgroups.
>
It certainly does have value outside of the root cgroup: for system oom
conditions it is possible to choose the largest process, largest single
cgroup, or largest subtree to kill from. For memcg oom conditions it's
possible for a consumer in a subtree to define whether it wants the
largest memory hogging process to be oom killed or the largest of its
child sub cgroups. This would be needed for a job scheduler in its own
subtree, for example, that creates its own subtrees to limit jobs
scheduled by individual users who have the power over their subtree but
not their limit. This is a real-world example. Michal also provided his
own example concerning top-level /admins and /students. They want to use
"cgroup" themselves and then /students/roman would be forced to "tree".
Keep in mind that this patchset alone makes the interface extensible and
addresses one of my big three concerns, but the comparison of the root mem
cgroup compared to other individual cgroups and cgroups maintaining a
subtree needs to be fixed separately so that we don't completely evade all
of these oom policies by using /proc/pid/oom_score_adj in the root mem
cgroup. That's a separate issue that needs to be addressed. My last
concern, about userspace influence, can probably be addressed after this
is merged by yet another tunable since it's vital that important cgroups
can be protected. It does make sense for an important cgroup to be able
to use 50% of memory without being oom killed, and that's impossible if
cgroup v2 has been mounted with your mount option.
^ permalink raw reply [flat|nested] 52+ messages in thread
* [patch -mm v2 0/3] mm, memcg: introduce oom policies
2018-01-17 2:14 [patch -mm 0/4] mm, memcg: introduce oom policies David Rientjes
` (4 preceding siblings ...)
2018-01-17 11:46 ` [patch -mm 0/4] mm, memcg: introduce oom policies Roman Gushchin
@ 2018-01-25 23:53 ` David Rientjes
2018-01-25 23:53 ` [patch -mm v2 1/3] mm, memcg: introduce per-memcg oom policy tunable David Rientjes
` (2 more replies)
5 siblings, 3 replies; 52+ messages in thread
From: David Rientjes @ 2018-01-25 23:53 UTC (permalink / raw)
To: Andrew Morton, Roman Gushchin
Cc: Michal Hocko, Vladimir Davydov, Johannes Weiner, Tetsuo Handa,
Tejun Heo, kernel-team, cgroups, linux-doc, linux-kernel,
linux-mm
There are three significant concerns about the cgroup aware oom killer as
it is implemented in -mm:
(1) allows users to evade the oom killer by creating subcontainers or
using other controllers since scoring is done per cgroup and not
hierarchically,
(2) does not allow the user to influence the decisionmaking, such that
important subtrees cannot be preferred or biased, and
(3) unfairly compares the root mem cgroup using completely different
criteria than leaf mem cgroups and allows wildly inaccurate results
if oom_score_adj is used.
This patchset aims to fix (1) completely and, by doing so, introduces a
completely extensible user interface that can be expanded in the future.
It eliminates the mount option for the cgroup aware oom killer entirely
since it is now enabled through the root mem cgroup's oom policy.
It preserves memory.oom_group behavior.
---
Applied on top of -mm.
Documentation/cgroup-v2.txt | 64 ++++++++++++++++++++++++++++-----------------
include/linux/cgroup-defs.h | 5 ----
include/linux/memcontrol.h | 21 +++++++++++++++
kernel/cgroup/cgroup.c | 13 +--------
mm/memcontrol.c | 64 ++++++++++++++++++++++++++++++++++++---------
5 files changed, 114 insertions(+), 53 deletions(-)
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 52+ messages in thread* [patch -mm v2 1/3] mm, memcg: introduce per-memcg oom policy tunable
2018-01-25 23:53 ` [patch -mm v2 0/3] " David Rientjes
@ 2018-01-25 23:53 ` David Rientjes
2018-01-26 17:15 ` Michal Hocko
2018-01-25 23:53 ` [patch -mm v2 2/3] mm, memcg: replace cgroup aware oom killer mount option with tunable David Rientjes
2018-01-25 23:53 ` [patch -mm v2 3/3] mm, memcg: add hierarchical usage oom policy David Rientjes
2 siblings, 1 reply; 52+ messages in thread
From: David Rientjes @ 2018-01-25 23:53 UTC (permalink / raw)
To: Andrew Morton, Roman Gushchin
Cc: Michal Hocko, Vladimir Davydov, Johannes Weiner, Tetsuo Handa,
Tejun Heo, kernel-team, cgroups, linux-doc, linux-kernel,
linux-mm
The cgroup aware oom killer is needlessly declared for the entire system
by a mount option. It's unnecessary to force the system into a single
oom policy: either cgroup aware, or the traditional process aware.
This patch introduces a memory.oom_policy tunable for all mem cgroups.
It is currently a no-op: it can only be set to "none", which is its
default policy. It will be expanded in the next patch to define cgroup
aware oom killer behavior.
This is an extensible interface that can be used to define cgroup aware
assessment of mem cgroup subtrees or the traditional process aware
assessment.
Another benefit of such an approach is that an admin can lock in a
certain policy for the system or for a mem cgroup subtree and can
delegate the policy decision to the user to determine if the kill should
originate from a subcontainer, as indivisible memory consumers
themselves, or selection should be done per process.
Signed-off-by: David Rientjes <rientjes@google.com>
---
Documentation/cgroup-v2.txt | 9 +++++++++
include/linux/memcontrol.h | 11 +++++++++++
mm/memcontrol.c | 35 +++++++++++++++++++++++++++++++++++
3 files changed, 55 insertions(+)
diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt
--- a/Documentation/cgroup-v2.txt
+++ b/Documentation/cgroup-v2.txt
@@ -1065,6 +1065,15 @@ PAGE_SIZE multiple when read back.
If cgroup-aware OOM killer is not enabled, ENOTSUPP error
is returned on attempt to access the file.
+ memory.oom_policy
+
+ A read-write single string file which exists on all cgroups. The
+ default value is "none".
+
+ If "none", the OOM killer will use the default policy to choose a
+ victim; that is, it will choose the single process with the largest
+ memory footprint.
+
memory.events
A read-only flat-keyed file which exists on non-root cgroups.
The following entries are defined. Unless specified
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -58,6 +58,14 @@ enum memcg_event_item {
MEMCG_NR_EVENTS,
};
+enum memcg_oom_policy {
+ /*
+ * No special oom policy, process selection is determined by
+ * oom_badness()
+ */
+ MEMCG_OOM_POLICY_NONE,
+};
+
struct mem_cgroup_reclaim_cookie {
pg_data_t *pgdat;
int priority;
@@ -203,6 +211,9 @@ struct mem_cgroup {
/* OOM-Killer disable */
int oom_kill_disable;
+ /* OOM policy for this subtree */
+ enum memcg_oom_policy oom_policy;
+
/*
* Treat the sub-tree as an indivisible memory consumer,
* kill all belonging tasks if the memory cgroup selected
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4417,6 +4417,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
if (parent) {
memcg->swappiness = mem_cgroup_swappiness(parent);
memcg->oom_kill_disable = parent->oom_kill_disable;
+ memcg->oom_policy = parent->oom_policy;
}
if (parent && parent->use_hierarchy) {
memcg->use_hierarchy = true;
@@ -5534,6 +5535,34 @@ static int memory_stat_show(struct seq_file *m, void *v)
return 0;
}
+static int memory_oom_policy_show(struct seq_file *m, void *v)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+ enum memcg_oom_policy policy = READ_ONCE(memcg->oom_policy);
+
+ switch (policy) {
+ case MEMCG_OOM_POLICY_NONE:
+ default:
+ seq_puts(m, "none\n");
+ };
+ return 0;
+}
+
+static ssize_t memory_oom_policy_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+ ssize_t ret = nbytes;
+
+ buf = strstrip(buf);
+ if (!memcmp("none", buf, min(sizeof("none")-1, nbytes)))
+ memcg->oom_policy = MEMCG_OOM_POLICY_NONE;
+ else
+ ret = -EINVAL;
+
+ return ret;
+}
+
static struct cftype memory_files[] = {
{
.name = "current",
@@ -5575,6 +5604,12 @@ static struct cftype memory_files[] = {
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = memory_stat_show,
},
+ {
+ .name = "oom_policy",
+ .flags = CFTYPE_NS_DELEGATABLE,
+ .seq_show = memory_oom_policy_show,
+ .write = memory_oom_policy_write,
+ },
{ } /* terminate */
};
^ permalink raw reply [flat|nested] 52+ messages in thread* Re: [patch -mm v2 1/3] mm, memcg: introduce per-memcg oom policy tunable
2018-01-25 23:53 ` [patch -mm v2 1/3] mm, memcg: introduce per-memcg oom policy tunable David Rientjes
@ 2018-01-26 17:15 ` Michal Hocko
2018-01-29 22:38 ` David Rientjes
0 siblings, 1 reply; 52+ messages in thread
From: Michal Hocko @ 2018-01-26 17:15 UTC (permalink / raw)
To: David Rientjes
Cc: Andrew Morton, Roman Gushchin, Vladimir Davydov, Johannes Weiner,
Tetsuo Handa, Tejun Heo, kernel-team, cgroups, linux-doc,
linux-kernel, linux-mm
On Thu 25-01-18 15:53:45, David Rientjes wrote:
> The cgroup aware oom killer is needlessly declared for the entire system
> by a mount option. It's unnecessary to force the system into a single
> oom policy: either cgroup aware, or the traditional process aware.
>
> This patch introduces a memory.oom_policy tunable for all mem cgroups.
> It is currently a no-op: it can only be set to "none", which is its
> default policy. It will be expanded in the next patch to define cgroup
> aware oom killer behavior.
>
> This is an extensible interface that can be used to define cgroup aware
> assessment of mem cgroup subtrees or the traditional process aware
> assessment.
>
So what is the actual semantic and scope of this policy. Does it apply
only down the hierarchy. Also how do you compare cgroups with different
policies? Let's say you have
root
/ | \
A B C
/ \ / \
D E F G
Assume A: cgroup, B: oom_group=1, C: tree, G: oom_group=1
Now we have the global OOM killer to choose a victim. From a quick
glance over those patches, it seems that we will be comparing only
tasks because root->oom_policy != MEMCG_OOM_POLICY_CGROUP. A, B and C
policies are ignored. Moreover If I select any of B's tasks then I will
happily kill it breaking the expectation that the whole memcg will go
away. Weird, don't you think? Or did I misunderstand?
So let's assume that root: cgroup. Then we are finally comparing
cgroups. D, E, B, C. Of those D, E and F do not have any
policy. Do they inherit their policy from the parent? If they don't then
we should be comparing their tasks separately, no? The code disagrees
because once we are in the cgroup mode, we do not care about separate
tasks.
Let's say we choose C because it has the largest cumulative consumption.
It is not oom_group so it will select a task from F, G. Again you are
breaking oom_group policy of G if you kill a single task. So you would
have to be recursive here. That sounds fixable though. Just be
recursive.
Then you say
> Another benefit of such an approach is that an admin can lock in a
> certain policy for the system or for a mem cgroup subtree and can
> delegate the policy decision to the user to determine if the kill should
> originate from a subcontainer, as indivisible memory consumers
> themselves, or selection should be done per process.
And the code indeed doesn't check oom_policy on each level of the
hierarchy, unless I am missing something. So the subgroup is simply
locked in to the oom_policy parent has chosen. That is not the case for
the tree policy.
So look how we are comparing cumulative groups without policy with
groups with policy with subtrees. Either I have grossly misunderstood
something or this is massively inconsistent and it doesn't make much
sense to me. Root memcg without cgroup policy will simply turn off the
whole thing for the global OOM case. So you really need to enable it
there but then it is not really clear how to configure lower levels.
From the above it seems that you are more interested in memcg OOMs and
want to give different hierarchies different policies but you quickly
hit the similar inconsistencies there as well.
I am not sure how extensible this is actually. How do we place
priorities on top?
> Signed-off-by: David Rientjes <rientjes@google.com>
--
Michal Hocko
SUSE Labs
^ permalink raw reply [flat|nested] 52+ messages in thread* Re: [patch -mm v2 1/3] mm, memcg: introduce per-memcg oom policy tunable
2018-01-26 17:15 ` Michal Hocko
@ 2018-01-29 22:38 ` David Rientjes
[not found] ` <alpine.DEB.2.10.1801291418150.29670-X6Q0R45D7oAcqpCFd4KODRPsWskHk0ljAL8bYrjMMd8@public.gmane.org>
0 siblings, 1 reply; 52+ messages in thread
From: David Rientjes @ 2018-01-29 22:38 UTC (permalink / raw)
To: Michal Hocko
Cc: Andrew Morton, Roman Gushchin, Vladimir Davydov, Johannes Weiner,
Tetsuo Handa, Tejun Heo, kernel-team, cgroups, linux-doc,
linux-kernel, linux-mm
On Fri, 26 Jan 2018, Michal Hocko wrote:
> > The cgroup aware oom killer is needlessly declared for the entire system
> > by a mount option. It's unnecessary to force the system into a single
> > oom policy: either cgroup aware, or the traditional process aware.
> >
> > This patch introduces a memory.oom_policy tunable for all mem cgroups.
> > It is currently a no-op: it can only be set to "none", which is its
> > default policy. It will be expanded in the next patch to define cgroup
> > aware oom killer behavior.
> >
> > This is an extensible interface that can be used to define cgroup aware
> > assessment of mem cgroup subtrees or the traditional process aware
> > assessment.
> >
>
> So what is the actual semantic and scope of this policy. Does it apply
> only down the hierarchy. Also how do you compare cgroups with different
> policies? Let's say you have
> root
> / | \
> A B C
> / \ / \
> D E F G
>
> Assume A: cgroup, B: oom_group=1, C: tree, G: oom_group=1
>
At each level of the hierarchy, memory.oom_policy compares immediate
children, it's the only way that an admin can lock in a specific oom
policy like "tree" and then delegate the subtree to the user. If you've
configured it as above, comparing A and C should be the same based on the
cumulative usage of their child mem cgroups.
The policy for B hasn't been specified, but since it does not have any
children "cgroup" and "tree" should be the same.
> Now we have the global OOM killer to choose a victim. From a quick
> glance over those patches, it seems that we will be comparing only
> tasks because root->oom_policy != MEMCG_OOM_POLICY_CGROUP. A, B and C
> policies are ignored.
Right, a policy of "none" reverts its subtree back to per-process
comparison if you are either not using the cgroup aware oom killer or your
subtree is not using the cgroup aware oom killer.
> Moreover If I select any of B's tasks then I will
> happily kill it breaking the expectation that the whole memcg will go
> away. Weird, don't you think? Or did I misunderstand?
>
It's just as weird as the behavior of memory.oom_group today without using
the mount option :) In that case, mem_cgroup_select_oom_victim() always
returns false and the value of memory.oom_group is ignored. I agree that
it's weird in -mm and there's nothing preventing us from separating
memory.oom_group from the cgroup aware oom killer and allowing it to be
set regardless of a selection change. If memory.oom_group is set, and the
kill originates from that mem cgroup, kill all processes attached to it
and its subtree.
This is a criticism of the current implementation in -mm, however, my
extension only respects its weirdness.
> So let's assume that root: cgroup. Then we are finally comparing
> cgroups. D, E, B, C. Of those D, E and F do not have any
> policy. Do they inherit their policy from the parent? If they don't then
> we should be comparing their tasks separately, no? The code disagrees
> because once we are in the cgroup mode, we do not care about separate
> tasks.
>
No, perhaps I wasn't clear in the documentation: the policy at each level
of the hierarchy is specified by memory.oom_policy and compares its
immediate children with that policy. So the per-cgroup usage of A, B, and
C and compared regardless of A, B, and C's own oom policies.
> Let's say we choose C because it has the largest cumulative consumption.
> It is not oom_group so it will select a task from F, G. Again you are
> breaking oom_group policy of G if you kill a single task. So you would
> have to be recursive here. That sounds fixable though. Just be
> recursive.
>
I fully agree, but that's (another) implementation detail of what is in
-mm that isn't specified. I think where you're going is the complete
separation of mem cgroup selection from memory.oom_group. I agree, and we
can fix that. memory.oom_group also shouldn't depend on any mount option,
it can be set or unset depending on the properties of the workload.
> Then you say
>
> > Another benefit of such an approach is that an admin can lock in a
> > certain policy for the system or for a mem cgroup subtree and can
> > delegate the policy decision to the user to determine if the kill should
> > originate from a subcontainer, as indivisible memory consumers
> > themselves, or selection should be done per process.
>
> And the code indeed doesn't check oom_policy on each level of the
> hierarchy, unless I am missing something. So the subgroup is simply
> locked in to the oom_policy parent has chosen. That is not the case for
> the tree policy.
>
> So look how we are comparing cumulative groups without policy with
> groups with policy with subtrees. Either I have grossly misunderstood
> something or this is massively inconsistent and it doesn't make much
> sense to me. Root memcg without cgroup policy will simply turn off the
> whole thing for the global OOM case. So you really need to enable it
> there but then it is not really clear how to configure lower levels.
>
> From the above it seems that you are more interested in memcg OOMs and
> want to give different hierarchies different policies but you quickly
> hit the similar inconsistencies there as well.
>
> I am not sure how extensible this is actually. How do we place
> priorities on top?
>
If you're referring to strict priorities where one cgroup can be preferred
or biased against regardless of usage, that would be an extension with yet
another tunable. Userspace influence over the selection is not addressed
by this patchset, nor is the unfair comparison of the root mem cgroup with
leaf mem cgroups. My suggestion previously was a memory.oom_value
tunable, which is configured depending on its parent's memory.oom_policy.
If "cgroup" or "tree" it behaves as oom_score_adj-type behavior, i.e. it's
an adjustment on the usage. This way, a subtree's usage can have a
certain amount of memory discounted, for example, because it is supposed
to use more than 50% of memory. If a new "priority" memory.oom_policy
were implemented, which would be trivial, comparison between child cgroups
would be as simple as comparing memory.oom_value integers.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 52+ messages in thread
* [patch -mm v2 2/3] mm, memcg: replace cgroup aware oom killer mount option with tunable
2018-01-25 23:53 ` [patch -mm v2 0/3] " David Rientjes
2018-01-25 23:53 ` [patch -mm v2 1/3] mm, memcg: introduce per-memcg oom policy tunable David Rientjes
@ 2018-01-25 23:53 ` David Rientjes
2018-01-26 0:00 ` Andrew Morton
2018-01-25 23:53 ` [patch -mm v2 3/3] mm, memcg: add hierarchical usage oom policy David Rientjes
2 siblings, 1 reply; 52+ messages in thread
From: David Rientjes @ 2018-01-25 23:53 UTC (permalink / raw)
To: Andrew Morton, Roman Gushchin
Cc: Michal Hocko, Vladimir Davydov, Johannes Weiner, Tetsuo Handa,
Tejun Heo, kernel-team, cgroups, linux-doc, linux-kernel,
linux-mm
Now that each mem cgroup on the system has a memory.oom_policy tunable to
specify oom kill selection behavior, remove the needless "groupoom" mount
option that requires (1) the entire system to be forced, perhaps
unnecessarily, perhaps unexpectedly, into a single oom policy that
differs from the traditional per process selection, and (2) a remount to
change.
Instead of enabling the cgroup aware oom killer with the "groupoom" mount
option, set the mem cgroup subtree's memory.oom_policy to "cgroup".
Signed-off-by: David Rientjes <rientjes@google.com>
---
Documentation/cgroup-v2.txt | 43 +++++++++++++++++++++----------------------
include/linux/cgroup-defs.h | 5 -----
include/linux/memcontrol.h | 5 +++++
kernel/cgroup/cgroup.c | 13 +------------
mm/memcontrol.c | 17 ++++++++---------
5 files changed, 35 insertions(+), 48 deletions(-)
diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt
--- a/Documentation/cgroup-v2.txt
+++ b/Documentation/cgroup-v2.txt
@@ -1074,6 +1074,10 @@ PAGE_SIZE multiple when read back.
victim; that is, it will choose the single process with the largest
memory footprint.
+ If "cgroup", the OOM killer will compare mem cgroups as indivisible
+ memory consumers; that is, they will compare mem cgroup usage rather
+ than process memory footprint. See the "OOM Killer" section.
+
memory.events
A read-only flat-keyed file which exists on non-root cgroups.
The following entries are defined. Unless specified
@@ -1280,37 +1284,32 @@ belonging to the affected files to ensure correct memory ownership.
OOM Killer
~~~~~~~~~~
-Cgroup v2 memory controller implements a cgroup-aware OOM killer.
-It means that it treats cgroups as first class OOM entities.
+Cgroup v2 memory controller implements an optional cgroup-aware out of
+memory killer, which treats cgroups as indivisible OOM entities.
-Cgroup-aware OOM logic is turned off by default and requires
-passing the "groupoom" option on mounting cgroupfs. It can also
-by remounting cgroupfs with the following command::
+This policy is controlled by memory.oom_policy. When a memory cgroup is
+out of memory, its memory.oom_policy will dictate how the OOM killer will
+select a process, or cgroup, to kill. Likewise, when the system is OOM,
+the policy is dictated by the root mem cgroup.
- # mount -o remount,groupoom $MOUNT_POINT
+There are currently two available oom policies:
-Under OOM conditions the memory controller tries to make the best
-choice of a victim, looking for a memory cgroup with the largest
-memory footprint, considering leaf cgroups and cgroups with the
-memory.oom_group option set, which are considered to be an indivisible
-memory consumers.
+ - "none": default, choose the largest single memory hogging process to
+ oom kill, as traditionally the OOM killer has always done.
-By default, OOM killer will kill the biggest task in the selected
-memory cgroup. A user can change this behavior by enabling
-the per-cgroup memory.oom_group option. If set, it causes
-the OOM killer to kill all processes attached to the cgroup,
-except processes with oom_score_adj set to -1000.
+ - "cgroup": choose the cgroup with the largest memory footprint from the
+ subtree as an OOM victim and kill at least one process, depending on
+ memory.oom_group, from it.
-This affects both system- and cgroup-wide OOMs. For a cgroup-wide OOM
-the memory controller considers only cgroups belonging to the sub-tree
-of the OOM'ing cgroup.
+When selecting a cgroup as a victim, the OOM killer will kill the process
+with the largest memory footprint. A user can control this behavior by
+enabling the per-cgroup memory.oom_group option. If set, it causes the
+OOM killer to kill all processes attached to the cgroup, except processes
+with /proc/pid/oom_score_adj set to -1000 (oom disabled).
The root cgroup is treated as a leaf memory cgroup, so it's compared
with other leaf memory cgroups and cgroups with oom_group option set.
-If there are no cgroups with the enabled memory controller,
-the OOM killer is using the "traditional" process-based approach.
-
Please, note that memory charges are not migrating if tasks
are moved between different memory cgroups. Moving tasks with
significant memory footprint may affect OOM victim selection logic.
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -81,11 +81,6 @@ enum {
* Enable cpuset controller in v1 cgroup to use v2 behavior.
*/
CGRP_ROOT_CPUSET_V2_MODE = (1 << 4),
-
- /*
- * Enable cgroup-aware OOM killer.
- */
- CGRP_GROUP_OOM = (1 << 5),
};
/* cftype->flags */
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -64,6 +64,11 @@ enum memcg_oom_policy {
* oom_badness()
*/
MEMCG_OOM_POLICY_NONE,
+ /*
+ * Local cgroup usage is used to select a target cgroup, treating each
+ * mem cgroup as an indivisible consumer
+ */
+ MEMCG_OOM_POLICY_CGROUP,
};
struct mem_cgroup_reclaim_cookie {
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1732,9 +1732,6 @@ static int parse_cgroup_root_flags(char *data, unsigned int *root_flags)
if (!strcmp(token, "nsdelegate")) {
*root_flags |= CGRP_ROOT_NS_DELEGATE;
continue;
- } else if (!strcmp(token, "groupoom")) {
- *root_flags |= CGRP_GROUP_OOM;
- continue;
}
pr_err("cgroup2: unknown option \"%s\"\n", token);
@@ -1751,11 +1748,6 @@ static void apply_cgroup_root_flags(unsigned int root_flags)
cgrp_dfl_root.flags |= CGRP_ROOT_NS_DELEGATE;
else
cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE;
-
- if (root_flags & CGRP_GROUP_OOM)
- cgrp_dfl_root.flags |= CGRP_GROUP_OOM;
- else
- cgrp_dfl_root.flags &= ~CGRP_GROUP_OOM;
}
}
@@ -1763,8 +1755,6 @@ static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root
{
if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE)
seq_puts(seq, ",nsdelegate");
- if (cgrp_dfl_root.flags & CGRP_GROUP_OOM)
- seq_puts(seq, ",groupoom");
return 0;
}
@@ -5922,8 +5912,7 @@ static struct kobj_attribute cgroup_delegate_attr = __ATTR_RO(delegate);
static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
- return snprintf(buf, PAGE_SIZE, "nsdelegate\n"
- "groupoom\n");
+ return snprintf(buf, PAGE_SIZE, "nsdelegate\n");
}
static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2798,14 +2798,14 @@ bool mem_cgroup_select_oom_victim(struct oom_control *oc)
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
return false;
- if (!(cgrp_dfl_root.flags & CGRP_GROUP_OOM))
- return false;
-
if (oc->memcg)
root = oc->memcg;
else
root = root_mem_cgroup;
+ if (root->oom_policy != MEMCG_OOM_POLICY_CGROUP)
+ return false;
+
select_victim_memcg(root, oc);
return oc->chosen_memcg;
@@ -5412,9 +5412,6 @@ static int memory_oom_group_show(struct seq_file *m, void *v)
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
bool oom_group = memcg->oom_group;
- if (!(cgrp_dfl_root.flags & CGRP_GROUP_OOM))
- return -ENOTSUPP;
-
seq_printf(m, "%d\n", oom_group);
return 0;
@@ -5428,9 +5425,6 @@ static ssize_t memory_oom_group_write(struct kernfs_open_file *of,
int oom_group;
int err;
- if (!(cgrp_dfl_root.flags & CGRP_GROUP_OOM))
- return -ENOTSUPP;
-
err = kstrtoint(strstrip(buf), 0, &oom_group);
if (err)
return err;
@@ -5541,6 +5535,9 @@ static int memory_oom_policy_show(struct seq_file *m, void *v)
enum memcg_oom_policy policy = READ_ONCE(memcg->oom_policy);
switch (policy) {
+ case MEMCG_OOM_POLICY_CGROUP:
+ seq_puts(m, "cgroup\n");
+ break;
case MEMCG_OOM_POLICY_NONE:
default:
seq_puts(m, "none\n");
@@ -5557,6 +5554,8 @@ static ssize_t memory_oom_policy_write(struct kernfs_open_file *of,
buf = strstrip(buf);
if (!memcmp("none", buf, min(sizeof("none")-1, nbytes)))
memcg->oom_policy = MEMCG_OOM_POLICY_NONE;
+ else if (!memcmp("cgroup", buf, min(sizeof("cgroup")-1, nbytes)))
+ memcg->oom_policy = MEMCG_OOM_POLICY_CGROUP;
else
ret = -EINVAL;
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 52+ messages in thread* Re: [patch -mm v2 2/3] mm, memcg: replace cgroup aware oom killer mount option with tunable
2018-01-25 23:53 ` [patch -mm v2 2/3] mm, memcg: replace cgroup aware oom killer mount option with tunable David Rientjes
@ 2018-01-26 0:00 ` Andrew Morton
2018-01-26 22:20 ` David Rientjes
0 siblings, 1 reply; 52+ messages in thread
From: Andrew Morton @ 2018-01-26 0:00 UTC (permalink / raw)
To: David Rientjes
Cc: Roman Gushchin, Michal Hocko, Vladimir Davydov, Johannes Weiner,
Tetsuo Handa, Tejun Heo, kernel-team, cgroups, linux-doc,
linux-kernel, linux-mm
On Thu, 25 Jan 2018 15:53:48 -0800 (PST) David Rientjes <rientjes@google.com> wrote:
> Now that each mem cgroup on the system has a memory.oom_policy tunable to
> specify oom kill selection behavior, remove the needless "groupoom" mount
> option that requires (1) the entire system to be forced, perhaps
> unnecessarily, perhaps unexpectedly, into a single oom policy that
> differs from the traditional per process selection, and (2) a remount to
> change.
>
> Instead of enabling the cgroup aware oom killer with the "groupoom" mount
> option, set the mem cgroup subtree's memory.oom_policy to "cgroup".
Can we retain the groupoom mount option and use its setting to set the
initial value of every memory.oom_policy? That way the mount option
remains somewhat useful and we're back-compatible?
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 52+ messages in thread
* Re: [patch -mm v2 2/3] mm, memcg: replace cgroup aware oom killer mount option with tunable
2018-01-26 0:00 ` Andrew Morton
@ 2018-01-26 22:20 ` David Rientjes
2018-01-26 22:39 ` Andrew Morton
0 siblings, 1 reply; 52+ messages in thread
From: David Rientjes @ 2018-01-26 22:20 UTC (permalink / raw)
To: Andrew Morton
Cc: Roman Gushchin, Michal Hocko, Vladimir Davydov, Johannes Weiner,
Tetsuo Handa, Tejun Heo, kernel-team, cgroups, linux-doc,
linux-kernel, linux-mm
On Thu, 25 Jan 2018, Andrew Morton wrote:
> > Now that each mem cgroup on the system has a memory.oom_policy tunable to
> > specify oom kill selection behavior, remove the needless "groupoom" mount
> > option that requires (1) the entire system to be forced, perhaps
> > unnecessarily, perhaps unexpectedly, into a single oom policy that
> > differs from the traditional per process selection, and (2) a remount to
> > change.
> >
> > Instead of enabling the cgroup aware oom killer with the "groupoom" mount
> > option, set the mem cgroup subtree's memory.oom_policy to "cgroup".
>
> Can we retain the groupoom mount option and use its setting to set the
> initial value of every memory.oom_policy? That way the mount option
> remains somewhat useful and we're back-compatible?
>
-ECONFUSED. We want to have a mount option that has the sole purpose of
doing echo cgroup > /mnt/cgroup/memory.oom_policy?
Please note that this patchset is not only to remove a mount option, it is
to allow oom policies to be configured per subtree such that users whom
you delegate those subtrees to cannot evade the oom policy that is set at
a higher level. The goal is to prevent the user from needing to organize
their hierarchy is a specific way to work around this constraint and use
things like limiting the number of child cgroups that user is allowed to
create only to work around the oom policy. With a cgroup v2 single
hierarchy it severely limits the amount of control the user has over their
processes because they are locked into a very specific hierarchy
configuration solely to not allow the user to evade oom kill.
This, and fixes to fairly compare the root mem cgroup with leaf mem
cgroups, are essential before the feature is merged otherwise it yields
wildly unpredictable (and unexpected, since its interaction with
oom_score_adj isn't documented) results as I already demonstrated where
cgroups with 1GB of usage are killed instead of 6GB workers outside of
that subtree.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 52+ messages in thread
* Re: [patch -mm v2 2/3] mm, memcg: replace cgroup aware oom killer mount option with tunable
2018-01-26 22:20 ` David Rientjes
@ 2018-01-26 22:39 ` Andrew Morton
2018-01-26 22:52 ` David Rientjes
0 siblings, 1 reply; 52+ messages in thread
From: Andrew Morton @ 2018-01-26 22:39 UTC (permalink / raw)
To: David Rientjes
Cc: Roman Gushchin, Michal Hocko, Vladimir Davydov, Johannes Weiner,
Tetsuo Handa, Tejun Heo, kernel-team, cgroups, linux-doc,
linux-kernel, linux-mm
On Fri, 26 Jan 2018 14:20:24 -0800 (PST) David Rientjes <rientjes@google.com> wrote:
> On Thu, 25 Jan 2018, Andrew Morton wrote:
>
> > > Now that each mem cgroup on the system has a memory.oom_policy tunable to
> > > specify oom kill selection behavior, remove the needless "groupoom" mount
> > > option that requires (1) the entire system to be forced, perhaps
> > > unnecessarily, perhaps unexpectedly, into a single oom policy that
> > > differs from the traditional per process selection, and (2) a remount to
> > > change.
> > >
> > > Instead of enabling the cgroup aware oom killer with the "groupoom" mount
> > > option, set the mem cgroup subtree's memory.oom_policy to "cgroup".
> >
> > Can we retain the groupoom mount option and use its setting to set the
> > initial value of every memory.oom_policy? That way the mount option
> > remains somewhat useful and we're back-compatible?
> >
>
> -ECONFUSED. We want to have a mount option that has the sole purpose of
> doing echo cgroup > /mnt/cgroup/memory.oom_policy?
Approximately. Let me put it another way: can we modify your patchset
so that the mount option remains, and continues to have a sufficiently
same effect? For backward compatibility.
> This, and fixes to fairly compare the root mem cgroup with leaf mem
> cgroups, are essential before the feature is merged otherwise it yields
> wildly unpredictable (and unexpected, since its interaction with
> oom_score_adj isn't documented) results as I already demonstrated where
> cgroups with 1GB of usage are killed instead of 6GB workers outside of
> that subtree.
OK, so Roman's new feature is incomplete: it satisfies some use cases
but not others. And we kinda have a plan to address the other use
cases in the future.
There's nothing wrong with that! As long as we don't break existing
setups while evolving the feature. How do we do that?
^ permalink raw reply [flat|nested] 52+ messages in thread
* Re: [patch -mm v2 2/3] mm, memcg: replace cgroup aware oom killer mount option with tunable
2018-01-26 22:39 ` Andrew Morton
@ 2018-01-26 22:52 ` David Rientjes
2018-01-27 0:17 ` Andrew Morton
0 siblings, 1 reply; 52+ messages in thread
From: David Rientjes @ 2018-01-26 22:52 UTC (permalink / raw)
To: Andrew Morton
Cc: Roman Gushchin, Michal Hocko, Vladimir Davydov, Johannes Weiner,
Tetsuo Handa, Tejun Heo, kernel-team, cgroups, linux-doc,
linux-kernel, linux-mm
On Fri, 26 Jan 2018, Andrew Morton wrote:
> > -ECONFUSED. We want to have a mount option that has the sole purpose of
> > doing echo cgroup > /mnt/cgroup/memory.oom_policy?
>
> Approximately. Let me put it another way: can we modify your patchset
> so that the mount option remains, and continues to have a sufficiently
> same effect? For backward compatibility.
>
The mount option would exist solely to set the oom policy of the root mem
cgroup, it would lose its effect of mandating that policy for any subtree
since it would become configurable by the user if delegated.
Let me put it another way: if the cgroup aware oom killer is merged for
4.16 without this patchset, userspace can reasonably infer the oom policy
from checking how cgroups were mounted. If my followup patchset were
merged for 4.17, that's invalid and it becomes dependent on kernel
version: it could have the "groupoom" mount option but configured through
the root mem cgroup's memory.oom_policy to not be cgroup aware at all.
That inconsistency, to me, is unfair to burden userspace with.
> > This, and fixes to fairly compare the root mem cgroup with leaf mem
> > cgroups, are essential before the feature is merged otherwise it yields
> > wildly unpredictable (and unexpected, since its interaction with
> > oom_score_adj isn't documented) results as I already demonstrated where
> > cgroups with 1GB of usage are killed instead of 6GB workers outside of
> > that subtree.
>
> OK, so Roman's new feature is incomplete: it satisfies some use cases
> but not others. And we kinda have a plan to address the other use
> cases in the future.
>
Those use cases are also undocumented such that the user doesn't know the
behavior they are opting into. Nowhere in the patchset does it mention
anything about oom_score_adj other than being oom disabled. It doesn't
mention that a per-process tunable now depends strictly on whether it is
attached to root or not. It specifies a fair comparison between the root
mem cgroup and leaf mem cgroups, which is obviously incorrect by the
implementation itself. So I'm not sure the user would know which use
cases it is valid for, which is why I've been trying to make it generally
purposeful and documented.
> There's nothing wrong with that! As long as we don't break existing
> setups while evolving the feature. How do we do that?
>
We'd break the setups that actually configure their cgroups and processes
to abide by the current implementation since we'd need to discount
oom_score_adj from the the root mem cgroup usage to fix it.
There hasn't been any feedback on v2 of my patchset that would suggest
changes are needed. I think we all recognize the shortcoming that it is
addressing. The only feedback on v1, the need for memory.oom_group as a
separate tunable, has been addressed in v2. What are we waiting for?
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 52+ messages in thread
* Re: [patch -mm v2 2/3] mm, memcg: replace cgroup aware oom killer mount option with tunable
2018-01-26 22:52 ` David Rientjes
@ 2018-01-27 0:17 ` Andrew Morton
2018-01-29 10:46 ` Michal Hocko
2018-01-29 22:16 ` David Rientjes
0 siblings, 2 replies; 52+ messages in thread
From: Andrew Morton @ 2018-01-27 0:17 UTC (permalink / raw)
To: David Rientjes
Cc: Roman Gushchin, Michal Hocko, Vladimir Davydov, Johannes Weiner,
Tetsuo Handa, Tejun Heo, kernel-team, cgroups, linux-doc,
linux-kernel, linux-mm
On Fri, 26 Jan 2018 14:52:59 -0800 (PST) David Rientjes <rientjes@google.com> wrote:
> On Fri, 26 Jan 2018, Andrew Morton wrote:
>
> > > -ECONFUSED. We want to have a mount option that has the sole purpose of
> > > doing echo cgroup > /mnt/cgroup/memory.oom_policy?
> >
> > Approximately. Let me put it another way: can we modify your patchset
> > so that the mount option remains, and continues to have a sufficiently
> > same effect? For backward compatibility.
> >
>
> The mount option would exist solely to set the oom policy of the root mem
> cgroup, it would lose its effect of mandating that policy for any subtree
> since it would become configurable by the user if delegated.
Why can't we propagate the mount option into the subtrees?
If the user then alters that behaviour with new added-by-David tunables
then fine, that's still backward compatible.
> Let me put it another way: if the cgroup aware oom killer is merged for
> 4.16 without this patchset, userspace can reasonably infer the oom policy
> from checking how cgroups were mounted. If my followup patchset were
> merged for 4.17, that's invalid and it becomes dependent on kernel
> version: it could have the "groupoom" mount option but configured through
> the root mem cgroup's memory.oom_policy to not be cgroup aware at all.
That concern seems unreasonable to me. Is an application *really*
going to peek at the mount options to figure out what its present oom
policy is? Well, maybe. But that's a pretty dopey thing to do and I
wouldn't lose much sleep over breaking any such application in the very
unlikely case that such a thing was developed in that two-month window.
If that's really a concern then let's add (to Roman's patchset) a
proper interface for an application to query its own oom policy.
> That inconsistency, to me, is unfair to burden userspace with.
>
> > > This, and fixes to fairly compare the root mem cgroup with leaf mem
> > > cgroups, are essential before the feature is merged otherwise it yields
> > > wildly unpredictable (and unexpected, since its interaction with
> > > oom_score_adj isn't documented) results as I already demonstrated where
> > > cgroups with 1GB of usage are killed instead of 6GB workers outside of
> > > that subtree.
> >
> > OK, so Roman's new feature is incomplete: it satisfies some use cases
> > but not others. And we kinda have a plan to address the other use
> > cases in the future.
> >
>
> Those use cases are also undocumented such that the user doesn't know the
> behavior they are opting into. Nowhere in the patchset does it mention
> anything about oom_score_adj other than being oom disabled. It doesn't
> mention that a per-process tunable now depends strictly on whether it is
> attached to root or not. It specifies a fair comparison between the root
> mem cgroup and leaf mem cgroups, which is obviously incorrect by the
> implementation itself. So I'm not sure the user would know which use
> cases it is valid for, which is why I've been trying to make it generally
> purposeful and documented.
Documentation patches are nice. We can cc:stable them too, so no huge
hurry.
> > There's nothing wrong with that! As long as we don't break existing
> > setups while evolving the feature. How do we do that?
> >
>
> We'd break the setups that actually configure their cgroups and processes
> to abide by the current implementation since we'd need to discount
> oom_score_adj from the the root mem cgroup usage to fix it.
Am having trouble understanding that. Expand, please?
Can we address this (and other such) issues in the (interim)
documentation?
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 52+ messages in thread
* Re: [patch -mm v2 2/3] mm, memcg: replace cgroup aware oom killer mount option with tunable
2018-01-27 0:17 ` Andrew Morton
@ 2018-01-29 10:46 ` Michal Hocko
[not found] ` <20180129104657.GC21609-2MMpYkNvuYDjFM9bn6wA6Q@public.gmane.org>
2018-01-29 22:16 ` David Rientjes
1 sibling, 1 reply; 52+ messages in thread
From: Michal Hocko @ 2018-01-29 10:46 UTC (permalink / raw)
To: Andrew Morton
Cc: David Rientjes, Roman Gushchin, Vladimir Davydov, Johannes Weiner,
Tetsuo Handa, Tejun Heo, kernel-team, cgroups, linux-doc,
linux-kernel, linux-mm
On Fri 26-01-18 16:17:35, Andrew Morton wrote:
> On Fri, 26 Jan 2018 14:52:59 -0800 (PST) David Rientjes <rientjes@google.com> wrote:
[...]
> > Those use cases are also undocumented such that the user doesn't know the
> > behavior they are opting into. Nowhere in the patchset does it mention
> > anything about oom_score_adj other than being oom disabled. It doesn't
> > mention that a per-process tunable now depends strictly on whether it is
> > attached to root or not. It specifies a fair comparison between the root
> > mem cgroup and leaf mem cgroups, which is obviously incorrect by the
> > implementation itself. So I'm not sure the user would know which use
> > cases it is valid for, which is why I've been trying to make it generally
> > purposeful and documented.
>
> Documentation patches are nice. We can cc:stable them too, so no huge
> hurry.
What about this?
From c02d8bc1396d5ab3785d01f577e2ee74e5dd985e Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Mon, 29 Jan 2018 11:42:59 +0100
Subject: [PATCH] oom, memcg: clarify root memcg oom accounting
David Rientjes has pointed out that the current way how the root memcg
is accounted for the cgroup aware OOM killer is undocumented. Unlike
regular cgroups there is no accounting going on in the root memcg
(mostly for performance reasons). Therefore we are suming up oom_badness
of its tasks. This might result in an over accounting because of the
oom_score_adj setting. Document this for now.
Signed-off-by: Michal Hocko <mhocko@suse.com>
---
Documentation/cgroup-v2.txt | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt
index 2eaed1e2243d..7dff106bba57 100644
--- a/Documentation/cgroup-v2.txt
+++ b/Documentation/cgroup-v2.txt
@@ -1292,7 +1292,11 @@ the memory controller considers only cgroups belonging to the sub-tree
of the OOM'ing cgroup.
The root cgroup is treated as a leaf memory cgroup, so it's compared
-with other leaf memory cgroups and cgroups with oom_group option set.
+with other leaf memory cgroups and cgroups with oom_group option
+set. Due to internal implementation restrictions the size of the root
+cgroup is a cumulative sum of oom_badness of all its tasks (in other
+words oom_score_adj of each task is obeyed). This might change in the
+future.
If there are no cgroups with the enabled memory controller,
the OOM killer is using the "traditional" process-based approach.
--
2.15.1
--
Michal Hocko
SUSE Labs
^ permalink raw reply related [flat|nested] 52+ messages in thread* Re: [patch -mm v2 2/3] mm, memcg: replace cgroup aware oom killer mount option with tunable
2018-01-27 0:17 ` Andrew Morton
2018-01-29 10:46 ` Michal Hocko
@ 2018-01-29 22:16 ` David Rientjes
1 sibling, 0 replies; 52+ messages in thread
From: David Rientjes @ 2018-01-29 22:16 UTC (permalink / raw)
To: Andrew Morton
Cc: Roman Gushchin, Michal Hocko, Vladimir Davydov, Johannes Weiner,
Tetsuo Handa, Tejun Heo, kernel-team, cgroups, linux-doc,
linux-kernel, linux-mm
On Fri, 26 Jan 2018, Andrew Morton wrote:
> > > > -ECONFUSED. We want to have a mount option that has the sole purpose of
> > > > doing echo cgroup > /mnt/cgroup/memory.oom_policy?
> > >
> > > Approximately. Let me put it another way: can we modify your patchset
> > > so that the mount option remains, and continues to have a sufficiently
> > > same effect? For backward compatibility.
> > >
> >
> > The mount option would exist solely to set the oom policy of the root mem
> > cgroup, it would lose its effect of mandating that policy for any subtree
> > since it would become configurable by the user if delegated.
>
> Why can't we propagate the mount option into the subtrees?
>
> If the user then alters that behaviour with new added-by-David tunables
> then fine, that's still backward compatible.
>
It's not, if you look for the "groupoom" mount option it will specify two
different things: the entire hierarchy is locked into a single per-cgroup
usage comparison (Roman's original patchset), and entire hierarchy had an
initial oom policy set which could have subsequently changed (my
extension). With memory.oom_policy you need to query what the effective
policy is, checking for "groupoom" is entirely irrelevant, it was only the
initial setting.
Thus, if memory.oom_policy is going to be merged in the future, it
necessarily obsoletes the mount option. It would depend on the kernel
version to determine its meaning.
I'm struggling to see the benefit of simply not reviewing patches that
build off the original and merging a patchset early. What are we gaining?
> > Let me put it another way: if the cgroup aware oom killer is merged for
> > 4.16 without this patchset, userspace can reasonably infer the oom policy
> > from checking how cgroups were mounted. If my followup patchset were
> > merged for 4.17, that's invalid and it becomes dependent on kernel
> > version: it could have the "groupoom" mount option but configured through
> > the root mem cgroup's memory.oom_policy to not be cgroup aware at all.
>
> That concern seems unreasonable to me. Is an application *really*
> going to peek at the mount options to figure out what its present oom
> policy is? Well, maybe. But that's a pretty dopey thing to do and I
> wouldn't lose much sleep over breaking any such application in the very
> unlikely case that such a thing was developed in that two-month window.
>
It's not dopey, it's the only way that any userspace can determine what
process is going to be oom killed! That policy will dictate how the
cgroup hierarchy is configured without my extension, there's no other way
to prefer or bias processes.
How can a userspace cgroup manager possibly construct a cgroup v2
hierarchy with expected oom kill behavior if it is not peeking at the
mount option?
My concern is that if extended with my patchset the mount option itself
becomes obsolete and then peeking at it is irrelevant to the runtime
behavior!
> > > There's nothing wrong with that! As long as we don't break existing
> > > setups while evolving the feature. How do we do that?
> >
> > We'd break the setups that actually configure their cgroups and processes
> > to abide by the current implementation since we'd need to discount
> > oom_score_adj from the the root mem cgroup usage to fix it.
>
> Am having trouble understanding that. Expand, please?
>
> Can we address this (and other such) issues in the (interim)
> documentation?
>
This point isn't a documentation issue at all, this is the fact that
oom_score_adj is only effective for the root mem cgroup. If the user is
fully aware of the implementation, it does not change the fact that he or
she will construct their cgroup hierarchy and attach processes to it to
abide by the behavior. That is the breakage that I am concerned about.
An example: you have a log scraper that is running with
/proc/pid/oom_score_adj == 999. It's best effort, it can be killed, we'll
retry the next time if the system has memory available. This is partly
why oom_adj and oom_score_adj exist and is used on production systems.
If you attach that process to an unlimited mem cgroup dedicated to system
daemons purely for the rich stats that mem cgroup provides, this breaks
the oom_score_adj setting solely because it's attached to the cgroup. On
system-wide oom, it is no longer the killed process merely because it is
attached to an unlimited child cgroup. This is not the only such example:
this occurs for any process attached to a cgroup.
^ permalink raw reply [flat|nested] 52+ messages in thread
* [patch -mm v2 3/3] mm, memcg: add hierarchical usage oom policy
2018-01-25 23:53 ` [patch -mm v2 0/3] " David Rientjes
2018-01-25 23:53 ` [patch -mm v2 1/3] mm, memcg: introduce per-memcg oom policy tunable David Rientjes
2018-01-25 23:53 ` [patch -mm v2 2/3] mm, memcg: replace cgroup aware oom killer mount option with tunable David Rientjes
@ 2018-01-25 23:53 ` David Rientjes
2 siblings, 0 replies; 52+ messages in thread
From: David Rientjes @ 2018-01-25 23:53 UTC (permalink / raw)
To: Andrew Morton, Roman Gushchin
Cc: Michal Hocko, Vladimir Davydov, Johannes Weiner, Tetsuo Handa,
Tejun Heo, kernel-team, cgroups, linux-doc, linux-kernel,
linux-mm
One of the three significant concerns brought up about the cgroup aware
oom killer is that its decisionmaking is completely evaded by creating
subcontainers and attaching processes such that the ancestor's usage does
not exceed another cgroup on the system.
In this regard, users who do not distribute their processes over a set of
subcontainers for mem cgroup control, statistics, or other controllers
are unfairly penalized.
This adds an oom policy, "tree", that accounts for hierarchical usage
when comparing cgroups and the cgroup aware oom killer is enabled by an
ancestor. This allows administrators, for example, to require users in
their own top-level mem cgroup subtree to be accounted for with
hierarchical usage. In other words, they can longer evade the oom killer
by using other controllers or subcontainers.
Signed-off-by: David Rientjes <rientjes@google.com>
---
Documentation/cgroup-v2.txt | 12 ++++++++++--
include/linux/memcontrol.h | 5 +++++
mm/memcontrol.c | 12 +++++++++---
3 files changed, 24 insertions(+), 5 deletions(-)
diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt
--- a/Documentation/cgroup-v2.txt
+++ b/Documentation/cgroup-v2.txt
@@ -1078,6 +1078,11 @@ PAGE_SIZE multiple when read back.
memory consumers; that is, they will compare mem cgroup usage rather
than process memory footprint. See the "OOM Killer" section.
+ If "tree", the OOM killer will compare mem cgroups and its subtree
+ as indivisible memory consumers when selecting a hierarchy. This
+ policy cannot be set on the root mem cgroup. See the "OOM Killer"
+ section.
+
memory.events
A read-only flat-keyed file which exists on non-root cgroups.
The following entries are defined. Unless specified
@@ -1301,6 +1306,9 @@ There are currently two available oom policies:
subtree as an OOM victim and kill at least one process, depending on
memory.oom_group, from it.
+ - "tree": choose the cgroup with the largest memory footprint considering
+ itself and its subtree and kill at least one process.
+
When selecting a cgroup as a victim, the OOM killer will kill the process
with the largest memory footprint. A user can control this behavior by
enabling the per-cgroup memory.oom_group option. If set, it causes the
@@ -1314,8 +1322,8 @@ Please, note that memory charges are not migrating if tasks
are moved between different memory cgroups. Moving tasks with
significant memory footprint may affect OOM victim selection logic.
If it's a case, please, consider creating a common ancestor for
-the source and destination memory cgroups and enabling oom_group
-on ancestor layer.
+the source and destination memory cgroups and setting a policy of "tree"
+and enabling oom_group on an ancestor layer.
IO
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -69,6 +69,11 @@ enum memcg_oom_policy {
* mem cgroup as an indivisible consumer
*/
MEMCG_OOM_POLICY_CGROUP,
+ /*
+ * Tree cgroup usage for all descendant memcg groups, treating each mem
+ * cgroup and its subtree as an indivisible consumer
+ */
+ MEMCG_OOM_POLICY_TREE,
};
struct mem_cgroup_reclaim_cookie {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2728,7 +2728,7 @@ static void select_victim_memcg(struct mem_cgroup *root, struct oom_control *oc)
/*
* The oom_score is calculated for leaf memory cgroups (including
* the root memcg).
- * Non-leaf oom_group cgroups accumulating score of descendant
+ * Cgroups with oom policy of "tree" accumulate the score of descendant
* leaf memory cgroups.
*/
rcu_read_lock();
@@ -2737,10 +2737,11 @@ static void select_victim_memcg(struct mem_cgroup *root, struct oom_control *oc)
/*
* We don't consider non-leaf non-oom_group memory cgroups
- * as OOM victims.
+ * without the oom policy of "tree" as OOM victims.
*/
if (memcg_has_children(iter) && iter != root_mem_cgroup &&
- !mem_cgroup_oom_group(iter))
+ !mem_cgroup_oom_group(iter) &&
+ iter->oom_policy != MEMCG_OOM_POLICY_TREE)
continue;
/*
@@ -5538,6 +5539,9 @@ static int memory_oom_policy_show(struct seq_file *m, void *v)
case MEMCG_OOM_POLICY_CGROUP:
seq_puts(m, "cgroup\n");
break;
+ case MEMCG_OOM_POLICY_TREE:
+ seq_puts(m, "tree\n");
+ break;
case MEMCG_OOM_POLICY_NONE:
default:
seq_puts(m, "none\n");
@@ -5556,6 +5560,8 @@ static ssize_t memory_oom_policy_write(struct kernfs_open_file *of,
memcg->oom_policy = MEMCG_OOM_POLICY_NONE;
else if (!memcmp("cgroup", buf, min(sizeof("cgroup")-1, nbytes)))
memcg->oom_policy = MEMCG_OOM_POLICY_CGROUP;
+ else if (!memcmp("tree", buf, min(sizeof("tree")-1, nbytes)))
+ memcg->oom_policy = MEMCG_OOM_POLICY_TREE;
else
ret = -EINVAL;
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 52+ messages in thread