[patch 1/8] fork: collapse copy_flags into copy

cgroups.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [patch 1/8] fork: collapse copy_flags into copy_process
       [not found]           ` <alpine.DEB.2.02.1312021504170.13465@chino.kir.corp.google.com>
@ 2013-12-04  5:19             ` David Rientjes
  2013-12-04  5:19               ` [patch 2/8] mm, mempolicy: rename slab_node for clarity David Rientjes
                                 ` (6 more replies)
  0 siblings, 7 replies; 39+ messages in thread
From: David Rientjes @ 2013-12-04  5:19 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Michal Hocko, KAMEZAWA Hiroyuki, Johannes Weiner, Mel Gorman,
	Rik van Riel, Pekka Enberg, Christoph Lameter, linux-kernel,
	linux-mm, cgroups

copy_flags() does not use the clone_flags formal and can be collapsed
into copy_process() for cleaner code.

Signed-off-by: David Rientjes <rientjes@google.com>
---
 kernel/fork.c | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/kernel/fork.c b/kernel/fork.c
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1066,15 +1066,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 	return 0;
 }
 
-static void copy_flags(unsigned long clone_flags, struct task_struct *p)
-{
-	unsigned long new_flags = p->flags;
-
-	new_flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER);
-	new_flags |= PF_FORKNOEXEC;
-	p->flags = new_flags;
-}
-
 SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr)
 {
 	current->clear_child_tid = tidptr;
@@ -1223,7 +1214,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 
 	p->did_exec = 0;
 	delayacct_tsk_init(p);	/* Must remain after dup_task_struct() */
-	copy_flags(clone_flags, p);
+	p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER);
+	p->flags |= PF_FORKNOEXEC;
 	INIT_LIST_HEAD(&p->children);
 	INIT_LIST_HEAD(&p->sibling);
 	rcu_copy_process(p);

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [patch 2/8] mm, mempolicy: rename slab_node for clarity
  2013-12-04  5:19             ` [patch 1/8] fork: collapse copy_flags into copy_process David Rientjes
@ 2013-12-04  5:19               ` David Rientjes
       [not found]                 ` <alpine.DEB.2.02.1312032117330.29733-X6Q0R45D7oAcqpCFd4KODRPsWskHk0ljAL8bYrjMMd8@public.gmane.org>
  2013-12-04  5:20               ` [patch 3/8] mm, mempolicy: remove per-process flag David Rientjes
                                 ` (5 subsequent siblings)
  6 siblings, 1 reply; 39+ messages in thread
From: David Rientjes @ 2013-12-04  5:19 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Michal Hocko, KAMEZAWA Hiroyuki, Johannes Weiner, Mel Gorman,
	Rik van Riel, Pekka Enberg, Christoph Lameter, linux-kernel,
	linux-mm, cgroups

slab_node() is actually a mempolicy function, so rename it to
mempolicy_slab_node() to make it clearer that it used for processes with
mempolicies.

At the same time, cleanup its code by saving numa_mem_id() in a local
variable (since we require a node with memory, not just any node) and
remove an obsolete comment that assumes the mempolicy is actually passed
into the function.

Signed-off-by: David Rientjes <rientjes@google.com>
---
 include/linux/mempolicy.h |  2 +-
 mm/mempolicy.c            | 15 ++++++---------
 mm/slab.c                 |  4 ++--
 mm/slub.c                 |  2 +-
 4 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -151,7 +151,7 @@ extern struct zonelist *huge_zonelist(struct vm_area_struct *vma,
 extern bool init_nodemask_of_mempolicy(nodemask_t *mask);
 extern bool mempolicy_nodemask_intersects(struct task_struct *tsk,
 				const nodemask_t *mask);
-extern unsigned slab_node(void);
+extern unsigned int mempolicy_slab_node(void);
 
 extern enum zone_type policy_zone;
 
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1783,21 +1783,18 @@ static unsigned interleave_nodes(struct mempolicy *policy)
 /*
  * Depending on the memory policy provide a node from which to allocate the
  * next slab entry.
- * @policy must be protected by freeing by the caller.  If @policy is
- * the current task's mempolicy, this protection is implicit, as only the
- * task can change it's policy.  The system default policy requires no
- * such protection.
  */
-unsigned slab_node(void)
+unsigned int mempolicy_slab_node(void)
 {
 	struct mempolicy *policy;
+	int node = numa_mem_id();
 
 	if (in_interrupt())
-		return numa_node_id();
+		return node;
 
 	policy = current->mempolicy;
 	if (!policy || policy->flags & MPOL_F_LOCAL)
-		return numa_node_id();
+		return node;
 
 	switch (policy->mode) {
 	case MPOL_PREFERRED:
@@ -1817,11 +1814,11 @@ unsigned slab_node(void)
 		struct zonelist *zonelist;
 		struct zone *zone;
 		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
-		zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
+		zonelist = &NODE_DATA(node)->node_zonelists[0];
 		(void)first_zones_zonelist(zonelist, highest_zoneidx,
 							&policy->v.nodes,
 							&zone);
-		return zone ? zone->node : numa_node_id();
+		return zone ? zone->node : node;
 	}
 
 	default:
diff --git a/mm/slab.c b/mm/slab.c
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3042,7 +3042,7 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
 	if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
 		nid_alloc = cpuset_slab_spread_node();
 	else if (current->mempolicy)
-		nid_alloc = slab_node();
+		nid_alloc = mempolicy_slab_node();
 	if (nid_alloc != nid_here)
 		return ____cache_alloc_node(cachep, flags, nid_alloc);
 	return NULL;
@@ -3074,7 +3074,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
 
 retry_cpuset:
 	cpuset_mems_cookie = get_mems_allowed();
-	zonelist = node_zonelist(slab_node(), flags);
+	zonelist = node_zonelist(mempolicy_slab_node(), flags);
 
 retry:
 	/*
diff --git a/mm/slub.c b/mm/slub.c
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1663,7 +1663,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
 
 	do {
 		cpuset_mems_cookie = get_mems_allowed();
-		zonelist = node_zonelist(slab_node(), flags);
+		zonelist = node_zonelist(mempolicy_slab_node(), flags);
 		for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
 			struct kmem_cache_node *n;
 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 39+ messages in thread

[parent not found: <alpine.DEB.2.02.1312032117330.29733-X6Q0R45D7oAcqpCFd4KODRPsWskHk0ljAL8bYrjMMd8@public.gmane.org>]

* Re: [patch 2/8] mm, mempolicy: rename slab_node for clarity
       [not found]                 ` <alpine.DEB.2.02.1312032117330.29733-X6Q0R45D7oAcqpCFd4KODRPsWskHk0ljAL8bYrjMMd8@public.gmane.org>
@ 2013-12-04 15:21                   ` Christoph Lameter
  0 siblings, 0 replies; 39+ messages in thread
From: Christoph Lameter @ 2013-12-04 15:21 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Michal Hocko, KAMEZAWA Hiroyuki, Johannes Weiner,
	Mel Gorman, Rik van Riel, Pekka Enberg,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg, cgroups-u79uwXL29TY76Z2rM5mHXA

On Tue, 3 Dec 2013, David Rientjes wrote:

> slab_node() is actually a mempolicy function, so rename it to
> mempolicy_slab_node() to make it clearer that it used for processes with
> mempolicies.

Acked-by: Christoph Lameter <cl-vYTEC60ixJUAvxtiuMwx3w@public.gmane.org>

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [patch 3/8] mm, mempolicy: remove per-process flag
  2013-12-04  5:19             ` [patch 1/8] fork: collapse copy_flags into copy_process David Rientjes
  2013-12-04  5:19               ` [patch 2/8] mm, mempolicy: rename slab_node for clarity David Rientjes
@ 2013-12-04  5:20               ` David Rientjes
  2013-12-04 15:24                 ` Christoph Lameter
  2013-12-04  5:20               ` [patch 4/8] mm, memcg: add tunable for oom reserves David Rientjes
                                 ` (4 subsequent siblings)
  6 siblings, 1 reply; 39+ messages in thread
From: David Rientjes @ 2013-12-04  5:20 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Michal Hocko, KAMEZAWA Hiroyuki, Johannes Weiner, Mel Gorman,
	Rik van Riel, Pekka Enberg, Christoph Lameter, linux-kernel,
	linux-mm, cgroups

PF_MEMPOLICY is an unnecessary optimization for CONFIG_SLAB users.
There's no significant performance degradation to checking
current->mempolicy rather than current->flags & PF_MEMPOLICY in the
allocation path, especially since this is considered unlikely().

Per-process flags are a scarce resource so we should free them up
whenever possible and make them available.  We'll be using it shortly for
memcg oom reserves.

Signed-off-by: David Rientjes <rientjes@google.com>
---
 include/linux/mempolicy.h |  5 -----
 include/linux/sched.h     |  1 -
 kernel/fork.c             |  1 -
 mm/mempolicy.c            | 31 -------------------------------
 mm/slab.c                 |  4 ++--
 5 files changed, 2 insertions(+), 40 deletions(-)

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -143,7 +143,6 @@ extern void numa_policy_init(void);
 extern void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
 				enum mpol_rebind_step step);
 extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new);
-extern void mpol_fix_fork_child_flag(struct task_struct *p);
 
 extern struct zonelist *huge_zonelist(struct vm_area_struct *vma,
 				unsigned long addr, gfp_t gfp_flags,
@@ -266,10 +265,6 @@ static inline void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
 {
 }
 
-static inline void mpol_fix_fork_child_flag(struct task_struct *p)
-{
-}
-
 static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma,
 				unsigned long addr, gfp_t gfp_flags,
 				struct mempolicy **mpol, nodemask_t **nodemask)
diff --git a/include/linux/sched.h b/include/linux/sched.h
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1695,7 +1695,6 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut,
 #define PF_SPREAD_SLAB	0x02000000	/* Spread some slab caches over cpuset */
 #define PF_NO_SETAFFINITY 0x04000000	/* Userland is not allowed to meddle with cpus_allowed */
 #define PF_MCE_EARLY    0x08000000      /* Early kill for mce process policy */
-#define PF_MEMPOLICY	0x10000000	/* Non-default NUMA mempolicy */
 #define PF_MUTEX_TESTER	0x20000000	/* Thread belongs to the rt mutex tester */
 #define PF_FREEZER_SKIP	0x40000000	/* Freezer should not count it as freezable */
 #define PF_SUSPEND_TASK 0x80000000      /* this thread called freeze_processes and should not be frozen */
diff --git a/kernel/fork.c b/kernel/fork.c
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1261,7 +1261,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 		p->mempolicy = NULL;
 		goto bad_fork_cleanup_cgroup;
 	}
-	mpol_fix_fork_child_flag(p);
 #endif
 #ifdef CONFIG_CPUSETS
 	p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -796,36 +796,6 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
 	return err;
 }
 
-/*
- * Update task->flags PF_MEMPOLICY bit: set iff non-default
- * mempolicy.  Allows more rapid checking of this (combined perhaps
- * with other PF_* flag bits) on memory allocation hot code paths.
- *
- * If called from outside this file, the task 'p' should -only- be
- * a newly forked child not yet visible on the task list, because
- * manipulating the task flags of a visible task is not safe.
- *
- * The above limitation is why this routine has the funny name
- * mpol_fix_fork_child_flag().
- *
- * It is also safe to call this with a task pointer of current,
- * which the static wrapper mpol_set_task_struct_flag() does,
- * for use within this file.
- */
-
-void mpol_fix_fork_child_flag(struct task_struct *p)
-{
-	if (p->mempolicy)
-		p->flags |= PF_MEMPOLICY;
-	else
-		p->flags &= ~PF_MEMPOLICY;
-}
-
-static void mpol_set_task_struct_flag(void)
-{
-	mpol_fix_fork_child_flag(current);
-}
-
 /* Set the process memory policy */
 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 			     nodemask_t *nodes)
@@ -862,7 +832,6 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 	}
 	old = current->mempolicy;
 	current->mempolicy = new;
-	mpol_set_task_struct_flag();
 	if (new && new->mode == MPOL_INTERLEAVE &&
 	    nodes_weight(new->v.nodes))
 		current->il_next = first_node(new->v.nodes);
diff --git a/mm/slab.c b/mm/slab.c
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3027,7 +3027,7 @@ out:
 
 #ifdef CONFIG_NUMA
 /*
- * Try allocating on another node if PF_SPREAD_SLAB|PF_MEMPOLICY.
+ * Try allocating on another node if PF_SPREAD_SLAB is a mempolicy is set.
  *
  * If we are in_interrupt, then process context, including cpusets and
  * mempolicy, may not apply and should not be used for allocation policy.
@@ -3259,7 +3259,7 @@ __do_cache_alloc(struct kmem_cache *cache, gfp_t flags)
 {
 	void *objp;
 
-	if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) {
+	if (current->mempolicy || unlikely(current->flags & PF_SPREAD_SLAB)) {
 		objp = alternate_node_alloc(cache, flags);
 		if (objp)
 			goto out;

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [patch 3/8] mm, mempolicy: remove per-process flag
  2013-12-04  5:20               ` [patch 3/8] mm, mempolicy: remove per-process flag David Rientjes
@ 2013-12-04 15:24                 ` Christoph Lameter
  2013-12-05  0:53                   ` David Rientjes
  0 siblings, 1 reply; 39+ messages in thread
From: Christoph Lameter @ 2013-12-04 15:24 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Michal Hocko, KAMEZAWA Hiroyuki, Johannes Weiner,
	Mel Gorman, Rik van Riel, Pekka Enberg, linux-kernel, linux-mm,
	cgroups

On Tue, 3 Dec 2013, David Rientjes wrote:

> PF_MEMPOLICY is an unnecessary optimization for CONFIG_SLAB users.
> There's no significant performance degradation to checking
> current->mempolicy rather than current->flags & PF_MEMPOLICY in the
> allocation path, especially since this is considered unlikely().

The use of current->mempolicy increase the cache footprint since its in a
rarely used cacheline. This performance issue would occur when memory
policies are not used since that cacheline would then have to be touched
regardless of memory policies be in effect or not. PF_MEMPOLICY was used
to avoid touching the cacheline.


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [patch 3/8] mm, mempolicy: remove per-process flag
  2013-12-04 15:24                 ` Christoph Lameter
@ 2013-12-05  0:53                   ` David Rientjes
  2013-12-05 19:05                     ` Christoph Lameter
  0 siblings, 1 reply; 39+ messages in thread
From: David Rientjes @ 2013-12-05  0:53 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Andrew Morton, Michal Hocko, KAMEZAWA Hiroyuki, Johannes Weiner,
	Mel Gorman, Rik van Riel, Pekka Enberg, linux-kernel, linux-mm,
	cgroups

On Wed, 4 Dec 2013, Christoph Lameter wrote:

> > PF_MEMPOLICY is an unnecessary optimization for CONFIG_SLAB users.
> > There's no significant performance degradation to checking
> > current->mempolicy rather than current->flags & PF_MEMPOLICY in the
> > allocation path, especially since this is considered unlikely().
> 
> The use of current->mempolicy increase the cache footprint since its in a
> rarely used cacheline. This performance issue would occur when memory
> policies are not used since that cacheline would then have to be touched
> regardless of memory policies be in effect or not. PF_MEMPOLICY was used
> to avoid touching the cacheline.
> 

Right, but it turns out not to matter in practice.  As one of the non-
default CONFIG_SLAB users, and PF_MEMPOLICY only does something for 
CONFIG_SLAB, this patch tested to not show any degradation for specjbb 
which stresses the allocator in terms of throughput:

	   with patch: 128761.54 SPECjbb2005 bops
	without patch: 127576.65 SPECjbb2005 bops

These per-process flags are a scarce resource so I don't think 
PF_MEMPOLICY warrants a bit when it's not shown to be advantageous in 
configurations without mempolicy usage where it's intended to optimize, 
especially for a non-default slab allocator.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [patch 3/8] mm, mempolicy: remove per-process flag
  2013-12-05  0:53                   ` David Rientjes
@ 2013-12-05 19:05                     ` Christoph Lameter
  2013-12-05 23:53                       ` David Rientjes
  0 siblings, 1 reply; 39+ messages in thread
From: Christoph Lameter @ 2013-12-05 19:05 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Michal Hocko, KAMEZAWA Hiroyuki, Johannes Weiner,
	Mel Gorman, Rik van Riel, Pekka Enberg, linux-kernel, linux-mm,
	cgroups

On Wed, 4 Dec 2013, David Rientjes wrote:

>
> Right, but it turns out not to matter in practice.  As one of the non-
> default CONFIG_SLAB users, and PF_MEMPOLICY only does something for
> CONFIG_SLAB, this patch tested to not show any degradation for specjbb
> which stresses the allocator in terms of throughput:
>
> 	   with patch: 128761.54 SPECjbb2005 bops
> 	without patch: 127576.65 SPECjbb2005 bops

Specjbb? What does Java have to do with this?
Can you run the synthetic in kernel slab benchmark.

Like this one https://lkml.org/lkml/2009/10/13/459

> These per-process flags are a scarce resource so I don't think
> PF_MEMPOLICY warrants a bit when it's not shown to be advantageous in
> configurations without mempolicy usage where it's intended to optimize,
> especially for a non-default slab allocator.

PF_MEMPOLICY was advantageous when Paul Jackson introduced and benchmarked
it.

SLUB supports mempolicies through allocate_pages but it will allocate all
objects out of one slab pages before retrieving another page following the
policy. Thats why PF_MEMPOLICY and the other per object handling can be
avoided in its fastpath. Thus PF_MEMPOLICY is not that important there.

However, SLAB is still the allocator in use for RHEL which puts some
importance on still supporting SLAB.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [patch 3/8] mm, mempolicy: remove per-process flag
  2013-12-05 19:05                     ` Christoph Lameter
@ 2013-12-05 23:53                       ` David Rientjes
  2013-12-06 14:46                         ` Christoph Lameter
  0 siblings, 1 reply; 39+ messages in thread
From: David Rientjes @ 2013-12-05 23:53 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Andrew Morton, Michal Hocko, KAMEZAWA Hiroyuki, Johannes Weiner,
	Mel Gorman, Rik van Riel, Pekka Enberg, linux-kernel, linux-mm,
	cgroups

On Thu, 5 Dec 2013, Christoph Lameter wrote:

> Specjbb? What does Java have to do with this?
> Can you run the synthetic in kernel slab benchmark.
> 
> Like this one https://lkml.org/lkml/2009/10/13/459
> 

We actually carry that in our production kernel and have updated it to 
build on 3.11, I'll run it and netperf TCP_RR as well, thanks.

> However, SLAB is still the allocator in use for RHEL which puts some
> importance on still supporting SLAB.
> 

Google also uses it exclusively so I'm definitely not saying that since 
it's not default that we can ignore it.  I haven't seen any performance 
regression in removing it, but I'll post the numbers on the slab benchmark 
and netperf TCP_RR when I have them.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [patch 3/8] mm, mempolicy: remove per-process flag
  2013-12-05 23:53                       ` David Rientjes
@ 2013-12-06 14:46                         ` Christoph Lameter
  0 siblings, 0 replies; 39+ messages in thread
From: Christoph Lameter @ 2013-12-06 14:46 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Michal Hocko, KAMEZAWA Hiroyuki, Johannes Weiner,
	Mel Gorman, Rik van Riel, Pekka Enberg, linux-kernel, linux-mm,
	cgroups

On Thu, 5 Dec 2013, David Rientjes wrote:

> We actually carry that in our production kernel and have updated it to
> build on 3.11, I'll run it and netperf TCP_RR as well, thanks.

If you get around it then please post the updated version. Maybe we can
get that merged at some point. Keeps floating around after all.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [patch 4/8] mm, memcg: add tunable for oom reserves
  2013-12-04  5:19             ` [patch 1/8] fork: collapse copy_flags into copy_process David Rientjes
  2013-12-04  5:19               ` [patch 2/8] mm, mempolicy: rename slab_node for clarity David Rientjes
  2013-12-04  5:20               ` [patch 3/8] mm, mempolicy: remove per-process flag David Rientjes
@ 2013-12-04  5:20               ` David Rientjes
  2013-12-04  5:20               ` [patch 5/8] res_counter: remove interface for locked charging and uncharging David Rientjes
                                 ` (3 subsequent siblings)
  6 siblings, 0 replies; 39+ messages in thread
From: David Rientjes @ 2013-12-04  5:20 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Michal Hocko, KAMEZAWA Hiroyuki, Johannes Weiner, Mel Gorman,
	Rik van Riel, Pekka Enberg, Christoph Lameter, linux-kernel,
	linux-mm, cgroups

Userspace needs a way to define the amount of memory reserves that
processes handling oom conditions may utilize.  This patch adds a per-
memcg oom reserve field and file, memory.oom_reserve_in_bytes, to
manipulate its value.

If currently utilized memory reserves are attempted to be reduced by
writing a smaller value to memory.oom_reserve_in_bytes, it will fail with
-EBUSY until some memory is uncharged.

Signed-off-by: David Rientjes <rientjes@google.com>
---
 mm/memcontrol.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -274,6 +274,9 @@ struct mem_cgroup {
 	/* OOM-Killer disable */
 	int		oom_kill_disable;
 
+	/* reserves for handling oom conditions, protected by res.lock */
+	unsigned long long	oom_reserve;
+
 	/* set when res.limit == memsw.limit */
 	bool		memsw_is_minimum;
 
@@ -5893,6 +5896,51 @@ static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
 	return 0;
 }
 
+static int mem_cgroup_resize_oom_reserve(struct mem_cgroup *memcg,
+					 unsigned long long new_limit)
+{
+	struct res_counter *res = &memcg->res;
+	u64 limit, usage;
+	int ret = 0;
+
+	spin_lock(&res->lock);
+	limit = res->limit;
+	usage = res->usage;
+
+	if (usage > limit && usage - limit > new_limit) {
+		ret = -EBUSY;
+		goto out;
+	}
+
+	memcg->oom_reserve = new_limit;
+out:
+	spin_unlock(&res->lock);
+	return ret;
+}
+
+static u64 mem_cgroup_oom_reserve_read(struct cgroup_subsys_state *css,
+				       struct cftype *cft)
+{
+	return mem_cgroup_from_css(css)->oom_reserve;
+}
+
+static int mem_cgroup_oom_reserve_write(struct cgroup_subsys_state *css,
+					struct cftype *cft, const char *buffer)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+	unsigned long long val;
+	int ret;
+
+	if (mem_cgroup_is_root(memcg))
+		return -EINVAL;
+
+	ret = res_counter_memparse_write_strategy(buffer, &val);
+	if (ret)
+		return ret;
+
+	return mem_cgroup_resize_oom_reserve(memcg, val);
+}
+
 #ifdef CONFIG_MEMCG_KMEM
 static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 {
@@ -6024,6 +6072,11 @@ static struct cftype mem_cgroup_files[] = {
 		.private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
 	},
 	{
+		.name = "oom_reserve_in_bytes",
+		.read_u64 = mem_cgroup_oom_reserve_read,
+		.write_string = mem_cgroup_oom_reserve_write,
+	},
+	{
 		.name = "pressure_level",
 		.register_event = vmpressure_register_event,
 		.unregister_event = vmpressure_unregister_event,

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [patch 5/8] res_counter: remove interface for locked charging and uncharging
  2013-12-04  5:19             ` [patch 1/8] fork: collapse copy_flags into copy_process David Rientjes
                                 ` (2 preceding siblings ...)
  2013-12-04  5:20               ` [patch 4/8] mm, memcg: add tunable for oom reserves David Rientjes
@ 2013-12-04  5:20               ` David Rientjes
  2013-12-04  5:20               ` [patch 6/8] res_counter: add interface for maximum nofail charge David Rientjes
                                 ` (2 subsequent siblings)
  6 siblings, 0 replies; 39+ messages in thread
From: David Rientjes @ 2013-12-04  5:20 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Michal Hocko, KAMEZAWA Hiroyuki, Johannes Weiner, Mel Gorman,
	Rik van Riel, Pekka Enberg, Christoph Lameter, linux-kernel,
	linux-mm, cgroups

The res_counter_{charge,uncharge}_locked() variants are not used in the
kernel outside of the resource counter code itself, so remove the
interface.

Signed-off-by: David Rientjes <rientjes@google.com>
---
 Documentation/cgroups/resource_counter.txt | 14 ++------------
 include/linux/res_counter.h                |  6 +-----
 kernel/res_counter.c                       | 23 ++++++++++++-----------
 3 files changed, 15 insertions(+), 28 deletions(-)

diff --git a/Documentation/cgroups/resource_counter.txt b/Documentation/cgroups/resource_counter.txt
--- a/Documentation/cgroups/resource_counter.txt
+++ b/Documentation/cgroups/resource_counter.txt
@@ -76,24 +76,14 @@ to work with it.
 	limit_fail_at parameter is set to the particular res_counter element
 	where the charging failed.
 
- d. int res_counter_charge_locked
-			(struct res_counter *rc, unsigned long val, bool force)
-
-	The same as res_counter_charge(), but it must not acquire/release the
-	res_counter->lock internally (it must be called with res_counter->lock
-	held). The force parameter indicates whether we can bypass the limit.
-
- e. u64 res_counter_uncharge[_locked]
-			(struct res_counter *rc, unsigned long val)
+ d. u64 res_counter_uncharge(struct res_counter *rc, unsigned long val)
 
 	When a resource is released (freed) it should be de-accounted
 	from the resource counter it was accounted to.  This is called
 	"uncharging". The return value of this function indicate the amount
 	of charges still present in the counter.
 
-	The _locked routines imply that the res_counter->lock is taken.
-
- f. u64 res_counter_uncharge_until
+ e. u64 res_counter_uncharge_until
 		(struct res_counter *rc, struct res_counter *top,
 		 unsinged long val)
 
diff --git a/include/linux/res_counter.h b/include/linux/res_counter.h
--- a/include/linux/res_counter.h
+++ b/include/linux/res_counter.h
@@ -104,15 +104,13 @@ void res_counter_init(struct res_counter *counter, struct res_counter *parent);
  *       units, e.g. numbers, bytes, Kbytes, etc
  *
  * returns 0 on success and <0 if the counter->usage will exceed the
- * counter->limit _locked call expects the counter->lock to be taken
+ * counter->limit
  *
  * charge_nofail works the same, except that it charges the resource
  * counter unconditionally, and returns < 0 if the after the current
  * charge we are over limit.
  */
 
-int __must_check res_counter_charge_locked(struct res_counter *counter,
-					   unsigned long val, bool force);
 int __must_check res_counter_charge(struct res_counter *counter,
 		unsigned long val, struct res_counter **limit_fail_at);
 int res_counter_charge_nofail(struct res_counter *counter,
@@ -125,12 +123,10 @@ int res_counter_charge_nofail(struct res_counter *counter,
  * @val: the amount of the resource
  *
  * these calls check for usage underflow and show a warning on the console
- * _locked call expects the counter->lock to be taken
  *
  * returns the total charges still present in @counter.
  */
 
-u64 res_counter_uncharge_locked(struct res_counter *counter, unsigned long val);
 u64 res_counter_uncharge(struct res_counter *counter, unsigned long val);
 
 u64 res_counter_uncharge_until(struct res_counter *counter,
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -22,8 +22,18 @@ void res_counter_init(struct res_counter *counter, struct res_counter *parent)
 	counter->parent = parent;
 }
 
-int res_counter_charge_locked(struct res_counter *counter, unsigned long val,
-			      bool force)
+static u64 res_counter_uncharge_locked(struct res_counter *counter,
+				       unsigned long val)
+{
+	if (WARN_ON(counter->usage < val))
+		val = counter->usage;
+
+	counter->usage -= val;
+	return counter->usage;
+}
+
+static int res_counter_charge_locked(struct res_counter *counter,
+				     unsigned long val, bool force)
 {
 	int ret = 0;
 
@@ -86,15 +96,6 @@ int res_counter_charge_nofail(struct res_counter *counter, unsigned long val,
 	return __res_counter_charge(counter, val, limit_fail_at, true);
 }
 
-u64 res_counter_uncharge_locked(struct res_counter *counter, unsigned long val)
-{
-	if (WARN_ON(counter->usage < val))
-		val = counter->usage;
-
-	counter->usage -= val;
-	return counter->usage;
-}
-
 u64 res_counter_uncharge_until(struct res_counter *counter,
 			       struct res_counter *top,
 			       unsigned long val)

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [patch 6/8] res_counter: add interface for maximum nofail charge
  2013-12-04  5:19             ` [patch 1/8] fork: collapse copy_flags into copy_process David Rientjes
                                 ` (3 preceding siblings ...)
  2013-12-04  5:20               ` [patch 5/8] res_counter: remove interface for locked charging and uncharging David Rientjes
@ 2013-12-04  5:20               ` David Rientjes
  2013-12-04  5:20               ` [patch 7/8] mm, memcg: allow processes handling oom notifications to access reserves David Rientjes
  2013-12-04  5:20               ` [patch 8/8] mm, memcg: add memcg oom reserve documentation David Rientjes
  6 siblings, 0 replies; 39+ messages in thread
From: David Rientjes @ 2013-12-04  5:20 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Michal Hocko, KAMEZAWA Hiroyuki, Johannes Weiner, Mel Gorman,
	Rik van Riel, Pekka Enberg, Christoph Lameter, linux-kernel,
	linux-mm, cgroups

For memcg oom reserves, we'll need a resource counter interface that will
not fail when exceeding the memcg limit like res_counter_charge_nofail,
but only to a ceiling.

This patch adds res_counter_charge_nofail_max() that will exceed the
resource counter but only to a maximum defined value.  If it fails to
charge the resource, it returns -ENOMEM.

Signed-off-by: David Rientjes <rientjes@google.com>
---
 include/linux/res_counter.h | 10 +++++++++-
 kernel/res_counter.c        | 27 +++++++++++++++++++++------
 2 files changed, 30 insertions(+), 7 deletions(-)

diff --git a/include/linux/res_counter.h b/include/linux/res_counter.h
--- a/include/linux/res_counter.h
+++ b/include/linux/res_counter.h
@@ -107,14 +107,22 @@ void res_counter_init(struct res_counter *counter, struct res_counter *parent);
  * counter->limit
  *
  * charge_nofail works the same, except that it charges the resource
- * counter unconditionally, and returns < 0 if the after the current
+ * counter unconditionally, and returns < 0 if after the current
  * charge we are over limit.
+ *
+ * charge_nofail_max is the same as charge_nofail, except that the
+ * resource counter usage can only exceed the limit by the max
+ * difference.  Unlike charge_nofail, charge_nofail_max returns < 0
+ * only if the current charge fails because of the max difference.
  */
 
 int __must_check res_counter_charge(struct res_counter *counter,
 		unsigned long val, struct res_counter **limit_fail_at);
 int res_counter_charge_nofail(struct res_counter *counter,
 		unsigned long val, struct res_counter **limit_fail_at);
+int res_counter_charge_nofail_max(struct res_counter *counter,
+		unsigned long val, struct res_counter **limit_fail_at,
+		unsigned long max);
 
 /*
  * uncharge - tell that some portion of the resource is released
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -33,15 +33,19 @@ static u64 res_counter_uncharge_locked(struct res_counter *counter,
 }
 
 static int res_counter_charge_locked(struct res_counter *counter,
-				     unsigned long val, bool force)
+				     unsigned long val, bool force,
+				     unsigned long max)
 {
 	int ret = 0;
 
 	if (counter->usage + val > counter->limit) {
 		counter->failcnt++;
-		ret = -ENOMEM;
+		if (max == ULONG_MAX)
+			ret = -ENOMEM;
 		if (!force)
 			return ret;
+		if (counter->usage + val - counter->limit > max)
+			return -ENOMEM;
 	}
 
 	counter->usage += val;
@@ -51,7 +55,8 @@ static int res_counter_charge_locked(struct res_counter *counter,
 }
 
 static int __res_counter_charge(struct res_counter *counter, unsigned long val,
-				struct res_counter **limit_fail_at, bool force)
+				struct res_counter **limit_fail_at, bool force,
+				unsigned long max)
 {
 	int ret, r;
 	unsigned long flags;
@@ -62,7 +67,7 @@ static int __res_counter_charge(struct res_counter *counter, unsigned long val,
 	local_irq_save(flags);
 	for (c = counter; c != NULL; c = c->parent) {
 		spin_lock(&c->lock);
-		r = res_counter_charge_locked(c, val, force);
+		r = res_counter_charge_locked(c, val, force, max);
 		spin_unlock(&c->lock);
 		if (r < 0 && !ret) {
 			ret = r;
@@ -87,13 +92,23 @@ static int __res_counter_charge(struct res_counter *counter, unsigned long val,
 int res_counter_charge(struct res_counter *counter, unsigned long val,
 			struct res_counter **limit_fail_at)
 {
-	return __res_counter_charge(counter, val, limit_fail_at, false);
+	return __res_counter_charge(counter, val, limit_fail_at, false,
+				    ULONG_MAX);
 }
 
 int res_counter_charge_nofail(struct res_counter *counter, unsigned long val,
 			      struct res_counter **limit_fail_at)
 {
-	return __res_counter_charge(counter, val, limit_fail_at, true);
+	return __res_counter_charge(counter, val, limit_fail_at, true,
+				    ULONG_MAX);
+}
+
+int res_counter_charge_nofail_max(struct res_counter *counter,
+				  unsigned long val,
+				  struct res_counter **limit_fail_at,
+				  unsigned long max)
+{
+	return __res_counter_charge(counter, val, limit_fail_at, true, max);
 }
 
 u64 res_counter_uncharge_until(struct res_counter *counter,

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [patch 7/8] mm, memcg: allow processes handling oom notifications to access reserves
  2013-12-04  5:19             ` [patch 1/8] fork: collapse copy_flags into copy_process David Rientjes
                                 ` (4 preceding siblings ...)
  2013-12-04  5:20               ` [patch 6/8] res_counter: add interface for maximum nofail charge David Rientjes
@ 2013-12-04  5:20               ` David Rientjes
  2013-12-04  5:45                 ` Johannes Weiner
  2013-12-04  5:20               ` [patch 8/8] mm, memcg: add memcg oom reserve documentation David Rientjes
  6 siblings, 1 reply; 39+ messages in thread
From: David Rientjes @ 2013-12-04  5:20 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Michal Hocko, KAMEZAWA Hiroyuki, Johannes Weiner, Mel Gorman,
	Rik van Riel, Pekka Enberg, Christoph Lameter, linux-kernel,
	linux-mm, cgroups

Now that a per-process flag is available, define it for processes that
handle userspace oom notifications.  This is an optimization to avoid
mantaining a list of such processes attached to a memcg at any given time
and iterating it at charge time.

This flag gets set whenever a process has registered for an oom
notification and is cleared whenever it unregisters.

When memcg reclaim has failed to free any memory, it is necessary for
userspace oom handlers to be able to dip into reserves to pagefault text,
allocate kernel memory to read the "tasks" file, allocate heap, etc.

System oom conditions are not addressed at this time, but the same per-
process flag can be used in the page allocator to determine if access
should be given to userspace oom handlers to per-zone memory reserves at
a later time once there is consensus.

Signed-off-by: David Rientjes <rientjes@google.com>
---
 include/linux/sched.h |  1 +
 mm/memcontrol.c       | 47 ++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 47 insertions(+), 1 deletion(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1695,6 +1695,7 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut,
 #define PF_SPREAD_SLAB	0x02000000	/* Spread some slab caches over cpuset */
 #define PF_NO_SETAFFINITY 0x04000000	/* Userland is not allowed to meddle with cpus_allowed */
 #define PF_MCE_EARLY    0x08000000      /* Early kill for mce process policy */
+#define PF_OOM_HANDLER	0x10000000	/* Userspace process handling oom conditions */
 #define PF_MUTEX_TESTER	0x20000000	/* Thread belongs to the rt mutex tester */
 #define PF_FREEZER_SKIP	0x40000000	/* Freezer should not count it as freezable */
 #define PF_SUSPEND_TASK 0x80000000      /* this thread called freeze_processes and should not be frozen */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2590,6 +2590,33 @@ enum {
 	CHARGE_WOULDBLOCK,	/* GFP_WAIT wasn't set and no enough res. */
 };
 
+/*
+ * Processes handling oom conditions are allowed to utilize memory reserves so
+ * that they may handle the condition.
+ */
+static int mem_cgroup_oom_handler_charge(struct mem_cgroup *memcg,
+					 unsigned long csize,
+					 struct mem_cgroup **mem_over_limit)
+{
+	struct res_counter *fail_res;
+	int ret;
+
+	ret = res_counter_charge_nofail_max(&memcg->res, csize, &fail_res,
+					    memcg->oom_reserve);
+	if (!ret && do_swap_account) {
+		ret = res_counter_charge_nofail_max(&memcg->memsw, csize,
+						    &fail_res,
+						    memcg->oom_reserve);
+		if (ret) {
+			res_counter_uncharge(&memcg->res, csize);
+			*mem_over_limit = mem_cgroup_from_res_counter(fail_res,
+								      memsw);
+
+		}
+	}
+	return !ret ? CHARGE_OK : CHARGE_NOMEM;
+}
+
 static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
 				unsigned int nr_pages, unsigned int min_pages,
 				bool invoke_oom)
@@ -2649,6 +2676,13 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
 	if (mem_cgroup_wait_acct_move(mem_over_limit))
 		return CHARGE_RETRY;
 
+	if (current->flags & PF_OOM_HANDLER) {
+		ret = mem_cgroup_oom_handler_charge(memcg, csize,
+						    &mem_over_limit);
+		if (ret == CHARGE_OK)
+			return CHARGE_OK;
+	}
+
 	if (invoke_oom)
 		mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(csize));
 
@@ -2696,7 +2730,8 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
 		     || fatal_signal_pending(current)))
 		goto bypass;
 
-	if (unlikely(task_in_memcg_oom(current)))
+	if (unlikely(task_in_memcg_oom(current)) &&
+	    !(current->flags & PF_OOM_HANDLER))
 		goto bypass;
 
 	/*
@@ -5825,6 +5860,11 @@ static int mem_cgroup_oom_register_event(struct cgroup_subsys_state *css,
 	if (!event)
 		return -ENOMEM;
 
+	/*
+	 * Setting PF_OOM_HANDLER before taking memcg_oom_lock ensures it is
+	 * set before getting added to memcg->oom_notify.
+	 */
+	current->flags |= PF_OOM_HANDLER;
 	spin_lock(&memcg_oom_lock);
 
 	event->eventfd = eventfd;
@@ -5856,6 +5896,11 @@ static void mem_cgroup_oom_unregister_event(struct cgroup_subsys_state *css,
 		}
 	}
 
+	/*
+	 * Clearing PF_OOM_HANDLER before dropping memcg_oom_lock ensures it is
+	 * cleared before receiving another notification.
+	 */
+	current->flags &= ~PF_OOM_HANDLER;
 	spin_unlock(&memcg_oom_lock);
 }
 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [patch 7/8] mm, memcg: allow processes handling oom notifications to access reserves
  2013-12-04  5:20               ` [patch 7/8] mm, memcg: allow processes handling oom notifications to access reserves David Rientjes
@ 2013-12-04  5:45                 ` Johannes Weiner
  2013-12-05  1:49                   ` David Rientjes
  0 siblings, 1 reply; 39+ messages in thread
From: Johannes Weiner @ 2013-12-04  5:45 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Michal Hocko, KAMEZAWA Hiroyuki, Mel Gorman,
	Rik van Riel, Pekka Enberg, Christoph Lameter, Tejun Heo,
	Li Zefan, linux-kernel, linux-mm, cgroups

On Tue, Dec 03, 2013 at 09:20:17PM -0800, David Rientjes wrote:
> Now that a per-process flag is available, define it for processes that
> handle userspace oom notifications.  This is an optimization to avoid
> mantaining a list of such processes attached to a memcg at any given time
> and iterating it at charge time.
> 
> This flag gets set whenever a process has registered for an oom
> notification and is cleared whenever it unregisters.
> 
> When memcg reclaim has failed to free any memory, it is necessary for
> userspace oom handlers to be able to dip into reserves to pagefault text,
> allocate kernel memory to read the "tasks" file, allocate heap, etc.

The task handling the OOM of a memcg can obviously not be part of that
same memcg.

I've said this many times in the past, but here is the most recent
thread from Tejun, me, and Li on this topic:

---

On Tue, 3 Dec 2013 at 15:35:48 +0800, Li Zefan wrote:
> On Mon, 2 Dec 2013 at 11:44:06 -0500, Johannes Weiner wrote:
> > On Fri, Nov 29, 2013 at 03:05:25PM -0500, Tejun Heo wrote:
> > > Whoa, so we support oom handler inside the memcg that it handles?
> > > Does that work reliably?  Changing the above detail in this patch
> > > isn't difficult (and we'll later need to update kernfs too) but
> > > supporting such setup properly would be a *lot* of commitment and I'm
> > > very doubtful we'd be able to achieve that by just carefully avoiding
> > > memory allocation in the operations that usreland oom handler uses -
> > > that set is destined to expand over time, extremely fragile and will
> > > be hellish to maintain.
> > > 
> > > So, I'm not at all excited about commiting to this guarantee.  This
> > > one is an easy one but it looks like the first step onto dizzying
> > > slippery slope.
> > > 
> > > Am I misunderstanding something here?  Are you and Johannes firm on
> > > supporting this?
> >
> > Handling a memcg OOM from userspace running inside that OOM memcg is
> > completely crazy.  I mean, think about this for just two seconds...
> > Really?
> >
> > I get that people are doing it right now, and if you can get away with
> > it for now, good for you.  But you have to be aware how crazy this is
> > and if it breaks you get to keep the pieces and we are not going to
> > accomodate this in the kernel.  Fix your crazy userspace.
> 
> +1

---

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [patch 7/8] mm, memcg: allow processes handling oom notifications to access reserves
  2013-12-04  5:45                 ` Johannes Weiner
@ 2013-12-05  1:49                   ` David Rientjes
  2013-12-05  2:50                     ` Tejun Heo
  0 siblings, 1 reply; 39+ messages in thread
From: David Rientjes @ 2013-12-05  1:49 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: Andrew Morton, Michal Hocko, KAMEZAWA Hiroyuki, Mel Gorman,
	Rik van Riel, Pekka Enberg, Christoph Lameter, Tejun Heo,
	Li Zefan, linux-kernel, linux-mm, cgroups

On Wed, 4 Dec 2013, Johannes Weiner wrote:

> > Now that a per-process flag is available, define it for processes that
> > handle userspace oom notifications.  This is an optimization to avoid
> > mantaining a list of such processes attached to a memcg at any given time
> > and iterating it at charge time.
> > 
> > This flag gets set whenever a process has registered for an oom
> > notification and is cleared whenever it unregisters.
> > 
> > When memcg reclaim has failed to free any memory, it is necessary for
> > userspace oom handlers to be able to dip into reserves to pagefault text,
> > allocate kernel memory to read the "tasks" file, allocate heap, etc.
> 
> The task handling the OOM of a memcg can obviously not be part of that
> same memcg.
> 

Not without memory.oom_reserve_in_bytes that this series adds, that's 
true.  Michal expressed interest in the idea of memcg oom reserves in the 
past, so I thought I'd share the series.

> On Tue, 3 Dec 2013 at 15:35:48 +0800, Li Zefan wrote:
> > On Mon, 2 Dec 2013 at 11:44:06 -0500, Johannes Weiner wrote:
> > > On Fri, Nov 29, 2013 at 03:05:25PM -0500, Tejun Heo wrote:
> > > > Whoa, so we support oom handler inside the memcg that it handles?
> > > > Does that work reliably?  Changing the above detail in this patch
> > > > isn't difficult (and we'll later need to update kernfs too) but
> > > > supporting such setup properly would be a *lot* of commitment and I'm
> > > > very doubtful we'd be able to achieve that by just carefully avoiding
> > > > memory allocation in the operations that usreland oom handler uses -
> > > > that set is destined to expand over time, extremely fragile and will
> > > > be hellish to maintain.
> > > > 

It works reliably with this patch series, yes.  I'm not sure what change 
this is referring to that would avoid memory allocation for userspace oom 
handlers, and I'd agree that it would be difficult to maintain a 
no-allocation policy for a subset of processes that are destined to handle 
oom handlers.

That's not what this series is addressing, though, and in fact it's quite 
the opposite.  It acknowledges that userspace oom handlers need to 
allocate and that anything else would be too difficult to maintain 
(thereby agreeing with the above), so we must set aside memory that they 
are exclusively allowed to access.  For the vast majority of users who 
will not use userspace oom handlers, they can just use the default value 
of memory.oom_reserve_in_bytes == 0 and they incur absolutely no side-
effects as a result of this series.

For those who do use userspace oom handlers, like Google, this allows us 
to set aside memory to allow the userspace oom handlers to kill a process, 
dump the heap, send a signal, drop caches, etc. when waking up.

> > > > So, I'm not at all excited about commiting to this guarantee.  This
> > > > one is an easy one but it looks like the first step onto dizzying
> > > > slippery slope.
> > > > 
> > > > Am I misunderstanding something here?  Are you and Johannes firm on
> > > > supporting this?
> > >
> > > Handling a memcg OOM from userspace running inside that OOM memcg is
> > > completely crazy.  I mean, think about this for just two seconds...
> > > Really?
> > >
> > > I get that people are doing it right now, and if you can get away with
> > > it for now, good for you.  But you have to be aware how crazy this is
> > > and if it breaks you get to keep the pieces and we are not going to
> > > accomodate this in the kernel.  Fix your crazy userspace.
> > 

The rest of this email communicates only one thing: someone thinks it's 
crazy.  And I agree it would be crazy if we don't allow that class of 
process to have access to a pre-defined amount of memory to handle the 
situation, which this series adds.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [patch 7/8] mm, memcg: allow processes handling oom notifications to access reserves
  2013-12-05  1:49                   ` David Rientjes
@ 2013-12-05  2:50                     ` Tejun Heo
       [not found]                       ` <20131205025026.GA26777-Gd/HAXX7CRxy/B6EtB590w@public.gmane.org>
  0 siblings, 1 reply; 39+ messages in thread
From: Tejun Heo @ 2013-12-05  2:50 UTC (permalink / raw)
  To: David Rientjes
  Cc: Johannes Weiner, Andrew Morton, Michal Hocko, KAMEZAWA Hiroyuki,
	Mel Gorman, Rik van Riel, Pekka Enberg, Christoph Lameter,
	Li Zefan, linux-kernel, linux-mm, cgroups

Hello,

On Wed, Dec 04, 2013 at 05:49:04PM -0800, David Rientjes wrote:
> That's not what this series is addressing, though, and in fact it's quite 
> the opposite.  It acknowledges that userspace oom handlers need to 
> allocate and that anything else would be too difficult to maintain 
> (thereby agreeing with the above), so we must set aside memory that they 
> are exclusively allowed to access.  For the vast majority of users who 
> will not use userspace oom handlers, they can just use the default value 
> of memory.oom_reserve_in_bytes == 0 and they incur absolutely no side-
> effects as a result of this series.

Umm.. without delving into details, aren't you basically creating a
memory cgroup inside a memory cgroup?  Doesn't sound like a
particularly well thought-out plan to me.

> For those who do use userspace oom handlers, like Google, this allows us 
> to set aside memory to allow the userspace oom handlers to kill a process, 
> dump the heap, send a signal, drop caches, etc. when waking up.

Seems kinda obvious.  Put it in a separate cgroup?  You're basically
saying it doesn't want to be under the same memory limit as the
processes that it's looking over.  That's like the definition of being
in a different cgroup.

Thanks.

-- 
tejun

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 39+ messages in thread

[parent not found: <20131205025026.GA26777-Gd/HAXX7CRxy/B6EtB590w@public.gmane.org>]

* Re: [patch 7/8] mm, memcg: allow processes handling oom notifications to access reserves
       [not found]                       ` <20131205025026.GA26777-Gd/HAXX7CRxy/B6EtB590w@public.gmane.org>
@ 2013-12-05 23:49                         ` David Rientjes
       [not found]                           ` <alpine.DEB.2.02.1312051537550.7717-X6Q0R45D7oAcqpCFd4KODRPsWskHk0ljAL8bYrjMMd8@public.gmane.org>
  0 siblings, 1 reply; 39+ messages in thread
From: David Rientjes @ 2013-12-05 23:49 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Johannes Weiner, Andrew Morton, Michal Hocko, KAMEZAWA Hiroyuki,
	Mel Gorman, Rik van Riel, Pekka Enberg, Christoph Lameter,
	Li Zefan, linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg, cgroups-u79uwXL29TY76Z2rM5mHXA

On Wed, 4 Dec 2013, Tejun Heo wrote:

> Hello,
> 

Tejun, how are you?

> Umm.. without delving into details, aren't you basically creating a
> memory cgroup inside a memory cgroup?  Doesn't sound like a
> particularly well thought-out plan to me.
> 

I agree that we wouldn't need such support if we are only addressing memcg 
oom conditions.  We could do things like A/memory.limit_in_bytes == 128M 
and A/b/memory.limit_in_bytes == 126MB and then attach the process waiting 
on A/b/memory.oom_control to A and that would work perfect.

However, we also need to discuss system oom handling.  We have an interest 
in being able to allow userspace to handle system oom conditions since the 
policy will differ depending on machine and we can't encode every possible 
mechanism into the kernel.  For example, on system oom we want to kill a 
process from the lowest priority top-level memcg.  We lack that ability 
entirely in the kernel and since the sum of our top-level memcgs 
memory.limit_in_bytes exceeds the amount of present RAM, we run into these 
oom conditions a _lot_.

So the first step, in my opinion, is to add a system oom notification on 
the root memcg's memory.oom_control which currently allows registering an 
eventfd() notification but never actually triggers.  I did that in a patch 
and it is was merged into -mm but was pulled out for later discussion.

Then, we need to ensure that the userspace that is registered to handle 
such events and that is difficult to do when the system is oom.  The 
proposal is to allow such processes, now marked as PF_OOM_HANDLER, to be 
able to access pre-defined per-zone memory reserves in the page allocator.  
The only special handling for PF_OOM_HANDLER in the page allocator itself 
would be under such oom conditions (memcg oom conditions have no problem 
allocating the memory, only charging it).  The amount of reserves would be 
defined as memory.oom_reserve_in_bytes from within the root memcg as 
defined by this patch, i.e. allow this amount of memory to be allocated in 
the page allocator for PF_OOM_HANDLER below the per-zone min watermarks.

This, I believe, is the cleanest interface for users who choose to use a 
non-default policy by setting memory.oom_reserve_in_bytes and constrains 
all of the code to memcg which you have to configure for such support.

The system oom condition is not addressed in this patch series, although 
the PF_OOM_HANDLER bit can be used for that purpose.  I didn't post that 
patch because the notification on the root memcg's memory.oom_control in 
such conditions is currently being debated, so we need to solve that issue 
first.

Your opinions and suggestions are more than helpful, thanks.

^ permalink raw reply	[flat|nested] 39+ messages in thread

[parent not found: <alpine.DEB.2.02.1312051537550.7717-X6Q0R45D7oAcqpCFd4KODRPsWskHk0ljAL8bYrjMMd8@public.gmane.org>]

* Re: [patch 7/8] mm, memcg: allow processes handling oom notifications to access reserves
       [not found]                           ` <alpine.DEB.2.02.1312051537550.7717-X6Q0R45D7oAcqpCFd4KODRPsWskHk0ljAL8bYrjMMd8@public.gmane.org>
@ 2013-12-06 17:34                             ` Johannes Weiner
  2013-12-07 16:38                               ` Tim Hockin
  2013-12-06 19:01                             ` Tejun Heo
  1 sibling, 1 reply; 39+ messages in thread
From: Johannes Weiner @ 2013-12-06 17:34 UTC (permalink / raw)
  To: David Rientjes
  Cc: Tejun Heo, Andrew Morton, Michal Hocko, KAMEZAWA Hiroyuki,
	Mel Gorman, Rik van Riel, Pekka Enberg, Christoph Lameter,
	Li Zefan, linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg, cgroups-u79uwXL29TY76Z2rM5mHXA

On Thu, Dec 05, 2013 at 03:49:57PM -0800, David Rientjes wrote:
> On Wed, 4 Dec 2013, Tejun Heo wrote:
> 
> > Hello,
> > 
> 
> Tejun, how are you?
> 
> > Umm.. without delving into details, aren't you basically creating a
> > memory cgroup inside a memory cgroup?  Doesn't sound like a
> > particularly well thought-out plan to me.
> > 
> 
> I agree that we wouldn't need such support if we are only addressing memcg 
> oom conditions.  We could do things like A/memory.limit_in_bytes == 128M 
> and A/b/memory.limit_in_bytes == 126MB and then attach the process waiting 
> on A/b/memory.oom_control to A and that would work perfect.
> 
> However, we also need to discuss system oom handling.  We have an interest 
> in being able to allow userspace to handle system oom conditions since the 
> policy will differ depending on machine and we can't encode every possible 
> mechanism into the kernel.  For example, on system oom we want to kill a 
> process from the lowest priority top-level memcg.  We lack that ability 
> entirely in the kernel and since the sum of our top-level memcgs 
> memory.limit_in_bytes exceeds the amount of present RAM, we run into these 
> oom conditions a _lot_.

A simple and natural solution to this is to have the global OOM killer
respect cgroups.  You go through all the effort of carefully grouping
tasks into bigger entities that you then arrange hierarchically.  The
global OOM killer should not just treat all tasks as equal peers.

We can add a per-cgroup OOM priority knob and have the global OOM
handler pick victim tasks from the one or more groups that have the
lowest priority.

Out of the box, every cgroup has the same priority, which means we can
add this feature without changing the default behavior.

> So the first step, in my opinion, is to add a system oom notification on 
> the root memcg's memory.oom_control which currently allows registering an 
> eventfd() notification but never actually triggers.  I did that in a patch 
> and it is was merged into -mm but was pulled out for later discussion.
> 
> Then, we need to ensure that the userspace that is registered to handle 
> such events and that is difficult to do when the system is oom.  The 
> proposal is to allow such processes, now marked as PF_OOM_HANDLER, to be 
> able to access pre-defined per-zone memory reserves in the page allocator.  
> The only special handling for PF_OOM_HANDLER in the page allocator itself 
> would be under such oom conditions (memcg oom conditions have no problem 
> allocating the memory, only charging it).  The amount of reserves would be 
> defined as memory.oom_reserve_in_bytes from within the root memcg as 
> defined by this patch, i.e. allow this amount of memory to be allocated in 
> the page allocator for PF_OOM_HANDLER below the per-zone min watermarks.
> 
> This, I believe, is the cleanest interface for users who choose to use a 
> non-default policy by setting memory.oom_reserve_in_bytes and constrains 
> all of the code to memcg which you have to configure for such support.
> 
> The system oom condition is not addressed in this patch series, although 
> the PF_OOM_HANDLER bit can be used for that purpose.  I didn't post that 
> patch because the notification on the root memcg's memory.oom_control in 
> such conditions is currently being debated, so we need to solve that issue 
> first.

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [patch 7/8] mm, memcg: allow processes handling oom notifications to access reserves
  2013-12-06 17:34                             ` Johannes Weiner
@ 2013-12-07 16:38                               ` Tim Hockin
  2013-12-07 17:40                                 ` Johannes Weiner
  0 siblings, 1 reply; 39+ messages in thread
From: Tim Hockin @ 2013-12-07 16:38 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: Michal Hocko, Li Zefan, KAMEZAWA Hiroyuki, Tejun Heo,
	Christoph Lameter, David Rientjes, linux-mm, Rik van Riel,
	Pekka Enberg, cgroups, Mel Gorman, Andrew Morton, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 4160 bytes --]

We actually started with kernel patches all h these lines - per-memcg
scores and all of our crazy policy requirements.

It turns out that changing policies is hard.

When David offered the opportunity to manage it all in user space it
sounded like a great idea.

If this can be made to work as a high prio daemon with access to reserves,
we would like it.

Tim
On Dec 6, 2013 9:36 AM, "Johannes Weiner" <hannes@cmpxchg.org> wrote:

> On Thu, Dec 05, 2013 at 03:49:57PM -0800, David Rientjes wrote:
> > On Wed, 4 Dec 2013, Tejun Heo wrote:
> >
> > > Hello,
> > >
> >
> > Tejun, how are you?
> >
> > > Umm.. without delving into details, aren't you basically creating a
> > > memory cgroup inside a memory cgroup?  Doesn't sound like a
> > > particularly well thought-out plan to me.
> > >
> >
> > I agree that we wouldn't need such support if we are only addressing
> memcg
> > oom conditions.  We could do things like A/memory.limit_in_bytes == 128M
> > and A/b/memory.limit_in_bytes == 126MB and then attach the process
> waiting
> > on A/b/memory.oom_control to A and that would work perfect.
> >
> > However, we also need to discuss system oom handling.  We have an
> interest
> > in being able to allow userspace to handle system oom conditions since
> the
> > policy will differ depending on machine and we can't encode every
> possible
> > mechanism into the kernel.  For example, on system oom we want to kill a
> > process from the lowest priority top-level memcg.  We lack that ability
> > entirely in the kernel and since the sum of our top-level memcgs
> > memory.limit_in_bytes exceeds the amount of present RAM, we run into
> these
> > oom conditions a _lot_.
>
> A simple and natural solution to this is to have the global OOM killer
> respect cgroups.  You go through all the effort of carefully grouping
> tasks into bigger entities that you then arrange hierarchically.  The
> global OOM killer should not just treat all tasks as equal peers.
>
> We can add a per-cgroup OOM priority knob and have the global OOM
> handler pick victim tasks from the one or more groups that have the
> lowest priority.
>
> Out of the box, every cgroup has the same priority, which means we can
> add this feature without changing the default behavior.
>
> > So the first step, in my opinion, is to add a system oom notification on
> > the root memcg's memory.oom_control which currently allows registering an
> > eventfd() notification but never actually triggers.  I did that in a
> patch
> > and it is was merged into -mm but was pulled out for later discussion.
> >
> > Then, we need to ensure that the userspace that is registered to handle
> > such events and that is difficult to do when the system is oom.  The
> > proposal is to allow such processes, now marked as PF_OOM_HANDLER, to be
> > able to access pre-defined per-zone memory reserves in the page
> allocator.
> > The only special handling for PF_OOM_HANDLER in the page allocator itself
> > would be under such oom conditions (memcg oom conditions have no problem
> > allocating the memory, only charging it).  The amount of reserves would
> be
> > defined as memory.oom_reserve_in_bytes from within the root memcg as
> > defined by this patch, i.e. allow this amount of memory to be allocated
> in
> > the page allocator for PF_OOM_HANDLER below the per-zone min watermarks.
> >
> > This, I believe, is the cleanest interface for users who choose to use a
> > non-default policy by setting memory.oom_reserve_in_bytes and constrains
> > all of the code to memcg which you have to configure for such support.
> >
> > The system oom condition is not addressed in this patch series, although
> > the PF_OOM_HANDLER bit can be used for that purpose.  I didn't post that
> > patch because the notification on the root memcg's memory.oom_control in
> > such conditions is currently being debated, so we need to solve that
> issue
> > first.
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
>

[-- Attachment #2: Type: text/html, Size: 5054 bytes --]

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [patch 7/8] mm, memcg: allow processes handling oom notifications to access reserves
  2013-12-07 16:38                               ` Tim Hockin
@ 2013-12-07 17:40                                 ` Johannes Weiner
  2013-12-07 18:12                                   ` Tim Hockin
  0 siblings, 1 reply; 39+ messages in thread
From: Johannes Weiner @ 2013-12-07 17:40 UTC (permalink / raw)
  To: Tim Hockin
  Cc: Michal Hocko, Li Zefan, KAMEZAWA Hiroyuki, Tejun Heo,
	Christoph Lameter, David Rientjes, linux-mm, Rik van Riel,
	Pekka Enberg, cgroups, Mel Gorman, Andrew Morton, linux-kernel

Hello Tim!

On Sat, Dec 07, 2013 at 08:38:20AM -0800, Tim Hockin wrote:
> We actually started with kernel patches all h these lines - per-memcg
> scores and all of our crazy policy requirements.
> 
> It turns out that changing policies is hard.
>
> When David offered the opportunity to manage it all in user space it
> sounded like a great idea.
> 
> If this can be made to work as a high prio daemon with access to reserves,
> we would like it.

We can not talk solutions if you won't describe the problem.  It's
understandable that you can't talk about internal details, but it's
possible to describe a technical problem in a portable fashion such
that people can understand and evaluate it without knowing your whole
application.  Companies do this all the time.

"The way our blackbox works makes it really hard to hook it up to the
Linux kernel" is not a very convincing technical argument to change
the Linux kernel.

Thanks!

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [patch 7/8] mm, memcg: allow processes handling oom notifications to access reserves
  2013-12-07 17:40                                 ` Johannes Weiner
@ 2013-12-07 18:12                                   ` Tim Hockin
       [not found]                                     ` <CAAAKZwvanMiz8QZVOU0-SUKYzqcaJAXn0HxYs5+=Zakmnbcfbg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  0 siblings, 1 reply; 39+ messages in thread
From: Tim Hockin @ 2013-12-07 18:12 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: Michal Hocko, KAMEZAWA Hiroyuki, Li Zefan, Tejun Heo,
	David Rientjes, Christoph Lameter, linux-mm, Rik van Riel,
	Pekka Enberg, Andrew Morton, Mel Gorman, cgroups, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 804 bytes --]

You more or less described the fundamental change - a score per memcg, with
a recursive OOM killer which evaluates scores between siblings at the same
level.

It gets a bit complicated because we have need if wider scoring ranges than
are provided by default and because we score PIDs against mcgs at a given
scope.  We also have some tiebreaker heuristic (age).

We also have a handful of features that depend on OOM handling like the
aforementioned automatically growing and changing the actual OOM score
depending on usage in relation to various thresholds ( e.g. we sold you X,
and we allow you to go over X but if you do, your likelihood of death in
case of system OOM goes up.

Do you really want us to teach the kernel policies like this?  It would be
way easier to do and test in userspace.

Tim

[-- Attachment #2: Type: text/html, Size: 896 bytes --]

^ permalink raw reply	[flat|nested] 39+ messages in thread

[parent not found: <CAAAKZwvanMiz8QZVOU0-SUKYzqcaJAXn0HxYs5+=Zakmnbcfbg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>]

* Re: [patch 7/8] mm, memcg: allow processes handling oom notifications to access reserves
       [not found]                                     ` <CAAAKZwvanMiz8QZVOU0-SUKYzqcaJAXn0HxYs5+=Zakmnbcfbg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2013-12-07 19:06                                       ` Johannes Weiner
  2013-12-07 21:04                                         ` Tim Hockin
  0 siblings, 1 reply; 39+ messages in thread
From: Johannes Weiner @ 2013-12-07 19:06 UTC (permalink / raw)
  To: Tim Hockin
  Cc: Michal Hocko, KAMEZAWA Hiroyuki, Li Zefan, Tejun Heo,
	David Rientjes, Christoph Lameter,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg, Rik van Riel, Pekka Enberg,
	Andrew Morton, Mel Gorman, cgroups-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA

On Sat, Dec 07, 2013 at 10:12:19AM -0800, Tim Hockin wrote:
> You more or less described the fundamental change - a score per memcg, with
> a recursive OOM killer which evaluates scores between siblings at the same
> level.
> 
> It gets a bit complicated because we have need if wider scoring ranges than
> are provided by default

If so, I'm sure you can make a convincing case to widen the internal
per-task score ranges.  The per-memcg score ranges have not even be
defined, so this is even easier.

> and because we score PIDs against mcgs at a given scope.

You are describing bits of a solution, not a problem.  And I can't
possibly infer a problem from this.

> We also have some tiebreaker heuristic (age).

Either periodically update the per-memcg score from userspace or
implement this in the kernel.  We have considered CPU usage
history/runtime etc. in the past when picking an OOM victim task.

But I'm again just speculating what your problem is, so this may or
may not be a feasible solution.

> We also have a handful of features that depend on OOM handling like the
> aforementioned automatically growing and changing the actual OOM score
> depending on usage in relation to various thresholds ( e.g. we sold you X,
> and we allow you to go over X but if you do, your likelihood of death in
> case of system OOM goes up.

You can trivially monitor threshold events from userspace with the
existing infrastructure and accordingly update the per-memcg score.

> Do you really want us to teach the kernel policies like this?  It would be
> way easier to do and test in userspace.

Maybe.  Providing fragments of your solution is not an efficient way
to communicate the problem.  And you have to sell the problem before
anybody can be expected to even consider your proposal as one of the
possible solutions.

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [patch 7/8] mm, memcg: allow processes handling oom notifications to access reserves
  2013-12-07 19:06                                       ` Johannes Weiner
@ 2013-12-07 21:04                                         ` Tim Hockin
  0 siblings, 0 replies; 39+ messages in thread
From: Tim Hockin @ 2013-12-07 21:04 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: Michal Hocko, Li Zefan, KAMEZAWA Hiroyuki, Tejun Heo,
	Christoph Lameter, David Rientjes, linux-mm, Rik van Riel,
	Pekka Enberg, cgroups, Mel Gorman, Andrew Morton, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 2990 bytes --]

We have hierarchical "containers".  Jobs exist in these containers.  The
containers can hold sub-containers.

In case of system OOM we want to kill in strict priority order.  From the
root of the hierarchy, choose the lowest priority.  This could be a task or
a memcg.  If a memcg, recurse.

We CAN do it in kernel (in fact we do, and I argued for that, and David
acquiesced).  But doing it in kernel means changes are slow and risky.

What we really have is a bunch of features that we offer to our users that
need certain OOM-time behaviors and guarantees to be implemented.  I don't
expect that most of our changes are useful for anyone outside of Google,
really. They come with a lot of environmental assumptions.  This is why
David finally convinced me it was easier to release changes, to fix bugs,
and to update kernels if we do this in userspace.

I apologize if I am not giving you what you want.  I am typing on a phone
at the moment.  If this still doesn't help I can try from a computer later.

Tim
On Dec 7, 2013 11:07 AM, "Johannes Weiner" <hannes@cmpxchg.org> wrote:

> On Sat, Dec 07, 2013 at 10:12:19AM -0800, Tim Hockin wrote:
> > You more or less described the fundamental change - a score per memcg,
> with
> > a recursive OOM killer which evaluates scores between siblings at the
> same
> > level.
> >
> > It gets a bit complicated because we have need if wider scoring ranges
> than
> > are provided by default
>
> If so, I'm sure you can make a convincing case to widen the internal
> per-task score ranges.  The per-memcg score ranges have not even be
> defined, so this is even easier.
>
> > and because we score PIDs against mcgs at a given scope.
>
> You are describing bits of a solution, not a problem.  And I can't
> possibly infer a problem from this.
>
> > We also have some tiebreaker heuristic (age).
>
> Either periodically update the per-memcg score from userspace or
> implement this in the kernel.  We have considered CPU usage
> history/runtime etc. in the past when picking an OOM victim task.
>
> But I'm again just speculating what your problem is, so this may or
> may not be a feasible solution.
>
> > We also have a handful of features that depend on OOM handling like the
> > aforementioned automatically growing and changing the actual OOM score
> > depending on usage in relation to various thresholds ( e.g. we sold you
> X,
> > and we allow you to go over X but if you do, your likelihood of death in
> > case of system OOM goes up.
>
> You can trivially monitor threshold events from userspace with the
> existing infrastructure and accordingly update the per-memcg score.
>
> > Do you really want us to teach the kernel policies like this?  It would
> be
> > way easier to do and test in userspace.
>
> Maybe.  Providing fragments of your solution is not an efficient way
> to communicate the problem.  And you have to sell the problem before
> anybody can be expected to even consider your proposal as one of the
> possible solutions.
>

[-- Attachment #2: Type: text/html, Size: 3540 bytes --]

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [patch 7/8] mm, memcg: allow processes handling oom notifications to access reserves
       [not found]                           ` <alpine.DEB.2.02.1312051537550.7717-X6Q0R45D7oAcqpCFd4KODRPsWskHk0ljAL8bYrjMMd8@public.gmane.org>
  2013-12-06 17:34                             ` Johannes Weiner
@ 2013-12-06 19:01                             ` Tejun Heo
  2013-12-09 20:10                               ` David Rientjes
  1 sibling, 1 reply; 39+ messages in thread
From: Tejun Heo @ 2013-12-06 19:01 UTC (permalink / raw)
  To: David Rientjes
  Cc: Johannes Weiner, Andrew Morton, Michal Hocko, KAMEZAWA Hiroyuki,
	Mel Gorman, Rik van Riel, Pekka Enberg, Christoph Lameter,
	Li Zefan, linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg, cgroups-u79uwXL29TY76Z2rM5mHXA

Yo, David.

On Thu, Dec 05, 2013 at 03:49:57PM -0800, David Rientjes wrote:
> Tejun, how are you?

Doing pretty good.  How's yourself? :)

> > Umm.. without delving into details, aren't you basically creating a
> > memory cgroup inside a memory cgroup?  Doesn't sound like a
> > particularly well thought-out plan to me.
> 
> I agree that we wouldn't need such support if we are only addressing memcg 
> oom conditions.  We could do things like A/memory.limit_in_bytes == 128M 
> and A/b/memory.limit_in_bytes == 126MB and then attach the process waiting 
> on A/b/memory.oom_control to A and that would work perfect.

Or even just create a separate parallel cgroup A/memory.limit_in_bytes
== 126M A-oom/memory.limit_in_bytes = 2M and avoid the extra layer of
nesting.

> However, we also need to discuss system oom handling.  We have an interest 
> in being able to allow userspace to handle system oom conditions since the 
> policy will differ depending on machine and we can't encode every possible 
> mechanism into the kernel.  For example, on system oom we want to kill a 
> process from the lowest priority top-level memcg.  We lack that ability 
> entirely in the kernel and since the sum of our top-level memcgs 
> memory.limit_in_bytes exceeds the amount of present RAM, we run into these 
> oom conditions a _lot_.
> 
> So the first step, in my opinion, is to add a system oom notification on 
> the root memcg's memory.oom_control which currently allows registering an 
> eventfd() notification but never actually triggers.  I did that in a patch 
> and it is was merged into -mm but was pulled out for later discussion.

Hmmm... this seems to be a different topic.  You're saying that it'd
be beneficial to add userland oom handling at the sytem level and if
that happens having per-memcg oom reserve would be consistent with the
system-wide one, right?  While I can see some merit in that argument,
the whole thing is predicated on system level userland oom handling
being justified && even then I'm not quite sure whether "consistent
interface" is enough to have oom reserve in all memory cgroups.  It
feels a bit backwards because, here, the root memcg is the exception,
not the other way around.  Root is the only one which can't put oom
handler in a separate cgroup, so it could make more sense to special
case that rather than spreading the interface for global userland oom
to everyone else.

But, before that, system level userland OOM handling sounds scary to
me.  I thought about userland OOM handling for memcgs and it does make
some sense.  ie. there is a different action that userland oom handler
can take which kernel oom handler can't - it can expand the limit of
the offending cgroup, effectively using OOM handler as a sizing
estimator.  I'm not sure whether that in itself is a good idea but
then again it might not be possible to clearly separate out sizing
from oom conditions.

Anyways, but for system level OOM handling, there's no other action
userland handler can take.  It's not like the OOM handler paging the
admin to install more memory is a reasonable mode of operation to
support.  The *only* action userland OOM handler can take is killing
something.  Now, if that's the case and we have kernel OOM handler
anyway, I think the best course of action is improving kernel OOM
handler and teach it to make the decisions that the userland handler
would consider good.  That should be doable, right?

The thing is OOM handling in userland is an inherently fragile thing
and it can *never* replace kernel OOM handling.  You may reserve any
amount of memory you want but there would still be cases that it may
fail.  It's not like we have owner-based allocation all through the
kernel or are willing to pay overhead for such thing.  Even if that
part can be guaranteed somehow (no idea how), the kernel still can
NEVER trust the userland OOM handler.  No matter what we do, we need a
kernel OOM handler with no resource dependency.

So, there isn't anything userland OOM handler can inherently do better
and we can't do away with kernel handler no matter what.  On both
accounts, it seems like the best course of action is making
system-wide kernel OOM handler to make better decisions if possible at
all.  If that's impossible, let's first think about why that's the
case before hastly opening this new can of worms.

Thanks!

-- 
tejun

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [patch 7/8] mm, memcg: allow processes handling oom notifications to access reserves
  2013-12-06 19:01                             ` Tejun Heo
@ 2013-12-09 20:10                               ` David Rientjes
       [not found]                                 ` <alpine.DEB.2.02.1312061441390.8949-X6Q0R45D7oAcqpCFd4KODRPsWskHk0ljAL8bYrjMMd8@public.gmane.org>
  0 siblings, 1 reply; 39+ messages in thread
From: David Rientjes @ 2013-12-09 20:10 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Johannes Weiner, Andrew Morton, Michal Hocko, KAMEZAWA Hiroyuki,
	Mel Gorman, Rik van Riel, Pekka Enberg, Christoph Lameter,
	Li Zefan, linux-kernel, linux-mm, cgroups

On Fri, 6 Dec 2013, Tejun Heo wrote:

> > Tejun, how are you?
> 
> Doing pretty good.  How's yourself? :)
> 

Not bad, busy with holidays and all that.

> > I agree that we wouldn't need such support if we are only addressing memcg 
> > oom conditions.  We could do things like A/memory.limit_in_bytes == 128M 
> > and A/b/memory.limit_in_bytes == 126MB and then attach the process waiting 
> > on A/b/memory.oom_control to A and that would work perfect.
> 
> Or even just create a separate parallel cgroup A/memory.limit_in_bytes
> == 126M A-oom/memory.limit_in_bytes = 2M and avoid the extra layer of
> nesting.
> 

Indeed.  The setup I'm specifically trying to attack is where the sum of 
the limits of all non-oom handling memcgs (A/b in my model, A in yours) 
exceed the amount of RAM.  If the system has 256MB,

				/=256MB
	A=126MB		A-oom=2MB	B=188MB		B-oom=4MB

or

			/=256MB
	C=128MB				D=192MB
	C/a=126M			D/a=188MB

then it's possible for A + B or C/a + D/a to cause a system oom condition 
and meanwhile A-oom/tasks, B-oom/tasks, C/tasks, and D/tasks cannot 
allocate memory to handle it.

> > However, we also need to discuss system oom handling.  We have an interest 
> > in being able to allow userspace to handle system oom conditions since the 
> > policy will differ depending on machine and we can't encode every possible 
> > mechanism into the kernel.  For example, on system oom we want to kill a 
> > process from the lowest priority top-level memcg.  We lack that ability 
> > entirely in the kernel and since the sum of our top-level memcgs 
> > memory.limit_in_bytes exceeds the amount of present RAM, we run into these 
> > oom conditions a _lot_.
> > 
> > So the first step, in my opinion, is to add a system oom notification on 
> > the root memcg's memory.oom_control which currently allows registering an 
> > eventfd() notification but never actually triggers.  I did that in a patch 
> > and it is was merged into -mm but was pulled out for later discussion.
> 
> Hmmm... this seems to be a different topic.  You're saying that it'd
> be beneficial to add userland oom handling at the sytem level and if
> that happens having per-memcg oom reserve would be consistent with the
> system-wide one, right?

Right, and apologies for not discussing the system oom handling here since 
its notification on the root memcg is currently being debated as well.  
The idea is that admins and users aren't going to be concerned about 
memory allocation through the page allocator vs memory charging through 
the memory controller; they simply want memory for their userspace oom 
handling.  And since the notification would be tied to the root memcg, it 
makes sense to make the amount of memory allowed to allocate exclusively 
for these handlers a memcg interface.  So the cleanest solution, in my 
opinion, was to add the interface as part of memcg.

> While I can see some merit in that argument,
> the whole thing is predicated on system level userland oom handling
> being justified && even then I'm not quite sure whether "consistent
> interface" is enough to have oom reserve in all memory cgroups.  It
> feels a bit backwards because, here, the root memcg is the exception,
> not the other way around.  Root is the only one which can't put oom
> handler in a separate cgroup, so it could make more sense to special
> case that rather than spreading the interface for global userland oom
> to everyone else.
> 

It's really the same thing, though, from the user perspective.  They don't 
care about page allocation failure vs memcg charge failure, they simply 
want to ensure that the memory set aside for memory.oom_reserve_in_bytes 
is available in oom conditions.  With the suggested alternatives:

				/=256MB
	A=126MB		A-oom=2MB	B=188MB		B-oom=4MB

or

			/=256MB
	C=128MB				D=192MB
	C/a=126M			D/a=188MB

we can't distinguish between what is able to allocate below per-zone min 
watermarks in the page allocator as the oom reserve.  The key point is 
that the root memcg is not the only memcg concerned with page allocator 
memory reserves, it's any oom reserve.  If A's usage is 124MB and B's 
usage is 132MB, we can't specify that processes attached to B-oom should 
be able to bypass per-zone min watermarks without an interface such as 
that being proposed.

> But, before that, system level userland OOM handling sounds scary to
> me.  I thought about userland OOM handling for memcgs and it does make
> some sense.  ie. there is a different action that userland oom handler
> can take which kernel oom handler can't - it can expand the limit of
> the offending cgroup, effectively using OOM handler as a sizing
> estimator.  I'm not sure whether that in itself is a good idea but
> then again it might not be possible to clearly separate out sizing
> from oom conditions.
> 
> Anyways, but for system level OOM handling, there's no other action
> userland handler can take.  It's not like the OOM handler paging the
> admin to install more memory is a reasonable mode of operation to
> support.  The *only* action userland OOM handler can take is killing
> something.  Now, if that's the case and we have kernel OOM handler
> anyway, I think the best course of action is improving kernel OOM
> handler and teach it to make the decisions that the userland handler
> would consider good.  That should be doable, right?
> 

It's much more powerful than that; you're referring to the mechanism to 
guarantee future memory freeing so the system or memcg is no longer oom, 
and that's only one case of possible handling.  I have a customer who 
wants to save heap profiles at the time of oom as well, for example, and 
their sole desire is to be able to capture memory statistics before the 
oom kill takes place.  The sine qua non is that memory reserves allow 
something to be done in such conditions: if you try to do a "ps" or "ls" 
or cat a file in an oom memcg, you hang.  We need better functionality to 
ensure that we can do some action prior to the oom kill itself, whether 
that comes from userspace or the kernel.  We simply cannot rely on things 
like memory thresholds or vmpressure to grab these heap profiles, there is 
no guarantee that memory will not be exhausted and the oom kill would 
already have taken place before the process handling the notification 
wakes up.  (And any argument that it is possible by simply making the 
threshold happen early enough is a non-starter: it does not guarantee the 
heaps are collected for oom conditions and the oom kill can still occur 
prematurely in machines that overcommit their memcg limits, as we do.)

> The thing is OOM handling in userland is an inherently fragile thing
> and it can *never* replace kernel OOM handling.  You may reserve any
> amount of memory you want but there would still be cases that it may
> fail.  It's not like we have owner-based allocation all through the
> kernel or are willing to pay overhead for such thing.  Even if that
> part can be guaranteed somehow (no idea how), the kernel still can
> NEVER trust the userland OOM handler.  No matter what we do, we need a
> kernel OOM handler with no resource dependency.
> 

I was never an advocate for the current memory.oom_control behavior that 
allows you to disable the oom killer indefinitely for a memcg and I agree 
that it is dangerous if userspace will not cause future memory freeing or 
toggle the value such that the kernel will kill something.  So I agree 
with you with today's functionality, not with the functionality that this 
patchset, and the notification on the root memcg for system oom 
conditions, provides.  I also proposed a memory.oom_delay_millisecs that 
we have used for several years dating back to even cpusets that simply 
delays the oom kill such that userspace can do "something" like send a 
kill itself, collect heap profiles, send a signal to our malloc() 
implementation to free arena memory, etc. prior to the kernel oom kill.

> So, there isn't anything userland OOM handler can inherently do better
> and we can't do away with kernel handler no matter what.  On both
> accounts, it seems like the best course of action is making
> system-wide kernel OOM handler to make better decisions if possible at
> all.  If that's impossible, let's first think about why that's the
> case before hastly opening this new can of worms.
> 

We certainly can get away with the kernel oom killer in 99% of cases with 
this functionality for users who choose to have their own oom handling 
implementations.  We also can't possibly code every single handling policy 
into the kernel: we can't guarantee that our version of malloc() is 
guaranteed to be able to free memory back to the kernel when waking up on 
a memory.oom_control notification prior to the memcg oom killer killing 
something, for example, without this functionality.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 39+ messages in thread

[parent not found: <alpine.DEB.2.02.1312061441390.8949-X6Q0R45D7oAcqpCFd4KODRPsWskHk0ljAL8bYrjMMd8@public.gmane.org>]

* Re: [patch 7/8] mm, memcg: allow processes handling oom notifications to access reserves
       [not found]                                 ` <alpine.DEB.2.02.1312061441390.8949-X6Q0R45D7oAcqpCFd4KODRPsWskHk0ljAL8bYrjMMd8@public.gmane.org>
@ 2013-12-09 22:37                                   ` Johannes Weiner
  2013-12-10 21:50                                   ` Tejun Heo
  1 sibling, 0 replies; 39+ messages in thread
From: Johannes Weiner @ 2013-12-09 22:37 UTC (permalink / raw)
  To: David Rientjes
  Cc: Tejun Heo, Andrew Morton, Michal Hocko, KAMEZAWA Hiroyuki,
	Mel Gorman, Rik van Riel, Pekka Enberg, Christoph Lameter,
	Li Zefan, linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg, cgroups-u79uwXL29TY76Z2rM5mHXA

On Mon, Dec 09, 2013 at 12:10:44PM -0800, David Rientjes wrote:
> On Fri, 6 Dec 2013, Tejun Heo wrote:
> 
> > > Tejun, how are you?
> > 
> > Doing pretty good.  How's yourself? :)
> > 
> 
> Not bad, busy with holidays and all that.
> 
> > > I agree that we wouldn't need such support if we are only addressing memcg 
> > > oom conditions.  We could do things like A/memory.limit_in_bytes == 128M 
> > > and A/b/memory.limit_in_bytes == 126MB and then attach the process waiting 
> > > on A/b/memory.oom_control to A and that would work perfect.
> > 
> > Or even just create a separate parallel cgroup A/memory.limit_in_bytes
> > == 126M A-oom/memory.limit_in_bytes = 2M and avoid the extra layer of
> > nesting.
> > 
> 
> Indeed.  The setup I'm specifically trying to attack is where the sum of 
> the limits of all non-oom handling memcgs (A/b in my model, A in yours) 
> exceed the amount of RAM.  If the system has 256MB,
> 
> 				/=256MB
> 	A=126MB		A-oom=2MB	B=188MB		B-oom=4MB
> 
> or
> 
> 			/=256MB
> 	C=128MB				D=192MB
> 	C/a=126M			D/a=188MB
> 
> then it's possible for A + B or C/a + D/a to cause a system oom condition 
> and meanwhile A-oom/tasks, B-oom/tasks, C/tasks, and D/tasks cannot 
> allocate memory to handle it.

So your per-memcg handlers want access to PHYSICAL MEMORY reserves
during system-wide OOM, but this patch implements MEMORY CHARGE
reserves only, which are obviously meaningless during system-wide OOM.

In other words, this is an entirely different usecase than what this
patchset is really about.

You have to sell us on the problem first, then we can discuss a
solution.  Instead, you insist on the solution and keep changing the
problem whenever we find it no longer justifies your proposal.

> > > However, we also need to discuss system oom handling.  We have an interest 
> > > in being able to allow userspace to handle system oom conditions since the 
> > > policy will differ depending on machine and we can't encode every possible 
> > > mechanism into the kernel.  For example, on system oom we want to kill a 
> > > process from the lowest priority top-level memcg.  We lack that ability 
> > > entirely in the kernel and since the sum of our top-level memcgs 
> > > memory.limit_in_bytes exceeds the amount of present RAM, we run into these 
> > > oom conditions a _lot_.
> > > 
> > > So the first step, in my opinion, is to add a system oom notification on 
> > > the root memcg's memory.oom_control which currently allows registering an 
> > > eventfd() notification but never actually triggers.  I did that in a patch 
> > > and it is was merged into -mm but was pulled out for later discussion.
> > 
> > Hmmm... this seems to be a different topic.  You're saying that it'd
> > be beneficial to add userland oom handling at the sytem level and if
> > that happens having per-memcg oom reserve would be consistent with the
> > system-wide one, right?
> 
> Right, and apologies for not discussing the system oom handling here since 
> its notification on the root memcg is currently being debated as well.  
> The idea is that admins and users aren't going to be concerned about 
> memory allocation through the page allocator vs memory charging through 
> the memory controller; they simply want memory for their userspace oom 
> handling.  And since the notification would be tied to the root memcg, it 
> makes sense to make the amount of memory allowed to allocate exclusively 
> for these handlers a memcg interface.  So the cleanest solution, in my 
> opinion, was to add the interface as part of memcg.
> 
> > While I can see some merit in that argument,
> > the whole thing is predicated on system level userland oom handling
> > being justified && even then I'm not quite sure whether "consistent
> > interface" is enough to have oom reserve in all memory cgroups.  It
> > feels a bit backwards because, here, the root memcg is the exception,
> > not the other way around.  Root is the only one which can't put oom
> > handler in a separate cgroup, so it could make more sense to special
> > case that rather than spreading the interface for global userland oom
> > to everyone else.
> > 
> 
> It's really the same thing, though, from the user perspective.  They don't 
> care about page allocation failure vs memcg charge failure, they simply 
> want to ensure that the memory set aside for memory.oom_reserve_in_bytes 
> is available in oom conditions.  With the suggested alternatives:
> 
> 				/=256MB
> 	A=126MB		A-oom=2MB	B=188MB		B-oom=4MB
> 
> or
> 
> 			/=256MB
> 	C=128MB				D=192MB
> 	C/a=126M			D/a=188MB
> 
> we can't distinguish between what is able to allocate below per-zone min 
> watermarks in the page allocator as the oom reserve.  The key point is 
> that the root memcg is not the only memcg concerned with page allocator 
> memory reserves, it's any oom reserve.  If A's usage is 124MB and B's 
> usage is 132MB, we can't specify that processes attached to B-oom should 
> be able to bypass per-zone min watermarks without an interface such as 
> that being proposed.

The per-zone min watermarks are there to allow rudimentary OOM
handling inside the kernel to prevent a complete deadlock.

You want to hand them out to an indefinite number of (untrusted?)
userspace tasks in the hope that they handle the situation?

Also, the following concerns from Tejun still apply:

> > The thing is OOM handling in userland is an inherently fragile thing
> > and it can *never* replace kernel OOM handling.  You may reserve any
> > amount of memory you want but there would still be cases that it may
> > fail.  It's not like we have owner-based allocation all through the
> > kernel or are willing to pay overhead for such thing.  Even if that
> > part can be guaranteed somehow (no idea how), the kernel still can
> > NEVER trust the userland OOM handler.  No matter what we do, we need a
> > kernel OOM handler with no resource dependency.

Your userspace handler may very much fail, but it may have squandered
all the resources for the kernel fallback handling to actually perform
its job.

I don't know if you are actually allowing every PF_OOM_HANDLER to
simply bypass the watermarks in your kernels, but this seems way too
fragile for upstream.

> > But, before that, system level userland OOM handling sounds scary to
> > me.  I thought about userland OOM handling for memcgs and it does make
> > some sense.  ie. there is a different action that userland oom handler
> > can take which kernel oom handler can't - it can expand the limit of
> > the offending cgroup, effectively using OOM handler as a sizing
> > estimator.  I'm not sure whether that in itself is a good idea but
> > then again it might not be possible to clearly separate out sizing
> > from oom conditions.
> > 
> > Anyways, but for system level OOM handling, there's no other action
> > userland handler can take.  It's not like the OOM handler paging the
> > admin to install more memory is a reasonable mode of operation to
> > support.  The *only* action userland OOM handler can take is killing
> > something.  Now, if that's the case and we have kernel OOM handler
> > anyway, I think the best course of action is improving kernel OOM
> > handler and teach it to make the decisions that the userland handler
> > would consider good.  That should be doable, right?
> > 
> 
> It's much more powerful than that; you're referring to the mechanism to 
> guarantee future memory freeing so the system or memcg is no longer oom, 
> and that's only one case of possible handling.  I have a customer who 
> wants to save heap profiles at the time of oom as well, for example, and 
> their sole desire is to be able to capture memory statistics before the 
> oom kill takes place.  The sine qua non is that memory reserves allow 
> something to be done in such conditions: if you try to do a "ps" or "ls" 
> or cat a file in an oom memcg, you hang.

This is conflating per-memcg OOM handling and global OOM handling.
You can always ps or ls from outside to analyze a memcg OOM and we
have established that there is no good reason to try doing it from
inside the OOM group.

> We need better functionality to ensure that we can do some action
> prior to the oom kill itself, whether that comes from userspace or
> the kernel.  We simply cannot rely on things like memory thresholds
> or vmpressure to grab these heap profiles, there is no guarantee
> that memory will not be exhausted and the oom kill would already
> have taken place before the process handling the notification wakes
> up.  (And any argument that it is possible by simply making the
> threshold happen early enough is a non-starter: it does not
> guarantee the heaps are collected for oom conditions and the oom
> kill can still occur prematurely in machines that overcommit their
> memcg limits, as we do.)
> 
> > The thing is OOM handling in userland is an inherently fragile thing
> > and it can *never* replace kernel OOM handling.  You may reserve any
> > amount of memory you want but there would still be cases that it may
> > fail.  It's not like we have owner-based allocation all through the
> > kernel or are willing to pay overhead for such thing.  Even if that
> > part can be guaranteed somehow (no idea how), the kernel still can
> > NEVER trust the userland OOM handler.  No matter what we do, we need a
> > kernel OOM handler with no resource dependency.
> > 
> 
> I was never an advocate for the current memory.oom_control behavior that 
> allows you to disable the oom killer indefinitely for a memcg and I agree 
> that it is dangerous if userspace will not cause future memory freeing or 
> toggle the value such that the kernel will kill something.

This is again confusing system-wide OOM with per-memcg OOM.  Disabling
the per-memcg OOM handler is perfectly fine because any memory demand
from higher up the hierarchy will still kill in such a group.  The
problems Tejun describe are only existant in userspace handling of
system-wide OOM situations.  Which is the thing you are advocating,
not what we currently have.

> So I agree with you with today's functionality, not with the
> functionality that this patchset, and the notification on the root
> memcg for system oom conditions, provides.  I also proposed a
> memory.oom_delay_millisecs that we have used for several years
> dating back to even cpusets that simply delays the oom kill such
> that userspace can do "something" like send a kill itself, collect
> heap profiles, send a signal to our malloc() implementation to free
> arena memory, etc. prior to the kernel oom kill.
> 
> > So, there isn't anything userland OOM handler can inherently do better
> > and we can't do away with kernel handler no matter what.  On both
> > accounts, it seems like the best course of action is making
> > system-wide kernel OOM handler to make better decisions if possible at
> > all.  If that's impossible, let's first think about why that's the
> > case before hastly opening this new can of worms.
> > 
> 
> We certainly can get away with the kernel oom killer in 99% of cases with 
> this functionality for users who choose to have their own oom handling 
> implementations.  We also can't possibly code every single handling policy 
> into the kernel: we can't guarantee that our version of malloc() is 
> guaranteed to be able to free memory back to the kernel when waking up on 
> a memory.oom_control notification prior to the memcg oom killer killing 
> something, for example, without this functionality.

If you have discardable anonymous memory laying around the volatile
memory patches are a much more reliable way of getting rid of it than
to wake up a userspace task and wait & pray a few seconds.

Page reclaim has been *the* tool to facilitate overcommit for decades
while OOM killing has always been a last-resort measure.  Why is this
not good enough anymore and why is the only solution to give up and do
it all in userspace?

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [patch 7/8] mm, memcg: allow processes handling oom notifications to access reserves
       [not found]                                 ` <alpine.DEB.2.02.1312061441390.8949-X6Q0R45D7oAcqpCFd4KODRPsWskHk0ljAL8bYrjMMd8@public.gmane.org>
  2013-12-09 22:37                                   ` Johannes Weiner
@ 2013-12-10 21:50                                   ` Tejun Heo
  2013-12-10 23:55                                     ` David Rientjes
  1 sibling, 1 reply; 39+ messages in thread
From: Tejun Heo @ 2013-12-10 21:50 UTC (permalink / raw)
  To: David Rientjes
  Cc: Johannes Weiner, Andrew Morton, Michal Hocko, KAMEZAWA Hiroyuki,
	Mel Gorman, Rik van Riel, Pekka Enberg, Christoph Lameter,
	Li Zefan, linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg, cgroups-u79uwXL29TY76Z2rM5mHXA

Hey, David.

On Mon, Dec 09, 2013 at 12:10:44PM -0800, David Rientjes wrote:
> Indeed.  The setup I'm specifically trying to attack is where the sum of 
> the limits of all non-oom handling memcgs (A/b in my model, A in yours) 
> exceed the amount of RAM.  If the system has 256MB,
> 
> 				/=256MB
> 	A=126MB		A-oom=2MB	B=188MB		B-oom=4MB
> 
> or
> 
> 			/=256MB
> 	C=128MB				D=192MB
> 	C/a=126M			D/a=188MB
> 
> then it's possible for A + B or C/a + D/a to cause a system oom condition 
> and meanwhile A-oom/tasks, B-oom/tasks, C/tasks, and D/tasks cannot 
> allocate memory to handle it.

"tasks"?  You mean that tasks can't be read reliably once system-OOM
is hit regardless of memcg configuration?

> Right, and apologies for not discussing the system oom handling here since 
> its notification on the root memcg is currently being debated as well.  
> The idea is that admins and users aren't going to be concerned about 
> memory allocation through the page allocator vs memory charging through 
> the memory controller; they simply want memory for their userspace oom 
> handling.  And since the notification would be tied to the root memcg, it 
> makes sense to make the amount of memory allowed to allocate exclusively 
> for these handlers a memcg interface.  So the cleanest solution, in my 
> opinion, was to add the interface as part of memcg.

I'm still not quite following the reasoning.  Can you please elaborate
on what the distinction between "page allocator" and "charges through
memory controller" has to do with this interface?

> It's really the same thing, though, from the user perspective.  They don't 
> care about page allocation failure vs memcg charge failure, they simply 
> want to ensure that the memory set aside for memory.oom_reserve_in_bytes 
> is available in oom conditions.  With the suggested alternatives:
> 
> 				/=256MB
> 	A=126MB		A-oom=2MB	B=188MB		B-oom=4MB
> 
> or
> 
> 			/=256MB
> 	C=128MB				D=192MB
> 	C/a=126M			D/a=188MB
> 
> we can't distinguish between what is able to allocate below per-zone min 
> watermarks in the page allocator as the oom reserve.  The key point is 
> that the root memcg is not the only memcg concerned with page allocator 
> memory reserves, it's any oom reserve.  If A's usage is 124MB and B's 
> usage is 132MB, we can't specify that processes attached to B-oom should 
> be able to bypass per-zone min watermarks without an interface such as 
> that being proposed.

Okay, are you saying that userland OOM handlers will be able to dip
into kernel reserve memory?  Maybe I'm mistaken but you realize that
that reserve is there to make things like task exits work under OOM
conditions, right?  The only way userland OOM handlers as you describe
would work would be creating a separate reserve for them.

Aren't you basically suggesting two memcg domains - one which is
overcommitted and the other which isn't?  But if you want to do that,
wouldn't that be something which is a natural fit for memch hierarchy?
Not only that, such hierarchical setup would make sense for other
controllers too - you're really creating two fundamentally different
resource groups.

> It's much more powerful than that; you're referring to the mechanism to 
> guarantee future memory freeing so the system or memcg is no longer oom, 
> and that's only one case of possible handling.  I have a customer who 
> wants to save heap profiles at the time of oom as well, for example, and 
> their sole desire is to be able to capture memory statistics before the 
> oom kill takes place.  The sine qua non is that memory reserves allow 
> something to be done in such conditions: if you try to do a "ps" or "ls" 
> or cat a file in an oom memcg, you hang.  We need better functionality to 
> ensure that we can do some action prior to the oom kill itself, whether 
> that comes from userspace or the kernel.  We simply cannot rely on things 

Well, the gotcha there is that you won't be able to do that with
system level OOM handler either unless you create a separately
reserved memory, which, again, can be achieved using hierarchical
memcg setup already.  Am I missing something here?

> like memory thresholds or vmpressure to grab these heap profiles, there is 
> no guarantee that memory will not be exhausted and the oom kill would 
> already have taken place before the process handling the notification 
> wakes up.  (And any argument that it is possible by simply making the 
> threshold happen early enough is a non-starter: it does not guarantee the 
> heaps are collected for oom conditions and the oom kill can still occur 
> prematurely in machines that overcommit their memcg limits, as we do.)

I don't really follow your "guarantee" argument regarding OOM.  It's
not like we have mathmatically concrete definition of OOM conditions.
That'd be nice to have but we simply don't have them.  As it currently
is defined, it's just "oh well, we tried hard enough but nothing seems
to give in.  whatever".  As currently defined, it's an inherently
fuzzy and racy thing.  Sure, it *could* be meaningful to try to
decrease the raciness if the difference is significant but using
absolute terms like guarantee is just misleading, IMHO.  You can't
guarantee much with something which is racy to begin with.

...
> conditions, provides.  I also proposed a memory.oom_delay_millisecs that 
> we have used for several years dating back to even cpusets that simply 
> delays the oom kill such that userspace can do "something" like send a 
> kill itself, collect heap profiles, send a signal to our malloc() 
> implementation to free arena memory, etc. prior to the kernel oom kill.

All the above would require a separately reserved memory, right?
Also, a curiosity, how would "sending a signal to our malloc()" work?
If you mean sending a signal to malloc() in a different process,
that's not gonna work.  How is that process gonna have memory to
process the signal and free memory from malloc() under OOM condition?

> We certainly can get away with the kernel oom killer in 99% of cases with 
> this functionality for users who choose to have their own oom handling 
> implementations.  We also can't possibly code every single handling policy 
> into the kernel: we can't guarantee that our version of malloc() is 
> guaranteed to be able to free memory back to the kernel when waking up on 
> a memory.oom_control notification prior to the memcg oom killer killing 
> something, for example, without this functionality.

So, malloc() is mapped into the same process as the OOM handler which
is gonna be able to tap into physically reserved memory?  Also, while
freeing, it won't need to coordinate with other processes?

If I'm not mistaken, we're talking about a lot of additional
complexities throughout the whole mm layer for something which seems,
to me, achieveable through proper memcg configuration without any
modification to the kernel and doesn't seem all that necessary for 99%
of use cases, as you said.  Unless I'm missing something major (quite
possible, of course), I think you'd need stronger rationale.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [patch 7/8] mm, memcg: allow processes handling oom notifications to access reserves
  2013-12-10 21:50                                   ` Tejun Heo
@ 2013-12-10 23:55                                     ` David Rientjes
  2013-12-11  9:49                                       ` Mel Gorman
  2013-12-11 12:42                                       ` Tejun Heo
  0 siblings, 2 replies; 39+ messages in thread
From: David Rientjes @ 2013-12-10 23:55 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Johannes Weiner, Andrew Morton, Michal Hocko, KAMEZAWA Hiroyuki,
	Mel Gorman, Rik van Riel, Pekka Enberg, Christoph Lameter,
	Li Zefan, linux-kernel, linux-mm, cgroups

On Tue, 10 Dec 2013, Tejun Heo wrote:

> > Indeed.  The setup I'm specifically trying to attack is where the sum of 
> > the limits of all non-oom handling memcgs (A/b in my model, A in yours) 
> > exceed the amount of RAM.  If the system has 256MB,
> > 
> > 				/=256MB
> > 	A=126MB		A-oom=2MB	B=188MB		B-oom=4MB
> > 
> > or
> > 
> > 			/=256MB
> > 	C=128MB				D=192MB
> > 	C/a=126M			D/a=188MB
> > 
> > then it's possible for A + B or C/a + D/a to cause a system oom condition 
> > and meanwhile A-oom/tasks, B-oom/tasks, C/tasks, and D/tasks cannot 
> > allocate memory to handle it.
> 
> "tasks"?  You mean that tasks can't be read reliably once system-OOM
> is hit regardless of memcg configuration?
> 

Not referring to the files themselves, rather the processes listed by 
those files, sorry.  Those processes would not be able to do a ps, ls, or 
anything useful even if they are mlocked into memory because they cannot 
allocate memory in oom conditions.

> > Right, and apologies for not discussing the system oom handling here since 
> > its notification on the root memcg is currently being debated as well.  
> > The idea is that admins and users aren't going to be concerned about 
> > memory allocation through the page allocator vs memory charging through 
> > the memory controller; they simply want memory for their userspace oom 
> > handling.  And since the notification would be tied to the root memcg, it 
> > makes sense to make the amount of memory allowed to allocate exclusively 
> > for these handlers a memcg interface.  So the cleanest solution, in my 
> > opinion, was to add the interface as part of memcg.
> 
> I'm still not quite following the reasoning.  Can you please elaborate
> on what the distinction between "page allocator" and "charges through
> memory controller" has to do with this interface?
> 

The interface would allow both access to memory reserves through the page 
allocator as well as charging above the memcg limit, it is the only way to 
guarantee that memory can be allocated by processes attached to the memcg 
in oom conditions.  We must be able to do both, otherwise no matter what 
overcharge we allow them via memcg, it is still possible for the 
allocation itself to fail in the page allocator before we even get to that 
point.

The confusion here is because the access to memory reserves in the page 
allocator is not presented here because there is another on-going 
discussion about when to notify processes waiting on the root memcg's 
memory.oom_control about system oom conditions.  I can certainly post that 
patch as well, but it wouldn't apply without resolving that side-thread 
first.

The high order bit is that we need to be able to address system oom 
conditions as well as memcg oom conditions in userspace and system oom 
conditions require us to specify the processes that are allowed access to 
a special memory reserve.  We can't do that with sibling or parent memcgs 
without some new tunable like memory.allow_page_alloc_reserves, but we 
would also have to specify the amount of reserves allowed.  It seemed 
clean and straight-forward to specify this as both the system oom memory 
reserve amount and memcg limit overcharge amount within the same file, 
memory.oom_reserve_in_bytes as this patch does.

> > It's really the same thing, though, from the user perspective.  They don't 
> > care about page allocation failure vs memcg charge failure, they simply 
> > want to ensure that the memory set aside for memory.oom_reserve_in_bytes 
> > is available in oom conditions.  With the suggested alternatives:
> > 
> > 				/=256MB
> > 	A=126MB		A-oom=2MB	B=188MB		B-oom=4MB
> > 
> > or
> > 
> > 			/=256MB
> > 	C=128MB				D=192MB
> > 	C/a=126M			D/a=188MB
> > 
> > we can't distinguish between what is able to allocate below per-zone min 
> > watermarks in the page allocator as the oom reserve.  The key point is 
> > that the root memcg is not the only memcg concerned with page allocator 
> > memory reserves, it's any oom reserve.  If A's usage is 124MB and B's 
> > usage is 132MB, we can't specify that processes attached to B-oom should 
> > be able to bypass per-zone min watermarks without an interface such as 
> > that being proposed.
> 
> Okay, are you saying that userland OOM handlers will be able to dip
> into kernel reserve memory?  Maybe I'm mistaken but you realize that
> that reserve is there to make things like task exits work under OOM
> conditions, right?  The only way userland OOM handlers as you describe
> would work would be creating a separate reserve for them.
> 

Yes, PF_OOM_HANDLER processes would be able to allocate this amount as 
specified by memory.oom_reserve_in_bytes below the per-zone watermarks and 
the amount of reserves can already be controlled via min_free_kbytes, 
which we already increase internally for thp.  This could obviously be 
limited to some sane value that is a fraction of the smallest zone's min 
watermark, that's not a problem: I've never had a memcg or system oom 
reserve larger than 2MB and most users would probably get away with 256KB 
or 512KB.

> > It's much more powerful than that; you're referring to the mechanism to 
> > guarantee future memory freeing so the system or memcg is no longer oom, 
> > and that's only one case of possible handling.  I have a customer who 
> > wants to save heap profiles at the time of oom as well, for example, and 
> > their sole desire is to be able to capture memory statistics before the 
> > oom kill takes place.  The sine qua non is that memory reserves allow 
> > something to be done in such conditions: if you try to do a "ps" or "ls" 
> > or cat a file in an oom memcg, you hang.  We need better functionality to 
> > ensure that we can do some action prior to the oom kill itself, whether 
> > that comes from userspace or the kernel.  We simply cannot rely on things 
> 
> Well, the gotcha there is that you won't be able to do that with
> system level OOM handler either unless you create a separately
> reserved memory, which, again, can be achieved using hierarchical
> memcg setup already.  Am I missing something here?
> 

System oom conditions would only arise when the usage of memcgs A + B 
above cause the page allocator to not be able to allocate memory without 
oom killing something even though the limits of both A and B may not have 
been reached yet.  No userspace oom handler can allocate memory with 
access to memory reserves in the page allocator in such a context; it's 
vital that if we are to handle system oom conditions in userspace that we 
given them access to memory that other processes can't allocate.  You 
could attach a userspace system oom handler to any memcg in this scenario 
with memory.oom_reserve_in_bytes and since it has PF_OOM_HANDLER it would 
be able to allocate in reserves in the page allocator and overcharge in 
its memcg to handle it.  This isn't possible only with a hierarchical 
memcg setup unless you ensure the sum of the limits of the top level 
memcgs do not equal or exceed the sum of the min watermarks of all memory 
zones, and we exceed that.

> > conditions, provides.  I also proposed a memory.oom_delay_millisecs that 
> > we have used for several years dating back to even cpusets that simply 
> > delays the oom kill such that userspace can do "something" like send a 
> > kill itself, collect heap profiles, send a signal to our malloc() 
> > implementation to free arena memory, etc. prior to the kernel oom kill.
> 
> All the above would require a separately reserved memory, right?
> Also, a curiosity, how would "sending a signal to our malloc()" work?
> If you mean sending a signal to malloc() in a different process,
> that's not gonna work.  How is that process gonna have memory to
> process the signal and free memory from malloc() under OOM condition?
> 

The signal is actually a wakeup from vmpressure, we don't want to wait 
until reclaim is completely exhausted before freeing this memory, we want 
to do it at VMPRESSURE_LOW.  We simply needed a way to avoid the immediate 
oom kill unless it has a chance to free excess memory from malloc() first.  
We can also avoid oom killing entirely if, upon memcg oom notification, we 
can simply increase its limit instead of freeing memory at all: we have 
internally the notion of "overlimit" memcgs that are the first memcgs to 
kill within on system oom but are allowed to exceed their reservation if 
memory is available.  It's advantageous to require them to aggressively 
reclaim up to their reservation and then only increase the memcg limit as 
a last resort.  If we hit system oom later, they get killed first.  With 
this functionality, it does not require more than a few pages of 
memory.oom_reserve_in_bytes to write to memory.limit_in_bytes.

> So, malloc() is mapped into the same process as the OOM handler which
> is gonna be able to tap into physically reserved memory?  Also, while
> freeing, it won't need to coordinate with other processes?
> 

This is only one example and our reasoning for it is somewhat convoluted: 
we require thp's max_ptes_none to be 0 rather than the default 
HPAGE_PMD_NR-1 because we don't overcharge anonymous memory that isn't 
used purely for the sake of thp.  This causes all of malloc()'s 
MADV_DONTNEED to force a split of every thp page because the number of 
pte_none()'s > 0.  Instead, it's better to queue these free()'s and 
perhaps recycle them by zeroing out the memory and returning it on a 
subsequent malloc() rather than actually doing the MADV_DONTNEED and 
causing the thp split.  We want to do the split under memory pressure, 
however, and so there's no coordination required other than malloc() 
dropping its queue of freed regions.

> If I'm not mistaken, we're talking about a lot of additional
> complexities throughout the whole mm layer for something which seems,
> to me, achieveable through proper memcg configuration without any
> modification to the kernel and doesn't seem all that necessary for 99%
> of use cases, as you said.  Unless I'm missing something major (quite
> possible, of course), I think you'd need stronger rationale.
> 

The stronger rationale is that you can't handle system oom in userspace 
without this functionality and we need to do so.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [patch 7/8] mm, memcg: allow processes handling oom notifications to access reserves
  2013-12-10 23:55                                     ` David Rientjes
@ 2013-12-11  9:49                                       ` Mel Gorman
  2013-12-11 12:42                                       ` Tejun Heo
  1 sibling, 0 replies; 39+ messages in thread
From: Mel Gorman @ 2013-12-11  9:49 UTC (permalink / raw)
  To: David Rientjes
  Cc: Tejun Heo, Johannes Weiner, Andrew Morton, Michal Hocko,
	KAMEZAWA Hiroyuki, Rik van Riel, Pekka Enberg, Christoph Lameter,
	Li Zefan, linux-kernel, linux-mm, cgroups

On Tue, Dec 10, 2013 at 03:55:48PM -0800, David Rientjes wrote:
> > Okay, are you saying that userland OOM handlers will be able to dip
> > into kernel reserve memory?  Maybe I'm mistaken but you realize that
> > that reserve is there to make things like task exits work under OOM
> > conditions, right?  The only way userland OOM handlers as you describe
> > would work would be creating a separate reserve for them.
> > 
> 
> Yes, PF_OOM_HANDLER processes would be able to allocate this amount as 
> specified by memory.oom_reserve_in_bytes below the per-zone watermarks and 
> the amount of reserves can already be controlled via min_free_kbytes, 
> which we already increase internally for thp.

THP increased min_free_kbytes for external fragmentation control as
it reduces the amount of mixing of the different migrate types within
pageblocks. It was not about reserves, increasing reserves was just the
most straight forward way of handling the problem.

This dicussion is closer to swap-over-network than to anything
THP did. Swap-over-network takes care to only allocate memory for
reserves if it the allocation was required for swapping and reject
all other allocation requests to the extent they can get throttled in
throttle_direct_reclaim. Once allocated from reserves for swapping,
care is taken that the allocations are not leaked to other users (e.g.
is_obj_pfmemalloc checks in slab).

It does not look like PF_OOM_HANDLER takes the same sort of care. Even
if it did, it's not quite the same. swap-over-network allocates from the
zone reserves *only* the memory required to writeback the pages. It can
be slow but it'll make forward progress. A userspace process with special
privileges could allocate any amount of memory for any reason so it would
need a pre-configured and limited reserve on top of the zone reserves or
run the risk of livelock.

-- 
Mel Gorman
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [patch 7/8] mm, memcg: allow processes handling oom notifications to access reserves
  2013-12-10 23:55                                     ` David Rientjes
  2013-12-11  9:49                                       ` Mel Gorman
@ 2013-12-11 12:42                                       ` Tejun Heo
  2013-12-12  5:37                                         ` Tim Hockin
  1 sibling, 1 reply; 39+ messages in thread
From: Tejun Heo @ 2013-12-11 12:42 UTC (permalink / raw)
  To: David Rientjes
  Cc: Johannes Weiner, Andrew Morton, Michal Hocko, KAMEZAWA Hiroyuki,
	Mel Gorman, Rik van Riel, Pekka Enberg, Christoph Lameter,
	Li Zefan, linux-kernel, linux-mm, cgroups

Yo,

On Tue, Dec 10, 2013 at 03:55:48PM -0800, David Rientjes wrote:
> > Well, the gotcha there is that you won't be able to do that with
> > system level OOM handler either unless you create a separately
> > reserved memory, which, again, can be achieved using hierarchical
> > memcg setup already.  Am I missing something here?
> 
> System oom conditions would only arise when the usage of memcgs A + B 
> above cause the page allocator to not be able to allocate memory without 
> oom killing something even though the limits of both A and B may not have 
> been reached yet.  No userspace oom handler can allocate memory with 
> access to memory reserves in the page allocator in such a context; it's 
> vital that if we are to handle system oom conditions in userspace that we 
> given them access to memory that other processes can't allocate.  You 
> could attach a userspace system oom handler to any memcg in this scenario 
> with memory.oom_reserve_in_bytes and since it has PF_OOM_HANDLER it would 
> be able to allocate in reserves in the page allocator and overcharge in 
> its memcg to handle it.  This isn't possible only with a hierarchical 
> memcg setup unless you ensure the sum of the limits of the top level 
> memcgs do not equal or exceed the sum of the min watermarks of all memory 
> zones, and we exceed that.

Yes, exactly.  If system memory is 128M, create top level memcgs w/
120M and 8M each (well, with some slack of course) and then overcommit
the descendants of 120M while putting OOM handlers and friends under
8M without overcommitting.

...
> The stronger rationale is that you can't handle system oom in userspace 
> without this functionality and we need to do so.

You're giving yourself an unreasonable precondition - overcommitting
at root level and handling system OOM from userland - and then trying
to contort everything to fit that.  How can possibly "overcommitting
at root level" be a goal of and in itself?  Please take a step back
and look at and explain the *problem* you're trying to solve.  You
haven't explained why that *need*s to be the case at all.

I wrote this at the start of the thread but you're still doing the
same thing.  You're trying to create a hidden memcg level inside a
memcg.  At the beginning of this thread, you were trying to do that
for !root memcgs and now you're arguing that you *need* that for root
memcg.  Because there's no other limit we can make use of, you're
suggesting the use of kernel reserve memory for that purpose.  It
seems like an absurd thing to do to me.  It could be that you might
not be able to achieve exactly the same thing that way, but the right
thing to do would be improving memcg in general so that it can instead
of adding yet more layer of half-baked complexity, right?

Even if there are some inherent advantages of system userland OOM
handling with a separate physical memory reserve, which AFAICS you
haven't succeeded at showing yet, this is a very invasive change and,
as you said before, something with an *extremely* narrow use case.
Wouldn't it be a better idea to improve the existing mechanisms - be
that memcg in general or kernel OOM handling - to fit the niche use
case better?  I mean, just think about all the corner cases.  How are
you gonna handle priority inversion through locked pages or
allocations given out to other tasks through slab?  You're suggesting
opening a giant can of worms for extremely narrow benefit which
doesn't even seem like actually needing opening the said can.

Thanks.

-- 
tejun

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [patch 7/8] mm, memcg: allow processes handling oom notifications to access reserves
  2013-12-11 12:42                                       ` Tejun Heo
@ 2013-12-12  5:37                                         ` Tim Hockin
  2013-12-12 14:21                                           ` Tejun Heo
  0 siblings, 1 reply; 39+ messages in thread
From: Tim Hockin @ 2013-12-12  5:37 UTC (permalink / raw)
  To: Tejun Heo
  Cc: David Rientjes, Johannes Weiner, Andrew Morton, Michal Hocko,
	KAMEZAWA Hiroyuki, Mel Gorman, Rik van Riel, Pekka Enberg,
	Christoph Lameter, Li Zefan, linux-kernel@vger.kernel.org,
	linux-mm, Cgroups

The immediate problem I see with setting aside reserves "off the top"
is that we don't really know a priori how much memory the kernel
itself is going to use, which could still land us in an overcommitted
state.

In other words, if I have your 128 MB machine, and I set aside 8 MB
for OOM handling, and give 120 MB for jobs, I have not accounted for
the kernel.  So I set aside 8 MB for OOM and 100 MB for jobs, leaving
20 MB for jobs.  That should be enough right?  Hell if I know, and
nothing ensures that.

On Wed, Dec 11, 2013 at 4:42 AM, Tejun Heo <tj@kernel.org> wrote:
> Yo,
>
> On Tue, Dec 10, 2013 at 03:55:48PM -0800, David Rientjes wrote:
>> > Well, the gotcha there is that you won't be able to do that with
>> > system level OOM handler either unless you create a separately
>> > reserved memory, which, again, can be achieved using hierarchical
>> > memcg setup already.  Am I missing something here?
>>
>> System oom conditions would only arise when the usage of memcgs A + B
>> above cause the page allocator to not be able to allocate memory without
>> oom killing something even though the limits of both A and B may not have
>> been reached yet.  No userspace oom handler can allocate memory with
>> access to memory reserves in the page allocator in such a context; it's
>> vital that if we are to handle system oom conditions in userspace that we
>> given them access to memory that other processes can't allocate.  You
>> could attach a userspace system oom handler to any memcg in this scenario
>> with memory.oom_reserve_in_bytes and since it has PF_OOM_HANDLER it would
>> be able to allocate in reserves in the page allocator and overcharge in
>> its memcg to handle it.  This isn't possible only with a hierarchical
>> memcg setup unless you ensure the sum of the limits of the top level
>> memcgs do not equal or exceed the sum of the min watermarks of all memory
>> zones, and we exceed that.
>
> Yes, exactly.  If system memory is 128M, create top level memcgs w/
> 120M and 8M each (well, with some slack of course) and then overcommit
> the descendants of 120M while putting OOM handlers and friends under
> 8M without overcommitting.
>
> ...
>> The stronger rationale is that you can't handle system oom in userspace
>> without this functionality and we need to do so.
>
> You're giving yourself an unreasonable precondition - overcommitting
> at root level and handling system OOM from userland - and then trying
> to contort everything to fit that.  How can possibly "overcommitting
> at root level" be a goal of and in itself?  Please take a step back
> and look at and explain the *problem* you're trying to solve.  You
> haven't explained why that *need*s to be the case at all.
>
> I wrote this at the start of the thread but you're still doing the
> same thing.  You're trying to create a hidden memcg level inside a
> memcg.  At the beginning of this thread, you were trying to do that
> for !root memcgs and now you're arguing that you *need* that for root
> memcg.  Because there's no other limit we can make use of, you're
> suggesting the use of kernel reserve memory for that purpose.  It
> seems like an absurd thing to do to me.  It could be that you might
> not be able to achieve exactly the same thing that way, but the right
> thing to do would be improving memcg in general so that it can instead
> of adding yet more layer of half-baked complexity, right?
>
> Even if there are some inherent advantages of system userland OOM
> handling with a separate physical memory reserve, which AFAICS you
> haven't succeeded at showing yet, this is a very invasive change and,
> as you said before, something with an *extremely* narrow use case.
> Wouldn't it be a better idea to improve the existing mechanisms - be
> that memcg in general or kernel OOM handling - to fit the niche use
> case better?  I mean, just think about all the corner cases.  How are
> you gonna handle priority inversion through locked pages or
> allocations given out to other tasks through slab?  You're suggesting
> opening a giant can of worms for extremely narrow benefit which
> doesn't even seem like actually needing opening the said can.
>
> Thanks.
>
> --
> tejun
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [patch 7/8] mm, memcg: allow processes handling oom notifications to access reserves
  2013-12-12  5:37                                         ` Tim Hockin
@ 2013-12-12 14:21                                           ` Tejun Heo
  2013-12-12 16:32                                             ` Michal Hocko
       [not found]                                             ` <20131212142156.GB32683-Gd/HAXX7CRxy/B6EtB590w@public.gmane.org>
  0 siblings, 2 replies; 39+ messages in thread
From: Tejun Heo @ 2013-12-12 14:21 UTC (permalink / raw)
  To: Tim Hockin
  Cc: David Rientjes, Johannes Weiner, Andrew Morton, Michal Hocko,
	KAMEZAWA Hiroyuki, Mel Gorman, Rik van Riel, Pekka Enberg,
	Christoph Lameter, Li Zefan, linux-kernel@vger.kernel.org,
	linux-mm, Cgroups

Hey, Tim.

Sidenote: Please don't top-post with the whole body quoted below
unless you're adding new cc's.  Please selectively quote the original
message's body to remind the readers of the context and reply below
it.  It's a basic lkml etiquette and one with good reasons.  If you
have to top-post for whatever reason - say you're typing from a
machine which doesn't allow easy editing of the original message,
explain so at the top of the message, or better yet, wait till you can
unless it's urgent.

On Wed, Dec 11, 2013 at 09:37:46PM -0800, Tim Hockin wrote:
> The immediate problem I see with setting aside reserves "off the top"
> is that we don't really know a priori how much memory the kernel
> itself is going to use, which could still land us in an overcommitted
> state.
> 
> In other words, if I have your 128 MB machine, and I set aside 8 MB
> for OOM handling, and give 120 MB for jobs, I have not accounted for
> the kernel.  So I set aside 8 MB for OOM and 100 MB for jobs, leaving
> 20 MB for jobs.  That should be enough right?  Hell if I know, and
> nothing ensures that.

Yes, sure thing, that's the reason why I mentioned "with some slack"
in the original message and also that it might not be completely the
same.  It doesn't allow you to aggressively use system level OOM
handling as the sizing estimator for the root cgroup; however, it's
more of an implementation details than something which should guide
the overall architecture - it's a problem which lessens in severity as
[k]memcg improves and its coverage becomes more complete, which is the
direction we should be headed no matter what.

It'd depend on the workload but with memcg fully configured it
shouldn't fluctuate wildly.  If it does, we need to hunt down whatever
is causing such fluctuatation and include it in kmemcg, right?  That
way, memcg as a whole improves for all use cases not just your niche
one and I strongly believe that aligning as many use cases as possible
along the same axis, rather than creating a large hole to stow away
the exceptions, is vastly more beneficial to *everyone* in the long
term.

There'd still be all the bells and whistles to configure and monitor
system-level OOM and if there's justified need for improvements, we
surely can and should do that; however, with the heavy lifting / hot
path offloaded to the per-memcg userland OOM handlers, I believe it's
reasonable to expect the burden on system OOM handler being noticeably
less, which is the way it should be.  That's the last guard against
the whole system completely locking up and we can't extend its
capabilities beyond that easily and we most likely don't even want to.

If I take back a step and look at the two options and their pros and
cons, which path we should take is rather obvious to me.  I hope you
see it too.

Thanks.

-- 
tejun

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [patch 7/8] mm, memcg: allow processes handling oom notifications to access reserves
  2013-12-12 14:21                                           ` Tejun Heo
@ 2013-12-12 16:32                                             ` Michal Hocko
  2013-12-12 16:37                                               ` Tejun Heo
       [not found]                                             ` <20131212142156.GB32683-Gd/HAXX7CRxy/B6EtB590w@public.gmane.org>
  1 sibling, 1 reply; 39+ messages in thread
From: Michal Hocko @ 2013-12-12 16:32 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Tim Hockin, David Rientjes, Johannes Weiner, Andrew Morton,
	KAMEZAWA Hiroyuki, Mel Gorman, Rik van Riel, Pekka Enberg,
	Christoph Lameter, Li Zefan, linux-kernel@vger.kernel.org,
	linux-mm, Cgroups

On Thu 12-12-13 09:21:56, Tejun Heo wrote:
[...]
> There'd still be all the bells and whistles to configure and monitor
> system-level OOM and if there's justified need for improvements, we
> surely can and should do that;

You weren't on the CC of the original thread which has started here
https://lkml.org/lkml/2013/11/19/191. And the original request for
discussion was more about user defined _policies_ for the global
OOM rather than user space global OOM handler. I feel that there
are usacases where the current "kill a single task based on some
calculations" is far from optimal which leads to hacks which try to cope
with after oom condition somehow gracefully.

I do agree with you that pulling oom handling sounds too dangerous
even with all the code that it would need and I feel we should go a
different path than (ab)using memcg.oom_control interface for that.
I still think we need to have a way to tell the global OOM killer what
to do.

[...]
-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [patch 7/8] mm, memcg: allow processes handling oom notifications to access reserves
  2013-12-12 16:32                                             ` Michal Hocko
@ 2013-12-12 16:37                                               ` Tejun Heo
  0 siblings, 0 replies; 39+ messages in thread
From: Tejun Heo @ 2013-12-12 16:37 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Tim Hockin, David Rientjes, Johannes Weiner, Andrew Morton,
	KAMEZAWA Hiroyuki, Mel Gorman, Rik van Riel, Pekka Enberg,
	Christoph Lameter, Li Zefan, linux-kernel@vger.kernel.org,
	linux-mm, Cgroups

Hello, Michal.

On Thu, Dec 12, 2013 at 05:32:22PM +0100, Michal Hocko wrote:
> You weren't on the CC of the original thread which has started here
> https://lkml.org/lkml/2013/11/19/191. And the original request for
> discussion was more about user defined _policies_ for the global
> OOM rather than user space global OOM handler. I feel that there
> are usacases where the current "kill a single task based on some
> calculations" is far from optimal which leads to hacks which try to cope
> with after oom condition somehow gracefully.
> 
> I do agree with you that pulling oom handling sounds too dangerous
> even with all the code that it would need and I feel we should go a
> different path than (ab)using memcg.oom_control interface for that.
> I still think we need to have a way to tell the global OOM killer what
> to do.

Oh yeah, sure, I have no fundamental objections against improving the
in-kernel system OOM handler, including making it cgroup-aware which
seems like a natural extension to me.

Thanks.

-- 
tejun

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 39+ messages in thread

[parent not found: <20131212142156.GB32683-Gd/HAXX7CRxy/B6EtB590w@public.gmane.org>]

* Re: [patch 7/8] mm, memcg: allow processes handling oom notifications to access reserves
       [not found]                                             ` <20131212142156.GB32683-Gd/HAXX7CRxy/B6EtB590w@public.gmane.org>
@ 2013-12-12 18:42                                               ` Tim Hockin
  2013-12-12 19:23                                                 ` Tejun Heo
  0 siblings, 1 reply; 39+ messages in thread
From: Tim Hockin @ 2013-12-12 18:42 UTC (permalink / raw)
  To: Tejun Heo
  Cc: David Rientjes, Johannes Weiner, Andrew Morton, Michal Hocko,
	KAMEZAWA Hiroyuki, Mel Gorman, Rik van Riel, Pekka Enberg,
	Christoph Lameter, Li Zefan,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg, Cgroups

On Thu, Dec 12, 2013 at 6:21 AM, Tejun Heo <tj-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org> wrote:
> Hey, Tim.
>
> Sidenote: Please don't top-post with the whole body quoted below
> unless you're adding new cc's.  Please selectively quote the original
> message's body to remind the readers of the context and reply below
> it.  It's a basic lkml etiquette and one with good reasons.  If you
> have to top-post for whatever reason - say you're typing from a
> machine which doesn't allow easy editing of the original message,
> explain so at the top of the message, or better yet, wait till you can
> unless it's urgent.

Yeah sorry.  Replying from my phone is awkward at best.  I know better :)

> On Wed, Dec 11, 2013 at 09:37:46PM -0800, Tim Hockin wrote:
>> The immediate problem I see with setting aside reserves "off the top"
>> is that we don't really know a priori how much memory the kernel
>> itself is going to use, which could still land us in an overcommitted
>> state.
>>
>> In other words, if I have your 128 MB machine, and I set aside 8 MB
>> for OOM handling, and give 120 MB for jobs, I have not accounted for
>> the kernel.  So I set aside 8 MB for OOM and 100 MB for jobs, leaving
>> 20 MB for jobs.  That should be enough right?  Hell if I know, and
>> nothing ensures that.
>
> Yes, sure thing, that's the reason why I mentioned "with some slack"
> in the original message and also that it might not be completely the
> same.  It doesn't allow you to aggressively use system level OOM
> handling as the sizing estimator for the root cgroup; however, it's
> more of an implementation details than something which should guide
> the overall architecture - it's a problem which lessens in severity as
> [k]memcg improves and its coverage becomes more complete, which is the
> direction we should be headed no matter what.

In my mind, the ONLY point of pulling system-OOM handling into
userspace is to make it easier for crazy people (Google) to implement
bizarre system-OOM policies.  Example:

When we have a system OOM we want to do a walk of the administrative
memcg tree (which is only a couple levels deep, users can make
non-admin sub-memcgs), selecting the lowest priority entity at each
step (where both tasks and memcgs have a priority and the priority
range is much wider than the current OOM scores, and where memcg
priority is sometimes a function of memcg usage), until we reach a
leaf.

Once we reach a leaf, I want to log some info about the memcg doing
the allocation, the memcg being terminated, and maybe some other bits
about the system (depending on the priority of the selected victim,
this may or may not be an "acceptable" situation).  Then I want to
kill *everything* under that memcg.  Then I want to "publish" some
information through a sane API (e.g. not dmesg scraping).

This is basically our policy as we understand it today.  This is
notably different than it was a year ago, and it will probably evolve
further in the next year.

Teaching the kernel all of this stuff has proven to be sort of
difficult to maintain and forward-port, and has been very slow to
evolve because of how painful it is to test and deploy new kernels.

Maybe we can find a way to push this level of policy down to the
kernel OOM killer?  When this was mentioned internally I got shot down
(gently, but shot down none the less).  Assuming we had
nearly-reliable (it doesn't have to be 100% guaranteed to be useful)
OOM-in-userspace, I can keep the adminstrative memcg metadata in
memory, implement killing as cruelly as I need, and do all of the
logging and publication after the OOM kill is done.  Most importantly
I can test and deploy new policy changes pretty trivially.

Handling per-memcg OOM is a different discussion.  Here is where we
want to be able to extract things like heap profiles or take stats
snapshots, grow memcgs (if so configured) etc.  Allowing our users to
have a moment of mercy before we put a bullet in their brain enables a
whole new realm of debugging, as well as a lot of valuable features.

> It'd depend on the workload but with memcg fully configured it
> shouldn't fluctuate wildly.  If it does, we need to hunt down whatever
> is causing such fluctuatation and include it in kmemcg, right?  That
> way, memcg as a whole improves for all use cases not just your niche
> one and I strongly believe that aligning as many use cases as possible
> along the same axis, rather than creating a large hole to stow away
> the exceptions, is vastly more beneficial to *everyone* in the long
> term.

We have a long tail of kernel memory usage.  If we provision machines
so that the "do work here" first-level memcg excludes the average
kernel usage, we have a huge number of machines that will fail to
apply OOM policy because of actual overcommitment.  If we provision
for 95th or 99th percentile kernel usage, we're wasting large amounts
of memory that could be used to schedule jobs.  This is the
fundamental problem we face with static apportionment (and we face it
in a dozen other situations, too).  Expressing this set-aside memory
as "off-the-top" rather than absolute limits makes the whole system
more flexible.

> There'd still be all the bells and whistles to configure and monitor
> system-level OOM and if there's justified need for improvements, we
> surely can and should do that; however, with the heavy lifting / hot
> path offloaded to the per-memcg userland OOM handlers, I believe it's
> reasonable to expect the burden on system OOM handler being noticeably
> less, which is the way it should be.  That's the last guard against
> the whole system completely locking up and we can't extend its
> capabilities beyond that easily and we most likely don't even want to.
>
> If I take back a step and look at the two options and their pros and
> cons, which path we should take is rather obvious to me.  I hope you
> see it too.
>
> Thanks.
>
> --
> tejun

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [patch 7/8] mm, memcg: allow processes handling oom notifications to access reserves
  2013-12-12 18:42                                               ` Tim Hockin
@ 2013-12-12 19:23                                                 ` Tejun Heo
  2013-12-13  0:23                                                   ` Tim Hockin
  0 siblings, 1 reply; 39+ messages in thread
From: Tejun Heo @ 2013-12-12 19:23 UTC (permalink / raw)
  To: Tim Hockin
  Cc: David Rientjes, Johannes Weiner, Andrew Morton, Michal Hocko,
	KAMEZAWA Hiroyuki, Mel Gorman, Rik van Riel, Pekka Enberg,
	Christoph Lameter, Li Zefan, linux-kernel@vger.kernel.org,
	linux-mm, Cgroups

Hello, Tim.

On Thu, Dec 12, 2013 at 10:42:20AM -0800, Tim Hockin wrote:
> Yeah sorry.  Replying from my phone is awkward at best.  I know better :)

Heh, sorry about being bitchy. :)

> In my mind, the ONLY point of pulling system-OOM handling into
> userspace is to make it easier for crazy people (Google) to implement
> bizarre system-OOM policies.  Example:

I think that's one of the places where we largely disagree.  If at all
possible, I'd much prefer google's workload to be supported inside the
general boundaries of the upstream kernel without having to punch a
large hole in it.  To me, the general development history of memcg in
general and this thread in particular seem to epitomize why it is a
bad idea to have isolated, large and deep "crazy" use cases.  Punching
the initial hole is the easy part; however, we all are quite limited
in anticpating future needs and sooner or later that crazy use case is
bound to evolve further towards the isolated extreme it departed
towards and require more and larger holes and further contortions to
accomodate such progress.

The concern I have with the suggested solution is not necessarily that
it's technically complex than it looks like on the surface - I'm sure
it can be made to work one way or the other - but that it's a fairly
large step toward an isolated extreme which memcg as a project
probably should not head toward.

There sure are cases where such exceptions can't be avoided and are
good trade-offs but, here, we're talking about a major architectural
decision which not only affects memcg but mm in general.  I'm afraid
this doesn't sound like an no-brainer flexibility we can afford.

> When we have a system OOM we want to do a walk of the administrative
> memcg tree (which is only a couple levels deep, users can make
> non-admin sub-memcgs), selecting the lowest priority entity at each
> step (where both tasks and memcgs have a priority and the priority
> range is much wider than the current OOM scores, and where memcg
> priority is sometimes a function of memcg usage), until we reach a
> leaf.
> 
> Once we reach a leaf, I want to log some info about the memcg doing
> the allocation, the memcg being terminated, and maybe some other bits
> about the system (depending on the priority of the selected victim,
> this may or may not be an "acceptable" situation).  Then I want to
> kill *everything* under that memcg.  Then I want to "publish" some
> information through a sane API (e.g. not dmesg scraping).
> 
> This is basically our policy as we understand it today.  This is
> notably different than it was a year ago, and it will probably evolve
> further in the next year.

I think per-memcg score and killing is something which makes
fundamental sense.  In fact, killing a single process has never made
much sense to me as that is a unit which ultimately is only meaningful
to the kernel itself and not necessraily to userland, so no matter
what I think we're gonna gain per-memcg behavior and it seems most,
albeit not all, of what you described above should be implementable
through that.

Ultimately, if the use case calls for very fine level of control, I
think the right thing to do is making nesting work properly which is
likely to take some time.  In the meantime, even if such use case
requires modifying the kernel to tailor the OOM behavior, I think
sticking to kernel OOM provides a lot easier way to eventual
convergence.  Userland system OOM basically means giving up and would
lessen the motivation towards improving the shared infrastructures
while adding significant pressure towards schizophreic diversion.

> We have a long tail of kernel memory usage.  If we provision machines
> so that the "do work here" first-level memcg excludes the average
> kernel usage, we have a huge number of machines that will fail to
> apply OOM policy because of actual overcommitment.  If we provision
> for 95th or 99th percentile kernel usage, we're wasting large amounts
> of memory that could be used to schedule jobs.  This is the
> fundamental problem we face with static apportionment (and we face it
> in a dozen other situations, too).  Expressing this set-aside memory
> as "off-the-top" rather than absolute limits makes the whole system
> more flexible.

I agree that's pretty sad.  Maybe I shouldn't be surprised given the
far-from-perfect coverage of kmemcg at this point, but, again,
*everyone* wants [k]memcg coverage to be more complete and we have and
are still building the infrastructures to make that possible, so I'm
still of the opinion that making [k]memcg work better is the better
direction to pursue and given the short development history of kmemcg
I'm fairly sure there are quite a few low hanging fruits.

Another thing which *might* be relevant is the rigidity of the upper
limit and the vagueness of soft limit of the current implementation.
I have a rather strong suspicion that the way memcg config knobs
behave now - one finicky, the other whatever - is likely hindering the
use cases to fan out more naturally.  I could be completely wrong on
this but your mention of inflexibility of absolute limits reminds me
of the issue.

Thanks.

-- 
tejun

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [patch 7/8] mm, memcg: allow processes handling oom notifications to access reserves
  2013-12-12 19:23                                                 ` Tejun Heo
@ 2013-12-13  0:23                                                   ` Tim Hockin
  2013-12-13 11:47                                                     ` Tejun Heo
  0 siblings, 1 reply; 39+ messages in thread
From: Tim Hockin @ 2013-12-13  0:23 UTC (permalink / raw)
  To: Tejun Heo
  Cc: David Rientjes, Johannes Weiner, Andrew Morton, Michal Hocko,
	KAMEZAWA Hiroyuki, Mel Gorman, Rik van Riel, Pekka Enberg,
	Christoph Lameter, Li Zefan, linux-kernel@vger.kernel.org,
	linux-mm, Cgroups, Victor Marmol

On Thu, Dec 12, 2013 at 11:23 AM, Tejun Heo <tj@kernel.org> wrote:
> Hello, Tim.
>
> On Thu, Dec 12, 2013 at 10:42:20AM -0800, Tim Hockin wrote:
>> Yeah sorry.  Replying from my phone is awkward at best.  I know better :)
>
> Heh, sorry about being bitchy. :)
>
>> In my mind, the ONLY point of pulling system-OOM handling into
>> userspace is to make it easier for crazy people (Google) to implement
>> bizarre system-OOM policies.  Example:
>
> I think that's one of the places where we largely disagree.  If at all

Just to be clear - I say this because it doesn't feel right to impose
my craziness on others, and it sucks when we try and are met with
"you're crazy, go away".  And you have to admit that happens to
Google. :)  Punching an escape valve that allows us to be crazy
without hurting anyone else sounds ideal, IF and ONLY IF that escape
valve is itself maintainable.

If the escape valve is userspace it's REALLY easy to iterate on our
craziness.  If it is kernel space, it's somewhat less easy, but not
impossible.

> possible, I'd much prefer google's workload to be supported inside the
> general boundaries of the upstream kernel without having to punch a
> large hole in it.  To me, the general development history of memcg in
> general and this thread in particular seem to epitomize why it is a
> bad idea to have isolated, large and deep "crazy" use cases.  Punching
> the initial hole is the easy part; however, we all are quite limited
> in anticpating future needs and sooner or later that crazy use case is
> bound to evolve further towards the isolated extreme it departed
> towards and require more and larger holes and further contortions to
> accomodate such progress.
>
> The concern I have with the suggested solution is not necessarily that
> it's technically complex than it looks like on the surface - I'm sure
> it can be made to work one way or the other - but that it's a fairly
> large step toward an isolated extreme which memcg as a project
> probably should not head toward.
>
> There sure are cases where such exceptions can't be avoided and are
> good trade-offs but, here, we're talking about a major architectural
> decision which not only affects memcg but mm in general.  I'm afraid
> this doesn't sound like an no-brainer flexibility we can afford.
>
>> When we have a system OOM we want to do a walk of the administrative
>> memcg tree (which is only a couple levels deep, users can make
>> non-admin sub-memcgs), selecting the lowest priority entity at each
>> step (where both tasks and memcgs have a priority and the priority
>> range is much wider than the current OOM scores, and where memcg
>> priority is sometimes a function of memcg usage), until we reach a
>> leaf.
>>
>> Once we reach a leaf, I want to log some info about the memcg doing
>> the allocation, the memcg being terminated, and maybe some other bits
>> about the system (depending on the priority of the selected victim,
>> this may or may not be an "acceptable" situation).  Then I want to
>> kill *everything* under that memcg.  Then I want to "publish" some
>> information through a sane API (e.g. not dmesg scraping).
>>
>> This is basically our policy as we understand it today.  This is
>> notably different than it was a year ago, and it will probably evolve
>> further in the next year.
>
> I think per-memcg score and killing is something which makes
> fundamental sense.  In fact, killing a single process has never made
> much sense to me as that is a unit which ultimately is only meaningful
> to the kernel itself and not necessraily to userland, so no matter
> what I think we're gonna gain per-memcg behavior and it seems most,
> albeit not all, of what you described above should be implementable
> through that.

Well that's an awesome start.  We have or had patches to do a lot of
this.  I don't know how well scrubbed they are for pushing or whether
they apply at all to current head, though.

> Ultimately, if the use case calls for very fine level of control, I
> think the right thing to do is making nesting work properly which is
> likely to take some time.  In the meantime, even if such use case
> requires modifying the kernel to tailor the OOM behavior, I think
> sticking to kernel OOM provides a lot easier way to eventual
> convergence.  Userland system OOM basically means giving up and would
> lessen the motivation towards improving the shared infrastructures
> while adding significant pressure towards schizophreic diversion.
>
>> We have a long tail of kernel memory usage.  If we provision machines
>> so that the "do work here" first-level memcg excludes the average
>> kernel usage, we have a huge number of machines that will fail to
>> apply OOM policy because of actual overcommitment.  If we provision
>> for 95th or 99th percentile kernel usage, we're wasting large amounts
>> of memory that could be used to schedule jobs.  This is the
>> fundamental problem we face with static apportionment (and we face it
>> in a dozen other situations, too).  Expressing this set-aside memory
>> as "off-the-top" rather than absolute limits makes the whole system
>> more flexible.
>
> I agree that's pretty sad.  Maybe I shouldn't be surprised given the
> far-from-perfect coverage of kmemcg at this point, but, again,
> *everyone* wants [k]memcg coverage to be more complete and we have and
> are still building the infrastructures to make that possible, so I'm
> still of the opinion that making [k]memcg work better is the better
> direction to pursue and given the short development history of kmemcg
> I'm fairly sure there are quite a few low hanging fruits.

yes we should fix accounting across the board.  We are hugely in favor
of that.  But I don't buy that we'll erase that tail.  Fundamentally,
we don't know what the limit is, but we know that we need to save a
little "off the top".  I'm very much hoping we can find a way to
express that.

As an aside: mucking about with extra nesting levels to achieve a
stable OOM semantic sounds doable, but it certainly sucks in a unified
hierarchy.  We'll end up with 1, 2, or 3 (or more in esoteric cases?
not sure) extra nesting levels for every other resource dimension.
And lawd help us if we ever need to do something similar in a
different resource dimension - the cross product is mind-bending.
What we do with split-hierarchies is this but on a smaller scale.

> Another thing which *might* be relevant is the rigidity of the upper
> limit and the vagueness of soft limit of the current implementation.
> I have a rather strong suspicion that the way memcg config knobs
> behave now - one finicky, the other whatever - is likely hindering the
> use cases to fan out more naturally.  I could be completely wrong on
> this but your mention of inflexibility of absolute limits reminds me
> of the issue.
>
> Thanks.
>
> --
> tejun

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [patch 7/8] mm, memcg: allow processes handling oom notifications to access reserves
  2013-12-13  0:23                                                   ` Tim Hockin
@ 2013-12-13 11:47                                                     ` Tejun Heo
  0 siblings, 0 replies; 39+ messages in thread
From: Tejun Heo @ 2013-12-13 11:47 UTC (permalink / raw)
  To: Tim Hockin
  Cc: David Rientjes, Johannes Weiner, Andrew Morton, Michal Hocko,
	KAMEZAWA Hiroyuki, Mel Gorman, Rik van Riel, Pekka Enberg,
	Christoph Lameter, Li Zefan, linux-kernel@vger.kernel.org,
	linux-mm, Cgroups, Victor Marmol

Hello, Tim.

On Thu, Dec 12, 2013 at 04:23:18PM -0800, Tim Hockin wrote:
> Just to be clear - I say this because it doesn't feel right to impose
> my craziness on others, and it sucks when we try and are met with
> "you're crazy, go away".  And you have to admit that happens to
> Google. :)  Punching an escape valve that allows us to be crazy
> without hurting anyone else sounds ideal, IF and ONLY IF that escape
> valve is itself maintainable.

I don't think google being considered crazy is a good thing in
general, highly likely not something to be proud of.  It sure is
partly indicative of the specialization that you guys need but I
suspect is a much stronger signal for room for better engineering.

I'm fairly certain the blame is abundant for everybody to share.  The
point I'm trying to make is "let's please stop diverging".  It hurts
everybody.

> If the escape valve is userspace it's REALLY easy to iterate on our
> craziness.  If it is kernel space, it's somewhat less easy, but not
> impossible.

As I'm sure you've gathered from this thread, even punching the
initial hole is a sizable burden and contortion to the general memory
management and I'm sure as you guys develop further down the path
you'll encounter cases where you need further support or holes from
the kernel.  I can't anticipate the details but the fact that those
will follow is as evident as the day to me, especially given the
mindset leading to the current situation in the first place.

Please note that this part of discussion is more abstract than
necessary for this particular patchset or hole.  I'm quite doubtful
that system-level OOM handling with separate physical reserve is
likely to survive even just on technical details.  The reason why I'm
keeping at this abstract point is because this seems to be a
continuing trend rather than a single occurrence and I really hope it
changes.

> Well that's an awesome start.  We have or had patches to do a lot of
> this.  I don't know how well scrubbed they are for pushing or whether
> they apply at all to current head, though.

Awesome, this looks like something everyone agrees on. :)

> As an aside: mucking about with extra nesting levels to achieve a
> stable OOM semantic sounds doable, but it certainly sucks in a unified
> hierarchy.  We'll end up with 1, 2, or 3 (or more in esoteric cases?
> not sure) extra nesting levels for every other resource dimension.
> And lawd help us if we ever need to do something similar in a
> different resource dimension - the cross product is mind-bending.
> What we do with split-hierarchies is this but on a smaller scale.

Yes, agreed but I believe there are substantial benefits to having
certain level of structural constraints.  It encourages people to
ponder the underlying issues and make active trade-offs.  Not that
going off that extreme would be good either but we've gone too far
towards the other end.

This being a special issue with memcg, if this turns out to be a big
enough problem, I don't think having a provision to be able to handle
it without further nesting would be too crazy - e.g. the ability to
mark a single cgroup at the root level as for OOM handler or whatever
- as long as we stay within the boundaries of memcg and cgroup proper,
but we seem to have ways to go before worrying about that one.

Thanks.

-- 
tejun

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [patch 8/8] mm, memcg: add memcg oom reserve documentation
  2013-12-04  5:19             ` [patch 1/8] fork: collapse copy_flags into copy_process David Rientjes
                                 ` (5 preceding siblings ...)
  2013-12-04  5:20               ` [patch 7/8] mm, memcg: allow processes handling oom notifications to access reserves David Rientjes
@ 2013-12-04  5:20               ` David Rientjes
  6 siblings, 0 replies; 39+ messages in thread
From: David Rientjes @ 2013-12-04  5:20 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Michal Hocko, KAMEZAWA Hiroyuki, Johannes Weiner, Mel Gorman,
	Rik van Riel, Pekka Enberg, Christoph Lameter, linux-kernel,
	linux-mm, cgroups

Add documentation on memcg oom reserves to
Documentation/cgroups/memory.txt and give an example of its usage and
recommended best practices.

Signed-off-by: David Rientjes <rientjes@google.com>
---
 Documentation/cgroups/memory.txt | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroups/memory.txt
@@ -71,6 +71,7 @@ Brief summary of control files.
 				 (See sysctl's vm.swappiness)
  memory.move_charge_at_immigrate # set/show controls of moving charges
  memory.oom_control		 # set/show oom controls.
+ memory.oom_reserve_in_bytes	 # set/show limit of oom memory reserves
  memory.numa_stat		 # show the number of memory usage per numa node
 
  memory.kmem.limit_in_bytes      # set/show hard limit for kernel memory
@@ -772,6 +773,31 @@ At reading, current status of OOM is shown.
 	under_oom	 0 or 1 (if 1, the memory cgroup is under OOM, tasks may
 				 be stopped.)
 
+Processes that handle oom conditions in their own memcgs or their child
+memcgs may need to allocate memory themselves to do anything useful,
+including pagefaulting its text or allocating kernel memory to read the
+memcg "tasks" file.  For this reason, memory.oom_reserve_in_bytes is
+provided that specifies how much memory that processes waiting on
+memory.oom_control can allocate above the memcg limit.
+
+The memcg that the oom handler is attached to is charged for the memory
+that it allocates against its own memory.oom_reserve_in_bytes.  This
+memory is therefore only available to processes that are waiting for
+a notification.
+
+For example, if you do
+
+	# echo 2m > memory.oom_reserve_in_bytes
+
+then any process attached to this memcg that is waiting on memcg oom
+notifications anywhere on the system can allocate an additional 2MB
+above memory.limit_in_bytes.
+
+You may still consider doing mlockall(MCL_FUTURE) for processes that
+are waiting on oom notifications to keep this vaue as minimal as
+possible, or allow it to be large enough so that its text can still
+be pagefaulted in under oom conditions when the value is known.
+
 11. Memory Pressure
 
 The pressure level notifications can be used to monitor the memory

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 39+ messages in thread

end of thread, other threads:[~2013-12-13 11:47 UTC | newest]

Thread overview: 39+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <20131119131400.GC20655@dhcp22.suse.cz>
     [not found] ` <20131119134007.GD20655@dhcp22.suse.cz>
     [not found]   ` <alpine.DEB.2.02.1311192352070.20752@chino.kir.corp.google.com>
     [not found]     ` <20131120152251.GA18809@dhcp22.suse.cz>
     [not found]       ` <alpine.DEB.2.02.1311201917520.7167@chino.kir.corp.google.com>
     [not found]         ` <20131128115458.GK2761@dhcp22.suse.cz>
     [not found]           ` <alpine.DEB.2.02.1312021504170.13465@chino.kir.corp.google.com>
2013-12-04  5:19             ` [patch 1/8] fork: collapse copy_flags into copy_process David Rientjes
2013-12-04  5:19               ` [patch 2/8] mm, mempolicy: rename slab_node for clarity David Rientjes
     [not found]                 ` <alpine.DEB.2.02.1312032117330.29733-X6Q0R45D7oAcqpCFd4KODRPsWskHk0ljAL8bYrjMMd8@public.gmane.org>
2013-12-04 15:21                   ` Christoph Lameter
2013-12-04  5:20               ` [patch 3/8] mm, mempolicy: remove per-process flag David Rientjes
2013-12-04 15:24                 ` Christoph Lameter
2013-12-05  0:53                   ` David Rientjes
2013-12-05 19:05                     ` Christoph Lameter
2013-12-05 23:53                       ` David Rientjes
2013-12-06 14:46                         ` Christoph Lameter
2013-12-04  5:20               ` [patch 4/8] mm, memcg: add tunable for oom reserves David Rientjes
2013-12-04  5:20               ` [patch 5/8] res_counter: remove interface for locked charging and uncharging David Rientjes
2013-12-04  5:20               ` [patch 6/8] res_counter: add interface for maximum nofail charge David Rientjes
2013-12-04  5:20               ` [patch 7/8] mm, memcg: allow processes handling oom notifications to access reserves David Rientjes
2013-12-04  5:45                 ` Johannes Weiner
2013-12-05  1:49                   ` David Rientjes
2013-12-05  2:50                     ` Tejun Heo
     [not found]                       ` <20131205025026.GA26777-Gd/HAXX7CRxy/B6EtB590w@public.gmane.org>
2013-12-05 23:49                         ` David Rientjes
     [not found]                           ` <alpine.DEB.2.02.1312051537550.7717-X6Q0R45D7oAcqpCFd4KODRPsWskHk0ljAL8bYrjMMd8@public.gmane.org>
2013-12-06 17:34                             ` Johannes Weiner
2013-12-07 16:38                               ` Tim Hockin
2013-12-07 17:40                                 ` Johannes Weiner
2013-12-07 18:12                                   ` Tim Hockin
     [not found]                                     ` <CAAAKZwvanMiz8QZVOU0-SUKYzqcaJAXn0HxYs5+=Zakmnbcfbg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2013-12-07 19:06                                       ` Johannes Weiner
2013-12-07 21:04                                         ` Tim Hockin
2013-12-06 19:01                             ` Tejun Heo
2013-12-09 20:10                               ` David Rientjes
     [not found]                                 ` <alpine.DEB.2.02.1312061441390.8949-X6Q0R45D7oAcqpCFd4KODRPsWskHk0ljAL8bYrjMMd8@public.gmane.org>
2013-12-09 22:37                                   ` Johannes Weiner
2013-12-10 21:50                                   ` Tejun Heo
2013-12-10 23:55                                     ` David Rientjes
2013-12-11  9:49                                       ` Mel Gorman
2013-12-11 12:42                                       ` Tejun Heo
2013-12-12  5:37                                         ` Tim Hockin
2013-12-12 14:21                                           ` Tejun Heo
2013-12-12 16:32                                             ` Michal Hocko
2013-12-12 16:37                                               ` Tejun Heo
     [not found]                                             ` <20131212142156.GB32683-Gd/HAXX7CRxy/B6EtB590w@public.gmane.org>
2013-12-12 18:42                                               ` Tim Hockin
2013-12-12 19:23                                                 ` Tejun Heo
2013-12-13  0:23                                                   ` Tim Hockin
2013-12-13 11:47                                                     ` Tejun Heo
2013-12-04  5:20               ` [patch 8/8] mm, memcg: add memcg oom reserve documentation David Rientjes

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).