* [patch 1/8] fork: collapse copy_flags into copy_process [not found] ` <alpine.DEB.2.02.1312021504170.13465@chino.kir.corp.google.com> @ 2013-12-04 5:19 ` David Rientjes 2013-12-04 5:19 ` [patch 2/8] mm, mempolicy: rename slab_node for clarity David Rientjes ` (6 more replies) 0 siblings, 7 replies; 39+ messages in thread From: David Rientjes @ 2013-12-04 5:19 UTC (permalink / raw) To: Andrew Morton Cc: Michal Hocko, KAMEZAWA Hiroyuki, Johannes Weiner, Mel Gorman, Rik van Riel, Pekka Enberg, Christoph Lameter, linux-kernel, linux-mm, cgroups copy_flags() does not use the clone_flags formal and can be collapsed into copy_process() for cleaner code. Signed-off-by: David Rientjes <rientjes@google.com> --- kernel/fork.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1066,15 +1066,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) return 0; } -static void copy_flags(unsigned long clone_flags, struct task_struct *p) -{ - unsigned long new_flags = p->flags; - - new_flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER); - new_flags |= PF_FORKNOEXEC; - p->flags = new_flags; -} - SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr) { current->clear_child_tid = tidptr; @@ -1223,7 +1214,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->did_exec = 0; delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ - copy_flags(clone_flags, p); + p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER); + p->flags |= PF_FORKNOEXEC; INIT_LIST_HEAD(&p->children); INIT_LIST_HEAD(&p->sibling); rcu_copy_process(p); -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 39+ messages in thread
* [patch 2/8] mm, mempolicy: rename slab_node for clarity 2013-12-04 5:19 ` [patch 1/8] fork: collapse copy_flags into copy_process David Rientjes @ 2013-12-04 5:19 ` David Rientjes [not found] ` <alpine.DEB.2.02.1312032117330.29733-X6Q0R45D7oAcqpCFd4KODRPsWskHk0ljAL8bYrjMMd8@public.gmane.org> 2013-12-04 5:20 ` [patch 3/8] mm, mempolicy: remove per-process flag David Rientjes ` (5 subsequent siblings) 6 siblings, 1 reply; 39+ messages in thread From: David Rientjes @ 2013-12-04 5:19 UTC (permalink / raw) To: Andrew Morton Cc: Michal Hocko, KAMEZAWA Hiroyuki, Johannes Weiner, Mel Gorman, Rik van Riel, Pekka Enberg, Christoph Lameter, linux-kernel, linux-mm, cgroups slab_node() is actually a mempolicy function, so rename it to mempolicy_slab_node() to make it clearer that it used for processes with mempolicies. At the same time, cleanup its code by saving numa_mem_id() in a local variable (since we require a node with memory, not just any node) and remove an obsolete comment that assumes the mempolicy is actually passed into the function. Signed-off-by: David Rientjes <rientjes@google.com> --- include/linux/mempolicy.h | 2 +- mm/mempolicy.c | 15 ++++++--------- mm/slab.c | 4 ++-- mm/slub.c | 2 +- 4 files changed, 10 insertions(+), 13 deletions(-) diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -151,7 +151,7 @@ extern struct zonelist *huge_zonelist(struct vm_area_struct *vma, extern bool init_nodemask_of_mempolicy(nodemask_t *mask); extern bool mempolicy_nodemask_intersects(struct task_struct *tsk, const nodemask_t *mask); -extern unsigned slab_node(void); +extern unsigned int mempolicy_slab_node(void); extern enum zone_type policy_zone; diff --git a/mm/mempolicy.c b/mm/mempolicy.c --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1783,21 +1783,18 @@ static unsigned interleave_nodes(struct mempolicy *policy) /* * Depending on the memory policy provide a node from which to allocate the * next slab entry. - * @policy must be protected by freeing by the caller. If @policy is - * the current task's mempolicy, this protection is implicit, as only the - * task can change it's policy. The system default policy requires no - * such protection. */ -unsigned slab_node(void) +unsigned int mempolicy_slab_node(void) { struct mempolicy *policy; + int node = numa_mem_id(); if (in_interrupt()) - return numa_node_id(); + return node; policy = current->mempolicy; if (!policy || policy->flags & MPOL_F_LOCAL) - return numa_node_id(); + return node; switch (policy->mode) { case MPOL_PREFERRED: @@ -1817,11 +1814,11 @@ unsigned slab_node(void) struct zonelist *zonelist; struct zone *zone; enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL); - zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0]; + zonelist = &NODE_DATA(node)->node_zonelists[0]; (void)first_zones_zonelist(zonelist, highest_zoneidx, &policy->v.nodes, &zone); - return zone ? zone->node : numa_node_id(); + return zone ? zone->node : node; } default: diff --git a/mm/slab.c b/mm/slab.c --- a/mm/slab.c +++ b/mm/slab.c @@ -3042,7 +3042,7 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) nid_alloc = cpuset_slab_spread_node(); else if (current->mempolicy) - nid_alloc = slab_node(); + nid_alloc = mempolicy_slab_node(); if (nid_alloc != nid_here) return ____cache_alloc_node(cachep, flags, nid_alloc); return NULL; @@ -3074,7 +3074,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) retry_cpuset: cpuset_mems_cookie = get_mems_allowed(); - zonelist = node_zonelist(slab_node(), flags); + zonelist = node_zonelist(mempolicy_slab_node(), flags); retry: /* diff --git a/mm/slub.c b/mm/slub.c --- a/mm/slub.c +++ b/mm/slub.c @@ -1663,7 +1663,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags, do { cpuset_mems_cookie = get_mems_allowed(); - zonelist = node_zonelist(slab_node(), flags); + zonelist = node_zonelist(mempolicy_slab_node(), flags); for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { struct kmem_cache_node *n; -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 39+ messages in thread
[parent not found: <alpine.DEB.2.02.1312032117330.29733-X6Q0R45D7oAcqpCFd4KODRPsWskHk0ljAL8bYrjMMd8@public.gmane.org>]
* Re: [patch 2/8] mm, mempolicy: rename slab_node for clarity [not found] ` <alpine.DEB.2.02.1312032117330.29733-X6Q0R45D7oAcqpCFd4KODRPsWskHk0ljAL8bYrjMMd8@public.gmane.org> @ 2013-12-04 15:21 ` Christoph Lameter 0 siblings, 0 replies; 39+ messages in thread From: Christoph Lameter @ 2013-12-04 15:21 UTC (permalink / raw) To: David Rientjes Cc: Andrew Morton, Michal Hocko, KAMEZAWA Hiroyuki, Johannes Weiner, Mel Gorman, Rik van Riel, Pekka Enberg, linux-kernel-u79uwXL29TY76Z2rM5mHXA, linux-mm-Bw31MaZKKs3YtjvyW6yDsg, cgroups-u79uwXL29TY76Z2rM5mHXA On Tue, 3 Dec 2013, David Rientjes wrote: > slab_node() is actually a mempolicy function, so rename it to > mempolicy_slab_node() to make it clearer that it used for processes with > mempolicies. Acked-by: Christoph Lameter <cl-vYTEC60ixJUAvxtiuMwx3w@public.gmane.org> ^ permalink raw reply [flat|nested] 39+ messages in thread
* [patch 3/8] mm, mempolicy: remove per-process flag 2013-12-04 5:19 ` [patch 1/8] fork: collapse copy_flags into copy_process David Rientjes 2013-12-04 5:19 ` [patch 2/8] mm, mempolicy: rename slab_node for clarity David Rientjes @ 2013-12-04 5:20 ` David Rientjes 2013-12-04 15:24 ` Christoph Lameter 2013-12-04 5:20 ` [patch 4/8] mm, memcg: add tunable for oom reserves David Rientjes ` (4 subsequent siblings) 6 siblings, 1 reply; 39+ messages in thread From: David Rientjes @ 2013-12-04 5:20 UTC (permalink / raw) To: Andrew Morton Cc: Michal Hocko, KAMEZAWA Hiroyuki, Johannes Weiner, Mel Gorman, Rik van Riel, Pekka Enberg, Christoph Lameter, linux-kernel, linux-mm, cgroups PF_MEMPOLICY is an unnecessary optimization for CONFIG_SLAB users. There's no significant performance degradation to checking current->mempolicy rather than current->flags & PF_MEMPOLICY in the allocation path, especially since this is considered unlikely(). Per-process flags are a scarce resource so we should free them up whenever possible and make them available. We'll be using it shortly for memcg oom reserves. Signed-off-by: David Rientjes <rientjes@google.com> --- include/linux/mempolicy.h | 5 ----- include/linux/sched.h | 1 - kernel/fork.c | 1 - mm/mempolicy.c | 31 ------------------------------- mm/slab.c | 4 ++-- 5 files changed, 2 insertions(+), 40 deletions(-) diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -143,7 +143,6 @@ extern void numa_policy_init(void); extern void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new, enum mpol_rebind_step step); extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new); -extern void mpol_fix_fork_child_flag(struct task_struct *p); extern struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, @@ -266,10 +265,6 @@ static inline void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) { } -static inline void mpol_fix_fork_child_flag(struct task_struct *p) -{ -} - static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol, nodemask_t **nodemask) diff --git a/include/linux/sched.h b/include/linux/sched.h --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1695,7 +1695,6 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, #define PF_SPREAD_SLAB 0x02000000 /* Spread some slab caches over cpuset */ #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_allowed */ #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ -#define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */ #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ #define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */ #define PF_SUSPEND_TASK 0x80000000 /* this thread called freeze_processes and should not be frozen */ diff --git a/kernel/fork.c b/kernel/fork.c --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1261,7 +1261,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->mempolicy = NULL; goto bad_fork_cleanup_cgroup; } - mpol_fix_fork_child_flag(p); #endif #ifdef CONFIG_CPUSETS p->cpuset_mem_spread_rotor = NUMA_NO_NODE; diff --git a/mm/mempolicy.c b/mm/mempolicy.c --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -796,36 +796,6 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, return err; } -/* - * Update task->flags PF_MEMPOLICY bit: set iff non-default - * mempolicy. Allows more rapid checking of this (combined perhaps - * with other PF_* flag bits) on memory allocation hot code paths. - * - * If called from outside this file, the task 'p' should -only- be - * a newly forked child not yet visible on the task list, because - * manipulating the task flags of a visible task is not safe. - * - * The above limitation is why this routine has the funny name - * mpol_fix_fork_child_flag(). - * - * It is also safe to call this with a task pointer of current, - * which the static wrapper mpol_set_task_struct_flag() does, - * for use within this file. - */ - -void mpol_fix_fork_child_flag(struct task_struct *p) -{ - if (p->mempolicy) - p->flags |= PF_MEMPOLICY; - else - p->flags &= ~PF_MEMPOLICY; -} - -static void mpol_set_task_struct_flag(void) -{ - mpol_fix_fork_child_flag(current); -} - /* Set the process memory policy */ static long do_set_mempolicy(unsigned short mode, unsigned short flags, nodemask_t *nodes) @@ -862,7 +832,6 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, } old = current->mempolicy; current->mempolicy = new; - mpol_set_task_struct_flag(); if (new && new->mode == MPOL_INTERLEAVE && nodes_weight(new->v.nodes)) current->il_next = first_node(new->v.nodes); diff --git a/mm/slab.c b/mm/slab.c --- a/mm/slab.c +++ b/mm/slab.c @@ -3027,7 +3027,7 @@ out: #ifdef CONFIG_NUMA /* - * Try allocating on another node if PF_SPREAD_SLAB|PF_MEMPOLICY. + * Try allocating on another node if PF_SPREAD_SLAB is a mempolicy is set. * * If we are in_interrupt, then process context, including cpusets and * mempolicy, may not apply and should not be used for allocation policy. @@ -3259,7 +3259,7 @@ __do_cache_alloc(struct kmem_cache *cache, gfp_t flags) { void *objp; - if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) { + if (current->mempolicy || unlikely(current->flags & PF_SPREAD_SLAB)) { objp = alternate_node_alloc(cache, flags); if (objp) goto out; -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [patch 3/8] mm, mempolicy: remove per-process flag 2013-12-04 5:20 ` [patch 3/8] mm, mempolicy: remove per-process flag David Rientjes @ 2013-12-04 15:24 ` Christoph Lameter 2013-12-05 0:53 ` David Rientjes 0 siblings, 1 reply; 39+ messages in thread From: Christoph Lameter @ 2013-12-04 15:24 UTC (permalink / raw) To: David Rientjes Cc: Andrew Morton, Michal Hocko, KAMEZAWA Hiroyuki, Johannes Weiner, Mel Gorman, Rik van Riel, Pekka Enberg, linux-kernel, linux-mm, cgroups On Tue, 3 Dec 2013, David Rientjes wrote: > PF_MEMPOLICY is an unnecessary optimization for CONFIG_SLAB users. > There's no significant performance degradation to checking > current->mempolicy rather than current->flags & PF_MEMPOLICY in the > allocation path, especially since this is considered unlikely(). The use of current->mempolicy increase the cache footprint since its in a rarely used cacheline. This performance issue would occur when memory policies are not used since that cacheline would then have to be touched regardless of memory policies be in effect or not. PF_MEMPOLICY was used to avoid touching the cacheline. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [patch 3/8] mm, mempolicy: remove per-process flag 2013-12-04 15:24 ` Christoph Lameter @ 2013-12-05 0:53 ` David Rientjes 2013-12-05 19:05 ` Christoph Lameter 0 siblings, 1 reply; 39+ messages in thread From: David Rientjes @ 2013-12-05 0:53 UTC (permalink / raw) To: Christoph Lameter Cc: Andrew Morton, Michal Hocko, KAMEZAWA Hiroyuki, Johannes Weiner, Mel Gorman, Rik van Riel, Pekka Enberg, linux-kernel, linux-mm, cgroups On Wed, 4 Dec 2013, Christoph Lameter wrote: > > PF_MEMPOLICY is an unnecessary optimization for CONFIG_SLAB users. > > There's no significant performance degradation to checking > > current->mempolicy rather than current->flags & PF_MEMPOLICY in the > > allocation path, especially since this is considered unlikely(). > > The use of current->mempolicy increase the cache footprint since its in a > rarely used cacheline. This performance issue would occur when memory > policies are not used since that cacheline would then have to be touched > regardless of memory policies be in effect or not. PF_MEMPOLICY was used > to avoid touching the cacheline. > Right, but it turns out not to matter in practice. As one of the non- default CONFIG_SLAB users, and PF_MEMPOLICY only does something for CONFIG_SLAB, this patch tested to not show any degradation for specjbb which stresses the allocator in terms of throughput: with patch: 128761.54 SPECjbb2005 bops without patch: 127576.65 SPECjbb2005 bops These per-process flags are a scarce resource so I don't think PF_MEMPOLICY warrants a bit when it's not shown to be advantageous in configurations without mempolicy usage where it's intended to optimize, especially for a non-default slab allocator. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [patch 3/8] mm, mempolicy: remove per-process flag 2013-12-05 0:53 ` David Rientjes @ 2013-12-05 19:05 ` Christoph Lameter 2013-12-05 23:53 ` David Rientjes 0 siblings, 1 reply; 39+ messages in thread From: Christoph Lameter @ 2013-12-05 19:05 UTC (permalink / raw) To: David Rientjes Cc: Andrew Morton, Michal Hocko, KAMEZAWA Hiroyuki, Johannes Weiner, Mel Gorman, Rik van Riel, Pekka Enberg, linux-kernel, linux-mm, cgroups On Wed, 4 Dec 2013, David Rientjes wrote: > > Right, but it turns out not to matter in practice. As one of the non- > default CONFIG_SLAB users, and PF_MEMPOLICY only does something for > CONFIG_SLAB, this patch tested to not show any degradation for specjbb > which stresses the allocator in terms of throughput: > > with patch: 128761.54 SPECjbb2005 bops > without patch: 127576.65 SPECjbb2005 bops Specjbb? What does Java have to do with this? Can you run the synthetic in kernel slab benchmark. Like this one https://lkml.org/lkml/2009/10/13/459 > These per-process flags are a scarce resource so I don't think > PF_MEMPOLICY warrants a bit when it's not shown to be advantageous in > configurations without mempolicy usage where it's intended to optimize, > especially for a non-default slab allocator. PF_MEMPOLICY was advantageous when Paul Jackson introduced and benchmarked it. SLUB supports mempolicies through allocate_pages but it will allocate all objects out of one slab pages before retrieving another page following the policy. Thats why PF_MEMPOLICY and the other per object handling can be avoided in its fastpath. Thus PF_MEMPOLICY is not that important there. However, SLAB is still the allocator in use for RHEL which puts some importance on still supporting SLAB. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [patch 3/8] mm, mempolicy: remove per-process flag 2013-12-05 19:05 ` Christoph Lameter @ 2013-12-05 23:53 ` David Rientjes 2013-12-06 14:46 ` Christoph Lameter 0 siblings, 1 reply; 39+ messages in thread From: David Rientjes @ 2013-12-05 23:53 UTC (permalink / raw) To: Christoph Lameter Cc: Andrew Morton, Michal Hocko, KAMEZAWA Hiroyuki, Johannes Weiner, Mel Gorman, Rik van Riel, Pekka Enberg, linux-kernel, linux-mm, cgroups On Thu, 5 Dec 2013, Christoph Lameter wrote: > Specjbb? What does Java have to do with this? > Can you run the synthetic in kernel slab benchmark. > > Like this one https://lkml.org/lkml/2009/10/13/459 > We actually carry that in our production kernel and have updated it to build on 3.11, I'll run it and netperf TCP_RR as well, thanks. > However, SLAB is still the allocator in use for RHEL which puts some > importance on still supporting SLAB. > Google also uses it exclusively so I'm definitely not saying that since it's not default that we can ignore it. I haven't seen any performance regression in removing it, but I'll post the numbers on the slab benchmark and netperf TCP_RR when I have them. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [patch 3/8] mm, mempolicy: remove per-process flag 2013-12-05 23:53 ` David Rientjes @ 2013-12-06 14:46 ` Christoph Lameter 0 siblings, 0 replies; 39+ messages in thread From: Christoph Lameter @ 2013-12-06 14:46 UTC (permalink / raw) To: David Rientjes Cc: Andrew Morton, Michal Hocko, KAMEZAWA Hiroyuki, Johannes Weiner, Mel Gorman, Rik van Riel, Pekka Enberg, linux-kernel, linux-mm, cgroups On Thu, 5 Dec 2013, David Rientjes wrote: > We actually carry that in our production kernel and have updated it to > build on 3.11, I'll run it and netperf TCP_RR as well, thanks. If you get around it then please post the updated version. Maybe we can get that merged at some point. Keeps floating around after all. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 39+ messages in thread
* [patch 4/8] mm, memcg: add tunable for oom reserves 2013-12-04 5:19 ` [patch 1/8] fork: collapse copy_flags into copy_process David Rientjes 2013-12-04 5:19 ` [patch 2/8] mm, mempolicy: rename slab_node for clarity David Rientjes 2013-12-04 5:20 ` [patch 3/8] mm, mempolicy: remove per-process flag David Rientjes @ 2013-12-04 5:20 ` David Rientjes 2013-12-04 5:20 ` [patch 5/8] res_counter: remove interface for locked charging and uncharging David Rientjes ` (3 subsequent siblings) 6 siblings, 0 replies; 39+ messages in thread From: David Rientjes @ 2013-12-04 5:20 UTC (permalink / raw) To: Andrew Morton Cc: Michal Hocko, KAMEZAWA Hiroyuki, Johannes Weiner, Mel Gorman, Rik van Riel, Pekka Enberg, Christoph Lameter, linux-kernel, linux-mm, cgroups Userspace needs a way to define the amount of memory reserves that processes handling oom conditions may utilize. This patch adds a per- memcg oom reserve field and file, memory.oom_reserve_in_bytes, to manipulate its value. If currently utilized memory reserves are attempted to be reduced by writing a smaller value to memory.oom_reserve_in_bytes, it will fail with -EBUSY until some memory is uncharged. Signed-off-by: David Rientjes <rientjes@google.com> --- mm/memcontrol.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/mm/memcontrol.c b/mm/memcontrol.c --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -274,6 +274,9 @@ struct mem_cgroup { /* OOM-Killer disable */ int oom_kill_disable; + /* reserves for handling oom conditions, protected by res.lock */ + unsigned long long oom_reserve; + /* set when res.limit == memsw.limit */ bool memsw_is_minimum; @@ -5893,6 +5896,51 @@ static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, return 0; } +static int mem_cgroup_resize_oom_reserve(struct mem_cgroup *memcg, + unsigned long long new_limit) +{ + struct res_counter *res = &memcg->res; + u64 limit, usage; + int ret = 0; + + spin_lock(&res->lock); + limit = res->limit; + usage = res->usage; + + if (usage > limit && usage - limit > new_limit) { + ret = -EBUSY; + goto out; + } + + memcg->oom_reserve = new_limit; +out: + spin_unlock(&res->lock); + return ret; +} + +static u64 mem_cgroup_oom_reserve_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return mem_cgroup_from_css(css)->oom_reserve; +} + +static int mem_cgroup_oom_reserve_write(struct cgroup_subsys_state *css, + struct cftype *cft, const char *buffer) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + unsigned long long val; + int ret; + + if (mem_cgroup_is_root(memcg)) + return -EINVAL; + + ret = res_counter_memparse_write_strategy(buffer, &val); + if (ret) + return ret; + + return mem_cgroup_resize_oom_reserve(memcg, val); +} + #ifdef CONFIG_MEMCG_KMEM static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) { @@ -6024,6 +6072,11 @@ static struct cftype mem_cgroup_files[] = { .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), }, { + .name = "oom_reserve_in_bytes", + .read_u64 = mem_cgroup_oom_reserve_read, + .write_string = mem_cgroup_oom_reserve_write, + }, + { .name = "pressure_level", .register_event = vmpressure_register_event, .unregister_event = vmpressure_unregister_event, -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 39+ messages in thread
* [patch 5/8] res_counter: remove interface for locked charging and uncharging 2013-12-04 5:19 ` [patch 1/8] fork: collapse copy_flags into copy_process David Rientjes ` (2 preceding siblings ...) 2013-12-04 5:20 ` [patch 4/8] mm, memcg: add tunable for oom reserves David Rientjes @ 2013-12-04 5:20 ` David Rientjes 2013-12-04 5:20 ` [patch 6/8] res_counter: add interface for maximum nofail charge David Rientjes ` (2 subsequent siblings) 6 siblings, 0 replies; 39+ messages in thread From: David Rientjes @ 2013-12-04 5:20 UTC (permalink / raw) To: Andrew Morton Cc: Michal Hocko, KAMEZAWA Hiroyuki, Johannes Weiner, Mel Gorman, Rik van Riel, Pekka Enberg, Christoph Lameter, linux-kernel, linux-mm, cgroups The res_counter_{charge,uncharge}_locked() variants are not used in the kernel outside of the resource counter code itself, so remove the interface. Signed-off-by: David Rientjes <rientjes@google.com> --- Documentation/cgroups/resource_counter.txt | 14 ++------------ include/linux/res_counter.h | 6 +----- kernel/res_counter.c | 23 ++++++++++++----------- 3 files changed, 15 insertions(+), 28 deletions(-) diff --git a/Documentation/cgroups/resource_counter.txt b/Documentation/cgroups/resource_counter.txt --- a/Documentation/cgroups/resource_counter.txt +++ b/Documentation/cgroups/resource_counter.txt @@ -76,24 +76,14 @@ to work with it. limit_fail_at parameter is set to the particular res_counter element where the charging failed. - d. int res_counter_charge_locked - (struct res_counter *rc, unsigned long val, bool force) - - The same as res_counter_charge(), but it must not acquire/release the - res_counter->lock internally (it must be called with res_counter->lock - held). The force parameter indicates whether we can bypass the limit. - - e. u64 res_counter_uncharge[_locked] - (struct res_counter *rc, unsigned long val) + d. u64 res_counter_uncharge(struct res_counter *rc, unsigned long val) When a resource is released (freed) it should be de-accounted from the resource counter it was accounted to. This is called "uncharging". The return value of this function indicate the amount of charges still present in the counter. - The _locked routines imply that the res_counter->lock is taken. - - f. u64 res_counter_uncharge_until + e. u64 res_counter_uncharge_until (struct res_counter *rc, struct res_counter *top, unsinged long val) diff --git a/include/linux/res_counter.h b/include/linux/res_counter.h --- a/include/linux/res_counter.h +++ b/include/linux/res_counter.h @@ -104,15 +104,13 @@ void res_counter_init(struct res_counter *counter, struct res_counter *parent); * units, e.g. numbers, bytes, Kbytes, etc * * returns 0 on success and <0 if the counter->usage will exceed the - * counter->limit _locked call expects the counter->lock to be taken + * counter->limit * * charge_nofail works the same, except that it charges the resource * counter unconditionally, and returns < 0 if the after the current * charge we are over limit. */ -int __must_check res_counter_charge_locked(struct res_counter *counter, - unsigned long val, bool force); int __must_check res_counter_charge(struct res_counter *counter, unsigned long val, struct res_counter **limit_fail_at); int res_counter_charge_nofail(struct res_counter *counter, @@ -125,12 +123,10 @@ int res_counter_charge_nofail(struct res_counter *counter, * @val: the amount of the resource * * these calls check for usage underflow and show a warning on the console - * _locked call expects the counter->lock to be taken * * returns the total charges still present in @counter. */ -u64 res_counter_uncharge_locked(struct res_counter *counter, unsigned long val); u64 res_counter_uncharge(struct res_counter *counter, unsigned long val); u64 res_counter_uncharge_until(struct res_counter *counter, diff --git a/kernel/res_counter.c b/kernel/res_counter.c --- a/kernel/res_counter.c +++ b/kernel/res_counter.c @@ -22,8 +22,18 @@ void res_counter_init(struct res_counter *counter, struct res_counter *parent) counter->parent = parent; } -int res_counter_charge_locked(struct res_counter *counter, unsigned long val, - bool force) +static u64 res_counter_uncharge_locked(struct res_counter *counter, + unsigned long val) +{ + if (WARN_ON(counter->usage < val)) + val = counter->usage; + + counter->usage -= val; + return counter->usage; +} + +static int res_counter_charge_locked(struct res_counter *counter, + unsigned long val, bool force) { int ret = 0; @@ -86,15 +96,6 @@ int res_counter_charge_nofail(struct res_counter *counter, unsigned long val, return __res_counter_charge(counter, val, limit_fail_at, true); } -u64 res_counter_uncharge_locked(struct res_counter *counter, unsigned long val) -{ - if (WARN_ON(counter->usage < val)) - val = counter->usage; - - counter->usage -= val; - return counter->usage; -} - u64 res_counter_uncharge_until(struct res_counter *counter, struct res_counter *top, unsigned long val) -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 39+ messages in thread
* [patch 6/8] res_counter: add interface for maximum nofail charge 2013-12-04 5:19 ` [patch 1/8] fork: collapse copy_flags into copy_process David Rientjes ` (3 preceding siblings ...) 2013-12-04 5:20 ` [patch 5/8] res_counter: remove interface for locked charging and uncharging David Rientjes @ 2013-12-04 5:20 ` David Rientjes 2013-12-04 5:20 ` [patch 7/8] mm, memcg: allow processes handling oom notifications to access reserves David Rientjes 2013-12-04 5:20 ` [patch 8/8] mm, memcg: add memcg oom reserve documentation David Rientjes 6 siblings, 0 replies; 39+ messages in thread From: David Rientjes @ 2013-12-04 5:20 UTC (permalink / raw) To: Andrew Morton Cc: Michal Hocko, KAMEZAWA Hiroyuki, Johannes Weiner, Mel Gorman, Rik van Riel, Pekka Enberg, Christoph Lameter, linux-kernel, linux-mm, cgroups For memcg oom reserves, we'll need a resource counter interface that will not fail when exceeding the memcg limit like res_counter_charge_nofail, but only to a ceiling. This patch adds res_counter_charge_nofail_max() that will exceed the resource counter but only to a maximum defined value. If it fails to charge the resource, it returns -ENOMEM. Signed-off-by: David Rientjes <rientjes@google.com> --- include/linux/res_counter.h | 10 +++++++++- kernel/res_counter.c | 27 +++++++++++++++++++++------ 2 files changed, 30 insertions(+), 7 deletions(-) diff --git a/include/linux/res_counter.h b/include/linux/res_counter.h --- a/include/linux/res_counter.h +++ b/include/linux/res_counter.h @@ -107,14 +107,22 @@ void res_counter_init(struct res_counter *counter, struct res_counter *parent); * counter->limit * * charge_nofail works the same, except that it charges the resource - * counter unconditionally, and returns < 0 if the after the current + * counter unconditionally, and returns < 0 if after the current * charge we are over limit. + * + * charge_nofail_max is the same as charge_nofail, except that the + * resource counter usage can only exceed the limit by the max + * difference. Unlike charge_nofail, charge_nofail_max returns < 0 + * only if the current charge fails because of the max difference. */ int __must_check res_counter_charge(struct res_counter *counter, unsigned long val, struct res_counter **limit_fail_at); int res_counter_charge_nofail(struct res_counter *counter, unsigned long val, struct res_counter **limit_fail_at); +int res_counter_charge_nofail_max(struct res_counter *counter, + unsigned long val, struct res_counter **limit_fail_at, + unsigned long max); /* * uncharge - tell that some portion of the resource is released diff --git a/kernel/res_counter.c b/kernel/res_counter.c --- a/kernel/res_counter.c +++ b/kernel/res_counter.c @@ -33,15 +33,19 @@ static u64 res_counter_uncharge_locked(struct res_counter *counter, } static int res_counter_charge_locked(struct res_counter *counter, - unsigned long val, bool force) + unsigned long val, bool force, + unsigned long max) { int ret = 0; if (counter->usage + val > counter->limit) { counter->failcnt++; - ret = -ENOMEM; + if (max == ULONG_MAX) + ret = -ENOMEM; if (!force) return ret; + if (counter->usage + val - counter->limit > max) + return -ENOMEM; } counter->usage += val; @@ -51,7 +55,8 @@ static int res_counter_charge_locked(struct res_counter *counter, } static int __res_counter_charge(struct res_counter *counter, unsigned long val, - struct res_counter **limit_fail_at, bool force) + struct res_counter **limit_fail_at, bool force, + unsigned long max) { int ret, r; unsigned long flags; @@ -62,7 +67,7 @@ static int __res_counter_charge(struct res_counter *counter, unsigned long val, local_irq_save(flags); for (c = counter; c != NULL; c = c->parent) { spin_lock(&c->lock); - r = res_counter_charge_locked(c, val, force); + r = res_counter_charge_locked(c, val, force, max); spin_unlock(&c->lock); if (r < 0 && !ret) { ret = r; @@ -87,13 +92,23 @@ static int __res_counter_charge(struct res_counter *counter, unsigned long val, int res_counter_charge(struct res_counter *counter, unsigned long val, struct res_counter **limit_fail_at) { - return __res_counter_charge(counter, val, limit_fail_at, false); + return __res_counter_charge(counter, val, limit_fail_at, false, + ULONG_MAX); } int res_counter_charge_nofail(struct res_counter *counter, unsigned long val, struct res_counter **limit_fail_at) { - return __res_counter_charge(counter, val, limit_fail_at, true); + return __res_counter_charge(counter, val, limit_fail_at, true, + ULONG_MAX); +} + +int res_counter_charge_nofail_max(struct res_counter *counter, + unsigned long val, + struct res_counter **limit_fail_at, + unsigned long max) +{ + return __res_counter_charge(counter, val, limit_fail_at, true, max); } u64 res_counter_uncharge_until(struct res_counter *counter, -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 39+ messages in thread
* [patch 7/8] mm, memcg: allow processes handling oom notifications to access reserves 2013-12-04 5:19 ` [patch 1/8] fork: collapse copy_flags into copy_process David Rientjes ` (4 preceding siblings ...) 2013-12-04 5:20 ` [patch 6/8] res_counter: add interface for maximum nofail charge David Rientjes @ 2013-12-04 5:20 ` David Rientjes 2013-12-04 5:45 ` Johannes Weiner 2013-12-04 5:20 ` [patch 8/8] mm, memcg: add memcg oom reserve documentation David Rientjes 6 siblings, 1 reply; 39+ messages in thread From: David Rientjes @ 2013-12-04 5:20 UTC (permalink / raw) To: Andrew Morton Cc: Michal Hocko, KAMEZAWA Hiroyuki, Johannes Weiner, Mel Gorman, Rik van Riel, Pekka Enberg, Christoph Lameter, linux-kernel, linux-mm, cgroups Now that a per-process flag is available, define it for processes that handle userspace oom notifications. This is an optimization to avoid mantaining a list of such processes attached to a memcg at any given time and iterating it at charge time. This flag gets set whenever a process has registered for an oom notification and is cleared whenever it unregisters. When memcg reclaim has failed to free any memory, it is necessary for userspace oom handlers to be able to dip into reserves to pagefault text, allocate kernel memory to read the "tasks" file, allocate heap, etc. System oom conditions are not addressed at this time, but the same per- process flag can be used in the page allocator to determine if access should be given to userspace oom handlers to per-zone memory reserves at a later time once there is consensus. Signed-off-by: David Rientjes <rientjes@google.com> --- include/linux/sched.h | 1 + mm/memcontrol.c | 47 ++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 47 insertions(+), 1 deletion(-) diff --git a/include/linux/sched.h b/include/linux/sched.h --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1695,6 +1695,7 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, #define PF_SPREAD_SLAB 0x02000000 /* Spread some slab caches over cpuset */ #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_allowed */ #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ +#define PF_OOM_HANDLER 0x10000000 /* Userspace process handling oom conditions */ #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ #define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */ #define PF_SUSPEND_TASK 0x80000000 /* this thread called freeze_processes and should not be frozen */ diff --git a/mm/memcontrol.c b/mm/memcontrol.c --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2590,6 +2590,33 @@ enum { CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */ }; +/* + * Processes handling oom conditions are allowed to utilize memory reserves so + * that they may handle the condition. + */ +static int mem_cgroup_oom_handler_charge(struct mem_cgroup *memcg, + unsigned long csize, + struct mem_cgroup **mem_over_limit) +{ + struct res_counter *fail_res; + int ret; + + ret = res_counter_charge_nofail_max(&memcg->res, csize, &fail_res, + memcg->oom_reserve); + if (!ret && do_swap_account) { + ret = res_counter_charge_nofail_max(&memcg->memsw, csize, + &fail_res, + memcg->oom_reserve); + if (ret) { + res_counter_uncharge(&memcg->res, csize); + *mem_over_limit = mem_cgroup_from_res_counter(fail_res, + memsw); + + } + } + return !ret ? CHARGE_OK : CHARGE_NOMEM; +} + static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, unsigned int nr_pages, unsigned int min_pages, bool invoke_oom) @@ -2649,6 +2676,13 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, if (mem_cgroup_wait_acct_move(mem_over_limit)) return CHARGE_RETRY; + if (current->flags & PF_OOM_HANDLER) { + ret = mem_cgroup_oom_handler_charge(memcg, csize, + &mem_over_limit); + if (ret == CHARGE_OK) + return CHARGE_OK; + } + if (invoke_oom) mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(csize)); @@ -2696,7 +2730,8 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, || fatal_signal_pending(current))) goto bypass; - if (unlikely(task_in_memcg_oom(current))) + if (unlikely(task_in_memcg_oom(current)) && + !(current->flags & PF_OOM_HANDLER)) goto bypass; /* @@ -5825,6 +5860,11 @@ static int mem_cgroup_oom_register_event(struct cgroup_subsys_state *css, if (!event) return -ENOMEM; + /* + * Setting PF_OOM_HANDLER before taking memcg_oom_lock ensures it is + * set before getting added to memcg->oom_notify. + */ + current->flags |= PF_OOM_HANDLER; spin_lock(&memcg_oom_lock); event->eventfd = eventfd; @@ -5856,6 +5896,11 @@ static void mem_cgroup_oom_unregister_event(struct cgroup_subsys_state *css, } } + /* + * Clearing PF_OOM_HANDLER before dropping memcg_oom_lock ensures it is + * cleared before receiving another notification. + */ + current->flags &= ~PF_OOM_HANDLER; spin_unlock(&memcg_oom_lock); } -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [patch 7/8] mm, memcg: allow processes handling oom notifications to access reserves 2013-12-04 5:20 ` [patch 7/8] mm, memcg: allow processes handling oom notifications to access reserves David Rientjes @ 2013-12-04 5:45 ` Johannes Weiner 2013-12-05 1:49 ` David Rientjes 0 siblings, 1 reply; 39+ messages in thread From: Johannes Weiner @ 2013-12-04 5:45 UTC (permalink / raw) To: David Rientjes Cc: Andrew Morton, Michal Hocko, KAMEZAWA Hiroyuki, Mel Gorman, Rik van Riel, Pekka Enberg, Christoph Lameter, Tejun Heo, Li Zefan, linux-kernel, linux-mm, cgroups On Tue, Dec 03, 2013 at 09:20:17PM -0800, David Rientjes wrote: > Now that a per-process flag is available, define it for processes that > handle userspace oom notifications. This is an optimization to avoid > mantaining a list of such processes attached to a memcg at any given time > and iterating it at charge time. > > This flag gets set whenever a process has registered for an oom > notification and is cleared whenever it unregisters. > > When memcg reclaim has failed to free any memory, it is necessary for > userspace oom handlers to be able to dip into reserves to pagefault text, > allocate kernel memory to read the "tasks" file, allocate heap, etc. The task handling the OOM of a memcg can obviously not be part of that same memcg. I've said this many times in the past, but here is the most recent thread from Tejun, me, and Li on this topic: --- On Tue, 3 Dec 2013 at 15:35:48 +0800, Li Zefan wrote: > On Mon, 2 Dec 2013 at 11:44:06 -0500, Johannes Weiner wrote: > > On Fri, Nov 29, 2013 at 03:05:25PM -0500, Tejun Heo wrote: > > > Whoa, so we support oom handler inside the memcg that it handles? > > > Does that work reliably? Changing the above detail in this patch > > > isn't difficult (and we'll later need to update kernfs too) but > > > supporting such setup properly would be a *lot* of commitment and I'm > > > very doubtful we'd be able to achieve that by just carefully avoiding > > > memory allocation in the operations that usreland oom handler uses - > > > that set is destined to expand over time, extremely fragile and will > > > be hellish to maintain. > > > > > > So, I'm not at all excited about commiting to this guarantee. This > > > one is an easy one but it looks like the first step onto dizzying > > > slippery slope. > > > > > > Am I misunderstanding something here? Are you and Johannes firm on > > > supporting this? > > > > Handling a memcg OOM from userspace running inside that OOM memcg is > > completely crazy. I mean, think about this for just two seconds... > > Really? > > > > I get that people are doing it right now, and if you can get away with > > it for now, good for you. But you have to be aware how crazy this is > > and if it breaks you get to keep the pieces and we are not going to > > accomodate this in the kernel. Fix your crazy userspace. > > +1 --- -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [patch 7/8] mm, memcg: allow processes handling oom notifications to access reserves 2013-12-04 5:45 ` Johannes Weiner @ 2013-12-05 1:49 ` David Rientjes 2013-12-05 2:50 ` Tejun Heo 0 siblings, 1 reply; 39+ messages in thread From: David Rientjes @ 2013-12-05 1:49 UTC (permalink / raw) To: Johannes Weiner Cc: Andrew Morton, Michal Hocko, KAMEZAWA Hiroyuki, Mel Gorman, Rik van Riel, Pekka Enberg, Christoph Lameter, Tejun Heo, Li Zefan, linux-kernel, linux-mm, cgroups On Wed, 4 Dec 2013, Johannes Weiner wrote: > > Now that a per-process flag is available, define it for processes that > > handle userspace oom notifications. This is an optimization to avoid > > mantaining a list of such processes attached to a memcg at any given time > > and iterating it at charge time. > > > > This flag gets set whenever a process has registered for an oom > > notification and is cleared whenever it unregisters. > > > > When memcg reclaim has failed to free any memory, it is necessary for > > userspace oom handlers to be able to dip into reserves to pagefault text, > > allocate kernel memory to read the "tasks" file, allocate heap, etc. > > The task handling the OOM of a memcg can obviously not be part of that > same memcg. > Not without memory.oom_reserve_in_bytes that this series adds, that's true. Michal expressed interest in the idea of memcg oom reserves in the past, so I thought I'd share the series. > On Tue, 3 Dec 2013 at 15:35:48 +0800, Li Zefan wrote: > > On Mon, 2 Dec 2013 at 11:44:06 -0500, Johannes Weiner wrote: > > > On Fri, Nov 29, 2013 at 03:05:25PM -0500, Tejun Heo wrote: > > > > Whoa, so we support oom handler inside the memcg that it handles? > > > > Does that work reliably? Changing the above detail in this patch > > > > isn't difficult (and we'll later need to update kernfs too) but > > > > supporting such setup properly would be a *lot* of commitment and I'm > > > > very doubtful we'd be able to achieve that by just carefully avoiding > > > > memory allocation in the operations that usreland oom handler uses - > > > > that set is destined to expand over time, extremely fragile and will > > > > be hellish to maintain. > > > > It works reliably with this patch series, yes. I'm not sure what change this is referring to that would avoid memory allocation for userspace oom handlers, and I'd agree that it would be difficult to maintain a no-allocation policy for a subset of processes that are destined to handle oom handlers. That's not what this series is addressing, though, and in fact it's quite the opposite. It acknowledges that userspace oom handlers need to allocate and that anything else would be too difficult to maintain (thereby agreeing with the above), so we must set aside memory that they are exclusively allowed to access. For the vast majority of users who will not use userspace oom handlers, they can just use the default value of memory.oom_reserve_in_bytes == 0 and they incur absolutely no side- effects as a result of this series. For those who do use userspace oom handlers, like Google, this allows us to set aside memory to allow the userspace oom handlers to kill a process, dump the heap, send a signal, drop caches, etc. when waking up. > > > > So, I'm not at all excited about commiting to this guarantee. This > > > > one is an easy one but it looks like the first step onto dizzying > > > > slippery slope. > > > > > > > > Am I misunderstanding something here? Are you and Johannes firm on > > > > supporting this? > > > > > > Handling a memcg OOM from userspace running inside that OOM memcg is > > > completely crazy. I mean, think about this for just two seconds... > > > Really? > > > > > > I get that people are doing it right now, and if you can get away with > > > it for now, good for you. But you have to be aware how crazy this is > > > and if it breaks you get to keep the pieces and we are not going to > > > accomodate this in the kernel. Fix your crazy userspace. > > The rest of this email communicates only one thing: someone thinks it's crazy. And I agree it would be crazy if we don't allow that class of process to have access to a pre-defined amount of memory to handle the situation, which this series adds. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [patch 7/8] mm, memcg: allow processes handling oom notifications to access reserves 2013-12-05 1:49 ` David Rientjes @ 2013-12-05 2:50 ` Tejun Heo [not found] ` <20131205025026.GA26777-Gd/HAXX7CRxy/B6EtB590w@public.gmane.org> 0 siblings, 1 reply; 39+ messages in thread From: Tejun Heo @ 2013-12-05 2:50 UTC (permalink / raw) To: David Rientjes Cc: Johannes Weiner, Andrew Morton, Michal Hocko, KAMEZAWA Hiroyuki, Mel Gorman, Rik van Riel, Pekka Enberg, Christoph Lameter, Li Zefan, linux-kernel, linux-mm, cgroups Hello, On Wed, Dec 04, 2013 at 05:49:04PM -0800, David Rientjes wrote: > That's not what this series is addressing, though, and in fact it's quite > the opposite. It acknowledges that userspace oom handlers need to > allocate and that anything else would be too difficult to maintain > (thereby agreeing with the above), so we must set aside memory that they > are exclusively allowed to access. For the vast majority of users who > will not use userspace oom handlers, they can just use the default value > of memory.oom_reserve_in_bytes == 0 and they incur absolutely no side- > effects as a result of this series. Umm.. without delving into details, aren't you basically creating a memory cgroup inside a memory cgroup? Doesn't sound like a particularly well thought-out plan to me. > For those who do use userspace oom handlers, like Google, this allows us > to set aside memory to allow the userspace oom handlers to kill a process, > dump the heap, send a signal, drop caches, etc. when waking up. Seems kinda obvious. Put it in a separate cgroup? You're basically saying it doesn't want to be under the same memory limit as the processes that it's looking over. That's like the definition of being in a different cgroup. Thanks. -- tejun -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 39+ messages in thread
[parent not found: <20131205025026.GA26777-Gd/HAXX7CRxy/B6EtB590w@public.gmane.org>]
* Re: [patch 7/8] mm, memcg: allow processes handling oom notifications to access reserves [not found] ` <20131205025026.GA26777-Gd/HAXX7CRxy/B6EtB590w@public.gmane.org> @ 2013-12-05 23:49 ` David Rientjes [not found] ` <alpine.DEB.2.02.1312051537550.7717-X6Q0R45D7oAcqpCFd4KODRPsWskHk0ljAL8bYrjMMd8@public.gmane.org> 0 siblings, 1 reply; 39+ messages in thread From: David Rientjes @ 2013-12-05 23:49 UTC (permalink / raw) To: Tejun Heo Cc: Johannes Weiner, Andrew Morton, Michal Hocko, KAMEZAWA Hiroyuki, Mel Gorman, Rik van Riel, Pekka Enberg, Christoph Lameter, Li Zefan, linux-kernel-u79uwXL29TY76Z2rM5mHXA, linux-mm-Bw31MaZKKs3YtjvyW6yDsg, cgroups-u79uwXL29TY76Z2rM5mHXA On Wed, 4 Dec 2013, Tejun Heo wrote: > Hello, > Tejun, how are you? > Umm.. without delving into details, aren't you basically creating a > memory cgroup inside a memory cgroup? Doesn't sound like a > particularly well thought-out plan to me. > I agree that we wouldn't need such support if we are only addressing memcg oom conditions. We could do things like A/memory.limit_in_bytes == 128M and A/b/memory.limit_in_bytes == 126MB and then attach the process waiting on A/b/memory.oom_control to A and that would work perfect. However, we also need to discuss system oom handling. We have an interest in being able to allow userspace to handle system oom conditions since the policy will differ depending on machine and we can't encode every possible mechanism into the kernel. For example, on system oom we want to kill a process from the lowest priority top-level memcg. We lack that ability entirely in the kernel and since the sum of our top-level memcgs memory.limit_in_bytes exceeds the amount of present RAM, we run into these oom conditions a _lot_. So the first step, in my opinion, is to add a system oom notification on the root memcg's memory.oom_control which currently allows registering an eventfd() notification but never actually triggers. I did that in a patch and it is was merged into -mm but was pulled out for later discussion. Then, we need to ensure that the userspace that is registered to handle such events and that is difficult to do when the system is oom. The proposal is to allow such processes, now marked as PF_OOM_HANDLER, to be able to access pre-defined per-zone memory reserves in the page allocator. The only special handling for PF_OOM_HANDLER in the page allocator itself would be under such oom conditions (memcg oom conditions have no problem allocating the memory, only charging it). The amount of reserves would be defined as memory.oom_reserve_in_bytes from within the root memcg as defined by this patch, i.e. allow this amount of memory to be allocated in the page allocator for PF_OOM_HANDLER below the per-zone min watermarks. This, I believe, is the cleanest interface for users who choose to use a non-default policy by setting memory.oom_reserve_in_bytes and constrains all of the code to memcg which you have to configure for such support. The system oom condition is not addressed in this patch series, although the PF_OOM_HANDLER bit can be used for that purpose. I didn't post that patch because the notification on the root memcg's memory.oom_control in such conditions is currently being debated, so we need to solve that issue first. Your opinions and suggestions are more than helpful, thanks. ^ permalink raw reply [flat|nested] 39+ messages in thread
[parent not found: <alpine.DEB.2.02.1312051537550.7717-X6Q0R45D7oAcqpCFd4KODRPsWskHk0ljAL8bYrjMMd8@public.gmane.org>]
* Re: [patch 7/8] mm, memcg: allow processes handling oom notifications to access reserves [not found] ` <alpine.DEB.2.02.1312051537550.7717-X6Q0R45D7oAcqpCFd4KODRPsWskHk0ljAL8bYrjMMd8@public.gmane.org> @ 2013-12-06 17:34 ` Johannes Weiner 2013-12-07 16:38 ` Tim Hockin 2013-12-06 19:01 ` Tejun Heo 1 sibling, 1 reply; 39+ messages in thread From: Johannes Weiner @ 2013-12-06 17:34 UTC (permalink / raw) To: David Rientjes Cc: Tejun Heo, Andrew Morton, Michal Hocko, KAMEZAWA Hiroyuki, Mel Gorman, Rik van Riel, Pekka Enberg, Christoph Lameter, Li Zefan, linux-kernel-u79uwXL29TY76Z2rM5mHXA, linux-mm-Bw31MaZKKs3YtjvyW6yDsg, cgroups-u79uwXL29TY76Z2rM5mHXA On Thu, Dec 05, 2013 at 03:49:57PM -0800, David Rientjes wrote: > On Wed, 4 Dec 2013, Tejun Heo wrote: > > > Hello, > > > > Tejun, how are you? > > > Umm.. without delving into details, aren't you basically creating a > > memory cgroup inside a memory cgroup? Doesn't sound like a > > particularly well thought-out plan to me. > > > > I agree that we wouldn't need such support if we are only addressing memcg > oom conditions. We could do things like A/memory.limit_in_bytes == 128M > and A/b/memory.limit_in_bytes == 126MB and then attach the process waiting > on A/b/memory.oom_control to A and that would work perfect. > > However, we also need to discuss system oom handling. We have an interest > in being able to allow userspace to handle system oom conditions since the > policy will differ depending on machine and we can't encode every possible > mechanism into the kernel. For example, on system oom we want to kill a > process from the lowest priority top-level memcg. We lack that ability > entirely in the kernel and since the sum of our top-level memcgs > memory.limit_in_bytes exceeds the amount of present RAM, we run into these > oom conditions a _lot_. A simple and natural solution to this is to have the global OOM killer respect cgroups. You go through all the effort of carefully grouping tasks into bigger entities that you then arrange hierarchically. The global OOM killer should not just treat all tasks as equal peers. We can add a per-cgroup OOM priority knob and have the global OOM handler pick victim tasks from the one or more groups that have the lowest priority. Out of the box, every cgroup has the same priority, which means we can add this feature without changing the default behavior. > So the first step, in my opinion, is to add a system oom notification on > the root memcg's memory.oom_control which currently allows registering an > eventfd() notification but never actually triggers. I did that in a patch > and it is was merged into -mm but was pulled out for later discussion. > > Then, we need to ensure that the userspace that is registered to handle > such events and that is difficult to do when the system is oom. The > proposal is to allow such processes, now marked as PF_OOM_HANDLER, to be > able to access pre-defined per-zone memory reserves in the page allocator. > The only special handling for PF_OOM_HANDLER in the page allocator itself > would be under such oom conditions (memcg oom conditions have no problem > allocating the memory, only charging it). The amount of reserves would be > defined as memory.oom_reserve_in_bytes from within the root memcg as > defined by this patch, i.e. allow this amount of memory to be allocated in > the page allocator for PF_OOM_HANDLER below the per-zone min watermarks. > > This, I believe, is the cleanest interface for users who choose to use a > non-default policy by setting memory.oom_reserve_in_bytes and constrains > all of the code to memcg which you have to configure for such support. > > The system oom condition is not addressed in this patch series, although > the PF_OOM_HANDLER bit can be used for that purpose. I didn't post that > patch because the notification on the root memcg's memory.oom_control in > such conditions is currently being debated, so we need to solve that issue > first. ^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [patch 7/8] mm, memcg: allow processes handling oom notifications to access reserves 2013-12-06 17:34 ` Johannes Weiner @ 2013-12-07 16:38 ` Tim Hockin 2013-12-07 17:40 ` Johannes Weiner 0 siblings, 1 reply; 39+ messages in thread From: Tim Hockin @ 2013-12-07 16:38 UTC (permalink / raw) To: Johannes Weiner Cc: Michal Hocko, Li Zefan, KAMEZAWA Hiroyuki, Tejun Heo, Christoph Lameter, David Rientjes, linux-mm, Rik van Riel, Pekka Enberg, cgroups, Mel Gorman, Andrew Morton, linux-kernel [-- Attachment #1: Type: text/plain, Size: 4160 bytes --] We actually started with kernel patches all h these lines - per-memcg scores and all of our crazy policy requirements. It turns out that changing policies is hard. When David offered the opportunity to manage it all in user space it sounded like a great idea. If this can be made to work as a high prio daemon with access to reserves, we would like it. Tim On Dec 6, 2013 9:36 AM, "Johannes Weiner" <hannes@cmpxchg.org> wrote: > On Thu, Dec 05, 2013 at 03:49:57PM -0800, David Rientjes wrote: > > On Wed, 4 Dec 2013, Tejun Heo wrote: > > > > > Hello, > > > > > > > Tejun, how are you? > > > > > Umm.. without delving into details, aren't you basically creating a > > > memory cgroup inside a memory cgroup? Doesn't sound like a > > > particularly well thought-out plan to me. > > > > > > > I agree that we wouldn't need such support if we are only addressing > memcg > > oom conditions. We could do things like A/memory.limit_in_bytes == 128M > > and A/b/memory.limit_in_bytes == 126MB and then attach the process > waiting > > on A/b/memory.oom_control to A and that would work perfect. > > > > However, we also need to discuss system oom handling. We have an > interest > > in being able to allow userspace to handle system oom conditions since > the > > policy will differ depending on machine and we can't encode every > possible > > mechanism into the kernel. For example, on system oom we want to kill a > > process from the lowest priority top-level memcg. We lack that ability > > entirely in the kernel and since the sum of our top-level memcgs > > memory.limit_in_bytes exceeds the amount of present RAM, we run into > these > > oom conditions a _lot_. > > A simple and natural solution to this is to have the global OOM killer > respect cgroups. You go through all the effort of carefully grouping > tasks into bigger entities that you then arrange hierarchically. The > global OOM killer should not just treat all tasks as equal peers. > > We can add a per-cgroup OOM priority knob and have the global OOM > handler pick victim tasks from the one or more groups that have the > lowest priority. > > Out of the box, every cgroup has the same priority, which means we can > add this feature without changing the default behavior. > > > So the first step, in my opinion, is to add a system oom notification on > > the root memcg's memory.oom_control which currently allows registering an > > eventfd() notification but never actually triggers. I did that in a > patch > > and it is was merged into -mm but was pulled out for later discussion. > > > > Then, we need to ensure that the userspace that is registered to handle > > such events and that is difficult to do when the system is oom. The > > proposal is to allow such processes, now marked as PF_OOM_HANDLER, to be > > able to access pre-defined per-zone memory reserves in the page > allocator. > > The only special handling for PF_OOM_HANDLER in the page allocator itself > > would be under such oom conditions (memcg oom conditions have no problem > > allocating the memory, only charging it). The amount of reserves would > be > > defined as memory.oom_reserve_in_bytes from within the root memcg as > > defined by this patch, i.e. allow this amount of memory to be allocated > in > > the page allocator for PF_OOM_HANDLER below the per-zone min watermarks. > > > > This, I believe, is the cleanest interface for users who choose to use a > > non-default policy by setting memory.oom_reserve_in_bytes and constrains > > all of the code to memcg which you have to configure for such support. > > > > The system oom condition is not addressed in this patch series, although > > the PF_OOM_HANDLER bit can be used for that purpose. I didn't post that > > patch because the notification on the root memcg's memory.oom_control in > > such conditions is currently being debated, so we need to solve that > issue > > first. > -- > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html > Please read the FAQ at http://www.tux.org/lkml/ > [-- Attachment #2: Type: text/html, Size: 5054 bytes --] ^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [patch 7/8] mm, memcg: allow processes handling oom notifications to access reserves 2013-12-07 16:38 ` Tim Hockin @ 2013-12-07 17:40 ` Johannes Weiner 2013-12-07 18:12 ` Tim Hockin 0 siblings, 1 reply; 39+ messages in thread From: Johannes Weiner @ 2013-12-07 17:40 UTC (permalink / raw) To: Tim Hockin Cc: Michal Hocko, Li Zefan, KAMEZAWA Hiroyuki, Tejun Heo, Christoph Lameter, David Rientjes, linux-mm, Rik van Riel, Pekka Enberg, cgroups, Mel Gorman, Andrew Morton, linux-kernel Hello Tim! On Sat, Dec 07, 2013 at 08:38:20AM -0800, Tim Hockin wrote: > We actually started with kernel patches all h these lines - per-memcg > scores and all of our crazy policy requirements. > > It turns out that changing policies is hard. > > When David offered the opportunity to manage it all in user space it > sounded like a great idea. > > If this can be made to work as a high prio daemon with access to reserves, > we would like it. We can not talk solutions if you won't describe the problem. It's understandable that you can't talk about internal details, but it's possible to describe a technical problem in a portable fashion such that people can understand and evaluate it without knowing your whole application. Companies do this all the time. "The way our blackbox works makes it really hard to hook it up to the Linux kernel" is not a very convincing technical argument to change the Linux kernel. Thanks! -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [patch 7/8] mm, memcg: allow processes handling oom notifications to access reserves 2013-12-07 17:40 ` Johannes Weiner @ 2013-12-07 18:12 ` Tim Hockin [not found] ` <CAAAKZwvanMiz8QZVOU0-SUKYzqcaJAXn0HxYs5+=Zakmnbcfbg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org> 0 siblings, 1 reply; 39+ messages in thread From: Tim Hockin @ 2013-12-07 18:12 UTC (permalink / raw) To: Johannes Weiner Cc: Michal Hocko, KAMEZAWA Hiroyuki, Li Zefan, Tejun Heo, David Rientjes, Christoph Lameter, linux-mm, Rik van Riel, Pekka Enberg, Andrew Morton, Mel Gorman, cgroups, linux-kernel [-- Attachment #1: Type: text/plain, Size: 804 bytes --] You more or less described the fundamental change - a score per memcg, with a recursive OOM killer which evaluates scores between siblings at the same level. It gets a bit complicated because we have need if wider scoring ranges than are provided by default and because we score PIDs against mcgs at a given scope. We also have some tiebreaker heuristic (age). We also have a handful of features that depend on OOM handling like the aforementioned automatically growing and changing the actual OOM score depending on usage in relation to various thresholds ( e.g. we sold you X, and we allow you to go over X but if you do, your likelihood of death in case of system OOM goes up. Do you really want us to teach the kernel policies like this? It would be way easier to do and test in userspace. Tim [-- Attachment #2: Type: text/html, Size: 896 bytes --] ^ permalink raw reply [flat|nested] 39+ messages in thread
[parent not found: <CAAAKZwvanMiz8QZVOU0-SUKYzqcaJAXn0HxYs5+=Zakmnbcfbg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>]
* Re: [patch 7/8] mm, memcg: allow processes handling oom notifications to access reserves [not found] ` <CAAAKZwvanMiz8QZVOU0-SUKYzqcaJAXn0HxYs5+=Zakmnbcfbg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org> @ 2013-12-07 19:06 ` Johannes Weiner 2013-12-07 21:04 ` Tim Hockin 0 siblings, 1 reply; 39+ messages in thread From: Johannes Weiner @ 2013-12-07 19:06 UTC (permalink / raw) To: Tim Hockin Cc: Michal Hocko, KAMEZAWA Hiroyuki, Li Zefan, Tejun Heo, David Rientjes, Christoph Lameter, linux-mm-Bw31MaZKKs3YtjvyW6yDsg, Rik van Riel, Pekka Enberg, Andrew Morton, Mel Gorman, cgroups-u79uwXL29TY76Z2rM5mHXA, linux-kernel-u79uwXL29TY76Z2rM5mHXA On Sat, Dec 07, 2013 at 10:12:19AM -0800, Tim Hockin wrote: > You more or less described the fundamental change - a score per memcg, with > a recursive OOM killer which evaluates scores between siblings at the same > level. > > It gets a bit complicated because we have need if wider scoring ranges than > are provided by default If so, I'm sure you can make a convincing case to widen the internal per-task score ranges. The per-memcg score ranges have not even be defined, so this is even easier. > and because we score PIDs against mcgs at a given scope. You are describing bits of a solution, not a problem. And I can't possibly infer a problem from this. > We also have some tiebreaker heuristic (age). Either periodically update the per-memcg score from userspace or implement this in the kernel. We have considered CPU usage history/runtime etc. in the past when picking an OOM victim task. But I'm again just speculating what your problem is, so this may or may not be a feasible solution. > We also have a handful of features that depend on OOM handling like the > aforementioned automatically growing and changing the actual OOM score > depending on usage in relation to various thresholds ( e.g. we sold you X, > and we allow you to go over X but if you do, your likelihood of death in > case of system OOM goes up. You can trivially monitor threshold events from userspace with the existing infrastructure and accordingly update the per-memcg score. > Do you really want us to teach the kernel policies like this? It would be > way easier to do and test in userspace. Maybe. Providing fragments of your solution is not an efficient way to communicate the problem. And you have to sell the problem before anybody can be expected to even consider your proposal as one of the possible solutions. ^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [patch 7/8] mm, memcg: allow processes handling oom notifications to access reserves 2013-12-07 19:06 ` Johannes Weiner @ 2013-12-07 21:04 ` Tim Hockin 0 siblings, 0 replies; 39+ messages in thread From: Tim Hockin @ 2013-12-07 21:04 UTC (permalink / raw) To: Johannes Weiner Cc: Michal Hocko, Li Zefan, KAMEZAWA Hiroyuki, Tejun Heo, Christoph Lameter, David Rientjes, linux-mm, Rik van Riel, Pekka Enberg, cgroups, Mel Gorman, Andrew Morton, linux-kernel [-- Attachment #1: Type: text/plain, Size: 2990 bytes --] We have hierarchical "containers". Jobs exist in these containers. The containers can hold sub-containers. In case of system OOM we want to kill in strict priority order. From the root of the hierarchy, choose the lowest priority. This could be a task or a memcg. If a memcg, recurse. We CAN do it in kernel (in fact we do, and I argued for that, and David acquiesced). But doing it in kernel means changes are slow and risky. What we really have is a bunch of features that we offer to our users that need certain OOM-time behaviors and guarantees to be implemented. I don't expect that most of our changes are useful for anyone outside of Google, really. They come with a lot of environmental assumptions. This is why David finally convinced me it was easier to release changes, to fix bugs, and to update kernels if we do this in userspace. I apologize if I am not giving you what you want. I am typing on a phone at the moment. If this still doesn't help I can try from a computer later. Tim On Dec 7, 2013 11:07 AM, "Johannes Weiner" <hannes@cmpxchg.org> wrote: > On Sat, Dec 07, 2013 at 10:12:19AM -0800, Tim Hockin wrote: > > You more or less described the fundamental change - a score per memcg, > with > > a recursive OOM killer which evaluates scores between siblings at the > same > > level. > > > > It gets a bit complicated because we have need if wider scoring ranges > than > > are provided by default > > If so, I'm sure you can make a convincing case to widen the internal > per-task score ranges. The per-memcg score ranges have not even be > defined, so this is even easier. > > > and because we score PIDs against mcgs at a given scope. > > You are describing bits of a solution, not a problem. And I can't > possibly infer a problem from this. > > > We also have some tiebreaker heuristic (age). > > Either periodically update the per-memcg score from userspace or > implement this in the kernel. We have considered CPU usage > history/runtime etc. in the past when picking an OOM victim task. > > But I'm again just speculating what your problem is, so this may or > may not be a feasible solution. > > > We also have a handful of features that depend on OOM handling like the > > aforementioned automatically growing and changing the actual OOM score > > depending on usage in relation to various thresholds ( e.g. we sold you > X, > > and we allow you to go over X but if you do, your likelihood of death in > > case of system OOM goes up. > > You can trivially monitor threshold events from userspace with the > existing infrastructure and accordingly update the per-memcg score. > > > Do you really want us to teach the kernel policies like this? It would > be > > way easier to do and test in userspace. > > Maybe. Providing fragments of your solution is not an efficient way > to communicate the problem. And you have to sell the problem before > anybody can be expected to even consider your proposal as one of the > possible solutions. > [-- Attachment #2: Type: text/html, Size: 3540 bytes --] ^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [patch 7/8] mm, memcg: allow processes handling oom notifications to access reserves [not found] ` <alpine.DEB.2.02.1312051537550.7717-X6Q0R45D7oAcqpCFd4KODRPsWskHk0ljAL8bYrjMMd8@public.gmane.org> 2013-12-06 17:34 ` Johannes Weiner @ 2013-12-06 19:01 ` Tejun Heo 2013-12-09 20:10 ` David Rientjes 1 sibling, 1 reply; 39+ messages in thread From: Tejun Heo @ 2013-12-06 19:01 UTC (permalink / raw) To: David Rientjes Cc: Johannes Weiner, Andrew Morton, Michal Hocko, KAMEZAWA Hiroyuki, Mel Gorman, Rik van Riel, Pekka Enberg, Christoph Lameter, Li Zefan, linux-kernel-u79uwXL29TY76Z2rM5mHXA, linux-mm-Bw31MaZKKs3YtjvyW6yDsg, cgroups-u79uwXL29TY76Z2rM5mHXA Yo, David. On Thu, Dec 05, 2013 at 03:49:57PM -0800, David Rientjes wrote: > Tejun, how are you? Doing pretty good. How's yourself? :) > > Umm.. without delving into details, aren't you basically creating a > > memory cgroup inside a memory cgroup? Doesn't sound like a > > particularly well thought-out plan to me. > > I agree that we wouldn't need such support if we are only addressing memcg > oom conditions. We could do things like A/memory.limit_in_bytes == 128M > and A/b/memory.limit_in_bytes == 126MB and then attach the process waiting > on A/b/memory.oom_control to A and that would work perfect. Or even just create a separate parallel cgroup A/memory.limit_in_bytes == 126M A-oom/memory.limit_in_bytes = 2M and avoid the extra layer of nesting. > However, we also need to discuss system oom handling. We have an interest > in being able to allow userspace to handle system oom conditions since the > policy will differ depending on machine and we can't encode every possible > mechanism into the kernel. For example, on system oom we want to kill a > process from the lowest priority top-level memcg. We lack that ability > entirely in the kernel and since the sum of our top-level memcgs > memory.limit_in_bytes exceeds the amount of present RAM, we run into these > oom conditions a _lot_. > > So the first step, in my opinion, is to add a system oom notification on > the root memcg's memory.oom_control which currently allows registering an > eventfd() notification but never actually triggers. I did that in a patch > and it is was merged into -mm but was pulled out for later discussion. Hmmm... this seems to be a different topic. You're saying that it'd be beneficial to add userland oom handling at the sytem level and if that happens having per-memcg oom reserve would be consistent with the system-wide one, right? While I can see some merit in that argument, the whole thing is predicated on system level userland oom handling being justified && even then I'm not quite sure whether "consistent interface" is enough to have oom reserve in all memory cgroups. It feels a bit backwards because, here, the root memcg is the exception, not the other way around. Root is the only one which can't put oom handler in a separate cgroup, so it could make more sense to special case that rather than spreading the interface for global userland oom to everyone else. But, before that, system level userland OOM handling sounds scary to me. I thought about userland OOM handling for memcgs and it does make some sense. ie. there is a different action that userland oom handler can take which kernel oom handler can't - it can expand the limit of the offending cgroup, effectively using OOM handler as a sizing estimator. I'm not sure whether that in itself is a good idea but then again it might not be possible to clearly separate out sizing from oom conditions. Anyways, but for system level OOM handling, there's no other action userland handler can take. It's not like the OOM handler paging the admin to install more memory is a reasonable mode of operation to support. The *only* action userland OOM handler can take is killing something. Now, if that's the case and we have kernel OOM handler anyway, I think the best course of action is improving kernel OOM handler and teach it to make the decisions that the userland handler would consider good. That should be doable, right? The thing is OOM handling in userland is an inherently fragile thing and it can *never* replace kernel OOM handling. You may reserve any amount of memory you want but there would still be cases that it may fail. It's not like we have owner-based allocation all through the kernel or are willing to pay overhead for such thing. Even if that part can be guaranteed somehow (no idea how), the kernel still can NEVER trust the userland OOM handler. No matter what we do, we need a kernel OOM handler with no resource dependency. So, there isn't anything userland OOM handler can inherently do better and we can't do away with kernel handler no matter what. On both accounts, it seems like the best course of action is making system-wide kernel OOM handler to make better decisions if possible at all. If that's impossible, let's first think about why that's the case before hastly opening this new can of worms. Thanks! -- tejun ^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [patch 7/8] mm, memcg: allow processes handling oom notifications to access reserves 2013-12-06 19:01 ` Tejun Heo @ 2013-12-09 20:10 ` David Rientjes [not found] ` <alpine.DEB.2.02.1312061441390.8949-X6Q0R45D7oAcqpCFd4KODRPsWskHk0ljAL8bYrjMMd8@public.gmane.org> 0 siblings, 1 reply; 39+ messages in thread From: David Rientjes @ 2013-12-09 20:10 UTC (permalink / raw) To: Tejun Heo Cc: Johannes Weiner, Andrew Morton, Michal Hocko, KAMEZAWA Hiroyuki, Mel Gorman, Rik van Riel, Pekka Enberg, Christoph Lameter, Li Zefan, linux-kernel, linux-mm, cgroups On Fri, 6 Dec 2013, Tejun Heo wrote: > > Tejun, how are you? > > Doing pretty good. How's yourself? :) > Not bad, busy with holidays and all that. > > I agree that we wouldn't need such support if we are only addressing memcg > > oom conditions. We could do things like A/memory.limit_in_bytes == 128M > > and A/b/memory.limit_in_bytes == 126MB and then attach the process waiting > > on A/b/memory.oom_control to A and that would work perfect. > > Or even just create a separate parallel cgroup A/memory.limit_in_bytes > == 126M A-oom/memory.limit_in_bytes = 2M and avoid the extra layer of > nesting. > Indeed. The setup I'm specifically trying to attack is where the sum of the limits of all non-oom handling memcgs (A/b in my model, A in yours) exceed the amount of RAM. If the system has 256MB, /=256MB A=126MB A-oom=2MB B=188MB B-oom=4MB or /=256MB C=128MB D=192MB C/a=126M D/a=188MB then it's possible for A + B or C/a + D/a to cause a system oom condition and meanwhile A-oom/tasks, B-oom/tasks, C/tasks, and D/tasks cannot allocate memory to handle it. > > However, we also need to discuss system oom handling. We have an interest > > in being able to allow userspace to handle system oom conditions since the > > policy will differ depending on machine and we can't encode every possible > > mechanism into the kernel. For example, on system oom we want to kill a > > process from the lowest priority top-level memcg. We lack that ability > > entirely in the kernel and since the sum of our top-level memcgs > > memory.limit_in_bytes exceeds the amount of present RAM, we run into these > > oom conditions a _lot_. > > > > So the first step, in my opinion, is to add a system oom notification on > > the root memcg's memory.oom_control which currently allows registering an > > eventfd() notification but never actually triggers. I did that in a patch > > and it is was merged into -mm but was pulled out for later discussion. > > Hmmm... this seems to be a different topic. You're saying that it'd > be beneficial to add userland oom handling at the sytem level and if > that happens having per-memcg oom reserve would be consistent with the > system-wide one, right? Right, and apologies for not discussing the system oom handling here since its notification on the root memcg is currently being debated as well. The idea is that admins and users aren't going to be concerned about memory allocation through the page allocator vs memory charging through the memory controller; they simply want memory for their userspace oom handling. And since the notification would be tied to the root memcg, it makes sense to make the amount of memory allowed to allocate exclusively for these handlers a memcg interface. So the cleanest solution, in my opinion, was to add the interface as part of memcg. > While I can see some merit in that argument, > the whole thing is predicated on system level userland oom handling > being justified && even then I'm not quite sure whether "consistent > interface" is enough to have oom reserve in all memory cgroups. It > feels a bit backwards because, here, the root memcg is the exception, > not the other way around. Root is the only one which can't put oom > handler in a separate cgroup, so it could make more sense to special > case that rather than spreading the interface for global userland oom > to everyone else. > It's really the same thing, though, from the user perspective. They don't care about page allocation failure vs memcg charge failure, they simply want to ensure that the memory set aside for memory.oom_reserve_in_bytes is available in oom conditions. With the suggested alternatives: /=256MB A=126MB A-oom=2MB B=188MB B-oom=4MB or /=256MB C=128MB D=192MB C/a=126M D/a=188MB we can't distinguish between what is able to allocate below per-zone min watermarks in the page allocator as the oom reserve. The key point is that the root memcg is not the only memcg concerned with page allocator memory reserves, it's any oom reserve. If A's usage is 124MB and B's usage is 132MB, we can't specify that processes attached to B-oom should be able to bypass per-zone min watermarks without an interface such as that being proposed. > But, before that, system level userland OOM handling sounds scary to > me. I thought about userland OOM handling for memcgs and it does make > some sense. ie. there is a different action that userland oom handler > can take which kernel oom handler can't - it can expand the limit of > the offending cgroup, effectively using OOM handler as a sizing > estimator. I'm not sure whether that in itself is a good idea but > then again it might not be possible to clearly separate out sizing > from oom conditions. > > Anyways, but for system level OOM handling, there's no other action > userland handler can take. It's not like the OOM handler paging the > admin to install more memory is a reasonable mode of operation to > support. The *only* action userland OOM handler can take is killing > something. Now, if that's the case and we have kernel OOM handler > anyway, I think the best course of action is improving kernel OOM > handler and teach it to make the decisions that the userland handler > would consider good. That should be doable, right? > It's much more powerful than that; you're referring to the mechanism to guarantee future memory freeing so the system or memcg is no longer oom, and that's only one case of possible handling. I have a customer who wants to save heap profiles at the time of oom as well, for example, and their sole desire is to be able to capture memory statistics before the oom kill takes place. The sine qua non is that memory reserves allow something to be done in such conditions: if you try to do a "ps" or "ls" or cat a file in an oom memcg, you hang. We need better functionality to ensure that we can do some action prior to the oom kill itself, whether that comes from userspace or the kernel. We simply cannot rely on things like memory thresholds or vmpressure to grab these heap profiles, there is no guarantee that memory will not be exhausted and the oom kill would already have taken place before the process handling the notification wakes up. (And any argument that it is possible by simply making the threshold happen early enough is a non-starter: it does not guarantee the heaps are collected for oom conditions and the oom kill can still occur prematurely in machines that overcommit their memcg limits, as we do.) > The thing is OOM handling in userland is an inherently fragile thing > and it can *never* replace kernel OOM handling. You may reserve any > amount of memory you want but there would still be cases that it may > fail. It's not like we have owner-based allocation all through the > kernel or are willing to pay overhead for such thing. Even if that > part can be guaranteed somehow (no idea how), the kernel still can > NEVER trust the userland OOM handler. No matter what we do, we need a > kernel OOM handler with no resource dependency. > I was never an advocate for the current memory.oom_control behavior that allows you to disable the oom killer indefinitely for a memcg and I agree that it is dangerous if userspace will not cause future memory freeing or toggle the value such that the kernel will kill something. So I agree with you with today's functionality, not with the functionality that this patchset, and the notification on the root memcg for system oom conditions, provides. I also proposed a memory.oom_delay_millisecs that we have used for several years dating back to even cpusets that simply delays the oom kill such that userspace can do "something" like send a kill itself, collect heap profiles, send a signal to our malloc() implementation to free arena memory, etc. prior to the kernel oom kill. > So, there isn't anything userland OOM handler can inherently do better > and we can't do away with kernel handler no matter what. On both > accounts, it seems like the best course of action is making > system-wide kernel OOM handler to make better decisions if possible at > all. If that's impossible, let's first think about why that's the > case before hastly opening this new can of worms. > We certainly can get away with the kernel oom killer in 99% of cases with this functionality for users who choose to have their own oom handling implementations. We also can't possibly code every single handling policy into the kernel: we can't guarantee that our version of malloc() is guaranteed to be able to free memory back to the kernel when waking up on a memory.oom_control notification prior to the memcg oom killer killing something, for example, without this functionality. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 39+ messages in thread
[parent not found: <alpine.DEB.2.02.1312061441390.8949-X6Q0R45D7oAcqpCFd4KODRPsWskHk0ljAL8bYrjMMd8@public.gmane.org>]
* Re: [patch 7/8] mm, memcg: allow processes handling oom notifications to access reserves [not found] ` <alpine.DEB.2.02.1312061441390.8949-X6Q0R45D7oAcqpCFd4KODRPsWskHk0ljAL8bYrjMMd8@public.gmane.org> @ 2013-12-09 22:37 ` Johannes Weiner 2013-12-10 21:50 ` Tejun Heo 1 sibling, 0 replies; 39+ messages in thread From: Johannes Weiner @ 2013-12-09 22:37 UTC (permalink / raw) To: David Rientjes Cc: Tejun Heo, Andrew Morton, Michal Hocko, KAMEZAWA Hiroyuki, Mel Gorman, Rik van Riel, Pekka Enberg, Christoph Lameter, Li Zefan, linux-kernel-u79uwXL29TY76Z2rM5mHXA, linux-mm-Bw31MaZKKs3YtjvyW6yDsg, cgroups-u79uwXL29TY76Z2rM5mHXA On Mon, Dec 09, 2013 at 12:10:44PM -0800, David Rientjes wrote: > On Fri, 6 Dec 2013, Tejun Heo wrote: > > > > Tejun, how are you? > > > > Doing pretty good. How's yourself? :) > > > > Not bad, busy with holidays and all that. > > > > I agree that we wouldn't need such support if we are only addressing memcg > > > oom conditions. We could do things like A/memory.limit_in_bytes == 128M > > > and A/b/memory.limit_in_bytes == 126MB and then attach the process waiting > > > on A/b/memory.oom_control to A and that would work perfect. > > > > Or even just create a separate parallel cgroup A/memory.limit_in_bytes > > == 126M A-oom/memory.limit_in_bytes = 2M and avoid the extra layer of > > nesting. > > > > Indeed. The setup I'm specifically trying to attack is where the sum of > the limits of all non-oom handling memcgs (A/b in my model, A in yours) > exceed the amount of RAM. If the system has 256MB, > > /=256MB > A=126MB A-oom=2MB B=188MB B-oom=4MB > > or > > /=256MB > C=128MB D=192MB > C/a=126M D/a=188MB > > then it's possible for A + B or C/a + D/a to cause a system oom condition > and meanwhile A-oom/tasks, B-oom/tasks, C/tasks, and D/tasks cannot > allocate memory to handle it. So your per-memcg handlers want access to PHYSICAL MEMORY reserves during system-wide OOM, but this patch implements MEMORY CHARGE reserves only, which are obviously meaningless during system-wide OOM. In other words, this is an entirely different usecase than what this patchset is really about. You have to sell us on the problem first, then we can discuss a solution. Instead, you insist on the solution and keep changing the problem whenever we find it no longer justifies your proposal. > > > However, we also need to discuss system oom handling. We have an interest > > > in being able to allow userspace to handle system oom conditions since the > > > policy will differ depending on machine and we can't encode every possible > > > mechanism into the kernel. For example, on system oom we want to kill a > > > process from the lowest priority top-level memcg. We lack that ability > > > entirely in the kernel and since the sum of our top-level memcgs > > > memory.limit_in_bytes exceeds the amount of present RAM, we run into these > > > oom conditions a _lot_. > > > > > > So the first step, in my opinion, is to add a system oom notification on > > > the root memcg's memory.oom_control which currently allows registering an > > > eventfd() notification but never actually triggers. I did that in a patch > > > and it is was merged into -mm but was pulled out for later discussion. > > > > Hmmm... this seems to be a different topic. You're saying that it'd > > be beneficial to add userland oom handling at the sytem level and if > > that happens having per-memcg oom reserve would be consistent with the > > system-wide one, right? > > Right, and apologies for not discussing the system oom handling here since > its notification on the root memcg is currently being debated as well. > The idea is that admins and users aren't going to be concerned about > memory allocation through the page allocator vs memory charging through > the memory controller; they simply want memory for their userspace oom > handling. And since the notification would be tied to the root memcg, it > makes sense to make the amount of memory allowed to allocate exclusively > for these handlers a memcg interface. So the cleanest solution, in my > opinion, was to add the interface as part of memcg. > > > While I can see some merit in that argument, > > the whole thing is predicated on system level userland oom handling > > being justified && even then I'm not quite sure whether "consistent > > interface" is enough to have oom reserve in all memory cgroups. It > > feels a bit backwards because, here, the root memcg is the exception, > > not the other way around. Root is the only one which can't put oom > > handler in a separate cgroup, so it could make more sense to special > > case that rather than spreading the interface for global userland oom > > to everyone else. > > > > It's really the same thing, though, from the user perspective. They don't > care about page allocation failure vs memcg charge failure, they simply > want to ensure that the memory set aside for memory.oom_reserve_in_bytes > is available in oom conditions. With the suggested alternatives: > > /=256MB > A=126MB A-oom=2MB B=188MB B-oom=4MB > > or > > /=256MB > C=128MB D=192MB > C/a=126M D/a=188MB > > we can't distinguish between what is able to allocate below per-zone min > watermarks in the page allocator as the oom reserve. The key point is > that the root memcg is not the only memcg concerned with page allocator > memory reserves, it's any oom reserve. If A's usage is 124MB and B's > usage is 132MB, we can't specify that processes attached to B-oom should > be able to bypass per-zone min watermarks without an interface such as > that being proposed. The per-zone min watermarks are there to allow rudimentary OOM handling inside the kernel to prevent a complete deadlock. You want to hand them out to an indefinite number of (untrusted?) userspace tasks in the hope that they handle the situation? Also, the following concerns from Tejun still apply: > > The thing is OOM handling in userland is an inherently fragile thing > > and it can *never* replace kernel OOM handling. You may reserve any > > amount of memory you want but there would still be cases that it may > > fail. It's not like we have owner-based allocation all through the > > kernel or are willing to pay overhead for such thing. Even if that > > part can be guaranteed somehow (no idea how), the kernel still can > > NEVER trust the userland OOM handler. No matter what we do, we need a > > kernel OOM handler with no resource dependency. Your userspace handler may very much fail, but it may have squandered all the resources for the kernel fallback handling to actually perform its job. I don't know if you are actually allowing every PF_OOM_HANDLER to simply bypass the watermarks in your kernels, but this seems way too fragile for upstream. > > But, before that, system level userland OOM handling sounds scary to > > me. I thought about userland OOM handling for memcgs and it does make > > some sense. ie. there is a different action that userland oom handler > > can take which kernel oom handler can't - it can expand the limit of > > the offending cgroup, effectively using OOM handler as a sizing > > estimator. I'm not sure whether that in itself is a good idea but > > then again it might not be possible to clearly separate out sizing > > from oom conditions. > > > > Anyways, but for system level OOM handling, there's no other action > > userland handler can take. It's not like the OOM handler paging the > > admin to install more memory is a reasonable mode of operation to > > support. The *only* action userland OOM handler can take is killing > > something. Now, if that's the case and we have kernel OOM handler > > anyway, I think the best course of action is improving kernel OOM > > handler and teach it to make the decisions that the userland handler > > would consider good. That should be doable, right? > > > > It's much more powerful than that; you're referring to the mechanism to > guarantee future memory freeing so the system or memcg is no longer oom, > and that's only one case of possible handling. I have a customer who > wants to save heap profiles at the time of oom as well, for example, and > their sole desire is to be able to capture memory statistics before the > oom kill takes place. The sine qua non is that memory reserves allow > something to be done in such conditions: if you try to do a "ps" or "ls" > or cat a file in an oom memcg, you hang. This is conflating per-memcg OOM handling and global OOM handling. You can always ps or ls from outside to analyze a memcg OOM and we have established that there is no good reason to try doing it from inside the OOM group. > We need better functionality to ensure that we can do some action > prior to the oom kill itself, whether that comes from userspace or > the kernel. We simply cannot rely on things like memory thresholds > or vmpressure to grab these heap profiles, there is no guarantee > that memory will not be exhausted and the oom kill would already > have taken place before the process handling the notification wakes > up. (And any argument that it is possible by simply making the > threshold happen early enough is a non-starter: it does not > guarantee the heaps are collected for oom conditions and the oom > kill can still occur prematurely in machines that overcommit their > memcg limits, as we do.) > > > The thing is OOM handling in userland is an inherently fragile thing > > and it can *never* replace kernel OOM handling. You may reserve any > > amount of memory you want but there would still be cases that it may > > fail. It's not like we have owner-based allocation all through the > > kernel or are willing to pay overhead for such thing. Even if that > > part can be guaranteed somehow (no idea how), the kernel still can > > NEVER trust the userland OOM handler. No matter what we do, we need a > > kernel OOM handler with no resource dependency. > > > > I was never an advocate for the current memory.oom_control behavior that > allows you to disable the oom killer indefinitely for a memcg and I agree > that it is dangerous if userspace will not cause future memory freeing or > toggle the value such that the kernel will kill something. This is again confusing system-wide OOM with per-memcg OOM. Disabling the per-memcg OOM handler is perfectly fine because any memory demand from higher up the hierarchy will still kill in such a group. The problems Tejun describe are only existant in userspace handling of system-wide OOM situations. Which is the thing you are advocating, not what we currently have. > So I agree with you with today's functionality, not with the > functionality that this patchset, and the notification on the root > memcg for system oom conditions, provides. I also proposed a > memory.oom_delay_millisecs that we have used for several years > dating back to even cpusets that simply delays the oom kill such > that userspace can do "something" like send a kill itself, collect > heap profiles, send a signal to our malloc() implementation to free > arena memory, etc. prior to the kernel oom kill. > > > So, there isn't anything userland OOM handler can inherently do better > > and we can't do away with kernel handler no matter what. On both > > accounts, it seems like the best course of action is making > > system-wide kernel OOM handler to make better decisions if possible at > > all. If that's impossible, let's first think about why that's the > > case before hastly opening this new can of worms. > > > > We certainly can get away with the kernel oom killer in 99% of cases with > this functionality for users who choose to have their own oom handling > implementations. We also can't possibly code every single handling policy > into the kernel: we can't guarantee that our version of malloc() is > guaranteed to be able to free memory back to the kernel when waking up on > a memory.oom_control notification prior to the memcg oom killer killing > something, for example, without this functionality. If you have discardable anonymous memory laying around the volatile memory patches are a much more reliable way of getting rid of it than to wake up a userspace task and wait & pray a few seconds. Page reclaim has been *the* tool to facilitate overcommit for decades while OOM killing has always been a last-resort measure. Why is this not good enough anymore and why is the only solution to give up and do it all in userspace? ^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [patch 7/8] mm, memcg: allow processes handling oom notifications to access reserves [not found] ` <alpine.DEB.2.02.1312061441390.8949-X6Q0R45D7oAcqpCFd4KODRPsWskHk0ljAL8bYrjMMd8@public.gmane.org> 2013-12-09 22:37 ` Johannes Weiner @ 2013-12-10 21:50 ` Tejun Heo 2013-12-10 23:55 ` David Rientjes 1 sibling, 1 reply; 39+ messages in thread From: Tejun Heo @ 2013-12-10 21:50 UTC (permalink / raw) To: David Rientjes Cc: Johannes Weiner, Andrew Morton, Michal Hocko, KAMEZAWA Hiroyuki, Mel Gorman, Rik van Riel, Pekka Enberg, Christoph Lameter, Li Zefan, linux-kernel-u79uwXL29TY76Z2rM5mHXA, linux-mm-Bw31MaZKKs3YtjvyW6yDsg, cgroups-u79uwXL29TY76Z2rM5mHXA Hey, David. On Mon, Dec 09, 2013 at 12:10:44PM -0800, David Rientjes wrote: > Indeed. The setup I'm specifically trying to attack is where the sum of > the limits of all non-oom handling memcgs (A/b in my model, A in yours) > exceed the amount of RAM. If the system has 256MB, > > /=256MB > A=126MB A-oom=2MB B=188MB B-oom=4MB > > or > > /=256MB > C=128MB D=192MB > C/a=126M D/a=188MB > > then it's possible for A + B or C/a + D/a to cause a system oom condition > and meanwhile A-oom/tasks, B-oom/tasks, C/tasks, and D/tasks cannot > allocate memory to handle it. "tasks"? You mean that tasks can't be read reliably once system-OOM is hit regardless of memcg configuration? > Right, and apologies for not discussing the system oom handling here since > its notification on the root memcg is currently being debated as well. > The idea is that admins and users aren't going to be concerned about > memory allocation through the page allocator vs memory charging through > the memory controller; they simply want memory for their userspace oom > handling. And since the notification would be tied to the root memcg, it > makes sense to make the amount of memory allowed to allocate exclusively > for these handlers a memcg interface. So the cleanest solution, in my > opinion, was to add the interface as part of memcg. I'm still not quite following the reasoning. Can you please elaborate on what the distinction between "page allocator" and "charges through memory controller" has to do with this interface? > It's really the same thing, though, from the user perspective. They don't > care about page allocation failure vs memcg charge failure, they simply > want to ensure that the memory set aside for memory.oom_reserve_in_bytes > is available in oom conditions. With the suggested alternatives: > > /=256MB > A=126MB A-oom=2MB B=188MB B-oom=4MB > > or > > /=256MB > C=128MB D=192MB > C/a=126M D/a=188MB > > we can't distinguish between what is able to allocate below per-zone min > watermarks in the page allocator as the oom reserve. The key point is > that the root memcg is not the only memcg concerned with page allocator > memory reserves, it's any oom reserve. If A's usage is 124MB and B's > usage is 132MB, we can't specify that processes attached to B-oom should > be able to bypass per-zone min watermarks without an interface such as > that being proposed. Okay, are you saying that userland OOM handlers will be able to dip into kernel reserve memory? Maybe I'm mistaken but you realize that that reserve is there to make things like task exits work under OOM conditions, right? The only way userland OOM handlers as you describe would work would be creating a separate reserve for them. Aren't you basically suggesting two memcg domains - one which is overcommitted and the other which isn't? But if you want to do that, wouldn't that be something which is a natural fit for memch hierarchy? Not only that, such hierarchical setup would make sense for other controllers too - you're really creating two fundamentally different resource groups. > It's much more powerful than that; you're referring to the mechanism to > guarantee future memory freeing so the system or memcg is no longer oom, > and that's only one case of possible handling. I have a customer who > wants to save heap profiles at the time of oom as well, for example, and > their sole desire is to be able to capture memory statistics before the > oom kill takes place. The sine qua non is that memory reserves allow > something to be done in such conditions: if you try to do a "ps" or "ls" > or cat a file in an oom memcg, you hang. We need better functionality to > ensure that we can do some action prior to the oom kill itself, whether > that comes from userspace or the kernel. We simply cannot rely on things Well, the gotcha there is that you won't be able to do that with system level OOM handler either unless you create a separately reserved memory, which, again, can be achieved using hierarchical memcg setup already. Am I missing something here? > like memory thresholds or vmpressure to grab these heap profiles, there is > no guarantee that memory will not be exhausted and the oom kill would > already have taken place before the process handling the notification > wakes up. (And any argument that it is possible by simply making the > threshold happen early enough is a non-starter: it does not guarantee the > heaps are collected for oom conditions and the oom kill can still occur > prematurely in machines that overcommit their memcg limits, as we do.) I don't really follow your "guarantee" argument regarding OOM. It's not like we have mathmatically concrete definition of OOM conditions. That'd be nice to have but we simply don't have them. As it currently is defined, it's just "oh well, we tried hard enough but nothing seems to give in. whatever". As currently defined, it's an inherently fuzzy and racy thing. Sure, it *could* be meaningful to try to decrease the raciness if the difference is significant but using absolute terms like guarantee is just misleading, IMHO. You can't guarantee much with something which is racy to begin with. ... > conditions, provides. I also proposed a memory.oom_delay_millisecs that > we have used for several years dating back to even cpusets that simply > delays the oom kill such that userspace can do "something" like send a > kill itself, collect heap profiles, send a signal to our malloc() > implementation to free arena memory, etc. prior to the kernel oom kill. All the above would require a separately reserved memory, right? Also, a curiosity, how would "sending a signal to our malloc()" work? If you mean sending a signal to malloc() in a different process, that's not gonna work. How is that process gonna have memory to process the signal and free memory from malloc() under OOM condition? > We certainly can get away with the kernel oom killer in 99% of cases with > this functionality for users who choose to have their own oom handling > implementations. We also can't possibly code every single handling policy > into the kernel: we can't guarantee that our version of malloc() is > guaranteed to be able to free memory back to the kernel when waking up on > a memory.oom_control notification prior to the memcg oom killer killing > something, for example, without this functionality. So, malloc() is mapped into the same process as the OOM handler which is gonna be able to tap into physically reserved memory? Also, while freeing, it won't need to coordinate with other processes? If I'm not mistaken, we're talking about a lot of additional complexities throughout the whole mm layer for something which seems, to me, achieveable through proper memcg configuration without any modification to the kernel and doesn't seem all that necessary for 99% of use cases, as you said. Unless I'm missing something major (quite possible, of course), I think you'd need stronger rationale. Thanks. -- tejun ^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [patch 7/8] mm, memcg: allow processes handling oom notifications to access reserves 2013-12-10 21:50 ` Tejun Heo @ 2013-12-10 23:55 ` David Rientjes 2013-12-11 9:49 ` Mel Gorman 2013-12-11 12:42 ` Tejun Heo 0 siblings, 2 replies; 39+ messages in thread From: David Rientjes @ 2013-12-10 23:55 UTC (permalink / raw) To: Tejun Heo Cc: Johannes Weiner, Andrew Morton, Michal Hocko, KAMEZAWA Hiroyuki, Mel Gorman, Rik van Riel, Pekka Enberg, Christoph Lameter, Li Zefan, linux-kernel, linux-mm, cgroups On Tue, 10 Dec 2013, Tejun Heo wrote: > > Indeed. The setup I'm specifically trying to attack is where the sum of > > the limits of all non-oom handling memcgs (A/b in my model, A in yours) > > exceed the amount of RAM. If the system has 256MB, > > > > /=256MB > > A=126MB A-oom=2MB B=188MB B-oom=4MB > > > > or > > > > /=256MB > > C=128MB D=192MB > > C/a=126M D/a=188MB > > > > then it's possible for A + B or C/a + D/a to cause a system oom condition > > and meanwhile A-oom/tasks, B-oom/tasks, C/tasks, and D/tasks cannot > > allocate memory to handle it. > > "tasks"? You mean that tasks can't be read reliably once system-OOM > is hit regardless of memcg configuration? > Not referring to the files themselves, rather the processes listed by those files, sorry. Those processes would not be able to do a ps, ls, or anything useful even if they are mlocked into memory because they cannot allocate memory in oom conditions. > > Right, and apologies for not discussing the system oom handling here since > > its notification on the root memcg is currently being debated as well. > > The idea is that admins and users aren't going to be concerned about > > memory allocation through the page allocator vs memory charging through > > the memory controller; they simply want memory for their userspace oom > > handling. And since the notification would be tied to the root memcg, it > > makes sense to make the amount of memory allowed to allocate exclusively > > for these handlers a memcg interface. So the cleanest solution, in my > > opinion, was to add the interface as part of memcg. > > I'm still not quite following the reasoning. Can you please elaborate > on what the distinction between "page allocator" and "charges through > memory controller" has to do with this interface? > The interface would allow both access to memory reserves through the page allocator as well as charging above the memcg limit, it is the only way to guarantee that memory can be allocated by processes attached to the memcg in oom conditions. We must be able to do both, otherwise no matter what overcharge we allow them via memcg, it is still possible for the allocation itself to fail in the page allocator before we even get to that point. The confusion here is because the access to memory reserves in the page allocator is not presented here because there is another on-going discussion about when to notify processes waiting on the root memcg's memory.oom_control about system oom conditions. I can certainly post that patch as well, but it wouldn't apply without resolving that side-thread first. The high order bit is that we need to be able to address system oom conditions as well as memcg oom conditions in userspace and system oom conditions require us to specify the processes that are allowed access to a special memory reserve. We can't do that with sibling or parent memcgs without some new tunable like memory.allow_page_alloc_reserves, but we would also have to specify the amount of reserves allowed. It seemed clean and straight-forward to specify this as both the system oom memory reserve amount and memcg limit overcharge amount within the same file, memory.oom_reserve_in_bytes as this patch does. > > It's really the same thing, though, from the user perspective. They don't > > care about page allocation failure vs memcg charge failure, they simply > > want to ensure that the memory set aside for memory.oom_reserve_in_bytes > > is available in oom conditions. With the suggested alternatives: > > > > /=256MB > > A=126MB A-oom=2MB B=188MB B-oom=4MB > > > > or > > > > /=256MB > > C=128MB D=192MB > > C/a=126M D/a=188MB > > > > we can't distinguish between what is able to allocate below per-zone min > > watermarks in the page allocator as the oom reserve. The key point is > > that the root memcg is not the only memcg concerned with page allocator > > memory reserves, it's any oom reserve. If A's usage is 124MB and B's > > usage is 132MB, we can't specify that processes attached to B-oom should > > be able to bypass per-zone min watermarks without an interface such as > > that being proposed. > > Okay, are you saying that userland OOM handlers will be able to dip > into kernel reserve memory? Maybe I'm mistaken but you realize that > that reserve is there to make things like task exits work under OOM > conditions, right? The only way userland OOM handlers as you describe > would work would be creating a separate reserve for them. > Yes, PF_OOM_HANDLER processes would be able to allocate this amount as specified by memory.oom_reserve_in_bytes below the per-zone watermarks and the amount of reserves can already be controlled via min_free_kbytes, which we already increase internally for thp. This could obviously be limited to some sane value that is a fraction of the smallest zone's min watermark, that's not a problem: I've never had a memcg or system oom reserve larger than 2MB and most users would probably get away with 256KB or 512KB. > > It's much more powerful than that; you're referring to the mechanism to > > guarantee future memory freeing so the system or memcg is no longer oom, > > and that's only one case of possible handling. I have a customer who > > wants to save heap profiles at the time of oom as well, for example, and > > their sole desire is to be able to capture memory statistics before the > > oom kill takes place. The sine qua non is that memory reserves allow > > something to be done in such conditions: if you try to do a "ps" or "ls" > > or cat a file in an oom memcg, you hang. We need better functionality to > > ensure that we can do some action prior to the oom kill itself, whether > > that comes from userspace or the kernel. We simply cannot rely on things > > Well, the gotcha there is that you won't be able to do that with > system level OOM handler either unless you create a separately > reserved memory, which, again, can be achieved using hierarchical > memcg setup already. Am I missing something here? > System oom conditions would only arise when the usage of memcgs A + B above cause the page allocator to not be able to allocate memory without oom killing something even though the limits of both A and B may not have been reached yet. No userspace oom handler can allocate memory with access to memory reserves in the page allocator in such a context; it's vital that if we are to handle system oom conditions in userspace that we given them access to memory that other processes can't allocate. You could attach a userspace system oom handler to any memcg in this scenario with memory.oom_reserve_in_bytes and since it has PF_OOM_HANDLER it would be able to allocate in reserves in the page allocator and overcharge in its memcg to handle it. This isn't possible only with a hierarchical memcg setup unless you ensure the sum of the limits of the top level memcgs do not equal or exceed the sum of the min watermarks of all memory zones, and we exceed that. > > conditions, provides. I also proposed a memory.oom_delay_millisecs that > > we have used for several years dating back to even cpusets that simply > > delays the oom kill such that userspace can do "something" like send a > > kill itself, collect heap profiles, send a signal to our malloc() > > implementation to free arena memory, etc. prior to the kernel oom kill. > > All the above would require a separately reserved memory, right? > Also, a curiosity, how would "sending a signal to our malloc()" work? > If you mean sending a signal to malloc() in a different process, > that's not gonna work. How is that process gonna have memory to > process the signal and free memory from malloc() under OOM condition? > The signal is actually a wakeup from vmpressure, we don't want to wait until reclaim is completely exhausted before freeing this memory, we want to do it at VMPRESSURE_LOW. We simply needed a way to avoid the immediate oom kill unless it has a chance to free excess memory from malloc() first. We can also avoid oom killing entirely if, upon memcg oom notification, we can simply increase its limit instead of freeing memory at all: we have internally the notion of "overlimit" memcgs that are the first memcgs to kill within on system oom but are allowed to exceed their reservation if memory is available. It's advantageous to require them to aggressively reclaim up to their reservation and then only increase the memcg limit as a last resort. If we hit system oom later, they get killed first. With this functionality, it does not require more than a few pages of memory.oom_reserve_in_bytes to write to memory.limit_in_bytes. > So, malloc() is mapped into the same process as the OOM handler which > is gonna be able to tap into physically reserved memory? Also, while > freeing, it won't need to coordinate with other processes? > This is only one example and our reasoning for it is somewhat convoluted: we require thp's max_ptes_none to be 0 rather than the default HPAGE_PMD_NR-1 because we don't overcharge anonymous memory that isn't used purely for the sake of thp. This causes all of malloc()'s MADV_DONTNEED to force a split of every thp page because the number of pte_none()'s > 0. Instead, it's better to queue these free()'s and perhaps recycle them by zeroing out the memory and returning it on a subsequent malloc() rather than actually doing the MADV_DONTNEED and causing the thp split. We want to do the split under memory pressure, however, and so there's no coordination required other than malloc() dropping its queue of freed regions. > If I'm not mistaken, we're talking about a lot of additional > complexities throughout the whole mm layer for something which seems, > to me, achieveable through proper memcg configuration without any > modification to the kernel and doesn't seem all that necessary for 99% > of use cases, as you said. Unless I'm missing something major (quite > possible, of course), I think you'd need stronger rationale. > The stronger rationale is that you can't handle system oom in userspace without this functionality and we need to do so. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [patch 7/8] mm, memcg: allow processes handling oom notifications to access reserves 2013-12-10 23:55 ` David Rientjes @ 2013-12-11 9:49 ` Mel Gorman 2013-12-11 12:42 ` Tejun Heo 1 sibling, 0 replies; 39+ messages in thread From: Mel Gorman @ 2013-12-11 9:49 UTC (permalink / raw) To: David Rientjes Cc: Tejun Heo, Johannes Weiner, Andrew Morton, Michal Hocko, KAMEZAWA Hiroyuki, Rik van Riel, Pekka Enberg, Christoph Lameter, Li Zefan, linux-kernel, linux-mm, cgroups On Tue, Dec 10, 2013 at 03:55:48PM -0800, David Rientjes wrote: > > Okay, are you saying that userland OOM handlers will be able to dip > > into kernel reserve memory? Maybe I'm mistaken but you realize that > > that reserve is there to make things like task exits work under OOM > > conditions, right? The only way userland OOM handlers as you describe > > would work would be creating a separate reserve for them. > > > > Yes, PF_OOM_HANDLER processes would be able to allocate this amount as > specified by memory.oom_reserve_in_bytes below the per-zone watermarks and > the amount of reserves can already be controlled via min_free_kbytes, > which we already increase internally for thp. THP increased min_free_kbytes for external fragmentation control as it reduces the amount of mixing of the different migrate types within pageblocks. It was not about reserves, increasing reserves was just the most straight forward way of handling the problem. This dicussion is closer to swap-over-network than to anything THP did. Swap-over-network takes care to only allocate memory for reserves if it the allocation was required for swapping and reject all other allocation requests to the extent they can get throttled in throttle_direct_reclaim. Once allocated from reserves for swapping, care is taken that the allocations are not leaked to other users (e.g. is_obj_pfmemalloc checks in slab). It does not look like PF_OOM_HANDLER takes the same sort of care. Even if it did, it's not quite the same. swap-over-network allocates from the zone reserves *only* the memory required to writeback the pages. It can be slow but it'll make forward progress. A userspace process with special privileges could allocate any amount of memory for any reason so it would need a pre-configured and limited reserve on top of the zone reserves or run the risk of livelock. -- Mel Gorman SUSE Labs -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [patch 7/8] mm, memcg: allow processes handling oom notifications to access reserves 2013-12-10 23:55 ` David Rientjes 2013-12-11 9:49 ` Mel Gorman @ 2013-12-11 12:42 ` Tejun Heo 2013-12-12 5:37 ` Tim Hockin 1 sibling, 1 reply; 39+ messages in thread From: Tejun Heo @ 2013-12-11 12:42 UTC (permalink / raw) To: David Rientjes Cc: Johannes Weiner, Andrew Morton, Michal Hocko, KAMEZAWA Hiroyuki, Mel Gorman, Rik van Riel, Pekka Enberg, Christoph Lameter, Li Zefan, linux-kernel, linux-mm, cgroups Yo, On Tue, Dec 10, 2013 at 03:55:48PM -0800, David Rientjes wrote: > > Well, the gotcha there is that you won't be able to do that with > > system level OOM handler either unless you create a separately > > reserved memory, which, again, can be achieved using hierarchical > > memcg setup already. Am I missing something here? > > System oom conditions would only arise when the usage of memcgs A + B > above cause the page allocator to not be able to allocate memory without > oom killing something even though the limits of both A and B may not have > been reached yet. No userspace oom handler can allocate memory with > access to memory reserves in the page allocator in such a context; it's > vital that if we are to handle system oom conditions in userspace that we > given them access to memory that other processes can't allocate. You > could attach a userspace system oom handler to any memcg in this scenario > with memory.oom_reserve_in_bytes and since it has PF_OOM_HANDLER it would > be able to allocate in reserves in the page allocator and overcharge in > its memcg to handle it. This isn't possible only with a hierarchical > memcg setup unless you ensure the sum of the limits of the top level > memcgs do not equal or exceed the sum of the min watermarks of all memory > zones, and we exceed that. Yes, exactly. If system memory is 128M, create top level memcgs w/ 120M and 8M each (well, with some slack of course) and then overcommit the descendants of 120M while putting OOM handlers and friends under 8M without overcommitting. ... > The stronger rationale is that you can't handle system oom in userspace > without this functionality and we need to do so. You're giving yourself an unreasonable precondition - overcommitting at root level and handling system OOM from userland - and then trying to contort everything to fit that. How can possibly "overcommitting at root level" be a goal of and in itself? Please take a step back and look at and explain the *problem* you're trying to solve. You haven't explained why that *need*s to be the case at all. I wrote this at the start of the thread but you're still doing the same thing. You're trying to create a hidden memcg level inside a memcg. At the beginning of this thread, you were trying to do that for !root memcgs and now you're arguing that you *need* that for root memcg. Because there's no other limit we can make use of, you're suggesting the use of kernel reserve memory for that purpose. It seems like an absurd thing to do to me. It could be that you might not be able to achieve exactly the same thing that way, but the right thing to do would be improving memcg in general so that it can instead of adding yet more layer of half-baked complexity, right? Even if there are some inherent advantages of system userland OOM handling with a separate physical memory reserve, which AFAICS you haven't succeeded at showing yet, this is a very invasive change and, as you said before, something with an *extremely* narrow use case. Wouldn't it be a better idea to improve the existing mechanisms - be that memcg in general or kernel OOM handling - to fit the niche use case better? I mean, just think about all the corner cases. How are you gonna handle priority inversion through locked pages or allocations given out to other tasks through slab? You're suggesting opening a giant can of worms for extremely narrow benefit which doesn't even seem like actually needing opening the said can. Thanks. -- tejun -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [patch 7/8] mm, memcg: allow processes handling oom notifications to access reserves 2013-12-11 12:42 ` Tejun Heo @ 2013-12-12 5:37 ` Tim Hockin 2013-12-12 14:21 ` Tejun Heo 0 siblings, 1 reply; 39+ messages in thread From: Tim Hockin @ 2013-12-12 5:37 UTC (permalink / raw) To: Tejun Heo Cc: David Rientjes, Johannes Weiner, Andrew Morton, Michal Hocko, KAMEZAWA Hiroyuki, Mel Gorman, Rik van Riel, Pekka Enberg, Christoph Lameter, Li Zefan, linux-kernel@vger.kernel.org, linux-mm, Cgroups The immediate problem I see with setting aside reserves "off the top" is that we don't really know a priori how much memory the kernel itself is going to use, which could still land us in an overcommitted state. In other words, if I have your 128 MB machine, and I set aside 8 MB for OOM handling, and give 120 MB for jobs, I have not accounted for the kernel. So I set aside 8 MB for OOM and 100 MB for jobs, leaving 20 MB for jobs. That should be enough right? Hell if I know, and nothing ensures that. On Wed, Dec 11, 2013 at 4:42 AM, Tejun Heo <tj@kernel.org> wrote: > Yo, > > On Tue, Dec 10, 2013 at 03:55:48PM -0800, David Rientjes wrote: >> > Well, the gotcha there is that you won't be able to do that with >> > system level OOM handler either unless you create a separately >> > reserved memory, which, again, can be achieved using hierarchical >> > memcg setup already. Am I missing something here? >> >> System oom conditions would only arise when the usage of memcgs A + B >> above cause the page allocator to not be able to allocate memory without >> oom killing something even though the limits of both A and B may not have >> been reached yet. No userspace oom handler can allocate memory with >> access to memory reserves in the page allocator in such a context; it's >> vital that if we are to handle system oom conditions in userspace that we >> given them access to memory that other processes can't allocate. You >> could attach a userspace system oom handler to any memcg in this scenario >> with memory.oom_reserve_in_bytes and since it has PF_OOM_HANDLER it would >> be able to allocate in reserves in the page allocator and overcharge in >> its memcg to handle it. This isn't possible only with a hierarchical >> memcg setup unless you ensure the sum of the limits of the top level >> memcgs do not equal or exceed the sum of the min watermarks of all memory >> zones, and we exceed that. > > Yes, exactly. If system memory is 128M, create top level memcgs w/ > 120M and 8M each (well, with some slack of course) and then overcommit > the descendants of 120M while putting OOM handlers and friends under > 8M without overcommitting. > > ... >> The stronger rationale is that you can't handle system oom in userspace >> without this functionality and we need to do so. > > You're giving yourself an unreasonable precondition - overcommitting > at root level and handling system OOM from userland - and then trying > to contort everything to fit that. How can possibly "overcommitting > at root level" be a goal of and in itself? Please take a step back > and look at and explain the *problem* you're trying to solve. You > haven't explained why that *need*s to be the case at all. > > I wrote this at the start of the thread but you're still doing the > same thing. You're trying to create a hidden memcg level inside a > memcg. At the beginning of this thread, you were trying to do that > for !root memcgs and now you're arguing that you *need* that for root > memcg. Because there's no other limit we can make use of, you're > suggesting the use of kernel reserve memory for that purpose. It > seems like an absurd thing to do to me. It could be that you might > not be able to achieve exactly the same thing that way, but the right > thing to do would be improving memcg in general so that it can instead > of adding yet more layer of half-baked complexity, right? > > Even if there are some inherent advantages of system userland OOM > handling with a separate physical memory reserve, which AFAICS you > haven't succeeded at showing yet, this is a very invasive change and, > as you said before, something with an *extremely* narrow use case. > Wouldn't it be a better idea to improve the existing mechanisms - be > that memcg in general or kernel OOM handling - to fit the niche use > case better? I mean, just think about all the corner cases. How are > you gonna handle priority inversion through locked pages or > allocations given out to other tasks through slab? You're suggesting > opening a giant can of worms for extremely narrow benefit which > doesn't even seem like actually needing opening the said can. > > Thanks. > > -- > tejun > -- > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html > Please read the FAQ at http://www.tux.org/lkml/ -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [patch 7/8] mm, memcg: allow processes handling oom notifications to access reserves 2013-12-12 5:37 ` Tim Hockin @ 2013-12-12 14:21 ` Tejun Heo 2013-12-12 16:32 ` Michal Hocko [not found] ` <20131212142156.GB32683-Gd/HAXX7CRxy/B6EtB590w@public.gmane.org> 0 siblings, 2 replies; 39+ messages in thread From: Tejun Heo @ 2013-12-12 14:21 UTC (permalink / raw) To: Tim Hockin Cc: David Rientjes, Johannes Weiner, Andrew Morton, Michal Hocko, KAMEZAWA Hiroyuki, Mel Gorman, Rik van Riel, Pekka Enberg, Christoph Lameter, Li Zefan, linux-kernel@vger.kernel.org, linux-mm, Cgroups Hey, Tim. Sidenote: Please don't top-post with the whole body quoted below unless you're adding new cc's. Please selectively quote the original message's body to remind the readers of the context and reply below it. It's a basic lkml etiquette and one with good reasons. If you have to top-post for whatever reason - say you're typing from a machine which doesn't allow easy editing of the original message, explain so at the top of the message, or better yet, wait till you can unless it's urgent. On Wed, Dec 11, 2013 at 09:37:46PM -0800, Tim Hockin wrote: > The immediate problem I see with setting aside reserves "off the top" > is that we don't really know a priori how much memory the kernel > itself is going to use, which could still land us in an overcommitted > state. > > In other words, if I have your 128 MB machine, and I set aside 8 MB > for OOM handling, and give 120 MB for jobs, I have not accounted for > the kernel. So I set aside 8 MB for OOM and 100 MB for jobs, leaving > 20 MB for jobs. That should be enough right? Hell if I know, and > nothing ensures that. Yes, sure thing, that's the reason why I mentioned "with some slack" in the original message and also that it might not be completely the same. It doesn't allow you to aggressively use system level OOM handling as the sizing estimator for the root cgroup; however, it's more of an implementation details than something which should guide the overall architecture - it's a problem which lessens in severity as [k]memcg improves and its coverage becomes more complete, which is the direction we should be headed no matter what. It'd depend on the workload but with memcg fully configured it shouldn't fluctuate wildly. If it does, we need to hunt down whatever is causing such fluctuatation and include it in kmemcg, right? That way, memcg as a whole improves for all use cases not just your niche one and I strongly believe that aligning as many use cases as possible along the same axis, rather than creating a large hole to stow away the exceptions, is vastly more beneficial to *everyone* in the long term. There'd still be all the bells and whistles to configure and monitor system-level OOM and if there's justified need for improvements, we surely can and should do that; however, with the heavy lifting / hot path offloaded to the per-memcg userland OOM handlers, I believe it's reasonable to expect the burden on system OOM handler being noticeably less, which is the way it should be. That's the last guard against the whole system completely locking up and we can't extend its capabilities beyond that easily and we most likely don't even want to. If I take back a step and look at the two options and their pros and cons, which path we should take is rather obvious to me. I hope you see it too. Thanks. -- tejun -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [patch 7/8] mm, memcg: allow processes handling oom notifications to access reserves 2013-12-12 14:21 ` Tejun Heo @ 2013-12-12 16:32 ` Michal Hocko 2013-12-12 16:37 ` Tejun Heo [not found] ` <20131212142156.GB32683-Gd/HAXX7CRxy/B6EtB590w@public.gmane.org> 1 sibling, 1 reply; 39+ messages in thread From: Michal Hocko @ 2013-12-12 16:32 UTC (permalink / raw) To: Tejun Heo Cc: Tim Hockin, David Rientjes, Johannes Weiner, Andrew Morton, KAMEZAWA Hiroyuki, Mel Gorman, Rik van Riel, Pekka Enberg, Christoph Lameter, Li Zefan, linux-kernel@vger.kernel.org, linux-mm, Cgroups On Thu 12-12-13 09:21:56, Tejun Heo wrote: [...] > There'd still be all the bells and whistles to configure and monitor > system-level OOM and if there's justified need for improvements, we > surely can and should do that; You weren't on the CC of the original thread which has started here https://lkml.org/lkml/2013/11/19/191. And the original request for discussion was more about user defined _policies_ for the global OOM rather than user space global OOM handler. I feel that there are usacases where the current "kill a single task based on some calculations" is far from optimal which leads to hacks which try to cope with after oom condition somehow gracefully. I do agree with you that pulling oom handling sounds too dangerous even with all the code that it would need and I feel we should go a different path than (ab)using memcg.oom_control interface for that. I still think we need to have a way to tell the global OOM killer what to do. [...] -- Michal Hocko SUSE Labs -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [patch 7/8] mm, memcg: allow processes handling oom notifications to access reserves 2013-12-12 16:32 ` Michal Hocko @ 2013-12-12 16:37 ` Tejun Heo 0 siblings, 0 replies; 39+ messages in thread From: Tejun Heo @ 2013-12-12 16:37 UTC (permalink / raw) To: Michal Hocko Cc: Tim Hockin, David Rientjes, Johannes Weiner, Andrew Morton, KAMEZAWA Hiroyuki, Mel Gorman, Rik van Riel, Pekka Enberg, Christoph Lameter, Li Zefan, linux-kernel@vger.kernel.org, linux-mm, Cgroups Hello, Michal. On Thu, Dec 12, 2013 at 05:32:22PM +0100, Michal Hocko wrote: > You weren't on the CC of the original thread which has started here > https://lkml.org/lkml/2013/11/19/191. And the original request for > discussion was more about user defined _policies_ for the global > OOM rather than user space global OOM handler. I feel that there > are usacases where the current "kill a single task based on some > calculations" is far from optimal which leads to hacks which try to cope > with after oom condition somehow gracefully. > > I do agree with you that pulling oom handling sounds too dangerous > even with all the code that it would need and I feel we should go a > different path than (ab)using memcg.oom_control interface for that. > I still think we need to have a way to tell the global OOM killer what > to do. Oh yeah, sure, I have no fundamental objections against improving the in-kernel system OOM handler, including making it cgroup-aware which seems like a natural extension to me. Thanks. -- tejun -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 39+ messages in thread
[parent not found: <20131212142156.GB32683-Gd/HAXX7CRxy/B6EtB590w@public.gmane.org>]
* Re: [patch 7/8] mm, memcg: allow processes handling oom notifications to access reserves [not found] ` <20131212142156.GB32683-Gd/HAXX7CRxy/B6EtB590w@public.gmane.org> @ 2013-12-12 18:42 ` Tim Hockin 2013-12-12 19:23 ` Tejun Heo 0 siblings, 1 reply; 39+ messages in thread From: Tim Hockin @ 2013-12-12 18:42 UTC (permalink / raw) To: Tejun Heo Cc: David Rientjes, Johannes Weiner, Andrew Morton, Michal Hocko, KAMEZAWA Hiroyuki, Mel Gorman, Rik van Riel, Pekka Enberg, Christoph Lameter, Li Zefan, linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org, linux-mm-Bw31MaZKKs3YtjvyW6yDsg, Cgroups On Thu, Dec 12, 2013 at 6:21 AM, Tejun Heo <tj-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org> wrote: > Hey, Tim. > > Sidenote: Please don't top-post with the whole body quoted below > unless you're adding new cc's. Please selectively quote the original > message's body to remind the readers of the context and reply below > it. It's a basic lkml etiquette and one with good reasons. If you > have to top-post for whatever reason - say you're typing from a > machine which doesn't allow easy editing of the original message, > explain so at the top of the message, or better yet, wait till you can > unless it's urgent. Yeah sorry. Replying from my phone is awkward at best. I know better :) > On Wed, Dec 11, 2013 at 09:37:46PM -0800, Tim Hockin wrote: >> The immediate problem I see with setting aside reserves "off the top" >> is that we don't really know a priori how much memory the kernel >> itself is going to use, which could still land us in an overcommitted >> state. >> >> In other words, if I have your 128 MB machine, and I set aside 8 MB >> for OOM handling, and give 120 MB for jobs, I have not accounted for >> the kernel. So I set aside 8 MB for OOM and 100 MB for jobs, leaving >> 20 MB for jobs. That should be enough right? Hell if I know, and >> nothing ensures that. > > Yes, sure thing, that's the reason why I mentioned "with some slack" > in the original message and also that it might not be completely the > same. It doesn't allow you to aggressively use system level OOM > handling as the sizing estimator for the root cgroup; however, it's > more of an implementation details than something which should guide > the overall architecture - it's a problem which lessens in severity as > [k]memcg improves and its coverage becomes more complete, which is the > direction we should be headed no matter what. In my mind, the ONLY point of pulling system-OOM handling into userspace is to make it easier for crazy people (Google) to implement bizarre system-OOM policies. Example: When we have a system OOM we want to do a walk of the administrative memcg tree (which is only a couple levels deep, users can make non-admin sub-memcgs), selecting the lowest priority entity at each step (where both tasks and memcgs have a priority and the priority range is much wider than the current OOM scores, and where memcg priority is sometimes a function of memcg usage), until we reach a leaf. Once we reach a leaf, I want to log some info about the memcg doing the allocation, the memcg being terminated, and maybe some other bits about the system (depending on the priority of the selected victim, this may or may not be an "acceptable" situation). Then I want to kill *everything* under that memcg. Then I want to "publish" some information through a sane API (e.g. not dmesg scraping). This is basically our policy as we understand it today. This is notably different than it was a year ago, and it will probably evolve further in the next year. Teaching the kernel all of this stuff has proven to be sort of difficult to maintain and forward-port, and has been very slow to evolve because of how painful it is to test and deploy new kernels. Maybe we can find a way to push this level of policy down to the kernel OOM killer? When this was mentioned internally I got shot down (gently, but shot down none the less). Assuming we had nearly-reliable (it doesn't have to be 100% guaranteed to be useful) OOM-in-userspace, I can keep the adminstrative memcg metadata in memory, implement killing as cruelly as I need, and do all of the logging and publication after the OOM kill is done. Most importantly I can test and deploy new policy changes pretty trivially. Handling per-memcg OOM is a different discussion. Here is where we want to be able to extract things like heap profiles or take stats snapshots, grow memcgs (if so configured) etc. Allowing our users to have a moment of mercy before we put a bullet in their brain enables a whole new realm of debugging, as well as a lot of valuable features. > It'd depend on the workload but with memcg fully configured it > shouldn't fluctuate wildly. If it does, we need to hunt down whatever > is causing such fluctuatation and include it in kmemcg, right? That > way, memcg as a whole improves for all use cases not just your niche > one and I strongly believe that aligning as many use cases as possible > along the same axis, rather than creating a large hole to stow away > the exceptions, is vastly more beneficial to *everyone* in the long > term. We have a long tail of kernel memory usage. If we provision machines so that the "do work here" first-level memcg excludes the average kernel usage, we have a huge number of machines that will fail to apply OOM policy because of actual overcommitment. If we provision for 95th or 99th percentile kernel usage, we're wasting large amounts of memory that could be used to schedule jobs. This is the fundamental problem we face with static apportionment (and we face it in a dozen other situations, too). Expressing this set-aside memory as "off-the-top" rather than absolute limits makes the whole system more flexible. > There'd still be all the bells and whistles to configure and monitor > system-level OOM and if there's justified need for improvements, we > surely can and should do that; however, with the heavy lifting / hot > path offloaded to the per-memcg userland OOM handlers, I believe it's > reasonable to expect the burden on system OOM handler being noticeably > less, which is the way it should be. That's the last guard against > the whole system completely locking up and we can't extend its > capabilities beyond that easily and we most likely don't even want to. > > If I take back a step and look at the two options and their pros and > cons, which path we should take is rather obvious to me. I hope you > see it too. > > Thanks. > > -- > tejun ^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [patch 7/8] mm, memcg: allow processes handling oom notifications to access reserves 2013-12-12 18:42 ` Tim Hockin @ 2013-12-12 19:23 ` Tejun Heo 2013-12-13 0:23 ` Tim Hockin 0 siblings, 1 reply; 39+ messages in thread From: Tejun Heo @ 2013-12-12 19:23 UTC (permalink / raw) To: Tim Hockin Cc: David Rientjes, Johannes Weiner, Andrew Morton, Michal Hocko, KAMEZAWA Hiroyuki, Mel Gorman, Rik van Riel, Pekka Enberg, Christoph Lameter, Li Zefan, linux-kernel@vger.kernel.org, linux-mm, Cgroups Hello, Tim. On Thu, Dec 12, 2013 at 10:42:20AM -0800, Tim Hockin wrote: > Yeah sorry. Replying from my phone is awkward at best. I know better :) Heh, sorry about being bitchy. :) > In my mind, the ONLY point of pulling system-OOM handling into > userspace is to make it easier for crazy people (Google) to implement > bizarre system-OOM policies. Example: I think that's one of the places where we largely disagree. If at all possible, I'd much prefer google's workload to be supported inside the general boundaries of the upstream kernel without having to punch a large hole in it. To me, the general development history of memcg in general and this thread in particular seem to epitomize why it is a bad idea to have isolated, large and deep "crazy" use cases. Punching the initial hole is the easy part; however, we all are quite limited in anticpating future needs and sooner or later that crazy use case is bound to evolve further towards the isolated extreme it departed towards and require more and larger holes and further contortions to accomodate such progress. The concern I have with the suggested solution is not necessarily that it's technically complex than it looks like on the surface - I'm sure it can be made to work one way or the other - but that it's a fairly large step toward an isolated extreme which memcg as a project probably should not head toward. There sure are cases where such exceptions can't be avoided and are good trade-offs but, here, we're talking about a major architectural decision which not only affects memcg but mm in general. I'm afraid this doesn't sound like an no-brainer flexibility we can afford. > When we have a system OOM we want to do a walk of the administrative > memcg tree (which is only a couple levels deep, users can make > non-admin sub-memcgs), selecting the lowest priority entity at each > step (where both tasks and memcgs have a priority and the priority > range is much wider than the current OOM scores, and where memcg > priority is sometimes a function of memcg usage), until we reach a > leaf. > > Once we reach a leaf, I want to log some info about the memcg doing > the allocation, the memcg being terminated, and maybe some other bits > about the system (depending on the priority of the selected victim, > this may or may not be an "acceptable" situation). Then I want to > kill *everything* under that memcg. Then I want to "publish" some > information through a sane API (e.g. not dmesg scraping). > > This is basically our policy as we understand it today. This is > notably different than it was a year ago, and it will probably evolve > further in the next year. I think per-memcg score and killing is something which makes fundamental sense. In fact, killing a single process has never made much sense to me as that is a unit which ultimately is only meaningful to the kernel itself and not necessraily to userland, so no matter what I think we're gonna gain per-memcg behavior and it seems most, albeit not all, of what you described above should be implementable through that. Ultimately, if the use case calls for very fine level of control, I think the right thing to do is making nesting work properly which is likely to take some time. In the meantime, even if such use case requires modifying the kernel to tailor the OOM behavior, I think sticking to kernel OOM provides a lot easier way to eventual convergence. Userland system OOM basically means giving up and would lessen the motivation towards improving the shared infrastructures while adding significant pressure towards schizophreic diversion. > We have a long tail of kernel memory usage. If we provision machines > so that the "do work here" first-level memcg excludes the average > kernel usage, we have a huge number of machines that will fail to > apply OOM policy because of actual overcommitment. If we provision > for 95th or 99th percentile kernel usage, we're wasting large amounts > of memory that could be used to schedule jobs. This is the > fundamental problem we face with static apportionment (and we face it > in a dozen other situations, too). Expressing this set-aside memory > as "off-the-top" rather than absolute limits makes the whole system > more flexible. I agree that's pretty sad. Maybe I shouldn't be surprised given the far-from-perfect coverage of kmemcg at this point, but, again, *everyone* wants [k]memcg coverage to be more complete and we have and are still building the infrastructures to make that possible, so I'm still of the opinion that making [k]memcg work better is the better direction to pursue and given the short development history of kmemcg I'm fairly sure there are quite a few low hanging fruits. Another thing which *might* be relevant is the rigidity of the upper limit and the vagueness of soft limit of the current implementation. I have a rather strong suspicion that the way memcg config knobs behave now - one finicky, the other whatever - is likely hindering the use cases to fan out more naturally. I could be completely wrong on this but your mention of inflexibility of absolute limits reminds me of the issue. Thanks. -- tejun -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [patch 7/8] mm, memcg: allow processes handling oom notifications to access reserves 2013-12-12 19:23 ` Tejun Heo @ 2013-12-13 0:23 ` Tim Hockin 2013-12-13 11:47 ` Tejun Heo 0 siblings, 1 reply; 39+ messages in thread From: Tim Hockin @ 2013-12-13 0:23 UTC (permalink / raw) To: Tejun Heo Cc: David Rientjes, Johannes Weiner, Andrew Morton, Michal Hocko, KAMEZAWA Hiroyuki, Mel Gorman, Rik van Riel, Pekka Enberg, Christoph Lameter, Li Zefan, linux-kernel@vger.kernel.org, linux-mm, Cgroups, Victor Marmol On Thu, Dec 12, 2013 at 11:23 AM, Tejun Heo <tj@kernel.org> wrote: > Hello, Tim. > > On Thu, Dec 12, 2013 at 10:42:20AM -0800, Tim Hockin wrote: >> Yeah sorry. Replying from my phone is awkward at best. I know better :) > > Heh, sorry about being bitchy. :) > >> In my mind, the ONLY point of pulling system-OOM handling into >> userspace is to make it easier for crazy people (Google) to implement >> bizarre system-OOM policies. Example: > > I think that's one of the places where we largely disagree. If at all Just to be clear - I say this because it doesn't feel right to impose my craziness on others, and it sucks when we try and are met with "you're crazy, go away". And you have to admit that happens to Google. :) Punching an escape valve that allows us to be crazy without hurting anyone else sounds ideal, IF and ONLY IF that escape valve is itself maintainable. If the escape valve is userspace it's REALLY easy to iterate on our craziness. If it is kernel space, it's somewhat less easy, but not impossible. > possible, I'd much prefer google's workload to be supported inside the > general boundaries of the upstream kernel without having to punch a > large hole in it. To me, the general development history of memcg in > general and this thread in particular seem to epitomize why it is a > bad idea to have isolated, large and deep "crazy" use cases. Punching > the initial hole is the easy part; however, we all are quite limited > in anticpating future needs and sooner or later that crazy use case is > bound to evolve further towards the isolated extreme it departed > towards and require more and larger holes and further contortions to > accomodate such progress. > > The concern I have with the suggested solution is not necessarily that > it's technically complex than it looks like on the surface - I'm sure > it can be made to work one way or the other - but that it's a fairly > large step toward an isolated extreme which memcg as a project > probably should not head toward. > > There sure are cases where such exceptions can't be avoided and are > good trade-offs but, here, we're talking about a major architectural > decision which not only affects memcg but mm in general. I'm afraid > this doesn't sound like an no-brainer flexibility we can afford. > >> When we have a system OOM we want to do a walk of the administrative >> memcg tree (which is only a couple levels deep, users can make >> non-admin sub-memcgs), selecting the lowest priority entity at each >> step (where both tasks and memcgs have a priority and the priority >> range is much wider than the current OOM scores, and where memcg >> priority is sometimes a function of memcg usage), until we reach a >> leaf. >> >> Once we reach a leaf, I want to log some info about the memcg doing >> the allocation, the memcg being terminated, and maybe some other bits >> about the system (depending on the priority of the selected victim, >> this may or may not be an "acceptable" situation). Then I want to >> kill *everything* under that memcg. Then I want to "publish" some >> information through a sane API (e.g. not dmesg scraping). >> >> This is basically our policy as we understand it today. This is >> notably different than it was a year ago, and it will probably evolve >> further in the next year. > > I think per-memcg score and killing is something which makes > fundamental sense. In fact, killing a single process has never made > much sense to me as that is a unit which ultimately is only meaningful > to the kernel itself and not necessraily to userland, so no matter > what I think we're gonna gain per-memcg behavior and it seems most, > albeit not all, of what you described above should be implementable > through that. Well that's an awesome start. We have or had patches to do a lot of this. I don't know how well scrubbed they are for pushing or whether they apply at all to current head, though. > Ultimately, if the use case calls for very fine level of control, I > think the right thing to do is making nesting work properly which is > likely to take some time. In the meantime, even if such use case > requires modifying the kernel to tailor the OOM behavior, I think > sticking to kernel OOM provides a lot easier way to eventual > convergence. Userland system OOM basically means giving up and would > lessen the motivation towards improving the shared infrastructures > while adding significant pressure towards schizophreic diversion. > >> We have a long tail of kernel memory usage. If we provision machines >> so that the "do work here" first-level memcg excludes the average >> kernel usage, we have a huge number of machines that will fail to >> apply OOM policy because of actual overcommitment. If we provision >> for 95th or 99th percentile kernel usage, we're wasting large amounts >> of memory that could be used to schedule jobs. This is the >> fundamental problem we face with static apportionment (and we face it >> in a dozen other situations, too). Expressing this set-aside memory >> as "off-the-top" rather than absolute limits makes the whole system >> more flexible. > > I agree that's pretty sad. Maybe I shouldn't be surprised given the > far-from-perfect coverage of kmemcg at this point, but, again, > *everyone* wants [k]memcg coverage to be more complete and we have and > are still building the infrastructures to make that possible, so I'm > still of the opinion that making [k]memcg work better is the better > direction to pursue and given the short development history of kmemcg > I'm fairly sure there are quite a few low hanging fruits. yes we should fix accounting across the board. We are hugely in favor of that. But I don't buy that we'll erase that tail. Fundamentally, we don't know what the limit is, but we know that we need to save a little "off the top". I'm very much hoping we can find a way to express that. As an aside: mucking about with extra nesting levels to achieve a stable OOM semantic sounds doable, but it certainly sucks in a unified hierarchy. We'll end up with 1, 2, or 3 (or more in esoteric cases? not sure) extra nesting levels for every other resource dimension. And lawd help us if we ever need to do something similar in a different resource dimension - the cross product is mind-bending. What we do with split-hierarchies is this but on a smaller scale. > Another thing which *might* be relevant is the rigidity of the upper > limit and the vagueness of soft limit of the current implementation. > I have a rather strong suspicion that the way memcg config knobs > behave now - one finicky, the other whatever - is likely hindering the > use cases to fan out more naturally. I could be completely wrong on > this but your mention of inflexibility of absolute limits reminds me > of the issue. > > Thanks. > > -- > tejun -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [patch 7/8] mm, memcg: allow processes handling oom notifications to access reserves 2013-12-13 0:23 ` Tim Hockin @ 2013-12-13 11:47 ` Tejun Heo 0 siblings, 0 replies; 39+ messages in thread From: Tejun Heo @ 2013-12-13 11:47 UTC (permalink / raw) To: Tim Hockin Cc: David Rientjes, Johannes Weiner, Andrew Morton, Michal Hocko, KAMEZAWA Hiroyuki, Mel Gorman, Rik van Riel, Pekka Enberg, Christoph Lameter, Li Zefan, linux-kernel@vger.kernel.org, linux-mm, Cgroups, Victor Marmol Hello, Tim. On Thu, Dec 12, 2013 at 04:23:18PM -0800, Tim Hockin wrote: > Just to be clear - I say this because it doesn't feel right to impose > my craziness on others, and it sucks when we try and are met with > "you're crazy, go away". And you have to admit that happens to > Google. :) Punching an escape valve that allows us to be crazy > without hurting anyone else sounds ideal, IF and ONLY IF that escape > valve is itself maintainable. I don't think google being considered crazy is a good thing in general, highly likely not something to be proud of. It sure is partly indicative of the specialization that you guys need but I suspect is a much stronger signal for room for better engineering. I'm fairly certain the blame is abundant for everybody to share. The point I'm trying to make is "let's please stop diverging". It hurts everybody. > If the escape valve is userspace it's REALLY easy to iterate on our > craziness. If it is kernel space, it's somewhat less easy, but not > impossible. As I'm sure you've gathered from this thread, even punching the initial hole is a sizable burden and contortion to the general memory management and I'm sure as you guys develop further down the path you'll encounter cases where you need further support or holes from the kernel. I can't anticipate the details but the fact that those will follow is as evident as the day to me, especially given the mindset leading to the current situation in the first place. Please note that this part of discussion is more abstract than necessary for this particular patchset or hole. I'm quite doubtful that system-level OOM handling with separate physical reserve is likely to survive even just on technical details. The reason why I'm keeping at this abstract point is because this seems to be a continuing trend rather than a single occurrence and I really hope it changes. > Well that's an awesome start. We have or had patches to do a lot of > this. I don't know how well scrubbed they are for pushing or whether > they apply at all to current head, though. Awesome, this looks like something everyone agrees on. :) > As an aside: mucking about with extra nesting levels to achieve a > stable OOM semantic sounds doable, but it certainly sucks in a unified > hierarchy. We'll end up with 1, 2, or 3 (or more in esoteric cases? > not sure) extra nesting levels for every other resource dimension. > And lawd help us if we ever need to do something similar in a > different resource dimension - the cross product is mind-bending. > What we do with split-hierarchies is this but on a smaller scale. Yes, agreed but I believe there are substantial benefits to having certain level of structural constraints. It encourages people to ponder the underlying issues and make active trade-offs. Not that going off that extreme would be good either but we've gone too far towards the other end. This being a special issue with memcg, if this turns out to be a big enough problem, I don't think having a provision to be able to handle it without further nesting would be too crazy - e.g. the ability to mark a single cgroup at the root level as for OOM handler or whatever - as long as we stay within the boundaries of memcg and cgroup proper, but we seem to have ways to go before worrying about that one. Thanks. -- tejun -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 39+ messages in thread
* [patch 8/8] mm, memcg: add memcg oom reserve documentation 2013-12-04 5:19 ` [patch 1/8] fork: collapse copy_flags into copy_process David Rientjes ` (5 preceding siblings ...) 2013-12-04 5:20 ` [patch 7/8] mm, memcg: allow processes handling oom notifications to access reserves David Rientjes @ 2013-12-04 5:20 ` David Rientjes 6 siblings, 0 replies; 39+ messages in thread From: David Rientjes @ 2013-12-04 5:20 UTC (permalink / raw) To: Andrew Morton Cc: Michal Hocko, KAMEZAWA Hiroyuki, Johannes Weiner, Mel Gorman, Rik van Riel, Pekka Enberg, Christoph Lameter, linux-kernel, linux-mm, cgroups Add documentation on memcg oom reserves to Documentation/cgroups/memory.txt and give an example of its usage and recommended best practices. Signed-off-by: David Rientjes <rientjes@google.com> --- Documentation/cgroups/memory.txt | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt --- a/Documentation/cgroups/memory.txt +++ b/Documentation/cgroups/memory.txt @@ -71,6 +71,7 @@ Brief summary of control files. (See sysctl's vm.swappiness) memory.move_charge_at_immigrate # set/show controls of moving charges memory.oom_control # set/show oom controls. + memory.oom_reserve_in_bytes # set/show limit of oom memory reserves memory.numa_stat # show the number of memory usage per numa node memory.kmem.limit_in_bytes # set/show hard limit for kernel memory @@ -772,6 +773,31 @@ At reading, current status of OOM is shown. under_oom 0 or 1 (if 1, the memory cgroup is under OOM, tasks may be stopped.) +Processes that handle oom conditions in their own memcgs or their child +memcgs may need to allocate memory themselves to do anything useful, +including pagefaulting its text or allocating kernel memory to read the +memcg "tasks" file. For this reason, memory.oom_reserve_in_bytes is +provided that specifies how much memory that processes waiting on +memory.oom_control can allocate above the memcg limit. + +The memcg that the oom handler is attached to is charged for the memory +that it allocates against its own memory.oom_reserve_in_bytes. This +memory is therefore only available to processes that are waiting for +a notification. + +For example, if you do + + # echo 2m > memory.oom_reserve_in_bytes + +then any process attached to this memcg that is waiting on memcg oom +notifications anywhere on the system can allocate an additional 2MB +above memory.limit_in_bytes. + +You may still consider doing mlockall(MCL_FUTURE) for processes that +are waiting on oom notifications to keep this vaue as minimal as +possible, or allow it to be large enough so that its text can still +be pagefaulted in under oom conditions when the value is known. + 11. Memory Pressure The pressure level notifications can be used to monitor the memory -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 39+ messages in thread
end of thread, other threads:[~2013-12-13 11:47 UTC | newest]
Thread overview: 39+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
[not found] <20131119131400.GC20655@dhcp22.suse.cz>
[not found] ` <20131119134007.GD20655@dhcp22.suse.cz>
[not found] ` <alpine.DEB.2.02.1311192352070.20752@chino.kir.corp.google.com>
[not found] ` <20131120152251.GA18809@dhcp22.suse.cz>
[not found] ` <alpine.DEB.2.02.1311201917520.7167@chino.kir.corp.google.com>
[not found] ` <20131128115458.GK2761@dhcp22.suse.cz>
[not found] ` <alpine.DEB.2.02.1312021504170.13465@chino.kir.corp.google.com>
2013-12-04 5:19 ` [patch 1/8] fork: collapse copy_flags into copy_process David Rientjes
2013-12-04 5:19 ` [patch 2/8] mm, mempolicy: rename slab_node for clarity David Rientjes
[not found] ` <alpine.DEB.2.02.1312032117330.29733-X6Q0R45D7oAcqpCFd4KODRPsWskHk0ljAL8bYrjMMd8@public.gmane.org>
2013-12-04 15:21 ` Christoph Lameter
2013-12-04 5:20 ` [patch 3/8] mm, mempolicy: remove per-process flag David Rientjes
2013-12-04 15:24 ` Christoph Lameter
2013-12-05 0:53 ` David Rientjes
2013-12-05 19:05 ` Christoph Lameter
2013-12-05 23:53 ` David Rientjes
2013-12-06 14:46 ` Christoph Lameter
2013-12-04 5:20 ` [patch 4/8] mm, memcg: add tunable for oom reserves David Rientjes
2013-12-04 5:20 ` [patch 5/8] res_counter: remove interface for locked charging and uncharging David Rientjes
2013-12-04 5:20 ` [patch 6/8] res_counter: add interface for maximum nofail charge David Rientjes
2013-12-04 5:20 ` [patch 7/8] mm, memcg: allow processes handling oom notifications to access reserves David Rientjes
2013-12-04 5:45 ` Johannes Weiner
2013-12-05 1:49 ` David Rientjes
2013-12-05 2:50 ` Tejun Heo
[not found] ` <20131205025026.GA26777-Gd/HAXX7CRxy/B6EtB590w@public.gmane.org>
2013-12-05 23:49 ` David Rientjes
[not found] ` <alpine.DEB.2.02.1312051537550.7717-X6Q0R45D7oAcqpCFd4KODRPsWskHk0ljAL8bYrjMMd8@public.gmane.org>
2013-12-06 17:34 ` Johannes Weiner
2013-12-07 16:38 ` Tim Hockin
2013-12-07 17:40 ` Johannes Weiner
2013-12-07 18:12 ` Tim Hockin
[not found] ` <CAAAKZwvanMiz8QZVOU0-SUKYzqcaJAXn0HxYs5+=Zakmnbcfbg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2013-12-07 19:06 ` Johannes Weiner
2013-12-07 21:04 ` Tim Hockin
2013-12-06 19:01 ` Tejun Heo
2013-12-09 20:10 ` David Rientjes
[not found] ` <alpine.DEB.2.02.1312061441390.8949-X6Q0R45D7oAcqpCFd4KODRPsWskHk0ljAL8bYrjMMd8@public.gmane.org>
2013-12-09 22:37 ` Johannes Weiner
2013-12-10 21:50 ` Tejun Heo
2013-12-10 23:55 ` David Rientjes
2013-12-11 9:49 ` Mel Gorman
2013-12-11 12:42 ` Tejun Heo
2013-12-12 5:37 ` Tim Hockin
2013-12-12 14:21 ` Tejun Heo
2013-12-12 16:32 ` Michal Hocko
2013-12-12 16:37 ` Tejun Heo
[not found] ` <20131212142156.GB32683-Gd/HAXX7CRxy/B6EtB590w@public.gmane.org>
2013-12-12 18:42 ` Tim Hockin
2013-12-12 19:23 ` Tejun Heo
2013-12-13 0:23 ` Tim Hockin
2013-12-13 11:47 ` Tejun Heo
2013-12-04 5:20 ` [patch 8/8] mm, memcg: add memcg oom reserve documentation David Rientjes
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).