- * Re: [patch v2 3/6] memcg: rework mem_cgroup_iter to use cgroup iterators
  2012-11-26 18:47 ` [patch v2 3/6] memcg: rework mem_cgroup_iter to use cgroup iterators Michal Hocko
@ 2012-11-28  8:47   ` Kamezawa Hiroyuki
  2012-11-28  9:17     ` Michal Hocko
  2012-11-30  4:07   ` Kamezawa Hiroyuki
                     ` (3 subsequent siblings)
  4 siblings, 1 reply; 57+ messages in thread
From: Kamezawa Hiroyuki @ 2012-11-28  8:47 UTC (permalink / raw)
  To: Michal Hocko
  Cc: linux-mm, linux-kernel, Johannes Weiner, Ying Han, Tejun Heo,
	Glauber Costa, Li Zefan
(2012/11/27 3:47), Michal Hocko wrote:
> mem_cgroup_iter curently relies on css->id when walking down a group
> hierarchy tree. This is really awkward because the tree walk depends on
> the groups creation ordering. The only guarantee is that a parent node
> is visited before its children.
> Example
>   1) mkdir -p a a/d a/b/c
>   2) mkdir -a a/b/c a/d
> Will create the same trees but the tree walks will be different:
>   1) a, d, b, c
>   2) a, b, c, d
> 
> 574bd9f7 (cgroup: implement generic child / descendant walk macros) has
> introduced generic cgroup tree walkers which provide either pre-order
> or post-order tree walk. This patch converts css->id based iteration
> to pre-order tree walk to keep the semantic with the original iterator
> where parent is always visited before its subtree.
> 
> cgroup_for_each_descendant_pre suggests using post_create and
> pre_destroy for proper synchronization with groups addidition resp.
> removal. This implementation doesn't use those because a new memory
> cgroup is fully initialized in mem_cgroup_create and css reference
> counting enforces that the group is alive for both the last seen cgroup
> and the found one resp. it signals that the group is dead and it should
> be skipped.
> 
> If the reclaim cookie is used we need to store the last visited group
> into the iterator so we have to be careful that it doesn't disappear in
> the mean time. Elevated reference count on the css keeps it alive even
> though the group have been removed (parked waiting for the last dput so
> that it can be freed).
> 
> V2
> - use css_{get,put} for iter->last_visited rather than
>    mem_cgroup_{get,put} because it is stronger wrt. cgroup life cycle
> - cgroup_next_descendant_pre expects NULL pos for the first iterartion
>    otherwise it might loop endlessly for intermediate node without any
>    children.
> 
> Signed-off-by: Michal Hocko <mhocko@suse.cz>
> ---
>   mm/memcontrol.c |   74 ++++++++++++++++++++++++++++++++++++++++++-------------
>   1 file changed, 57 insertions(+), 17 deletions(-)
> 
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 1f5528d..6bcc97b 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -144,8 +144,8 @@ struct mem_cgroup_stat_cpu {
>   };
>   
>   struct mem_cgroup_reclaim_iter {
> -	/* css_id of the last scanned hierarchy member */
> -	int position;
> +	/* last scanned hierarchy member with elevated css ref count */
> +	struct mem_cgroup *last_visited;
>   	/* scan generation, increased every round-trip */
>   	unsigned int generation;
>   	/* lock to protect the position and generation */
> @@ -1066,7 +1066,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
>   				   struct mem_cgroup_reclaim_cookie *reclaim)
>   {
>   	struct mem_cgroup *memcg = NULL;
> -	int id = 0;
> +	struct mem_cgroup *last_visited = NULL;
>   
>   	if (mem_cgroup_disabled())
>   		return NULL;
> @@ -1075,7 +1075,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
>   		root = root_mem_cgroup;
>   
>   	if (prev && !reclaim)
> -		id = css_id(&prev->css);
> +		last_visited = prev;
>   
>   	if (!root->use_hierarchy && root != root_mem_cgroup) {
>   		if (prev)
> @@ -1083,9 +1083,10 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
>   		return root;
>   	}
>   
> +	rcu_read_lock();
>   	while (!memcg) {
>   		struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
> -		struct cgroup_subsys_state *css;
> +		struct cgroup_subsys_state *css = NULL;
>   
>   		if (reclaim) {
>   			int nid = zone_to_nid(reclaim->zone);
> @@ -1095,34 +1096,73 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
>   			mz = mem_cgroup_zoneinfo(root, nid, zid);
>   			iter = &mz->reclaim_iter[reclaim->priority];
>   			spin_lock(&iter->iter_lock);
> +			last_visited = iter->last_visited;
>   			if (prev && reclaim->generation != iter->generation) {
> +				if (last_visited) {
> +					css_put(&last_visited->css);
> +					iter->last_visited = NULL;
> +				}
>   				spin_unlock(&iter->iter_lock);
> -				goto out_css_put;
> +				goto out_unlock;
>   			}
> -			id = iter->position;
>   		}
>   
> -		rcu_read_lock();
> -		css = css_get_next(&mem_cgroup_subsys, id + 1, &root->css, &id);
> -		if (css) {
> -			if (css == &root->css || css_tryget(css))
> -				memcg = mem_cgroup_from_css(css);
> -		} else
> -			id = 0;
> -		rcu_read_unlock();
> +		/*
> +		 * Root is not visited by cgroup iterators so it needs an
> +		 * explicit visit.
> +		 */
> +		if (!last_visited) {
> +			css = &root->css;
> +		} else {
> +			struct cgroup *prev_cgroup, *next_cgroup;
> +
> +			prev_cgroup = (last_visited == root) ? NULL
> +				: last_visited->css.cgroup;
> +			next_cgroup = cgroup_next_descendant_pre(prev_cgroup,
> +					root->css.cgroup);
> +			if (next_cgroup)
> +				css = cgroup_subsys_state(next_cgroup,
> +						mem_cgroup_subsys_id);
> +		}
> +
> +		/*
> +		 * Even if we found a group we have to make sure it is alive.
> +		 * css && !memcg means that the groups should be skipped and
> +		 * we should continue the tree walk.
> +		 * last_visited css is safe to use because it is protected by
> +		 * css_get and the tree walk is rcu safe.
> +		 */
> +		if (css == &root->css || (css && css_tryget(css)))
> +			memcg = mem_cgroup_from_css(css);
Could you note that this iterator will never visit dangling(removed) memcg, somewhere ?
Hmm, I'm not sure but it may be trouble at shrkinking dangling kmem_cache(slab).
Costa, how do you think ?
I guess there is no problem with swap and not against the way you go.
Thanks,
-Kame
>   
>   		if (reclaim) {
> -			iter->position = id;
> +			struct mem_cgroup *curr = memcg;
> +
> +			if (last_visited)
> +				css_put(&last_visited->css);
> +
> +			if (css && !memcg)
> +				curr = mem_cgroup_from_css(css);
> +
> +			/* make sure that the cached memcg is not removed */
> +			if (curr)
> +				css_get(&curr->css);
> +			iter->last_visited = curr;
> +
>   			if (!css)
>   				iter->generation++;
>   			else if (!prev && memcg)
>   				reclaim->generation = iter->generation;
>   			spin_unlock(&iter->iter_lock);
> +		} else if (css && !memcg) {
> +			last_visited = mem_cgroup_from_css(css);
>   		}
>   
>   		if (prev && !css)
> -			goto out_css_put;
> +			goto out_unlock;
>   	}
> +out_unlock:
> +	rcu_read_unlock();
>   out_css_put:
>   	if (prev && prev != root)
>   		css_put(&prev->css);
> 
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply	[flat|nested] 57+ messages in thread
- * Re: [patch v2 3/6] memcg: rework mem_cgroup_iter to use cgroup iterators
  2012-11-28  8:47   ` Kamezawa Hiroyuki
@ 2012-11-28  9:17     ` Michal Hocko
  2012-11-28  9:23       ` Glauber Costa
  0 siblings, 1 reply; 57+ messages in thread
From: Michal Hocko @ 2012-11-28  9:17 UTC (permalink / raw)
  To: Kamezawa Hiroyuki
  Cc: linux-mm, linux-kernel, Johannes Weiner, Ying Han, Tejun Heo,
	Glauber Costa, Li Zefan
On Wed 28-11-12 17:47:59, KAMEZAWA Hiroyuki wrote:
> (2012/11/27 3:47), Michal Hocko wrote:
[...]
> > +		/*
> > +		 * Even if we found a group we have to make sure it is alive.
> > +		 * css && !memcg means that the groups should be skipped and
> > +		 * we should continue the tree walk.
> > +		 * last_visited css is safe to use because it is protected by
> > +		 * css_get and the tree walk is rcu safe.
> > +		 */
> > +		if (css == &root->css || (css && css_tryget(css)))
> > +			memcg = mem_cgroup_from_css(css);
> 
> Could you note that this iterator will never visit dangling(removed)
> memcg, somewhere ?
OK, I can add it to the function comment but the behavior hasn't changed
so I wouldn't like to confuse anybody.
> Hmm, I'm not sure but it may be trouble at shrkinking dangling
> kmem_cache(slab).
We do not shrink slab at all. Those objects that are in a dead memcg
wait for their owner tho release them which will make the dangling group
eventually go away
> 
> Costa, how do you think ?
> 
> I guess there is no problem with swap and not against the way you go.
Yes, swap should be OK. Pages charged against removed memcg will
fallback to the the current's mm (try_get_mem_cgroup_from_page and
__mem_cgroup_try_charge_swapin)
[...]
-- 
Michal Hocko
SUSE Labs
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply	[flat|nested] 57+ messages in thread 
- * Re: [patch v2 3/6] memcg: rework mem_cgroup_iter to use cgroup iterators
  2012-11-28  9:17     ` Michal Hocko
@ 2012-11-28  9:23       ` Glauber Costa
  2012-11-28  9:33         ` Michal Hocko
  0 siblings, 1 reply; 57+ messages in thread
From: Glauber Costa @ 2012-11-28  9:23 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Kamezawa Hiroyuki, linux-mm, linux-kernel, Johannes Weiner,
	Ying Han, Tejun Heo, Li Zefan
On 11/28/2012 01:17 PM, Michal Hocko wrote:
> On Wed 28-11-12 17:47:59, KAMEZAWA Hiroyuki wrote:
>> (2012/11/27 3:47), Michal Hocko wrote:
> [...]
>>> +		/*
>>> +		 * Even if we found a group we have to make sure it is alive.
>>> +		 * css && !memcg means that the groups should be skipped and
>>> +		 * we should continue the tree walk.
>>> +		 * last_visited css is safe to use because it is protected by
>>> +		 * css_get and the tree walk is rcu safe.
>>> +		 */
>>> +		if (css == &root->css || (css && css_tryget(css)))
>>> +			memcg = mem_cgroup_from_css(css);
>>
>> Could you note that this iterator will never visit dangling(removed)
>> memcg, somewhere ?
> 
> OK, I can add it to the function comment but the behavior hasn't changed
> so I wouldn't like to confuse anybody.
> 
>> Hmm, I'm not sure but it may be trouble at shrkinking dangling
>> kmem_cache(slab).
> 
> We do not shrink slab at all. 
yet. However...
> Those objects that are in a dead memcg
> wait for their owner tho release them which will make the dangling group
> eventually go away
> 
>>
>> Costa, how do you think ?
>>
In general, I particularly believe it is a good idea to skip dead memcgs
in the iterator. I don't anticipate any problems with shrinking at all.
Basically, we will only ever actively shrink when the memcg is dying, at
which point it is still alive.
After this, it's better to just leave it to vmscan. Whenever there is
pressure, it will go away.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply	[flat|nested] 57+ messages in thread 
- * Re: [patch v2 3/6] memcg: rework mem_cgroup_iter to use cgroup iterators
  2012-11-28  9:23       ` Glauber Costa
@ 2012-11-28  9:33         ` Michal Hocko
  2012-11-28  9:35           ` Glauber Costa
  0 siblings, 1 reply; 57+ messages in thread
From: Michal Hocko @ 2012-11-28  9:33 UTC (permalink / raw)
  To: Glauber Costa
  Cc: Kamezawa Hiroyuki, linux-mm, linux-kernel, Johannes Weiner,
	Ying Han, Tejun Heo, Li Zefan
On Wed 28-11-12 13:23:57, Glauber Costa wrote:
> On 11/28/2012 01:17 PM, Michal Hocko wrote:
> > On Wed 28-11-12 17:47:59, KAMEZAWA Hiroyuki wrote:
> >> (2012/11/27 3:47), Michal Hocko wrote:
> > [...]
> >>> +		/*
> >>> +		 * Even if we found a group we have to make sure it is alive.
> >>> +		 * css && !memcg means that the groups should be skipped and
> >>> +		 * we should continue the tree walk.
> >>> +		 * last_visited css is safe to use because it is protected by
> >>> +		 * css_get and the tree walk is rcu safe.
> >>> +		 */
> >>> +		if (css == &root->css || (css && css_tryget(css)))
> >>> +			memcg = mem_cgroup_from_css(css);
> >>
> >> Could you note that this iterator will never visit dangling(removed)
> >> memcg, somewhere ?
> > 
> > OK, I can add it to the function comment but the behavior hasn't changed
> > so I wouldn't like to confuse anybody.
> > 
> >> Hmm, I'm not sure but it may be trouble at shrkinking dangling
> >> kmem_cache(slab).
> > 
> > We do not shrink slab at all. 
> 
> yet. However...
> 
> > Those objects that are in a dead memcg
> > wait for their owner tho release them which will make the dangling group
> > eventually go away
> > 
> >>
> >> Costa, how do you think ?
> >>
> 
> In general, I particularly believe it is a good idea to skip dead memcgs
> in the iterator. I don't anticipate any problems with shrinking at all.
We even cannot iterate dead ones because their cgroups are gone and so
you do not have any way to iterate. So either make them alive by css_get
or we cannot iterate them.
> Basically, we will only ever actively shrink when the memcg is dying, at
> which point it is still alive.
> After this, it's better to just leave it to vmscan. Whenever there is
> pressure, it will go away.
-- 
Michal Hocko
SUSE Labs
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply	[flat|nested] 57+ messages in thread 
- * Re: [patch v2 3/6] memcg: rework mem_cgroup_iter to use cgroup iterators
  2012-11-28  9:33         ` Michal Hocko
@ 2012-11-28  9:35           ` Glauber Costa
  0 siblings, 0 replies; 57+ messages in thread
From: Glauber Costa @ 2012-11-28  9:35 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Kamezawa Hiroyuki, linux-mm, linux-kernel, Johannes Weiner,
	Ying Han, Tejun Heo, Li Zefan
On 11/28/2012 01:33 PM, Michal Hocko wrote:
> On Wed 28-11-12 13:23:57, Glauber Costa wrote:
>> On 11/28/2012 01:17 PM, Michal Hocko wrote:
>>> On Wed 28-11-12 17:47:59, KAMEZAWA Hiroyuki wrote:
>>>> (2012/11/27 3:47), Michal Hocko wrote:
>>> [...]
>>>>> +		/*
>>>>> +		 * Even if we found a group we have to make sure it is alive.
>>>>> +		 * css && !memcg means that the groups should be skipped and
>>>>> +		 * we should continue the tree walk.
>>>>> +		 * last_visited css is safe to use because it is protected by
>>>>> +		 * css_get and the tree walk is rcu safe.
>>>>> +		 */
>>>>> +		if (css == &root->css || (css && css_tryget(css)))
>>>>> +			memcg = mem_cgroup_from_css(css);
>>>>
>>>> Could you note that this iterator will never visit dangling(removed)
>>>> memcg, somewhere ?
>>>
>>> OK, I can add it to the function comment but the behavior hasn't changed
>>> so I wouldn't like to confuse anybody.
>>>
>>>> Hmm, I'm not sure but it may be trouble at shrkinking dangling
>>>> kmem_cache(slab).
>>>
>>> We do not shrink slab at all. 
>>
>> yet. However...
>>
>>> Those objects that are in a dead memcg
>>> wait for their owner tho release them which will make the dangling group
>>> eventually go away
>>>
>>>>
>>>> Costa, how do you think ?
>>>>
>>
>> In general, I particularly believe it is a good idea to skip dead memcgs
>> in the iterator. I don't anticipate any problems with shrinking at all.
> 
> We even cannot iterate dead ones because their cgroups are gone and so
> you do not have any way to iterate. So either make them alive by css_get
> or we cannot iterate them.
> 
We are in full agreement.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply	[flat|nested] 57+ messages in thread 
 
 
 
 
- * Re: [patch v2 3/6] memcg: rework mem_cgroup_iter to use cgroup iterators
  2012-11-26 18:47 ` [patch v2 3/6] memcg: rework mem_cgroup_iter to use cgroup iterators Michal Hocko
  2012-11-28  8:47   ` Kamezawa Hiroyuki
@ 2012-11-30  4:07   ` Kamezawa Hiroyuki
  2012-12-07  3:39   ` Ying Han
                     ` (2 subsequent siblings)
  4 siblings, 0 replies; 57+ messages in thread
From: Kamezawa Hiroyuki @ 2012-11-30  4:07 UTC (permalink / raw)
  To: Michal Hocko
  Cc: linux-mm, linux-kernel, Johannes Weiner, Ying Han, Tejun Heo,
	Glauber Costa, Li Zefan
(2012/11/27 3:47), Michal Hocko wrote:
> mem_cgroup_iter curently relies on css->id when walking down a group
> hierarchy tree. This is really awkward because the tree walk depends on
> the groups creation ordering. The only guarantee is that a parent node
> is visited before its children.
> Example
>   1) mkdir -p a a/d a/b/c
>   2) mkdir -a a/b/c a/d
> Will create the same trees but the tree walks will be different:
>   1) a, d, b, c
>   2) a, b, c, d
> 
> 574bd9f7 (cgroup: implement generic child / descendant walk macros) has
> introduced generic cgroup tree walkers which provide either pre-order
> or post-order tree walk. This patch converts css->id based iteration
> to pre-order tree walk to keep the semantic with the original iterator
> where parent is always visited before its subtree.
> 
> cgroup_for_each_descendant_pre suggests using post_create and
> pre_destroy for proper synchronization with groups addidition resp.
> removal. This implementation doesn't use those because a new memory
> cgroup is fully initialized in mem_cgroup_create and css reference
> counting enforces that the group is alive for both the last seen cgroup
> and the found one resp. it signals that the group is dead and it should
> be skipped.
> 
> If the reclaim cookie is used we need to store the last visited group
> into the iterator so we have to be careful that it doesn't disappear in
> the mean time. Elevated reference count on the css keeps it alive even
> though the group have been removed (parked waiting for the last dput so
> that it can be freed).
> 
> V2
> - use css_{get,put} for iter->last_visited rather than
>    mem_cgroup_{get,put} because it is stronger wrt. cgroup life cycle
> - cgroup_next_descendant_pre expects NULL pos for the first iterartion
>    otherwise it might loop endlessly for intermediate node without any
>    children.
> 
> Signed-off-by: Michal Hocko <mhocko@suse.cz>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply	[flat|nested] 57+ messages in thread
- * Re: [patch v2 3/6] memcg: rework mem_cgroup_iter to use cgroup iterators
  2012-11-26 18:47 ` [patch v2 3/6] memcg: rework mem_cgroup_iter to use cgroup iterators Michal Hocko
  2012-11-28  8:47   ` Kamezawa Hiroyuki
  2012-11-30  4:07   ` Kamezawa Hiroyuki
@ 2012-12-07  3:39   ` Ying Han
  2012-12-07  3:43     ` Ying Han
  2012-12-07  9:01     ` Michal Hocko
  2012-12-09 16:59   ` Ying Han
  2012-12-09 19:39   ` Ying Han
  4 siblings, 2 replies; 57+ messages in thread
From: Ying Han @ 2012-12-07  3:39 UTC (permalink / raw)
  To: Michal Hocko
  Cc: linux-mm, linux-kernel, KAMEZAWA Hiroyuki, Johannes Weiner,
	Tejun Heo, Glauber Costa, Li Zefan
On Mon, Nov 26, 2012 at 10:47 AM, Michal Hocko <mhocko@suse.cz> wrote:
> mem_cgroup_iter curently relies on css->id when walking down a group
> hierarchy tree. This is really awkward because the tree walk depends on
> the groups creation ordering. The only guarantee is that a parent node
> is visited before its children.
> Example
>  1) mkdir -p a a/d a/b/c
>  2) mkdir -a a/b/c a/d
> Will create the same trees but the tree walks will be different:
>  1) a, d, b, c
>  2) a, b, c, d
>
> 574bd9f7 (cgroup: implement generic child / descendant walk macros) has
> introduced generic cgroup tree walkers which provide either pre-order
> or post-order tree walk. This patch converts css->id based iteration
> to pre-order tree walk to keep the semantic with the original iterator
> where parent is always visited before its subtree.
>
> cgroup_for_each_descendant_pre suggests using post_create and
> pre_destroy for proper synchronization with groups addidition resp.
> removal. This implementation doesn't use those because a new memory
> cgroup is fully initialized in mem_cgroup_create and css reference
> counting enforces that the group is alive for both the last seen cgroup
> and the found one resp. it signals that the group is dead and it should
> be skipped.
>
> If the reclaim cookie is used we need to store the last visited group
> into the iterator so we have to be careful that it doesn't disappear in
> the mean time. Elevated reference count on the css keeps it alive even
> though the group have been removed (parked waiting for the last dput so
> that it can be freed).
>
> V2
> - use css_{get,put} for iter->last_visited rather than
>   mem_cgroup_{get,put} because it is stronger wrt. cgroup life cycle
> - cgroup_next_descendant_pre expects NULL pos for the first iterartion
>   otherwise it might loop endlessly for intermediate node without any
>   children.
>
> Signed-off-by: Michal Hocko <mhocko@suse.cz>
> ---
>  mm/memcontrol.c |   74 ++++++++++++++++++++++++++++++++++++++++++-------------
>  1 file changed, 57 insertions(+), 17 deletions(-)
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 1f5528d..6bcc97b 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -144,8 +144,8 @@ struct mem_cgroup_stat_cpu {
>  };
>
>  struct mem_cgroup_reclaim_iter {
> -       /* css_id of the last scanned hierarchy member */
> -       int position;
> +       /* last scanned hierarchy member with elevated css ref count */
> +       struct mem_cgroup *last_visited;
>         /* scan generation, increased every round-trip */
>         unsigned int generation;
>         /* lock to protect the position and generation */
> @@ -1066,7 +1066,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
>                                    struct mem_cgroup_reclaim_cookie *reclaim)
>  {
>         struct mem_cgroup *memcg = NULL;
> -       int id = 0;
> +       struct mem_cgroup *last_visited = NULL;
>
>         if (mem_cgroup_disabled())
>                 return NULL;
> @@ -1075,7 +1075,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
>                 root = root_mem_cgroup;
>
>         if (prev && !reclaim)
> -               id = css_id(&prev->css);
> +               last_visited = prev;
>
>         if (!root->use_hierarchy && root != root_mem_cgroup) {
>                 if (prev)
> @@ -1083,9 +1083,10 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
>                 return root;
>         }
>
> +       rcu_read_lock();
>         while (!memcg) {
>                 struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
> -               struct cgroup_subsys_state *css;
> +               struct cgroup_subsys_state *css = NULL;
>
>                 if (reclaim) {
>                         int nid = zone_to_nid(reclaim->zone);
> @@ -1095,34 +1096,73 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
>                         mz = mem_cgroup_zoneinfo(root, nid, zid);
>                         iter = &mz->reclaim_iter[reclaim->priority];
>                         spin_lock(&iter->iter_lock);
> +                       last_visited = iter->last_visited;
>                         if (prev && reclaim->generation != iter->generation) {
> +                               if (last_visited) {
> +                                       css_put(&last_visited->css);
> +                                       iter->last_visited = NULL;
> +                               }
>                                 spin_unlock(&iter->iter_lock);
> -                               goto out_css_put;
> +                               goto out_unlock;
>                         }
> -                       id = iter->position;
>                 }
>
> -               rcu_read_lock();
> -               css = css_get_next(&mem_cgroup_subsys, id + 1, &root->css, &id);
> -               if (css) {
> -                       if (css == &root->css || css_tryget(css))
> -                               memcg = mem_cgroup_from_css(css);
> -               } else
> -                       id = 0;
> -               rcu_read_unlock();
> +               /*
> +                * Root is not visited by cgroup iterators so it needs an
> +                * explicit visit.
> +                */
> +               if (!last_visited) {
> +                       css = &root->css;
> +               } else {
> +                       struct cgroup *prev_cgroup, *next_cgroup;
> +
> +                       prev_cgroup = (last_visited == root) ? NULL
> +                               : last_visited->css.cgroup;
> +                       next_cgroup = cgroup_next_descendant_pre(prev_cgroup,
> +                                       root->css.cgroup);
> +                       if (next_cgroup)
> +                               css = cgroup_subsys_state(next_cgroup,
> +                                               mem_cgroup_subsys_id);
> +               }
> +
> +               /*
> +                * Even if we found a group we have to make sure it is alive.
> +                * css && !memcg means that the groups should be skipped and
> +                * we should continue the tree walk.
> +                * last_visited css is safe to use because it is protected by
> +                * css_get and the tree walk is rcu safe.
> +                */
> +               if (css == &root->css || (css && css_tryget(css)))
> +                       memcg = mem_cgroup_from_css(css);
>
>                 if (reclaim) {
> -                       iter->position = id;
> +                       struct mem_cgroup *curr = memcg;
> +
> +                       if (last_visited)
> +                               css_put(&last_visited->css);
> +
> +                       if (css && !memcg)
> +                               curr = mem_cgroup_from_css(css);
> +
> +                       /* make sure that the cached memcg is not removed */
> +                       if (curr)
> +                               css_get(&curr->css);
> +                       iter->last_visited = curr;
> +
>                         if (!css)
>                                 iter->generation++;
>                         else if (!prev && memcg)
>                                 reclaim->generation = iter->generation;
>                         spin_unlock(&iter->iter_lock);
> +               } else if (css && !memcg) {
> +                       last_visited = mem_cgroup_from_css(css);
>                 }
>
>                 if (prev && !css)
> -                       goto out_css_put;
> +                       goto out_unlock;
>         }
> +out_unlock:
> +       rcu_read_unlock();
>  out_css_put:
>         if (prev && prev != root)
>                 css_put(&prev->css);
> --
> 1.7.10.4
>
Michal,
I got some trouble while running this patch with my test. The test
creates hundreds of memcgs which each runs some workload to generate
global pressure. At the last, it removes all the memcgs by rmdir. Then
the cmd "ls /dev/cgroup/memory/" hangs afterwards.
I studied a bit of the patch, but not spending too much time on it
yet. Looks like that the v2 has something different from your last
post, where you replaces the mem_cgroup_get() with css_get() on the
iter->last_visited. Didn't follow why we made that change, but after
restoring the behavior a bit seems passed my test. Here is the patch I
applied on top of this one:
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f2eeee6..4aadb9f 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1003,12 +1003,16 @@ struct mem_cgroup *mem_cgroup_iter(struct
mem_cgroup *root,
                        last_visited = iter->last_visited;
                        if (prev && reclaim->generation != iter->generation) {
                                if (last_visited) {
-                                       css_put(&last_visited->css);
+                                       mem_cgroup_put(last_visited);
                                        iter->last_visited = NULL;
                                }
                                spin_unlock(&iter->iter_lock);
                                goto out_unlock;
                        }
+                       if (last_visited && !css_tryget(&last_visited->css)) {
+                               mem_cgroup_put(last_visited);
+                               last_visited = NULL;
+                       }
                }
                /*
@@ -1041,15 +1045,17 @@ struct mem_cgroup *mem_cgroup_iter(struct
mem_cgroup *root,
                if (reclaim) {
                        struct mem_cgroup *curr = memcg;
-                       if (last_visited)
+                       if (last_visited) {
                                css_put(&last_visited->css);
+                               mem_cgroup_put(last_visited);
+                       }
                        if (css && !memcg)
                                curr = container_of(css, struct
mem_cgroup, css);
                        /* make sure that the cached memcg is not removed */
                        if (curr)
-                               css_get(&curr->css);
+                               mem_cgroup_get(curr);
                        iter->last_visited = curr;
                        if (!css)
I will probably look into why next, but like to bring it up in case it
rings the bell on your side :)
--Ying
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related	[flat|nested] 57+ messages in thread
- * Re: [patch v2 3/6] memcg: rework mem_cgroup_iter to use cgroup iterators
  2012-12-07  3:39   ` Ying Han
@ 2012-12-07  3:43     ` Ying Han
  2012-12-07  8:58       ` Michal Hocko
  2012-12-07  9:01     ` Michal Hocko
  1 sibling, 1 reply; 57+ messages in thread
From: Ying Han @ 2012-12-07  3:43 UTC (permalink / raw)
  To: Michal Hocko
  Cc: linux-mm, linux-kernel, KAMEZAWA Hiroyuki, Johannes Weiner,
	Tejun Heo, Glauber Costa, Li Zefan
On Thu, Dec 6, 2012 at 7:39 PM, Ying Han <yinghan@google.com> wrote:
> On Mon, Nov 26, 2012 at 10:47 AM, Michal Hocko <mhocko@suse.cz> wrote:
>> mem_cgroup_iter curently relies on css->id when walking down a group
>> hierarchy tree. This is really awkward because the tree walk depends on
>> the groups creation ordering. The only guarantee is that a parent node
>> is visited before its children.
>> Example
>>  1) mkdir -p a a/d a/b/c
>>  2) mkdir -a a/b/c a/d
>> Will create the same trees but the tree walks will be different:
>>  1) a, d, b, c
>>  2) a, b, c, d
>>
>> 574bd9f7 (cgroup: implement generic child / descendant walk macros) has
>> introduced generic cgroup tree walkers which provide either pre-order
>> or post-order tree walk. This patch converts css->id based iteration
>> to pre-order tree walk to keep the semantic with the original iterator
>> where parent is always visited before its subtree.
>>
>> cgroup_for_each_descendant_pre suggests using post_create and
>> pre_destroy for proper synchronization with groups addidition resp.
>> removal. This implementation doesn't use those because a new memory
>> cgroup is fully initialized in mem_cgroup_create and css reference
>> counting enforces that the group is alive for both the last seen cgroup
>> and the found one resp. it signals that the group is dead and it should
>> be skipped.
>>
>> If the reclaim cookie is used we need to store the last visited group
>> into the iterator so we have to be careful that it doesn't disappear in
>> the mean time. Elevated reference count on the css keeps it alive even
>> though the group have been removed (parked waiting for the last dput so
>> that it can be freed).
>>
>> V2
>> - use css_{get,put} for iter->last_visited rather than
>>   mem_cgroup_{get,put} because it is stronger wrt. cgroup life cycle
>> - cgroup_next_descendant_pre expects NULL pos for the first iterartion
>>   otherwise it might loop endlessly for intermediate node without any
>>   children.
>>
>> Signed-off-by: Michal Hocko <mhocko@suse.cz>
>> ---
>>  mm/memcontrol.c |   74 ++++++++++++++++++++++++++++++++++++++++++-------------
>>  1 file changed, 57 insertions(+), 17 deletions(-)
>>
>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
>> index 1f5528d..6bcc97b 100644
>> --- a/mm/memcontrol.c
>> +++ b/mm/memcontrol.c
>> @@ -144,8 +144,8 @@ struct mem_cgroup_stat_cpu {
>>  };
>>
>>  struct mem_cgroup_reclaim_iter {
>> -       /* css_id of the last scanned hierarchy member */
>> -       int position;
>> +       /* last scanned hierarchy member with elevated css ref count */
>> +       struct mem_cgroup *last_visited;
>>         /* scan generation, increased every round-trip */
>>         unsigned int generation;
>>         /* lock to protect the position and generation */
>> @@ -1066,7 +1066,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
>>                                    struct mem_cgroup_reclaim_cookie *reclaim)
>>  {
>>         struct mem_cgroup *memcg = NULL;
>> -       int id = 0;
>> +       struct mem_cgroup *last_visited = NULL;
>>
>>         if (mem_cgroup_disabled())
>>                 return NULL;
>> @@ -1075,7 +1075,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
>>                 root = root_mem_cgroup;
>>
>>         if (prev && !reclaim)
>> -               id = css_id(&prev->css);
>> +               last_visited = prev;
>>
>>         if (!root->use_hierarchy && root != root_mem_cgroup) {
>>                 if (prev)
>> @@ -1083,9 +1083,10 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
>>                 return root;
>>         }
>>
>> +       rcu_read_lock();
>>         while (!memcg) {
>>                 struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
>> -               struct cgroup_subsys_state *css;
>> +               struct cgroup_subsys_state *css = NULL;
>>
>>                 if (reclaim) {
>>                         int nid = zone_to_nid(reclaim->zone);
>> @@ -1095,34 +1096,73 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
>>                         mz = mem_cgroup_zoneinfo(root, nid, zid);
>>                         iter = &mz->reclaim_iter[reclaim->priority];
>>                         spin_lock(&iter->iter_lock);
>> +                       last_visited = iter->last_visited;
>>                         if (prev && reclaim->generation != iter->generation) {
>> +                               if (last_visited) {
>> +                                       css_put(&last_visited->css);
>> +                                       iter->last_visited = NULL;
>> +                               }
>>                                 spin_unlock(&iter->iter_lock);
>> -                               goto out_css_put;
>> +                               goto out_unlock;
>>                         }
>> -                       id = iter->position;
>>                 }
>>
>> -               rcu_read_lock();
>> -               css = css_get_next(&mem_cgroup_subsys, id + 1, &root->css, &id);
>> -               if (css) {
>> -                       if (css == &root->css || css_tryget(css))
>> -                               memcg = mem_cgroup_from_css(css);
>> -               } else
>> -                       id = 0;
>> -               rcu_read_unlock();
>> +               /*
>> +                * Root is not visited by cgroup iterators so it needs an
>> +                * explicit visit.
>> +                */
>> +               if (!last_visited) {
>> +                       css = &root->css;
>> +               } else {
>> +                       struct cgroup *prev_cgroup, *next_cgroup;
>> +
>> +                       prev_cgroup = (last_visited == root) ? NULL
>> +                               : last_visited->css.cgroup;
>> +                       next_cgroup = cgroup_next_descendant_pre(prev_cgroup,
>> +                                       root->css.cgroup);
>> +                       if (next_cgroup)
>> +                               css = cgroup_subsys_state(next_cgroup,
>> +                                               mem_cgroup_subsys_id);
>> +               }
>> +
>> +               /*
>> +                * Even if we found a group we have to make sure it is alive.
>> +                * css && !memcg means that the groups should be skipped and
>> +                * we should continue the tree walk.
>> +                * last_visited css is safe to use because it is protected by
>> +                * css_get and the tree walk is rcu safe.
>> +                */
>> +               if (css == &root->css || (css && css_tryget(css)))
>> +                       memcg = mem_cgroup_from_css(css);
>>
>>                 if (reclaim) {
>> -                       iter->position = id;
>> +                       struct mem_cgroup *curr = memcg;
>> +
>> +                       if (last_visited)
>> +                               css_put(&last_visited->css);
>> +
>> +                       if (css && !memcg)
>> +                               curr = mem_cgroup_from_css(css);
>> +
>> +                       /* make sure that the cached memcg is not removed */
>> +                       if (curr)
>> +                               css_get(&curr->css);
>> +                       iter->last_visited = curr;
>> +
>>                         if (!css)
>>                                 iter->generation++;
>>                         else if (!prev && memcg)
>>                                 reclaim->generation = iter->generation;
>>                         spin_unlock(&iter->iter_lock);
>> +               } else if (css && !memcg) {
>> +                       last_visited = mem_cgroup_from_css(css);
>>                 }
>>
>>                 if (prev && !css)
>> -                       goto out_css_put;
>> +                       goto out_unlock;
>>         }
>> +out_unlock:
>> +       rcu_read_unlock();
>>  out_css_put:
>>         if (prev && prev != root)
>>                 css_put(&prev->css);
>> --
>> 1.7.10.4
>>
>
> Michal,
>
> I got some trouble while running this patch with my test. The test
> creates hundreds of memcgs which each runs some workload to generate
> global pressure. At the last, it removes all the memcgs by rmdir. Then
> the cmd "ls /dev/cgroup/memory/" hangs afterwards.
>
> I studied a bit of the patch, but not spending too much time on it
> yet. Looks like that the v2 has something different from your last
> post, where you replaces the mem_cgroup_get() with css_get() on the
> iter->last_visited. Didn't follow why we made that change, but after
> restoring the behavior a bit seems passed my test. Here is the patch I
> applied on top of this one:
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index f2eeee6..4aadb9f 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -1003,12 +1003,16 @@ struct mem_cgroup *mem_cgroup_iter(struct
> mem_cgroup *root,
>                         last_visited = iter->last_visited;
>                         if (prev && reclaim->generation != iter->generation) {
>                                 if (last_visited) {
> -                                       css_put(&last_visited->css);
> +                                       mem_cgroup_put(last_visited);
>                                         iter->last_visited = NULL;
>                                 }
>                                 spin_unlock(&iter->iter_lock);
>                                 goto out_unlock;
>                         }
> +                       if (last_visited && !css_tryget(&last_visited->css)) {
> +                               mem_cgroup_put(last_visited);
> +                               last_visited = NULL;
> +                       }
>                 }
>
>                 /*
> @@ -1041,15 +1045,17 @@ struct mem_cgroup *mem_cgroup_iter(struct
> mem_cgroup *root,
>                 if (reclaim) {
>                         struct mem_cgroup *curr = memcg;
>
> -                       if (last_visited)
> +                       if (last_visited) {
>                                 css_put(&last_visited->css);
> +                               mem_cgroup_put(last_visited);
> +                       }
>
>                         if (css && !memcg)
>                                 curr = container_of(css, struct
> mem_cgroup, css);
>
>                         /* make sure that the cached memcg is not removed */
>                         if (curr)
> -                               css_get(&curr->css);
> +                               mem_cgroup_get(curr);
>                         iter->last_visited = curr;
>
>                         if (!css)
>
>
> I will probably look into why next, but like to bring it up in case it
> rings the bell on your side :)
Forgot to mention, I was testing 3.7-rc6 with the two cgroup changes :
cgroup: Use rculist ops for cgroup->children
cgroup: implement generic child / descendant walk macros
> --Ying
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply	[flat|nested] 57+ messages in thread
- * Re: [patch v2 3/6] memcg: rework mem_cgroup_iter to use cgroup iterators
  2012-12-07  3:43     ` Ying Han
@ 2012-12-07  8:58       ` Michal Hocko
  2012-12-07 17:12         ` Ying Han
  0 siblings, 1 reply; 57+ messages in thread
From: Michal Hocko @ 2012-12-07  8:58 UTC (permalink / raw)
  To: Ying Han
  Cc: linux-mm, linux-kernel, KAMEZAWA Hiroyuki, Johannes Weiner,
	Tejun Heo, Glauber Costa, Li Zefan
On Thu 06-12-12 19:43:52, Ying Han wrote:
[...]
> Forgot to mention, I was testing 3.7-rc6 with the two cgroup changes :
Could you give a try to -mm tree as well. There are some changes for
memcgs removal in that tree which are not in Linus's tree.
-- 
Michal Hocko
SUSE Labs
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply	[flat|nested] 57+ messages in thread 
- * Re: [patch v2 3/6] memcg: rework mem_cgroup_iter to use cgroup iterators
  2012-12-07  8:58       ` Michal Hocko
@ 2012-12-07 17:12         ` Ying Han
  2012-12-07 17:27           ` Michal Hocko
  0 siblings, 1 reply; 57+ messages in thread
From: Ying Han @ 2012-12-07 17:12 UTC (permalink / raw)
  To: Michal Hocko
  Cc: linux-mm, linux-kernel, KAMEZAWA Hiroyuki, Johannes Weiner,
	Tejun Heo, Glauber Costa, Li Zefan
On Fri, Dec 7, 2012 at 12:58 AM, Michal Hocko <mhocko@suse.cz> wrote:
> On Thu 06-12-12 19:43:52, Ying Han wrote:
> [...]
>> Forgot to mention, I was testing 3.7-rc6 with the two cgroup changes :
>
> Could you give a try to -mm tree as well. There are some changes for
> memcgs removal in that tree which are not in Linus's tree.
I will give a try, which patchset you have in mind so i can double check?
I didn't find the place where the css pins the memcg, which either
something i missed or not in place in my tree. I twisted the patch a
bit to make it closer to your v2 version, but instead keep the
mem_cgroup_put() as well as using css_tryget(). Again, my test is
happy with it:
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f2eeee6..acec05a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1004,6 +1004,7 @@ struct mem_cgroup *mem_cgroup_iter(struct
mem_cgroup *root,
                        if (prev && reclaim->generation != iter->generation) {
                                if (last_visited) {
                                        css_put(&last_visited->css);
+                                       mem_cgroup_put(last_visited);
                                        iter->last_visited = NULL;
                                }
                                spin_unlock(&iter->iter_lock);
@@ -1041,15 +1042,22 @@ struct mem_cgroup *mem_cgroup_iter(struct
mem_cgroup *root,
                if (reclaim) {
                        struct mem_cgroup *curr = memcg;
-                       if (last_visited)
+                       if (last_visited) {
                                css_put(&last_visited->css);
+                               mem_cgroup_put(last_visited);
+                       }
                        if (css && !memcg)
                                curr = container_of(css, struct
mem_cgroup, css);
                        /* make sure that the cached memcg is not removed */
-                       if (curr)
-                               css_get(&curr->css);
+                       if (curr) {
+                               mem_cgroup_get(curr);
+                               if (!css_tryget(&curr->css)) {
+                                       mem_cgroup_put(curr);
+                                       curr = NULL;
+                               }
+                       }
                        iter->last_visited = curr;
                        if (!css)
--Ying
> --
> Michal Hocko
> SUSE Labs
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related	[flat|nested] 57+ messages in thread
- * Re: [patch v2 3/6] memcg: rework mem_cgroup_iter to use cgroup iterators
  2012-12-07 17:12         ` Ying Han
@ 2012-12-07 17:27           ` Michal Hocko
  2012-12-07 19:16             ` Ying Han
  0 siblings, 1 reply; 57+ messages in thread
From: Michal Hocko @ 2012-12-07 17:27 UTC (permalink / raw)
  To: Ying Han
  Cc: linux-mm, linux-kernel, KAMEZAWA Hiroyuki, Johannes Weiner,
	Tejun Heo, Glauber Costa, Li Zefan
On Fri 07-12-12 09:12:25, Ying Han wrote:
> On Fri, Dec 7, 2012 at 12:58 AM, Michal Hocko <mhocko@suse.cz> wrote:
> > On Thu 06-12-12 19:43:52, Ying Han wrote:
> > [...]
> >> Forgot to mention, I was testing 3.7-rc6 with the two cgroup changes :
> >
> > Could you give a try to -mm tree as well. There are some changes for
> > memcgs removal in that tree which are not in Linus's tree.
> 
> I will give a try, which patchset you have in mind so i can double check?
Have a look at ba5e0e6be1c76fd37508b2825372b28a90a5b729 in my tree.
 
> I didn't find the place where the css pins the memcg, which either
> something i missed or not in place in my tree.
Yeah, it is carefully hidden ;).
css pins cgroup (last css_put will call dput on the cgroup dentry - via
css_dput_fn) and the last reference to memcg is dropped from ->destroy
callback (mem_cgroup_destroy) called from cgroup_diput.
> I twisted the patch a bit to make it closer to your v2 version,
> but instead keep the mem_cgroup_put() as well as using
> css_tryget(). Again, my test is happy with it:
This is really strange, there must be something weird with ref counting
going on.
Anyway, thanks for your testing! I will try to enahance my testing some
more next week.
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index f2eeee6..acec05a 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -1004,6 +1004,7 @@ struct mem_cgroup *mem_cgroup_iter(struct
> mem_cgroup *root,
>                         if (prev && reclaim->generation != iter->generation) {
>                                 if (last_visited) {
>                                         css_put(&last_visited->css);
> +                                       mem_cgroup_put(last_visited);
>                                         iter->last_visited = NULL;
>                                 }
>                                 spin_unlock(&iter->iter_lock);
> @@ -1041,15 +1042,22 @@ struct mem_cgroup *mem_cgroup_iter(struct
> mem_cgroup *root,
>                 if (reclaim) {
>                         struct mem_cgroup *curr = memcg;
> 
> -                       if (last_visited)
> +                       if (last_visited) {
>                                 css_put(&last_visited->css);
> +                               mem_cgroup_put(last_visited);
> +                       }
> 
>                         if (css && !memcg)
>                                 curr = container_of(css, struct
> mem_cgroup, css);
> 
>                         /* make sure that the cached memcg is not removed */
> -                       if (curr)
> -                               css_get(&curr->css);
> +                       if (curr) {
> +                               mem_cgroup_get(curr);
> +                               if (!css_tryget(&curr->css)) {
> +                                       mem_cgroup_put(curr);
> +                                       curr = NULL;
> +                               }
> +                       }
>                         iter->last_visited = curr;
> 
>                         if (!css)
> 
> 
> --Ying
> 
> > --
> > Michal Hocko
> > SUSE Labs
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
-- 
Michal Hocko
SUSE Labs
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply	[flat|nested] 57+ messages in thread
- * Re: [patch v2 3/6] memcg: rework mem_cgroup_iter to use cgroup iterators
  2012-12-07 17:27           ` Michal Hocko
@ 2012-12-07 19:16             ` Ying Han
  2012-12-07 19:35               ` Michal Hocko
  0 siblings, 1 reply; 57+ messages in thread
From: Ying Han @ 2012-12-07 19:16 UTC (permalink / raw)
  To: Michal Hocko
  Cc: linux-mm, linux-kernel, KAMEZAWA Hiroyuki, Johannes Weiner,
	Tejun Heo, Glauber Costa, Li Zefan
On Fri, Dec 7, 2012 at 9:27 AM, Michal Hocko <mhocko@suse.cz> wrote:
> On Fri 07-12-12 09:12:25, Ying Han wrote:
>> On Fri, Dec 7, 2012 at 12:58 AM, Michal Hocko <mhocko@suse.cz> wrote:
>> > On Thu 06-12-12 19:43:52, Ying Han wrote:
>> > [...]
>> >> Forgot to mention, I was testing 3.7-rc6 with the two cgroup changes :
>> >
>> > Could you give a try to -mm tree as well. There are some changes for
>> > memcgs removal in that tree which are not in Linus's tree.
>>
>> I will give a try, which patchset you have in mind so i can double check?
>
> Have a look at ba5e0e6be1c76fd37508b2825372b28a90a5b729 in my tree.
Tried the tag: mmotm-2012-12-05-16-59 which includes the commit above.
The test runs better. Thank you for the pointer.
Looking into the patch itself, it includes 9 patchset where 6 from
cgroup and 3 from memcg.
    Michal Hocko (3):
          memcg: make mem_cgroup_reparent_charges non failing
          hugetlb: do not fail in hugetlb_cgroup_pre_destroy
          Merge remote-tracking branch
'tj-cgroups/cgroup-rmdir-updates' into mmotm
    Tejun Heo (6):
          cgroup: kill cgroup_subsys->__DEPRECATED_clear_css_refs
          cgroup: kill CSS_REMOVED
          cgroup: use cgroup_lock_live_group(parent) in cgroup_create()
          cgroup: deactivate CSS's and mark cgroup dead before
invoking ->pre_destroy()
          cgroup: remove CGRP_WAIT_ON_RMDIR, cgroup_exclude_rmdir()
and cgroup_release_and_wakeup_rmdir()
          cgroup: make ->pre_destroy() return void
Any suggestion of the minimal patchset I need to apply for testing
this patchset? (hopefully not all of them)
--Ying
>
>> I didn't find the place where the css pins the memcg, which either
>> something i missed or not in place in my tree.
>
> Yeah, it is carefully hidden ;).
> css pins cgroup (last css_put will call dput on the cgroup dentry - via
> css_dput_fn) and the last reference to memcg is dropped from ->destroy
> callback (mem_cgroup_destroy) called from cgroup_diput.
>
>> I twisted the patch a bit to make it closer to your v2 version,
>> but instead keep the mem_cgroup_put() as well as using
>> css_tryget(). Again, my test is happy with it:
>
> This is really strange, there must be something weird with ref counting
> going on.
> Anyway, thanks for your testing! I will try to enahance my testing some
> more next week.
>
>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
>> index f2eeee6..acec05a 100644
>> --- a/mm/memcontrol.c
>> +++ b/mm/memcontrol.c
>> @@ -1004,6 +1004,7 @@ struct mem_cgroup *mem_cgroup_iter(struct
>> mem_cgroup *root,
>>                         if (prev && reclaim->generation != iter->generation) {
>>                                 if (last_visited) {
>>                                         css_put(&last_visited->css);
>> +                                       mem_cgroup_put(last_visited);
>>                                         iter->last_visited = NULL;
>>                                 }
>>                                 spin_unlock(&iter->iter_lock);
>> @@ -1041,15 +1042,22 @@ struct mem_cgroup *mem_cgroup_iter(struct
>> mem_cgroup *root,
>>                 if (reclaim) {
>>                         struct mem_cgroup *curr = memcg;
>>
>> -                       if (last_visited)
>> +                       if (last_visited) {
>>                                 css_put(&last_visited->css);
>> +                               mem_cgroup_put(last_visited);
>> +                       }
>>
>>                         if (css && !memcg)
>>                                 curr = container_of(css, struct
>> mem_cgroup, css);
>>
>>                         /* make sure that the cached memcg is not removed */
>> -                       if (curr)
>> -                               css_get(&curr->css);
>> +                       if (curr) {
>> +                               mem_cgroup_get(curr);
>> +                               if (!css_tryget(&curr->css)) {
>> +                                       mem_cgroup_put(curr);
>> +                                       curr = NULL;
>> +                               }
>> +                       }
>>                         iter->last_visited = curr;
>>
>>                         if (!css)
>>
>>
>> --Ying
>>
>> > --
>> > Michal Hocko
>> > SUSE Labs
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>> Please read the FAQ at  http://www.tux.org/lkml/
>
> --
> Michal Hocko
> SUSE Labs
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply	[flat|nested] 57+ messages in thread
- * Re: [patch v2 3/6] memcg: rework mem_cgroup_iter to use cgroup iterators
  2012-12-07 19:16             ` Ying Han
@ 2012-12-07 19:35               ` Michal Hocko
  0 siblings, 0 replies; 57+ messages in thread
From: Michal Hocko @ 2012-12-07 19:35 UTC (permalink / raw)
  To: Ying Han
  Cc: linux-mm, linux-kernel, KAMEZAWA Hiroyuki, Johannes Weiner,
	Tejun Heo, Glauber Costa, Li Zefan
On Fri 07-12-12 11:16:23, Ying Han wrote:
> On Fri, Dec 7, 2012 at 9:27 AM, Michal Hocko <mhocko@suse.cz> wrote:
> > On Fri 07-12-12 09:12:25, Ying Han wrote:
> >> On Fri, Dec 7, 2012 at 12:58 AM, Michal Hocko <mhocko@suse.cz> wrote:
> >> > On Thu 06-12-12 19:43:52, Ying Han wrote:
> >> > [...]
> >> >> Forgot to mention, I was testing 3.7-rc6 with the two cgroup changes :
> >> >
> >> > Could you give a try to -mm tree as well. There are some changes for
> >> > memcgs removal in that tree which are not in Linus's tree.
> >>
> >> I will give a try, which patchset you have in mind so i can double check?
> >
> > Have a look at ba5e0e6be1c76fd37508b2825372b28a90a5b729 in my tree.
> 
> Tried the tag: mmotm-2012-12-05-16-59 which includes the commit above.
> The test runs better. Thank you for the pointer.
Interesting.
> Looking into the patch itself, it includes 9 patchset where 6 from
> cgroup and 3 from memcg.
> 
>     Michal Hocko (3):
>           memcg: make mem_cgroup_reparent_charges non failing
>           hugetlb: do not fail in hugetlb_cgroup_pre_destroy
>           Merge remote-tracking branch
> 'tj-cgroups/cgroup-rmdir-updates' into mmotm
These are just follow up fixes. The core memcg changes were merged
earlier cad5c694dce67d8aa307a919d247c6a7e1354264. The commit I referred
to above is the finish of that effort.
>     Tejun Heo (6):
>           cgroup: kill cgroup_subsys->__DEPRECATED_clear_css_refs
>           cgroup: kill CSS_REMOVED
>           cgroup: use cgroup_lock_live_group(parent) in cgroup_create()
>           cgroup: deactivate CSS's and mark cgroup dead before
> invoking ->pre_destroy()
>           cgroup: remove CGRP_WAIT_ON_RMDIR, cgroup_exclude_rmdir()
> and cgroup_release_and_wakeup_rmdir()
>           cgroup: make ->pre_destroy() return void
> 
> Any suggestion of the minimal patchset I need to apply for testing
> this patchset? (hopefully not all of them)
The patches shouldn't make a difference but maybe there was a hidden
bug in the previous code which got visible by the iterators rework (we
stored only css id into the cached cookie so if the group went away in
the meantime would just skip it without noticing). Dunno...
Myabe you can start with cad5c694dce67d8aa307a919d247c6a7e1354264 and
move to cgroup changes after that?
[...]
Thanks!
-- 
Michal Hocko
SUSE Labs
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply	[flat|nested] 57+ messages in thread 
 
 
 
 
 
- * Re: [patch v2 3/6] memcg: rework mem_cgroup_iter to use cgroup iterators
  2012-12-07  3:39   ` Ying Han
  2012-12-07  3:43     ` Ying Han
@ 2012-12-07  9:01     ` Michal Hocko
  1 sibling, 0 replies; 57+ messages in thread
From: Michal Hocko @ 2012-12-07  9:01 UTC (permalink / raw)
  To: Ying Han
  Cc: linux-mm, linux-kernel, KAMEZAWA Hiroyuki, Johannes Weiner,
	Tejun Heo, Glauber Costa, Li Zefan
On Thu 06-12-12 19:39:41, Ying Han wrote:
[...]
> Michal,
> 
> I got some trouble while running this patch with my test. The test
> creates hundreds of memcgs which each runs some workload to generate
> global pressure. At the last, it removes all the memcgs by rmdir. Then
> the cmd "ls /dev/cgroup/memory/" hangs afterwards.
>
> I studied a bit of the patch, but not spending too much time on it
> yet. Looks like that the v2 has something different from your last
> post, where you replaces the mem_cgroup_get() with css_get() on the
> iter->last_visited. Didn't follow why we made that change, but after
> restoring the behavior a bit seems passed my test.
Hmm, strange. css reference counting should be stronger than mem_cgroup
one because it pins css thus cgroup which in turn keeps memcg alive.
> Here is the patch I applied on top of this one:
[...]
-- 
Michal Hocko
SUSE Labs
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply	[flat|nested] 57+ messages in thread 
 
- * Re: [patch v2 3/6] memcg: rework mem_cgroup_iter to use cgroup iterators
  2012-11-26 18:47 ` [patch v2 3/6] memcg: rework mem_cgroup_iter to use cgroup iterators Michal Hocko
                     ` (2 preceding siblings ...)
  2012-12-07  3:39   ` Ying Han
@ 2012-12-09 16:59   ` Ying Han
  2012-12-11 15:50     ` Michal Hocko
  2012-12-09 19:39   ` Ying Han
  4 siblings, 1 reply; 57+ messages in thread
From: Ying Han @ 2012-12-09 16:59 UTC (permalink / raw)
  To: Michal Hocko
  Cc: linux-mm, linux-kernel, KAMEZAWA Hiroyuki, Johannes Weiner,
	Tejun Heo, Glauber Costa, Li Zefan
On Mon, Nov 26, 2012 at 10:47 AM, Michal Hocko <mhocko@suse.cz> wrote:
> mem_cgroup_iter curently relies on css->id when walking down a group
> hierarchy tree. This is really awkward because the tree walk depends on
> the groups creation ordering. The only guarantee is that a parent node
> is visited before its children.
> Example
>  1) mkdir -p a a/d a/b/c
>  2) mkdir -a a/b/c a/d
> Will create the same trees but the tree walks will be different:
>  1) a, d, b, c
>  2) a, b, c, d
>
> 574bd9f7 (cgroup: implement generic child / descendant walk macros) has
> introduced generic cgroup tree walkers which provide either pre-order
> or post-order tree walk. This patch converts css->id based iteration
> to pre-order tree walk to keep the semantic with the original iterator
> where parent is always visited before its subtree.
>
> cgroup_for_each_descendant_pre suggests using post_create and
> pre_destroy for proper synchronization with groups addidition resp.
> removal. This implementation doesn't use those because a new memory
> cgroup is fully initialized in mem_cgroup_create and css reference
> counting enforces that the group is alive for both the last seen cgroup
> and the found one resp. it signals that the group is dead and it should
> be skipped.
>
> If the reclaim cookie is used we need to store the last visited group
> into the iterator so we have to be careful that it doesn't disappear in
> the mean time. Elevated reference count on the css keeps it alive even
> though the group have been removed (parked waiting for the last dput so
> that it can be freed).
>
> V2
> - use css_{get,put} for iter->last_visited rather than
>   mem_cgroup_{get,put} because it is stronger wrt. cgroup life cycle
> - cgroup_next_descendant_pre expects NULL pos for the first iterartion
>   otherwise it might loop endlessly for intermediate node without any
>   children.
>
> Signed-off-by: Michal Hocko <mhocko@suse.cz>
> ---
>  mm/memcontrol.c |   74 ++++++++++++++++++++++++++++++++++++++++++-------------
>  1 file changed, 57 insertions(+), 17 deletions(-)
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 1f5528d..6bcc97b 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -144,8 +144,8 @@ struct mem_cgroup_stat_cpu {
>  };
>
>  struct mem_cgroup_reclaim_iter {
> -       /* css_id of the last scanned hierarchy member */
> -       int position;
> +       /* last scanned hierarchy member with elevated css ref count */
> +       struct mem_cgroup *last_visited;
>         /* scan generation, increased every round-trip */
>         unsigned int generation;
>         /* lock to protect the position and generation */
> @@ -1066,7 +1066,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
>                                    struct mem_cgroup_reclaim_cookie *reclaim)
>  {
>         struct mem_cgroup *memcg = NULL;
> -       int id = 0;
> +       struct mem_cgroup *last_visited = NULL;
>
>         if (mem_cgroup_disabled())
>                 return NULL;
> @@ -1075,7 +1075,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
>                 root = root_mem_cgroup;
>
>         if (prev && !reclaim)
> -               id = css_id(&prev->css);
> +               last_visited = prev;
>
>         if (!root->use_hierarchy && root != root_mem_cgroup) {
>                 if (prev)
> @@ -1083,9 +1083,10 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
>                 return root;
>         }
>
> +       rcu_read_lock();
>         while (!memcg) {
>                 struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
> -               struct cgroup_subsys_state *css;
> +               struct cgroup_subsys_state *css = NULL;
>
>                 if (reclaim) {
>                         int nid = zone_to_nid(reclaim->zone);
> @@ -1095,34 +1096,73 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
>                         mz = mem_cgroup_zoneinfo(root, nid, zid);
>                         iter = &mz->reclaim_iter[reclaim->priority];
>                         spin_lock(&iter->iter_lock);
> +                       last_visited = iter->last_visited;
>                         if (prev && reclaim->generation != iter->generation) {
> +                               if (last_visited) {
> +                                       css_put(&last_visited->css);
> +                                       iter->last_visited = NULL;
> +                               }
>                                 spin_unlock(&iter->iter_lock);
> -                               goto out_css_put;
> +                               goto out_unlock;
>                         }
> -                       id = iter->position;
>                 }
>
> -               rcu_read_lock();
> -               css = css_get_next(&mem_cgroup_subsys, id + 1, &root->css, &id);
> -               if (css) {
> -                       if (css == &root->css || css_tryget(css))
> -                               memcg = mem_cgroup_from_css(css);
> -               } else
> -                       id = 0;
> -               rcu_read_unlock();
> +               /*
> +                * Root is not visited by cgroup iterators so it needs an
> +                * explicit visit.
> +                */
> +               if (!last_visited) {
> +                       css = &root->css;
> +               } else {
> +                       struct cgroup *prev_cgroup, *next_cgroup;
> +
> +                       prev_cgroup = (last_visited == root) ? NULL
> +                               : last_visited->css.cgroup;
> +                       next_cgroup = cgroup_next_descendant_pre(prev_cgroup,
> +                                       root->css.cgroup);
> +                       if (next_cgroup)
> +                               css = cgroup_subsys_state(next_cgroup,
> +                                               mem_cgroup_subsys_id);
> +               }
> +
> +               /*
> +                * Even if we found a group we have to make sure it is alive.
> +                * css && !memcg means that the groups should be skipped and
> +                * we should continue the tree walk.
> +                * last_visited css is safe to use because it is protected by
> +                * css_get and the tree walk is rcu safe.
> +                */
> +               if (css == &root->css || (css && css_tryget(css)))
> +                       memcg = mem_cgroup_from_css(css);
>
>                 if (reclaim) {
> -                       iter->position = id;
> +                       struct mem_cgroup *curr = memcg;
> +
> +                       if (last_visited)
> +                               css_put(&last_visited->css);
> +
> +                       if (css && !memcg)
> +                               curr = mem_cgroup_from_css(css);
In this case, the css_tryget() failed which implies the css is on the
way to be removed. (refcnt ==0) If so, why it is safe to call
css_get() directly on it below? It seems not preventing the css to be
removed by doing so.
> +                       /* make sure that the cached memcg is not removed */
> +                       if (curr)
> +                               css_get(&curr->css);
--Ying
> +                       iter->last_visited = curr;
> +
>                         if (!css)
>                                 iter->generation++;
>                         else if (!prev && memcg)
>                                 reclaim->generation = iter->generation;
>                         spin_unlock(&iter->iter_lock);
> +               } else if (css && !memcg) {
> +                       last_visited = mem_cgroup_from_css(css);
>                 }
>
>                 if (prev && !css)
> -                       goto out_css_put;
> +                       goto out_unlock;
>         }
> +out_unlock:
> +       rcu_read_unlock();
>  out_css_put:
>         if (prev && prev != root)
>                 css_put(&prev->css);
> --
> 1.7.10.4
>
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply	[flat|nested] 57+ messages in thread
- * Re: [patch v2 3/6] memcg: rework mem_cgroup_iter to use cgroup iterators
  2012-12-09 16:59   ` Ying Han
@ 2012-12-11 15:50     ` Michal Hocko
  2012-12-11 16:15       ` Michal Hocko
  2012-12-11 22:31       ` Ying Han
  0 siblings, 2 replies; 57+ messages in thread
From: Michal Hocko @ 2012-12-11 15:50 UTC (permalink / raw)
  To: Ying Han
  Cc: linux-mm, linux-kernel, KAMEZAWA Hiroyuki, Johannes Weiner,
	Tejun Heo, Glauber Costa, Li Zefan
On Sun 09-12-12 08:59:54, Ying Han wrote:
> On Mon, Nov 26, 2012 at 10:47 AM, Michal Hocko <mhocko@suse.cz> wrote:
[...]
> > +               /*
> > +                * Even if we found a group we have to make sure it is alive.
> > +                * css && !memcg means that the groups should be skipped and
> > +                * we should continue the tree walk.
> > +                * last_visited css is safe to use because it is protected by
> > +                * css_get and the tree walk is rcu safe.
> > +                */
> > +               if (css == &root->css || (css && css_tryget(css)))
> > +                       memcg = mem_cgroup_from_css(css);
> >
> >                 if (reclaim) {
> > -                       iter->position = id;
> > +                       struct mem_cgroup *curr = memcg;
> > +
> > +                       if (last_visited)
> > +                               css_put(&last_visited->css);
> > +
> > +                       if (css && !memcg)
> > +                               curr = mem_cgroup_from_css(css);
> 
> In this case, the css_tryget() failed which implies the css is on the
> way to be removed. (refcnt ==0) If so, why it is safe to call
> css_get() directly on it below? It seems not preventing the css to be
> removed by doing so.
Well, I do not remember exactly but I guess the code is meant to say
that we need to store a half-dead memcg because the loop has to be
retried. As we are under RCU hood it is just half dead.
Now that you brought this up I think this is not safe as well because
another thread could have seen the cached value while we tried to retry
and his RCU is not protecting the group anymore. The follow up patch
fixes this by retrying within the loop. I will bring that part into
this patch already and then leave only css clean up in the other patch.
Thanks for spotting this Ying!
> 
> > +                       /* make sure that the cached memcg is not removed */
> > +                       if (curr)
> > +                               css_get(&curr->css);
> 
> --Ying
-- 
Michal Hocko
SUSE Labs
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply	[flat|nested] 57+ messages in thread
- * Re: [patch v2 3/6] memcg: rework mem_cgroup_iter to use cgroup iterators
  2012-12-11 15:50     ` Michal Hocko
@ 2012-12-11 16:15       ` Michal Hocko
  2012-12-11 18:10         ` Michal Hocko
  2012-12-11 22:43         ` Ying Han
  2012-12-11 22:31       ` Ying Han
  1 sibling, 2 replies; 57+ messages in thread
From: Michal Hocko @ 2012-12-11 16:15 UTC (permalink / raw)
  To: Ying Han
  Cc: linux-mm, linux-kernel, KAMEZAWA Hiroyuki, Johannes Weiner,
	Tejun Heo, Glauber Costa, Li Zefan
On Tue 11-12-12 16:50:25, Michal Hocko wrote:
> On Sun 09-12-12 08:59:54, Ying Han wrote:
> > On Mon, Nov 26, 2012 at 10:47 AM, Michal Hocko <mhocko@suse.cz> wrote:
> [...]
> > > +               /*
> > > +                * Even if we found a group we have to make sure it is alive.
> > > +                * css && !memcg means that the groups should be skipped and
> > > +                * we should continue the tree walk.
> > > +                * last_visited css is safe to use because it is protected by
> > > +                * css_get and the tree walk is rcu safe.
> > > +                */
> > > +               if (css == &root->css || (css && css_tryget(css)))
> > > +                       memcg = mem_cgroup_from_css(css);
> > >
> > >                 if (reclaim) {
> > > -                       iter->position = id;
> > > +                       struct mem_cgroup *curr = memcg;
> > > +
> > > +                       if (last_visited)
> > > +                               css_put(&last_visited->css);
> > > +
> > > +                       if (css && !memcg)
> > > +                               curr = mem_cgroup_from_css(css);
> > 
> > In this case, the css_tryget() failed which implies the css is on the
> > way to be removed. (refcnt ==0) If so, why it is safe to call
> > css_get() directly on it below? It seems not preventing the css to be
> > removed by doing so.
> 
> Well, I do not remember exactly but I guess the code is meant to say
> that we need to store a half-dead memcg because the loop has to be
> retried. As we are under RCU hood it is just half dead.
> Now that you brought this up I think this is not safe as well because
> another thread could have seen the cached value while we tried to retry
> and his RCU is not protecting the group anymore.
Hmm, thinking about it some more, it _is_ be safe in the end.
We are safe because we are under RCU. And even if somebody else looked
at the half-dead memcg from iter->last_visited it cannot disappear
because the current one will retry without dropping RCU so the grace
period couldn't have been finished.
		CPU0					CPU1
rcu_read_lock()						rcu_read_lock()
while(!memcg) {						while(!memcg)
[...]
spin_lock(&iter->iter_lock)
[...]
if (css == &root->css ||
		(css && css_tryget(css)))
	memcg = mem_cgroup_from_css(css)
[...]
if (css && !memcg)
	curr = mem_cgroup_from_css(css)
if (curr)
	css_get(curr);
spin_unlock(&iter->iter_lock)
							spin_lock(&iter->iter_lock)
							/* sees the half dead memcg but its cgroup is still valid */ 
							[...]
							spin_unlock(&iter->iter_lock)
/* we do retry */
}
rcu_read_unlock()
so the css_get will just helps to prevent from further code obfuscation.
Makes sense? The code gets much simplified later in the series,
fortunately.
-- 
Michal Hocko
SUSE Labs
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply	[flat|nested] 57+ messages in thread
- * Re: [patch v2 3/6] memcg: rework mem_cgroup_iter to use cgroup iterators
  2012-12-11 16:15       ` Michal Hocko
@ 2012-12-11 18:10         ` Michal Hocko
  2012-12-11 22:43         ` Ying Han
  1 sibling, 0 replies; 57+ messages in thread
From: Michal Hocko @ 2012-12-11 18:10 UTC (permalink / raw)
  To: Ying Han
  Cc: linux-mm, linux-kernel, KAMEZAWA Hiroyuki, Johannes Weiner,
	Tejun Heo, Glauber Costa, Li Zefan
On Tue 11-12-12 17:15:59, Michal Hocko wrote:
> On Tue 11-12-12 16:50:25, Michal Hocko wrote:
> > On Sun 09-12-12 08:59:54, Ying Han wrote:
> > > On Mon, Nov 26, 2012 at 10:47 AM, Michal Hocko <mhocko@suse.cz> wrote:
> > [...]
> > > > +               /*
> > > > +                * Even if we found a group we have to make sure it is alive.
> > > > +                * css && !memcg means that the groups should be skipped and
> > > > +                * we should continue the tree walk.
> > > > +                * last_visited css is safe to use because it is protected by
> > > > +                * css_get and the tree walk is rcu safe.
> > > > +                */
> > > > +               if (css == &root->css || (css && css_tryget(css)))
> > > > +                       memcg = mem_cgroup_from_css(css);
> > > >
> > > >                 if (reclaim) {
> > > > -                       iter->position = id;
> > > > +                       struct mem_cgroup *curr = memcg;
> > > > +
> > > > +                       if (last_visited)
> > > > +                               css_put(&last_visited->css);
> > > > +
> > > > +                       if (css && !memcg)
> > > > +                               curr = mem_cgroup_from_css(css);
> > > 
> > > In this case, the css_tryget() failed which implies the css is on the
> > > way to be removed. (refcnt ==0) If so, why it is safe to call
> > > css_get() directly on it below? It seems not preventing the css to be
> > > removed by doing so.
> > 
> > Well, I do not remember exactly but I guess the code is meant to say
> > that we need to store a half-dead memcg because the loop has to be
> > retried. As we are under RCU hood it is just half dead.
> > Now that you brought this up I think this is not safe as well because
> > another thread could have seen the cached value while we tried to retry
> > and his RCU is not protecting the group anymore.
> 
> Hmm, thinking about it some more, it _is_ be safe in the end.
> 
> We are safe because we are under RCU.
And I've just realized that one sentence vanished while I was writing
this.
So either we retry (while(!memcg)) and see the half-dead memcg with a
valid cgroup because we are under rcu so cgroup iterator will find a
next one. Or we race with somebody else on the iterator and that is
described bellow.
> And even if somebody else looked
> at the half-dead memcg from iter->last_visited it cannot disappear
> because the current one will retry without dropping RCU so the grace
> period couldn't have been finished.
> 
> 		CPU0					CPU1
> rcu_read_lock()						rcu_read_lock()
> while(!memcg) {						while(!memcg)
> [...]
> spin_lock(&iter->iter_lock)
> [...]
> if (css == &root->css ||
> 		(css && css_tryget(css)))
> 	memcg = mem_cgroup_from_css(css)
> [...]
> if (css && !memcg)
> 	curr = mem_cgroup_from_css(css)
> if (curr)
> 	css_get(curr);
> spin_unlock(&iter->iter_lock)
> 							spin_lock(&iter->iter_lock)
> 							/* sees the half dead memcg but its cgroup is still valid */ 
> 							[...]
> 							spin_unlock(&iter->iter_lock)
> /* we do retry */
> }
> rcu_read_unlock()
> 
> so the css_get will just helps to prevent from further code obfuscation.
> 
> Makes sense? The code gets much simplified later in the series,
> fortunately.
> -- 
> Michal Hocko
> SUSE Labs
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
-- 
Michal Hocko
SUSE Labs
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply	[flat|nested] 57+ messages in thread
- * Re: [patch v2 3/6] memcg: rework mem_cgroup_iter to use cgroup iterators
  2012-12-11 16:15       ` Michal Hocko
  2012-12-11 18:10         ` Michal Hocko
@ 2012-12-11 22:43         ` Ying Han
  2012-12-12  8:55           ` Michal Hocko
  1 sibling, 1 reply; 57+ messages in thread
From: Ying Han @ 2012-12-11 22:43 UTC (permalink / raw)
  To: Michal Hocko
  Cc: linux-mm, linux-kernel, KAMEZAWA Hiroyuki, Johannes Weiner,
	Tejun Heo, Glauber Costa, Li Zefan
On Tue, Dec 11, 2012 at 8:15 AM, Michal Hocko <mhocko@suse.cz> wrote:
> On Tue 11-12-12 16:50:25, Michal Hocko wrote:
>> On Sun 09-12-12 08:59:54, Ying Han wrote:
>> > On Mon, Nov 26, 2012 at 10:47 AM, Michal Hocko <mhocko@suse.cz> wrote:
>> [...]
>> > > +               /*
>> > > +                * Even if we found a group we have to make sure it is alive.
>> > > +                * css && !memcg means that the groups should be skipped and
>> > > +                * we should continue the tree walk.
>> > > +                * last_visited css is safe to use because it is protected by
>> > > +                * css_get and the tree walk is rcu safe.
>> > > +                */
>> > > +               if (css == &root->css || (css && css_tryget(css)))
>> > > +                       memcg = mem_cgroup_from_css(css);
>> > >
>> > >                 if (reclaim) {
>> > > -                       iter->position = id;
>> > > +                       struct mem_cgroup *curr = memcg;
>> > > +
>> > > +                       if (last_visited)
>> > > +                               css_put(&last_visited->css);
>> > > +
>> > > +                       if (css && !memcg)
>> > > +                               curr = mem_cgroup_from_css(css);
>> >
>> > In this case, the css_tryget() failed which implies the css is on the
>> > way to be removed. (refcnt ==0) If so, why it is safe to call
>> > css_get() directly on it below? It seems not preventing the css to be
>> > removed by doing so.
>>
>> Well, I do not remember exactly but I guess the code is meant to say
>> that we need to store a half-dead memcg because the loop has to be
>> retried. As we are under RCU hood it is just half dead.
>> Now that you brought this up I think this is not safe as well because
>> another thread could have seen the cached value while we tried to retry
>> and his RCU is not protecting the group anymore.
>
> Hmm, thinking about it some more, it _is_ be safe in the end.
>
> We are safe because we are under RCU. And even if somebody else looked
> at the half-dead memcg from iter->last_visited it cannot disappear
> because the current one will retry without dropping RCU so the grace
> period couldn't have been finished.
>
>                 CPU0                                    CPU1
> rcu_read_lock()                                         rcu_read_lock()
> while(!memcg) {                                         while(!memcg)
> [...]
> spin_lock(&iter->iter_lock)
> [...]
> if (css == &root->css ||
>                 (css && css_tryget(css)))
>         memcg = mem_cgroup_from_css(css)
> [...]
> if (css && !memcg)
>         curr = mem_cgroup_from_css(css)
> if (curr)
>         css_get(curr);
> spin_unlock(&iter->iter_lock)
>                                                         spin_lock(&iter->iter_lock)
>                                                         /* sees the half dead memcg but its cgroup is still valid */
>                                                         [...]
>                                                         spin_unlock(&iter->iter_lock)
> /* we do retry */
> }
> rcu_read_unlock()
>
> so the css_get will just helps to prevent from further code obfuscation.
>
> Makes sense? The code gets much simplified later in the series,
> fortunately.
My understanding on this is that we should never call css_get()
without calling css_tryget() and it succeed. Whether or not it is
*safe* to do so, that seems conflicts with the assumption of the
cgroup_rmdir().
I would rather make the change to do the retry after css_tryget()
failed. The patch I have on my local tree:
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f2eeee6..e2af02d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -991,6 +991,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
        while (!memcg) {
                struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
                struct cgroup_subsys_state *css = NULL;
+               struct cgroup *prev_cgroup, *next_cgroup;
                if (reclaim) {
                        int nid = zone_to_nid(reclaim->zone);
@@ -1018,10 +1019,9 @@ struct mem_cgroup *mem_cgroup_iter(struct
mem_cgroup *root,
                if (!last_visited) {
                        css = &root->css;
                } else {
-                       struct cgroup *prev_cgroup, *next_cgroup;
-
                        prev_cgroup = (last_visited == root) ? NULL
                                : last_visited->css.cgroup;
+skip_node:
                        next_cgroup = cgroup_next_descendant_pre(
                                        prev_cgroup,
                                        root->css.cgroup);
@@ -1038,15 +1038,17 @@ struct mem_cgroup *mem_cgroup_iter(struct
mem_cgroup *root,
                if (css == &root->css || (css && css_tryget(css)))
                        memcg = container_of(css, struct mem_cgroup, css);
+               if (css && !memcg) {
+                       prev_cgroup = next_cgroup;
+                       goto skip_node;
+               }
+
                if (reclaim) {
                        struct mem_cgroup *curr = memcg;
                        if (last_visited)
                                css_put(&last_visited->css);
-                       if (css && !memcg)
-                               curr = container_of(css, struct
mem_cgroup, css);
-
                        /* make sure that the cached memcg is not removed */
                        if (curr)
                                css_get(&curr->css);
@@ -1057,8 +1059,6 @@ struct mem_cgroup *mem_cgroup_iter(struct
mem_cgroup *root,
                        else if (!prev && memcg)
                                reclaim->generation = iter->generation;
                        spin_unlock(&iter->iter_lock);
--Ying
> --
> Michal Hocko
> SUSE Labs
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related	[flat|nested] 57+ messages in thread
- * Re: [patch v2 3/6] memcg: rework mem_cgroup_iter to use cgroup iterators
  2012-12-11 22:43         ` Ying Han
@ 2012-12-12  8:55           ` Michal Hocko
  2012-12-12 17:57             ` Ying Han
  0 siblings, 1 reply; 57+ messages in thread
From: Michal Hocko @ 2012-12-12  8:55 UTC (permalink / raw)
  To: Ying Han
  Cc: linux-mm, linux-kernel, KAMEZAWA Hiroyuki, Johannes Weiner,
	Tejun Heo, Glauber Costa, Li Zefan
On Tue 11-12-12 14:43:37, Ying Han wrote:
> On Tue, Dec 11, 2012 at 8:15 AM, Michal Hocko <mhocko@suse.cz> wrote:
> > On Tue 11-12-12 16:50:25, Michal Hocko wrote:
> >> On Sun 09-12-12 08:59:54, Ying Han wrote:
> >> > On Mon, Nov 26, 2012 at 10:47 AM, Michal Hocko <mhocko@suse.cz> wrote:
> >> [...]
> >> > > +               /*
> >> > > +                * Even if we found a group we have to make sure it is alive.
> >> > > +                * css && !memcg means that the groups should be skipped and
> >> > > +                * we should continue the tree walk.
> >> > > +                * last_visited css is safe to use because it is protected by
> >> > > +                * css_get and the tree walk is rcu safe.
> >> > > +                */
> >> > > +               if (css == &root->css || (css && css_tryget(css)))
> >> > > +                       memcg = mem_cgroup_from_css(css);
> >> > >
> >> > >                 if (reclaim) {
> >> > > -                       iter->position = id;
> >> > > +                       struct mem_cgroup *curr = memcg;
> >> > > +
> >> > > +                       if (last_visited)
> >> > > +                               css_put(&last_visited->css);
> >> > > +
> >> > > +                       if (css && !memcg)
> >> > > +                               curr = mem_cgroup_from_css(css);
> >> >
> >> > In this case, the css_tryget() failed which implies the css is on the
> >> > way to be removed. (refcnt ==0) If so, why it is safe to call
> >> > css_get() directly on it below? It seems not preventing the css to be
> >> > removed by doing so.
> >>
> >> Well, I do not remember exactly but I guess the code is meant to say
> >> that we need to store a half-dead memcg because the loop has to be
> >> retried. As we are under RCU hood it is just half dead.
> >> Now that you brought this up I think this is not safe as well because
> >> another thread could have seen the cached value while we tried to retry
> >> and his RCU is not protecting the group anymore.
> >
> > Hmm, thinking about it some more, it _is_ be safe in the end.
> >
> > We are safe because we are under RCU. And even if somebody else looked
> > at the half-dead memcg from iter->last_visited it cannot disappear
> > because the current one will retry without dropping RCU so the grace
> > period couldn't have been finished.
> >
> >                 CPU0                                    CPU1
> > rcu_read_lock()                                         rcu_read_lock()
> > while(!memcg) {                                         while(!memcg)
> > [...]
> > spin_lock(&iter->iter_lock)
> > [...]
> > if (css == &root->css ||
> >                 (css && css_tryget(css)))
> >         memcg = mem_cgroup_from_css(css)
> > [...]
> > if (css && !memcg)
> >         curr = mem_cgroup_from_css(css)
> > if (curr)
> >         css_get(curr);
> > spin_unlock(&iter->iter_lock)
> >                                                         spin_lock(&iter->iter_lock)
> >                                                         /* sees the half dead memcg but its cgroup is still valid */
> >                                                         [...]
> >                                                         spin_unlock(&iter->iter_lock)
> > /* we do retry */
> > }
> > rcu_read_unlock()
> >
> > so the css_get will just helps to prevent from further code obfuscation.
> >
> > Makes sense? The code gets much simplified later in the series,
> > fortunately.
> 
> My understanding on this is that we should never call css_get()
> without calling css_tryget() and it succeed.
Hmm, what would be the point of using css_get then?
> Whether or not it is *safe* to do so, that seems conflicts with the
> assumption of the cgroup_rmdir().
> 
> I would rather make the change to do the retry after css_tryget()
> failed. The patch I have on my local tree:
OK, I am not against, the retry is just nicer and that is the reason
I changed that in the follow up patch. Just note that this is an
intermediate patch and the code is changed significantly in the later
patches so the question is whether it is worth changing that.
This surely couldn't have caused your testing issue, right?
So I can refactor the two patches and move the retry from the later to
this one if you or anybody else really insist ;)
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index f2eeee6..e2af02d 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -991,6 +991,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
>         while (!memcg) {
>                 struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
>                 struct cgroup_subsys_state *css = NULL;
> +               struct cgroup *prev_cgroup, *next_cgroup;
> 
>                 if (reclaim) {
>                         int nid = zone_to_nid(reclaim->zone);
> @@ -1018,10 +1019,9 @@ struct mem_cgroup *mem_cgroup_iter(struct
> mem_cgroup *root,
>                 if (!last_visited) {
>                         css = &root->css;
>                 } else {
> -                       struct cgroup *prev_cgroup, *next_cgroup;
> -
>                         prev_cgroup = (last_visited == root) ? NULL
>                                 : last_visited->css.cgroup;
> +skip_node:
>                         next_cgroup = cgroup_next_descendant_pre(
>                                         prev_cgroup,
>                                         root->css.cgroup);
> @@ -1038,15 +1038,17 @@ struct mem_cgroup *mem_cgroup_iter(struct
> mem_cgroup *root,
>                 if (css == &root->css || (css && css_tryget(css)))
>                         memcg = container_of(css, struct mem_cgroup, css);
> 
> +               if (css && !memcg) {
> +                       prev_cgroup = next_cgroup;
> +                       goto skip_node;
> +               }
> +
>                 if (reclaim) {
>                         struct mem_cgroup *curr = memcg;
> 
>                         if (last_visited)
>                                 css_put(&last_visited->css);
> 
> -                       if (css && !memcg)
> -                               curr = container_of(css, struct
> mem_cgroup, css);
> -
>                         /* make sure that the cached memcg is not removed */
>                         if (curr)
>                                 css_get(&curr->css);
> @@ -1057,8 +1059,6 @@ struct mem_cgroup *mem_cgroup_iter(struct
> mem_cgroup *root,
>                         else if (!prev && memcg)
>                                 reclaim->generation = iter->generation;
>                         spin_unlock(&iter->iter_lock);
> 
> 
> --Ying
> 
> > --
> > Michal Hocko
> > SUSE Labs
-- 
Michal Hocko
SUSE Labs
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply	[flat|nested] 57+ messages in thread
- * Re: [patch v2 3/6] memcg: rework mem_cgroup_iter to use cgroup iterators
  2012-12-12  8:55           ` Michal Hocko
@ 2012-12-12 17:57             ` Ying Han
  2012-12-12 18:08               ` Michal Hocko
  0 siblings, 1 reply; 57+ messages in thread
From: Ying Han @ 2012-12-12 17:57 UTC (permalink / raw)
  To: Michal Hocko
  Cc: linux-mm, linux-kernel, KAMEZAWA Hiroyuki, Johannes Weiner,
	Tejun Heo, Glauber Costa, Li Zefan
On Wed, Dec 12, 2012 at 12:55 AM, Michal Hocko <mhocko@suse.cz> wrote:
> On Tue 11-12-12 14:43:37, Ying Han wrote:
>> On Tue, Dec 11, 2012 at 8:15 AM, Michal Hocko <mhocko@suse.cz> wrote:
>> > On Tue 11-12-12 16:50:25, Michal Hocko wrote:
>> >> On Sun 09-12-12 08:59:54, Ying Han wrote:
>> >> > On Mon, Nov 26, 2012 at 10:47 AM, Michal Hocko <mhocko@suse.cz> wrote:
>> >> [...]
>> >> > > +               /*
>> >> > > +                * Even if we found a group we have to make sure it is alive.
>> >> > > +                * css && !memcg means that the groups should be skipped and
>> >> > > +                * we should continue the tree walk.
>> >> > > +                * last_visited css is safe to use because it is protected by
>> >> > > +                * css_get and the tree walk is rcu safe.
>> >> > > +                */
>> >> > > +               if (css == &root->css || (css && css_tryget(css)))
>> >> > > +                       memcg = mem_cgroup_from_css(css);
>> >> > >
>> >> > >                 if (reclaim) {
>> >> > > -                       iter->position = id;
>> >> > > +                       struct mem_cgroup *curr = memcg;
>> >> > > +
>> >> > > +                       if (last_visited)
>> >> > > +                               css_put(&last_visited->css);
>> >> > > +
>> >> > > +                       if (css && !memcg)
>> >> > > +                               curr = mem_cgroup_from_css(css);
>> >> >
>> >> > In this case, the css_tryget() failed which implies the css is on the
>> >> > way to be removed. (refcnt ==0) If so, why it is safe to call
>> >> > css_get() directly on it below? It seems not preventing the css to be
>> >> > removed by doing so.
>> >>
>> >> Well, I do not remember exactly but I guess the code is meant to say
>> >> that we need to store a half-dead memcg because the loop has to be
>> >> retried. As we are under RCU hood it is just half dead.
>> >> Now that you brought this up I think this is not safe as well because
>> >> another thread could have seen the cached value while we tried to retry
>> >> and his RCU is not protecting the group anymore.
>> >
>> > Hmm, thinking about it some more, it _is_ be safe in the end.
>> >
>> > We are safe because we are under RCU. And even if somebody else looked
>> > at the half-dead memcg from iter->last_visited it cannot disappear
>> > because the current one will retry without dropping RCU so the grace
>> > period couldn't have been finished.
>> >
>> >                 CPU0                                    CPU1
>> > rcu_read_lock()                                         rcu_read_lock()
>> > while(!memcg) {                                         while(!memcg)
>> > [...]
>> > spin_lock(&iter->iter_lock)
>> > [...]
>> > if (css == &root->css ||
>> >                 (css && css_tryget(css)))
>> >         memcg = mem_cgroup_from_css(css)
>> > [...]
>> > if (css && !memcg)
>> >         curr = mem_cgroup_from_css(css)
>> > if (curr)
>> >         css_get(curr);
>> > spin_unlock(&iter->iter_lock)
>> >                                                         spin_lock(&iter->iter_lock)
>> >                                                         /* sees the half dead memcg but its cgroup is still valid */
>> >                                                         [...]
>> >                                                         spin_unlock(&iter->iter_lock)
>> > /* we do retry */
>> > }
>> > rcu_read_unlock()
>> >
>> > so the css_get will just helps to prevent from further code obfuscation.
>> >
>> > Makes sense? The code gets much simplified later in the series,
>> > fortunately.
>>
>> My understanding on this is that we should never call css_get()
>> without calling css_tryget() and it succeed.
>
> Hmm, what would be the point of using css_get then?
Only css_tryget() will fail if the cgroup is under removal, but not
css_get(). AFAIK there is logic in cgroup_rmdir() rely on that. (The
CSS_DEACT_BIAS will block new css_tryget(), and then fail all further
css_get(). )
>
>> Whether or not it is *safe* to do so, that seems conflicts with the
>> assumption of the cgroup_rmdir().
>>
>> I would rather make the change to do the retry after css_tryget()
>> failed. The patch I have on my local tree:
>
> OK, I am not against, the retry is just nicer and that is the reason
> I changed that in the follow up patch. Just note that this is an
> intermediate patch and the code is changed significantly in the later
> patches so the question is whether it is worth changing that.
> This surely couldn't have caused your testing issue, right?
I haven't tested separately, but the retry logic +
mem_cgroup_iter_break() change cure my testcase.
--Ying
>
> So I can refactor the two patches and move the retry from the later to
> this one if you or anybody else really insist ;)
>
>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
>> index f2eeee6..e2af02d 100644
>> --- a/mm/memcontrol.c
>> +++ b/mm/memcontrol.c
>> @@ -991,6 +991,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
>>         while (!memcg) {
>>                 struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
>>                 struct cgroup_subsys_state *css = NULL;
>> +               struct cgroup *prev_cgroup, *next_cgroup;
>>
>>                 if (reclaim) {
>>                         int nid = zone_to_nid(reclaim->zone);
>> @@ -1018,10 +1019,9 @@ struct mem_cgroup *mem_cgroup_iter(struct
>> mem_cgroup *root,
>>                 if (!last_visited) {
>>                         css = &root->css;
>>                 } else {
>> -                       struct cgroup *prev_cgroup, *next_cgroup;
>> -
>>                         prev_cgroup = (last_visited == root) ? NULL
>>                                 : last_visited->css.cgroup;
>> +skip_node:
>>                         next_cgroup = cgroup_next_descendant_pre(
>>                                         prev_cgroup,
>>                                         root->css.cgroup);
>> @@ -1038,15 +1038,17 @@ struct mem_cgroup *mem_cgroup_iter(struct
>> mem_cgroup *root,
>>                 if (css == &root->css || (css && css_tryget(css)))
>>                         memcg = container_of(css, struct mem_cgroup, css);
>>
>> +               if (css && !memcg) {
>> +                       prev_cgroup = next_cgroup;
>> +                       goto skip_node;
>> +               }
>> +
>>                 if (reclaim) {
>>                         struct mem_cgroup *curr = memcg;
>>
>>                         if (last_visited)
>>                                 css_put(&last_visited->css);
>>
>> -                       if (css && !memcg)
>> -                               curr = container_of(css, struct
>> mem_cgroup, css);
>> -
>>                         /* make sure that the cached memcg is not removed */
>>                         if (curr)
>>                                 css_get(&curr->css);
>> @@ -1057,8 +1059,6 @@ struct mem_cgroup *mem_cgroup_iter(struct
>> mem_cgroup *root,
>>                         else if (!prev && memcg)
>>                                 reclaim->generation = iter->generation;
>>                         spin_unlock(&iter->iter_lock);
>>
>>
>> --Ying
>>
>> > --
>> > Michal Hocko
>> > SUSE Labs
>
> --
> Michal Hocko
> SUSE Labs
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply	[flat|nested] 57+ messages in thread
- * Re: [patch v2 3/6] memcg: rework mem_cgroup_iter to use cgroup iterators
  2012-12-12 17:57             ` Ying Han
@ 2012-12-12 18:08               ` Michal Hocko
  0 siblings, 0 replies; 57+ messages in thread
From: Michal Hocko @ 2012-12-12 18:08 UTC (permalink / raw)
  To: Ying Han
  Cc: linux-mm, linux-kernel, KAMEZAWA Hiroyuki, Johannes Weiner,
	Tejun Heo, Glauber Costa, Li Zefan
On Wed 12-12-12 09:57:56, Ying Han wrote:
> On Wed, Dec 12, 2012 at 12:55 AM, Michal Hocko <mhocko@suse.cz> wrote:
> > On Tue 11-12-12 14:43:37, Ying Han wrote:
> >> On Tue, Dec 11, 2012 at 8:15 AM, Michal Hocko <mhocko@suse.cz> wrote:
> >> > On Tue 11-12-12 16:50:25, Michal Hocko wrote:
> >> >> On Sun 09-12-12 08:59:54, Ying Han wrote:
> >> >> > On Mon, Nov 26, 2012 at 10:47 AM, Michal Hocko <mhocko@suse.cz> wrote:
> >> >> [...]
> >> >> > > +               /*
> >> >> > > +                * Even if we found a group we have to make sure it is alive.
> >> >> > > +                * css && !memcg means that the groups should be skipped and
> >> >> > > +                * we should continue the tree walk.
> >> >> > > +                * last_visited css is safe to use because it is protected by
> >> >> > > +                * css_get and the tree walk is rcu safe.
> >> >> > > +                */
> >> >> > > +               if (css == &root->css || (css && css_tryget(css)))
> >> >> > > +                       memcg = mem_cgroup_from_css(css);
> >> >> > >
> >> >> > >                 if (reclaim) {
> >> >> > > -                       iter->position = id;
> >> >> > > +                       struct mem_cgroup *curr = memcg;
> >> >> > > +
> >> >> > > +                       if (last_visited)
> >> >> > > +                               css_put(&last_visited->css);
> >> >> > > +
> >> >> > > +                       if (css && !memcg)
> >> >> > > +                               curr = mem_cgroup_from_css(css);
> >> >> >
> >> >> > In this case, the css_tryget() failed which implies the css is on the
> >> >> > way to be removed. (refcnt ==0) If so, why it is safe to call
> >> >> > css_get() directly on it below? It seems not preventing the css to be
> >> >> > removed by doing so.
> >> >>
> >> >> Well, I do not remember exactly but I guess the code is meant to say
> >> >> that we need to store a half-dead memcg because the loop has to be
> >> >> retried. As we are under RCU hood it is just half dead.
> >> >> Now that you brought this up I think this is not safe as well because
> >> >> another thread could have seen the cached value while we tried to retry
> >> >> and his RCU is not protecting the group anymore.
> >> >
> >> > Hmm, thinking about it some more, it _is_ be safe in the end.
> >> >
> >> > We are safe because we are under RCU. And even if somebody else looked
> >> > at the half-dead memcg from iter->last_visited it cannot disappear
> >> > because the current one will retry without dropping RCU so the grace
> >> > period couldn't have been finished.
> >> >
> >> >                 CPU0                                    CPU1
> >> > rcu_read_lock()                                         rcu_read_lock()
> >> > while(!memcg) {                                         while(!memcg)
> >> > [...]
> >> > spin_lock(&iter->iter_lock)
> >> > [...]
> >> > if (css == &root->css ||
> >> >                 (css && css_tryget(css)))
> >> >         memcg = mem_cgroup_from_css(css)
> >> > [...]
> >> > if (css && !memcg)
> >> >         curr = mem_cgroup_from_css(css)
> >> > if (curr)
> >> >         css_get(curr);
> >> > spin_unlock(&iter->iter_lock)
> >> >                                                         spin_lock(&iter->iter_lock)
> >> >                                                         /* sees the half dead memcg but its cgroup is still valid */
> >> >                                                         [...]
> >> >                                                         spin_unlock(&iter->iter_lock)
> >> > /* we do retry */
> >> > }
> >> > rcu_read_unlock()
> >> >
> >> > so the css_get will just helps to prevent from further code obfuscation.
> >> >
> >> > Makes sense? The code gets much simplified later in the series,
> >> > fortunately.
> >>
> >> My understanding on this is that we should never call css_get()
> >> without calling css_tryget() and it succeed.
> >
> > Hmm, what would be the point of using css_get then?
> 
> Only css_tryget() will fail if the cgroup is under removal, but not
> css_get(). AFAIK there is logic in cgroup_rmdir() rely on that. (The
> CSS_DEACT_BIAS will block new css_tryget(), and then fail all further
> css_get(). )
> 
> >
> >> Whether or not it is *safe* to do so, that seems conflicts with the
> >> assumption of the cgroup_rmdir().
> >>
> >> I would rather make the change to do the retry after css_tryget()
> >> failed. The patch I have on my local tree:
> >
> > OK, I am not against, the retry is just nicer and that is the reason
> > I changed that in the follow up patch. Just note that this is an
> > intermediate patch and the code is changed significantly in the later
> > patches so the question is whether it is worth changing that.
> > This surely couldn't have caused your testing issue, right?
> 
> I haven't tested separately, but the retry logic +
> mem_cgroup_iter_break() change cure my testcase.
I bet that what you are seeing is the stale cgroup due to cached
memcg. 
Retry logic doesn't help with that as the elevated ref count is just
temporal but your mem_cgroup_iter_break change might help for targeted
reclaim as it doesn't leave the memcg in last_visited (it still
shouldn't help the global reclaim case though). But this is not correct
because it will break the concurent reclaim as I mentioned previously.
I will try to post my pending patch to heal this ASAP.
Thanks
-- 
Michal Hocko
SUSE Labs
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply	[flat|nested] 57+ messages in thread
 
 
 
 
- * Re: [patch v2 3/6] memcg: rework mem_cgroup_iter to use cgroup iterators
  2012-12-11 15:50     ` Michal Hocko
  2012-12-11 16:15       ` Michal Hocko
@ 2012-12-11 22:31       ` Ying Han
  1 sibling, 0 replies; 57+ messages in thread
From: Ying Han @ 2012-12-11 22:31 UTC (permalink / raw)
  To: Michal Hocko
  Cc: linux-mm, linux-kernel, KAMEZAWA Hiroyuki, Johannes Weiner,
	Tejun Heo, Glauber Costa, Li Zefan
On Tue, Dec 11, 2012 at 7:50 AM, Michal Hocko <mhocko@suse.cz> wrote:
> On Sun 09-12-12 08:59:54, Ying Han wrote:
>> On Mon, Nov 26, 2012 at 10:47 AM, Michal Hocko <mhocko@suse.cz> wrote:
> [...]
>> > +               /*
>> > +                * Even if we found a group we have to make sure it is alive.
>> > +                * css && !memcg means that the groups should be skipped and
>> > +                * we should continue the tree walk.
>> > +                * last_visited css is safe to use because it is protected by
>> > +                * css_get and the tree walk is rcu safe.
>> > +                */
>> > +               if (css == &root->css || (css && css_tryget(css)))
>> > +                       memcg = mem_cgroup_from_css(css);
>> >
>> >                 if (reclaim) {
>> > -                       iter->position = id;
>> > +                       struct mem_cgroup *curr = memcg;
>> > +
>> > +                       if (last_visited)
>> > +                               css_put(&last_visited->css);
>> > +
>> > +                       if (css && !memcg)
>> > +                               curr = mem_cgroup_from_css(css);
>>
>> In this case, the css_tryget() failed which implies the css is on the
>> way to be removed. (refcnt ==0) If so, why it is safe to call
>> css_get() directly on it below? It seems not preventing the css to be
>> removed by doing so.
>
> Well, I do not remember exactly but I guess the code is meant to say
> that we need to store a half-dead memcg because the loop has to be
> retried. As we are under RCU hood it is just half dead.
> Now that you brought this up I think this is not safe as well because
> another thread could have seen the cached value while we tried to retry
> and his RCU is not protecting the group anymore. The follow up patch
> fixes this by retrying within the loop. I will bring that part into
> this patch already and then leave only css clean up in the other patch.
>
> Thanks for spotting this Ying!
I understand the intention here where we want to move on to the next
css if the css_tryget() failed. But css_get() won't hold on it in that
case.
I fixed that on my local branch which do the retry after css_tryget()
failed, just like what you talked about. And I will wait for you fix
on that.
--Ying
>
>>
>> > +                       /* make sure that the cached memcg is not removed */
>> > +                       if (curr)
>> > +                               css_get(&curr->css);
>>
>> --Ying
> --
> Michal Hocko
> SUSE Labs
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply	[flat|nested] 57+ messages in thread
 
 
- * Re: [patch v2 3/6] memcg: rework mem_cgroup_iter to use cgroup iterators
  2012-11-26 18:47 ` [patch v2 3/6] memcg: rework mem_cgroup_iter to use cgroup iterators Michal Hocko
                     ` (3 preceding siblings ...)
  2012-12-09 16:59   ` Ying Han
@ 2012-12-09 19:39   ` Ying Han
  2012-12-11 15:54     ` Michal Hocko
  4 siblings, 1 reply; 57+ messages in thread
From: Ying Han @ 2012-12-09 19:39 UTC (permalink / raw)
  To: Michal Hocko
  Cc: linux-mm, linux-kernel, KAMEZAWA Hiroyuki, Johannes Weiner,
	Tejun Heo, Glauber Costa, Li Zefan
On Mon, Nov 26, 2012 at 10:47 AM, Michal Hocko <mhocko@suse.cz> wrote:
> mem_cgroup_iter curently relies on css->id when walking down a group
> hierarchy tree. This is really awkward because the tree walk depends on
> the groups creation ordering. The only guarantee is that a parent node
> is visited before its children.
> Example
>  1) mkdir -p a a/d a/b/c
>  2) mkdir -a a/b/c a/d
> Will create the same trees but the tree walks will be different:
>  1) a, d, b, c
>  2) a, b, c, d
>
> 574bd9f7 (cgroup: implement generic child / descendant walk macros) has
> introduced generic cgroup tree walkers which provide either pre-order
> or post-order tree walk. This patch converts css->id based iteration
> to pre-order tree walk to keep the semantic with the original iterator
> where parent is always visited before its subtree.
>
> cgroup_for_each_descendant_pre suggests using post_create and
> pre_destroy for proper synchronization with groups addidition resp.
> removal. This implementation doesn't use those because a new memory
> cgroup is fully initialized in mem_cgroup_create and css reference
> counting enforces that the group is alive for both the last seen cgroup
> and the found one resp. it signals that the group is dead and it should
> be skipped.
>
> If the reclaim cookie is used we need to store the last visited group
> into the iterator so we have to be careful that it doesn't disappear in
> the mean time. Elevated reference count on the css keeps it alive even
> though the group have been removed (parked waiting for the last dput so
> that it can be freed).
>
> V2
> - use css_{get,put} for iter->last_visited rather than
>   mem_cgroup_{get,put} because it is stronger wrt. cgroup life cycle
> - cgroup_next_descendant_pre expects NULL pos for the first iterartion
>   otherwise it might loop endlessly for intermediate node without any
>   children.
>
> Signed-off-by: Michal Hocko <mhocko@suse.cz>
> ---
>  mm/memcontrol.c |   74 ++++++++++++++++++++++++++++++++++++++++++-------------
>  1 file changed, 57 insertions(+), 17 deletions(-)
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 1f5528d..6bcc97b 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -144,8 +144,8 @@ struct mem_cgroup_stat_cpu {
>  };
>
>  struct mem_cgroup_reclaim_iter {
> -       /* css_id of the last scanned hierarchy member */
> -       int position;
> +       /* last scanned hierarchy member with elevated css ref count */
> +       struct mem_cgroup *last_visited;
>         /* scan generation, increased every round-trip */
>         unsigned int generation;
>         /* lock to protect the position and generation */
> @@ -1066,7 +1066,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
>                                    struct mem_cgroup_reclaim_cookie *reclaim)
>  {
>         struct mem_cgroup *memcg = NULL;
> -       int id = 0;
> +       struct mem_cgroup *last_visited = NULL;
>
>         if (mem_cgroup_disabled())
>                 return NULL;
> @@ -1075,7 +1075,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
>                 root = root_mem_cgroup;
>
>         if (prev && !reclaim)
> -               id = css_id(&prev->css);
> +               last_visited = prev;
>
>         if (!root->use_hierarchy && root != root_mem_cgroup) {
>                 if (prev)
> @@ -1083,9 +1083,10 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
>                 return root;
>         }
>
> +       rcu_read_lock();
>         while (!memcg) {
>                 struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
> -               struct cgroup_subsys_state *css;
> +               struct cgroup_subsys_state *css = NULL;
>
>                 if (reclaim) {
>                         int nid = zone_to_nid(reclaim->zone);
> @@ -1095,34 +1096,73 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
>                         mz = mem_cgroup_zoneinfo(root, nid, zid);
>                         iter = &mz->reclaim_iter[reclaim->priority];
>                         spin_lock(&iter->iter_lock);
> +                       last_visited = iter->last_visited;
>                         if (prev && reclaim->generation != iter->generation) {
> +                               if (last_visited) {
> +                                       css_put(&last_visited->css);
> +                                       iter->last_visited = NULL;
> +                               }
>                                 spin_unlock(&iter->iter_lock);
> -                               goto out_css_put;
> +                               goto out_unlock;
>                         }
> -                       id = iter->position;
>                 }
>
> -               rcu_read_lock();
> -               css = css_get_next(&mem_cgroup_subsys, id + 1, &root->css, &id);
> -               if (css) {
> -                       if (css == &root->css || css_tryget(css))
> -                               memcg = mem_cgroup_from_css(css);
> -               } else
> -                       id = 0;
> -               rcu_read_unlock();
> +               /*
> +                * Root is not visited by cgroup iterators so it needs an
> +                * explicit visit.
> +                */
> +               if (!last_visited) {
> +                       css = &root->css;
> +               } else {
> +                       struct cgroup *prev_cgroup, *next_cgroup;
> +
> +                       prev_cgroup = (last_visited == root) ? NULL
> +                               : last_visited->css.cgroup;
> +                       next_cgroup = cgroup_next_descendant_pre(prev_cgroup,
> +                                       root->css.cgroup);
> +                       if (next_cgroup)
> +                               css = cgroup_subsys_state(next_cgroup,
> +                                               mem_cgroup_subsys_id);
> +               }
> +
> +               /*
> +                * Even if we found a group we have to make sure it is alive.
> +                * css && !memcg means that the groups should be skipped and
> +                * we should continue the tree walk.
> +                * last_visited css is safe to use because it is protected by
> +                * css_get and the tree walk is rcu safe.
> +                */
> +               if (css == &root->css || (css && css_tryget(css)))
> +                       memcg = mem_cgroup_from_css(css);
>
>                 if (reclaim) {
> -                       iter->position = id;
> +                       struct mem_cgroup *curr = memcg;
> +
> +                       if (last_visited)
> +                               css_put(&last_visited->css);
> +
> +                       if (css && !memcg)
> +                               curr = mem_cgroup_from_css(css);
> +
> +                       /* make sure that the cached memcg is not removed */
> +                       if (curr)
> +                               css_get(&curr->css);
> +                       iter->last_visited = curr;
Here we take extra refcnt for last_visited, and assume it is under
target reclaim which then calls mem_cgroup_iter_break() and we leaked
a refcnt of the
target memcg css. Sorry if i missed the change in iter_break(),
otherwise we might need something like this:
 void mem_cgroup_iter_break(struct mem_cgroup *root,
-                          struct mem_cgroup *prev)
+                          struct mem_cgroup *prev,
+                          struct mem_cgroup_reclaim_cookie *reclaim)
 {
        if (!root)
                root = root_mem_cgroup;
        if (prev && prev != root)
                css_put(&prev->css);
+
+       if (reclaim) {
+               int nid = zone_to_nid(reclaim->zone);
+               int zid = zone_idx(reclaim->zone);
+               struct mem_cgroup *last_visited = NULL;
+               struct mem_cgroup_per_zone *mz;
+               struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
+
+               mz = mem_cgroup_zoneinfo(root, nid, zid);
+               iter = &mz->reclaim_iter[reclaim->priority];
+               spin_lock(&iter->iter_lock);
+               last_visited = iter->last_visited;
+               if (last_visited) {
+                       css_put(&last_visited->css);
+                       iter->last_visited = NULL;
+               }
+               spin_unlock(&iter->iter_lock);
+       }
 }
--Ying
>                         if (!css)
>                                 iter->generation++;
>                         else if (!prev && memcg)
>                                 reclaim->generation = iter->generation;
>                         spin_unlock(&iter->iter_lock);
> +               } else if (css && !memcg) {
> +                       last_visited = mem_cgroup_from_css(css);
>                 }
>
>                 if (prev && !css)
> -                       goto out_css_put;
> +                       goto out_unlock;
>         }
> +out_unlock:
> +       rcu_read_unlock();
>  out_css_put:
>         if (prev && prev != root)
>                 css_put(&prev->css);
> --
> 1.7.10.4
>
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply	[flat|nested] 57+ messages in thread
- * Re: [patch v2 3/6] memcg: rework mem_cgroup_iter to use cgroup iterators
  2012-12-09 19:39   ` Ying Han
@ 2012-12-11 15:54     ` Michal Hocko
  2012-12-11 22:36       ` Ying Han
  0 siblings, 1 reply; 57+ messages in thread
From: Michal Hocko @ 2012-12-11 15:54 UTC (permalink / raw)
  To: Ying Han
  Cc: linux-mm, linux-kernel, KAMEZAWA Hiroyuki, Johannes Weiner,
	Tejun Heo, Glauber Costa, Li Zefan
On Sun 09-12-12 11:39:50, Ying Han wrote:
> On Mon, Nov 26, 2012 at 10:47 AM, Michal Hocko <mhocko@suse.cz> wrote:
[...]
> >                 if (reclaim) {
> > -                       iter->position = id;
> > +                       struct mem_cgroup *curr = memcg;
> > +
> > +                       if (last_visited)
> > +                               css_put(&last_visited->css);
			    ^^^^^^^^^^^
			    here
> > +
> > +                       if (css && !memcg)
> > +                               curr = mem_cgroup_from_css(css);
> > +
> > +                       /* make sure that the cached memcg is not removed */
> > +                       if (curr)
> > +                               css_get(&curr->css);
> > +                       iter->last_visited = curr;
> 
> Here we take extra refcnt for last_visited, and assume it is under
> target reclaim which then calls mem_cgroup_iter_break() and we leaked
> a refcnt of the target memcg css.
I think you are not right here. The extra reference is kept for
iter->last_visited and it will be dropped the next time somebody sees
the same zone-priority iter. See above.
Or have I missed your question?
[...]
-- 
Michal Hocko
SUSE Labs
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply	[flat|nested] 57+ messages in thread
- * Re: [patch v2 3/6] memcg: rework mem_cgroup_iter to use cgroup iterators
  2012-12-11 15:54     ` Michal Hocko
@ 2012-12-11 22:36       ` Ying Han
  2012-12-12  9:06         ` Michal Hocko
  0 siblings, 1 reply; 57+ messages in thread
From: Ying Han @ 2012-12-11 22:36 UTC (permalink / raw)
  To: Michal Hocko
  Cc: linux-mm, linux-kernel, KAMEZAWA Hiroyuki, Johannes Weiner,
	Tejun Heo, Glauber Costa, Li Zefan
On Tue, Dec 11, 2012 at 7:54 AM, Michal Hocko <mhocko@suse.cz> wrote:
> On Sun 09-12-12 11:39:50, Ying Han wrote:
>> On Mon, Nov 26, 2012 at 10:47 AM, Michal Hocko <mhocko@suse.cz> wrote:
> [...]
>> >                 if (reclaim) {
>> > -                       iter->position = id;
>> > +                       struct mem_cgroup *curr = memcg;
>> > +
>> > +                       if (last_visited)
>> > +                               css_put(&last_visited->css);
>                             ^^^^^^^^^^^
>                             here
>> > +
>> > +                       if (css && !memcg)
>> > +                               curr = mem_cgroup_from_css(css);
>> > +
>> > +                       /* make sure that the cached memcg is not removed */
>> > +                       if (curr)
>> > +                               css_get(&curr->css);
>> > +                       iter->last_visited = curr;
>>
>> Here we take extra refcnt for last_visited, and assume it is under
>> target reclaim which then calls mem_cgroup_iter_break() and we leaked
>> a refcnt of the target memcg css.
>
> I think you are not right here. The extra reference is kept for
> iter->last_visited and it will be dropped the next time somebody sees
> the same zone-priority iter. See above.
>
> Or have I missed your question?
Hmm, question remains.
My understanding of the mem_cgroup_iter() is that each call path
should close the loop itself, in the sense that no *leaked* css refcnt
after that loop finished. It is the case for all the caller today
where the loop terminates at memcg == NULL, where all the refcnt have
been dropped by then.
One exception is mem_cgroup_iter_break(), where the loop terminates
with *leaked* refcnt and that is what the iter_break() needs to clean
up. We can not rely on the next caller of the loop since it might
never happen.
It makes sense to drop the refcnt of last_visited, the same reason as
drop refcnt of prev. I don't see why it makes different.
--Ying
>
> [...]
> --
> Michal Hocko
> SUSE Labs
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply	[flat|nested] 57+ messages in thread
- * Re: [patch v2 3/6] memcg: rework mem_cgroup_iter to use cgroup iterators
  2012-12-11 22:36       ` Ying Han
@ 2012-12-12  9:06         ` Michal Hocko
  2012-12-12 18:09           ` Ying Han
  2012-12-12 19:24           ` [patch v2 3/6] memcg: rework mem_cgroup_iter to use cgroup iterators Michal Hocko
  0 siblings, 2 replies; 57+ messages in thread
From: Michal Hocko @ 2012-12-12  9:06 UTC (permalink / raw)
  To: Ying Han
  Cc: linux-mm, linux-kernel, KAMEZAWA Hiroyuki, Johannes Weiner,
	Tejun Heo, Glauber Costa, Li Zefan
On Tue 11-12-12 14:36:10, Ying Han wrote:
> On Tue, Dec 11, 2012 at 7:54 AM, Michal Hocko <mhocko@suse.cz> wrote:
> > On Sun 09-12-12 11:39:50, Ying Han wrote:
> >> On Mon, Nov 26, 2012 at 10:47 AM, Michal Hocko <mhocko@suse.cz> wrote:
> > [...]
> >> >                 if (reclaim) {
> >> > -                       iter->position = id;
> >> > +                       struct mem_cgroup *curr = memcg;
> >> > +
> >> > +                       if (last_visited)
> >> > +                               css_put(&last_visited->css);
> >                             ^^^^^^^^^^^
> >                             here
> >> > +
> >> > +                       if (css && !memcg)
> >> > +                               curr = mem_cgroup_from_css(css);
> >> > +
> >> > +                       /* make sure that the cached memcg is not removed */
> >> > +                       if (curr)
> >> > +                               css_get(&curr->css);
> >> > +                       iter->last_visited = curr;
> >>
> >> Here we take extra refcnt for last_visited, and assume it is under
> >> target reclaim which then calls mem_cgroup_iter_break() and we leaked
> >> a refcnt of the target memcg css.
> >
> > I think you are not right here. The extra reference is kept for
> > iter->last_visited and it will be dropped the next time somebody sees
> > the same zone-priority iter. See above.
> >
> > Or have I missed your question?
> 
> Hmm, question remains.
> 
> My understanding of the mem_cgroup_iter() is that each call path
> should close the loop itself, in the sense that no *leaked* css refcnt
> after that loop finished. It is the case for all the caller today
> where the loop terminates at memcg == NULL, where all the refcnt have
> been dropped by then.
Now I am not sure I understand you. mem_cgroup_iter_break will always
drop the reference of the last returned memcg. So far so good. But if
the last memcg got cached in per-zone-priority last_visited then we
_have_ to keep a reference to it regardless we broke out of the loop.
The last_visited thingy is shared between all parallel reclaimers so we
cannot just drop a reference to it.
> One exception is mem_cgroup_iter_break(), where the loop terminates
> with *leaked* refcnt and that is what the iter_break() needs to clean
> up. We can not rely on the next caller of the loop since it might
> never happen.
Yes, this is true and I already have a half baked patch for that. I
haven't posted it yet but it basically checks all node-zone-prio
last_visited and removes itself from them on the way out in pre_destroy
callback (I just need to cleanup "find a new last_visited" part and will
post it).
> It makes sense to drop the refcnt of last_visited, the same reason as
> drop refcnt of prev. I don't see why it makes different.
Because then it might vanish when somebody else wants to access it. If
we just did mem_cgroup_get which would be enough to keep only memcg part
in memory then what can we do at the time we visit it? css_tryget would
tell us "no your buddy is gone", you do not have any links to the tree
so you would need to start from the beginning. That is what I have
implemented in the first version. Then I've realized that this could
make a bigger pressure on the groups created earlier which doesn't seem
to be right. With css pinning we are sure that there is a link to a next
node in the tree.
Thanks!
-- 
Michal Hocko
SUSE Labs
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply	[flat|nested] 57+ messages in thread
- * Re: [patch v2 3/6] memcg: rework mem_cgroup_iter to use cgroup iterators
  2012-12-12  9:06         ` Michal Hocko
@ 2012-12-12 18:09           ` Ying Han
  2012-12-12 18:34             ` Michal Hocko
  2012-12-12 19:24           ` [patch v2 3/6] memcg: rework mem_cgroup_iter to use cgroup iterators Michal Hocko
  1 sibling, 1 reply; 57+ messages in thread
From: Ying Han @ 2012-12-12 18:09 UTC (permalink / raw)
  To: Michal Hocko
  Cc: linux-mm, linux-kernel, KAMEZAWA Hiroyuki, Johannes Weiner,
	Tejun Heo, Glauber Costa, Li Zefan
On Wed, Dec 12, 2012 at 1:06 AM, Michal Hocko <mhocko@suse.cz> wrote:
> On Tue 11-12-12 14:36:10, Ying Han wrote:
>> On Tue, Dec 11, 2012 at 7:54 AM, Michal Hocko <mhocko@suse.cz> wrote:
>> > On Sun 09-12-12 11:39:50, Ying Han wrote:
>> >> On Mon, Nov 26, 2012 at 10:47 AM, Michal Hocko <mhocko@suse.cz> wrote:
>> > [...]
>> >> >                 if (reclaim) {
>> >> > -                       iter->position = id;
>> >> > +                       struct mem_cgroup *curr = memcg;
>> >> > +
>> >> > +                       if (last_visited)
>> >> > +                               css_put(&last_visited->css);
>> >                             ^^^^^^^^^^^
>> >                             here
>> >> > +
>> >> > +                       if (css && !memcg)
>> >> > +                               curr = mem_cgroup_from_css(css);
>> >> > +
>> >> > +                       /* make sure that the cached memcg is not removed */
>> >> > +                       if (curr)
>> >> > +                               css_get(&curr->css);
>> >> > +                       iter->last_visited = curr;
>> >>
>> >> Here we take extra refcnt for last_visited, and assume it is under
>> >> target reclaim which then calls mem_cgroup_iter_break() and we leaked
>> >> a refcnt of the target memcg css.
>> >
>> > I think you are not right here. The extra reference is kept for
>> > iter->last_visited and it will be dropped the next time somebody sees
>> > the same zone-priority iter. See above.
>> >
>> > Or have I missed your question?
>>
>> Hmm, question remains.
>>
>> My understanding of the mem_cgroup_iter() is that each call path
>> should close the loop itself, in the sense that no *leaked* css refcnt
>> after that loop finished. It is the case for all the caller today
>> where the loop terminates at memcg == NULL, where all the refcnt have
>> been dropped by then.
>
> Now I am not sure I understand you. mem_cgroup_iter_break will always
> drop the reference of the last returned memcg. So far so good.
Yes, and the patch doesn't change that.
But if
> the last memcg got cached in per-zone-priority last_visited then we
> _have_ to keep a reference to it regardless we broke out of the loop.
> The last_visited thingy is shared between all parallel reclaimers so we
> cannot just drop a reference to it.
Agree that the last_visited is shared between all the memcgs accessing
the per-zone-per-iterator.
Also agree that we don't want to drop reference of it if last_visited
is cached after the loop.
But If i look at the callers of mem_cgroup_iter(), they all look like
the following:
memcg = mem_cgroup_iter(root, NULL, &reclaim);
do {
    // do something
    memcg = mem_cgroup_iter(root, memcg, &reclaim);
} while (memcg);
So we get out of the loop when memcg returns as NULL, where the
last_visited is cached as NULL as well thus no css_get(). That is what
I meant by "each reclaim thread closes the loop". If that is true, the
current implementation of mem_cgroup_iter_break() changes that.
>
>> One exception is mem_cgroup_iter_break(), where the loop terminates
>> with *leaked* refcnt and that is what the iter_break() needs to clean
>> up. We can not rely on the next caller of the loop since it might
>> never happen.
>
> Yes, this is true and I already have a half baked patch for that. I
> haven't posted it yet but it basically checks all node-zone-prio
> last_visited and removes itself from them on the way out in pre_destroy
> callback (I just need to cleanup "find a new last_visited" part and will
> post it).
Not sure whether that or just change the mem_cgroup_iter_break() by
dropping the refcnt of last_visited.
--Ying
>
>> It makes sense to drop the refcnt of last_visited, the same reason as
>> drop refcnt of prev. I don't see why it makes different.
>
> Because then it might vanish when somebody else wants to access it. If
> we just did mem_cgroup_get which would be enough to keep only memcg part
> in memory then what can we do at the time we visit it? css_tryget would
> tell us "no your buddy is gone", you do not have any links to the tree
> so you would need to start from the beginning. That is what I have
> implemented in the first version. Then I've realized that this could
> make a bigger pressure on the groups created earlier which doesn't seem
> to be right. With css pinning we are sure that there is a link to a next
> node in the tree.
>
> Thanks!
> --
> Michal Hocko
> SUSE Labs
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply	[flat|nested] 57+ messages in thread
- * Re: [patch v2 3/6] memcg: rework mem_cgroup_iter to use cgroup iterators
  2012-12-12 18:09           ` Ying Han
@ 2012-12-12 18:34             ` Michal Hocko
  2012-12-12 18:42               ` Michal Hocko
  0 siblings, 1 reply; 57+ messages in thread
From: Michal Hocko @ 2012-12-12 18:34 UTC (permalink / raw)
  To: Ying Han
  Cc: linux-mm, linux-kernel, KAMEZAWA Hiroyuki, Johannes Weiner,
	Tejun Heo, Glauber Costa, Li Zefan
On Wed 12-12-12 10:09:43, Ying Han wrote:
[...]
> But If i look at the callers of mem_cgroup_iter(), they all look like
> the following:
> 
> memcg = mem_cgroup_iter(root, NULL, &reclaim);
> do {
> 
>     // do something
> 
>     memcg = mem_cgroup_iter(root, memcg, &reclaim);
> } while (memcg);
> 
> So we get out of the loop when memcg returns as NULL, where the
> last_visited is cached as NULL as well thus no css_get(). That is what
> I meant by "each reclaim thread closes the loop".
OK
> If that is true, the current implementation of mem_cgroup_iter_break()
> changes that.
I do not understand this though. Why should we touch the zone-iter
there?  Just consider, if we did that then all the parallel targeted
reclaimers (! global case) would hammer the first node (root) as they
wouldn't continue where the last one finished.
[...]
Thanks!
-- 
Michal Hocko
SUSE Labs
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply	[flat|nested] 57+ messages in thread
- * Re: [patch v2 3/6] memcg: rework mem_cgroup_iter to use cgroup iterators
  2012-12-12 18:34             ` Michal Hocko
@ 2012-12-12 18:42               ` Michal Hocko
  2012-12-14  1:06                 ` Ying Han
  0 siblings, 1 reply; 57+ messages in thread
From: Michal Hocko @ 2012-12-12 18:42 UTC (permalink / raw)
  To: Ying Han
  Cc: linux-mm, linux-kernel, KAMEZAWA Hiroyuki, Johannes Weiner,
	Tejun Heo, Glauber Costa, Li Zefan
On Wed 12-12-12 19:34:46, Michal Hocko wrote:
> On Wed 12-12-12 10:09:43, Ying Han wrote:
> [...]
> > But If i look at the callers of mem_cgroup_iter(), they all look like
> > the following:
> > 
> > memcg = mem_cgroup_iter(root, NULL, &reclaim);
> > do {
> > 
> >     // do something
> > 
> >     memcg = mem_cgroup_iter(root, memcg, &reclaim);
> > } while (memcg);
> > 
> > So we get out of the loop when memcg returns as NULL, where the
> > last_visited is cached as NULL as well thus no css_get(). That is what
> > I meant by "each reclaim thread closes the loop".
> 
> OK
> 
> > If that is true, the current implementation of mem_cgroup_iter_break()
> > changes that.
> 
> I do not understand this though. Why should we touch the zone-iter
> there?  Just consider, if we did that then all the parallel targeted
Bahh, parallel is only confusing here. Say first child triggers a hard
limit reclaim then root of the hierarchy will be reclaimed first.
iter_break would reset iter->last_visited. Then B triggers the same
reclaim but we will start again from root rather than the first child
because it doesn't know where the other one stopped.
Hope this clarifies it and sorry for all the confusion.
> reclaimers (! global case) would hammer the first node (root) as they
> wouldn't continue where the last one finished.
> 
> [...]
> 
> Thanks!
-- 
Michal Hocko
SUSE Labs
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply	[flat|nested] 57+ messages in thread
- * Re: [patch v2 3/6] memcg: rework mem_cgroup_iter to use cgroup iterators
  2012-12-12 18:42               ` Michal Hocko
@ 2012-12-14  1:06                 ` Ying Han
  2012-12-14 10:56                   ` [PATCH] memcg,vmscan: do not break out targeted reclaim without reclaimed pages Michal Hocko
  0 siblings, 1 reply; 57+ messages in thread
From: Ying Han @ 2012-12-14  1:06 UTC (permalink / raw)
  To: Michal Hocko
  Cc: linux-mm, linux-kernel, KAMEZAWA Hiroyuki, Johannes Weiner,
	Tejun Heo, Glauber Costa, Li Zefan
On Wed, Dec 12, 2012 at 10:42 AM, Michal Hocko <mhocko@suse.cz> wrote:
> On Wed 12-12-12 19:34:46, Michal Hocko wrote:
>> On Wed 12-12-12 10:09:43, Ying Han wrote:
>> [...]
>> > But If i look at the callers of mem_cgroup_iter(), they all look like
>> > the following:
>> >
>> > memcg = mem_cgroup_iter(root, NULL, &reclaim);
>> > do {
>> >
>> >     // do something
>> >
>> >     memcg = mem_cgroup_iter(root, memcg, &reclaim);
>> > } while (memcg);
>> >
>> > So we get out of the loop when memcg returns as NULL, where the
>> > last_visited is cached as NULL as well thus no css_get(). That is what
>> > I meant by "each reclaim thread closes the loop".
>>
>> OK
>>
>> > If that is true, the current implementation of mem_cgroup_iter_break()
>> > changes that.
>>
>> I do not understand this though. Why should we touch the zone-iter
>> there?  Just consider, if we did that then all the parallel targeted
>
> Bahh, parallel is only confusing here. Say first child triggers a hard
> limit reclaim then root of the hierarchy will be reclaimed first.
> iter_break would reset iter->last_visited. Then B triggers the same
> reclaim but we will start again from root rather than the first child
> because it doesn't know where the other one stopped.
>
> Hope this clarifies it and sorry for all the confusion.
Yes it does.
I missed the point of how the target reclaim are currently
implemented, and part of the reason is because I don't understand why
that is the case from the beginning.
Off topic of the following discussion.
Take the following hierarchy as example:
                root
              /  |   \
            a   b     c
                        |  \
                        d   e
                        |      \
                        g      h
Let's say c hits its hardlimit and then triggers target reclaim. There
are two reclaimers at the moment and reclaimer_1 starts earlier. The
cgroup_next_descendant_pre() returns in order : c->d->g->e->h
Then we might get the reclaim result as the following where each
reclaimer keep hitting one node of the sub-tree for all the priorities
like the following:
                reclaimer_1  reclaimer_2
priority 12  c                 d
...             c                 d
...             c                 d
...             c                 d
           0   c                 d
However, this is not how global reclaim works:
the cgroup_next_descendant_pre returns in order: root->a->b->c->d->g->e->h
                reclaimer_1  reclaimer_1 reclaimer_1  reclaimer_2
priority 12  root                 a            b                 c
...             root                 a            b                 c
...
...
0
There is no reason for me to think of why target reclaim behave
differently from global reclaim, which the later one is just the
target reclaim of root cgroup.
--Ying
>
>> reclaimers (! global case) would hammer the first node (root) as they
>> wouldn't continue where the last one finished.
>>
>> [...]
>>
>> Thanks!
> --
> Michal Hocko
> SUSE Labs
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply	[flat|nested] 57+ messages in thread
- * [PATCH] memcg,vmscan: do not break out targeted reclaim without reclaimed pages
  2012-12-14  1:06                 ` Ying Han
@ 2012-12-14 10:56                   ` Michal Hocko
  0 siblings, 0 replies; 57+ messages in thread
From: Michal Hocko @ 2012-12-14 10:56 UTC (permalink / raw)
  To: Ying Han
  Cc: linux-mm, linux-kernel, KAMEZAWA Hiroyuki, Johannes Weiner,
	Tejun Heo, Glauber Costa, Li Zefan
On Thu 13-12-12 17:06:38, Ying Han wrote:
[...]
> Off topic of the following discussion.
> Take the following hierarchy as example:
> 
>                 root
>               /  |   \
>             a   b     c
>                         |  \
>                         d   e
>                         |      \
>                         g      h
> 
> Let's say c hits its hardlimit and then triggers target reclaim. There
> are two reclaimers at the moment and reclaimer_1 starts earlier. The
> cgroup_next_descendant_pre() returns in order : c->d->g->e->h
> 
> Then we might get the reclaim result as the following where each
> reclaimer keep hitting one node of the sub-tree for all the priorities
> like the following:
> 
>                 reclaimer_1  reclaimer_2
> priority 12  c                 d
> ...             c                 d
> ...             c                 d
> ...             c                 d
>            0   c                 d
> 
> However, this is not how global reclaim works:
> 
> the cgroup_next_descendant_pre returns in order: root->a->b->c->d->g->e->h
> 
>                 reclaimer_1  reclaimer_1 reclaimer_1  reclaimer_2
> priority 12  root                 a            b                 c
> ...             root                 a            b                 c
> ...
> ...
> 0
> 
> There is no reason for me to think of why target reclaim behave
> differently from global reclaim, which the later one is just the
> target reclaim of root cgroup.
Well, this is not a fair comparison because global reclaim is not just
targeted reclaim of the root cgroup. The difference is that global
reclaim balances zones while targeted reclaim only tries to get bellow
a threshold (hard or soft limit). So we cannot really do the same thing
for both.
On the other hand you are right that targeted reclaim iteration can be
weird, especially when nodes higher in the hierarchy do not have any
pages to reclaim (if they do not have any tasks then only re-parented
are on the list). Then we would drop the priority rather quickly and
hammering the same group again and again until we exhaust all priorities
and come back to the shrinker which finds out that nothing changed so it
will try again and we will slowly get to something to reclaim (always
starting with DEF_PRIORITY). So true we are doing a lot of work without
any point.
Maybe we shouldn't break out of the loop if we didn't reclaim enough for
targeted reclaim. Something like:
---
^ permalink raw reply	[flat|nested] 57+ messages in thread 
 
 
 
 
- * Re: [patch v2 3/6] memcg: rework mem_cgroup_iter to use cgroup iterators
  2012-12-12  9:06         ` Michal Hocko
  2012-12-12 18:09           ` Ying Han
@ 2012-12-12 19:24           ` Michal Hocko
  2012-12-14  1:14             ` Ying Han
  2012-12-14 12:37             ` Michal Hocko
  1 sibling, 2 replies; 57+ messages in thread
From: Michal Hocko @ 2012-12-12 19:24 UTC (permalink / raw)
  To: Ying Han
  Cc: linux-mm, linux-kernel, KAMEZAWA Hiroyuki, Johannes Weiner,
	Tejun Heo, Glauber Costa, Li Zefan
On Wed 12-12-12 10:06:52, Michal Hocko wrote:
> On Tue 11-12-12 14:36:10, Ying Han wrote:
[...]
> > One exception is mem_cgroup_iter_break(), where the loop terminates
> > with *leaked* refcnt and that is what the iter_break() needs to clean
> > up. We can not rely on the next caller of the loop since it might
> > never happen.
> 
> Yes, this is true and I already have a half baked patch for that. I
> haven't posted it yet but it basically checks all node-zone-prio
> last_visited and removes itself from them on the way out in pre_destroy
> callback (I just need to cleanup "find a new last_visited" part and will
> post it).
And a half baked patch - just compile tested
---
^ permalink raw reply	[flat|nested] 57+ messages in thread 
- * Re: [patch v2 3/6] memcg: rework mem_cgroup_iter to use cgroup iterators
  2012-12-12 19:24           ` [patch v2 3/6] memcg: rework mem_cgroup_iter to use cgroup iterators Michal Hocko
@ 2012-12-14  1:14             ` Ying Han
  2012-12-14 12:07               ` Michal Hocko
  2012-12-14 12:37             ` Michal Hocko
  1 sibling, 1 reply; 57+ messages in thread
From: Ying Han @ 2012-12-14  1:14 UTC (permalink / raw)
  To: Michal Hocko
  Cc: linux-mm, linux-kernel, KAMEZAWA Hiroyuki, Johannes Weiner,
	Tejun Heo, Glauber Costa, Li Zefan
On Wed, Dec 12, 2012 at 11:24 AM, Michal Hocko <mhocko@suse.cz> wrote:
> On Wed 12-12-12 10:06:52, Michal Hocko wrote:
>> On Tue 11-12-12 14:36:10, Ying Han wrote:
> [...]
>> > One exception is mem_cgroup_iter_break(), where the loop terminates
>> > with *leaked* refcnt and that is what the iter_break() needs to clean
>> > up. We can not rely on the next caller of the loop since it might
>> > never happen.
>>
>> Yes, this is true and I already have a half baked patch for that. I
>> haven't posted it yet but it basically checks all node-zone-prio
>> last_visited and removes itself from them on the way out in pre_destroy
>> callback (I just need to cleanup "find a new last_visited" part and will
>> post it).
>
> And a half baked patch - just compile tested
> ---
> From 1c976c079c383175c679e00115aee0ab8e215bf2 Mon Sep 17 00:00:00 2001
> From: Michal Hocko <mhocko@suse.cz>
> Date: Tue, 11 Dec 2012 21:02:39 +0100
> Subject: [PATCH] NOT READY YET - just compile tested
>
> memcg: remove memcg from the reclaim iterators
>
> Now that per-node-zone-priority iterator caches memory cgroups rather
> than their css ids we have to be careful and remove them from the
> iterator when they are on the way out otherwise they might hang for
> unbounded amount of time (until the global reclaim triggers the zone
> under priority to find out the group is dead and let it to find the
> final rest).
>
> This is solved by hooking into mem_cgroup_pre_destroy and checking all
> per-node-zone-priority iterators. If the current memcg is found in
> iter->last_visited then it is replaced by its left sibling or its parent
> otherwise. This guarantees that no group gets more reclaiming than
> necessary and the next iteration will continue seemingly.
>
> Spotted-by: Ying Han <yinghan@google.com>
> Not-signed-off-by-yet: Michal Hocko <mhocko@suse.cz>
> ---
>  mm/memcontrol.c |   38 ++++++++++++++++++++++++++++++++++++++
>  1 file changed, 38 insertions(+)
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 7134148..286db74 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -6213,12 +6213,50 @@ free_out:
>         return ERR_PTR(error);
>  }
>
> +static void mem_cgroup_remove_cached(struct mem_cgroup *memcg)
> +{
> +       int node, zone;
> +
> +       for_each_node(node) {
> +               struct mem_cgroup_per_node *pn = memcg->info.nodeinfo[node];
> +               int prio;
> +
> +               for (zone = 0; zone < MAX_NR_ZONES; zone++) {
> +                       struct mem_cgroup_per_zone *mz;
> +
> +                       mz = &pn->zoneinfo[zone];
> +                       for (prio = 0; prio < DEF_PRIORITY + 1; prio++) {
> +                               struct mem_cgroup_reclaim_iter *iter;
> +
> +                               iter = &mz->reclaim_iter[prio];
> +                               rcu_read_lock();
> +                               spin_lock(&iter->iter_lock);
> +                               if (iter->last_visited == memcg) {
> +                                       struct cgroup *cgroup, *prev;
> +
> +                                       cgroup = memcg->css.cgroup;
> +                                       prev = list_entry_rcu(cgroup->sibling.prev, struct cgroup, sibling);
> +                                       if (&prev->sibling == &prev->parent->children)
> +                                               prev = prev->parent;
> +                                       iter->last_visited = mem_cgroup_from_cont(prev);
> +
> +                                       /* TODO can we do this? */
> +                                       css_put(&memcg->css);
> +                               }
> +                               spin_unlock(&iter->iter_lock);
> +                               rcu_read_unlock();
> +                       }
> +               }
> +       }
> +}
> +
>  static void mem_cgroup_pre_destroy(struct cgroup *cont)
>  {
>         struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
>
>         mem_cgroup_reparent_charges(memcg);
>         mem_cgroup_destroy_all_caches(memcg);
> +       mem_cgroup_remove_cached(memcg);
>  }
>
>  static void mem_cgroup_destroy(struct cgroup *cont)
> --
> 1.7.10.4
>
> --
> Michal Hocko
> SUSE Labs
I haven't tried this patch set yet. Before I am doing that, I am
curious whether changing the target reclaim to be consistent with
global reclaim something worthy to consider based my last reply:
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 53dcde9..3f158c5 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1911,20 +1911,6 @@ static void shrink_zone(struct zone *zone,
struct scan_control *sc)
                shrink_lruvec(lruvec, sc);
-               /*
-                * Limit reclaim has historically picked one memcg and
-                * scanned it with decreasing priority levels until
-                * nr_to_reclaim had been reclaimed.  This priority
-                * cycle is thus over after a single memcg.
-                *
-                * Direct reclaim and kswapd, on the other hand, have
-                * to scan all memory cgroups to fulfill the overall
-                * scan target for the zone.
-                */
-               if (!global_reclaim(sc)) {
-                       mem_cgroup_iter_break(root, memcg);
-                       break;
-               }
                memcg = mem_cgroup_iter(root, memcg, &reclaim);
        } while (memcg);
 }
--Ying
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related	[flat|nested] 57+ messages in thread
- * Re: [patch v2 3/6] memcg: rework mem_cgroup_iter to use cgroup iterators
  2012-12-14  1:14             ` Ying Han
@ 2012-12-14 12:07               ` Michal Hocko
  2012-12-14 23:08                 ` Ying Han
  0 siblings, 1 reply; 57+ messages in thread
From: Michal Hocko @ 2012-12-14 12:07 UTC (permalink / raw)
  To: Ying Han
  Cc: linux-mm, linux-kernel, KAMEZAWA Hiroyuki, Johannes Weiner,
	Tejun Heo, Glauber Costa, Li Zefan
On Thu 13-12-12 17:14:13, Ying Han wrote:
[...]
> I haven't tried this patch set yet. Before I am doing that, I am
> curious whether changing the target reclaim to be consistent with
> global reclaim something worthy to consider based my last reply:
> 
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 53dcde9..3f158c5 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -1911,20 +1911,6 @@ static void shrink_zone(struct zone *zone,
> struct scan_control *sc)
> 
>                 shrink_lruvec(lruvec, sc);
> 
> -               /*
> -                * Limit reclaim has historically picked one memcg and
> -                * scanned it with decreasing priority levels until
> -                * nr_to_reclaim had been reclaimed.  This priority
> -                * cycle is thus over after a single memcg.
> -                *
> -                * Direct reclaim and kswapd, on the other hand, have
> -                * to scan all memory cgroups to fulfill the overall
> -                * scan target for the zone.
> -                */
> -               if (!global_reclaim(sc)) {
> -                       mem_cgroup_iter_break(root, memcg);
> -                       break;
> -               }
>                 memcg = mem_cgroup_iter(root, memcg, &reclaim);
This wouldn't work because you would over-reclaim proportionally to the
number of groups in the hierarchy.
>         } while (memcg);
>  }
> 
> --Ying
-- 
Michal Hocko
SUSE Labs
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply	[flat|nested] 57+ messages in thread
- * Re: [patch v2 3/6] memcg: rework mem_cgroup_iter to use cgroup iterators
  2012-12-14 12:07               ` Michal Hocko
@ 2012-12-14 23:08                 ` Ying Han
  0 siblings, 0 replies; 57+ messages in thread
From: Ying Han @ 2012-12-14 23:08 UTC (permalink / raw)
  To: Michal Hocko
  Cc: linux-mm, linux-kernel, KAMEZAWA Hiroyuki, Johannes Weiner,
	Tejun Heo, Glauber Costa, Li Zefan
On Fri, Dec 14, 2012 at 4:07 AM, Michal Hocko <mhocko@suse.cz> wrote:
> On Thu 13-12-12 17:14:13, Ying Han wrote:
> [...]
>> I haven't tried this patch set yet. Before I am doing that, I am
>> curious whether changing the target reclaim to be consistent with
>> global reclaim something worthy to consider based my last reply:
>>
>> diff --git a/mm/vmscan.c b/mm/vmscan.c
>> index 53dcde9..3f158c5 100644
>> --- a/mm/vmscan.c
>> +++ b/mm/vmscan.c
>> @@ -1911,20 +1911,6 @@ static void shrink_zone(struct zone *zone,
>> struct scan_control *sc)
>>
>>                 shrink_lruvec(lruvec, sc);
>>
>> -               /*
>> -                * Limit reclaim has historically picked one memcg and
>> -                * scanned it with decreasing priority levels until
>> -                * nr_to_reclaim had been reclaimed.  This priority
>> -                * cycle is thus over after a single memcg.
>> -                *
>> -                * Direct reclaim and kswapd, on the other hand, have
>> -                * to scan all memory cgroups to fulfill the overall
>> -                * scan target for the zone.
>> -                */
>> -               if (!global_reclaim(sc)) {
>> -                       mem_cgroup_iter_break(root, memcg);
>> -                       break;
>> -               }
>>                 memcg = mem_cgroup_iter(root, memcg, &reclaim);
>
> This wouldn't work because you would over-reclaim proportionally to the
> number of groups in the hierarchy.
Don't get it and especially of why it is different from global
reclaim? I view the global reclaim should be viewed as target reclaim,
just a matter of the root cgroup is under memory pressure.
Anyway, don't want to distract you from working on the next post. So
feel free to not follow up on this.
--Ying
>
>>         } while (memcg);
>>  }
>>
>> --Ying
>
> --
> Michal Hocko
> SUSE Labs
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply	[flat|nested] 57+ messages in thread
 
 
- * Re: [patch v2 3/6] memcg: rework mem_cgroup_iter to use cgroup iterators
  2012-12-12 19:24           ` [patch v2 3/6] memcg: rework mem_cgroup_iter to use cgroup iterators Michal Hocko
  2012-12-14  1:14             ` Ying Han
@ 2012-12-14 12:37             ` Michal Hocko
  1 sibling, 0 replies; 57+ messages in thread
From: Michal Hocko @ 2012-12-14 12:37 UTC (permalink / raw)
  To: Ying Han
  Cc: linux-mm, linux-kernel, KAMEZAWA Hiroyuki, Johannes Weiner,
	Tejun Heo, Glauber Costa, Li Zefan
On Wed 12-12-12 20:24:41, Michal Hocko wrote:
> On Wed 12-12-12 10:06:52, Michal Hocko wrote:
> > On Tue 11-12-12 14:36:10, Ying Han wrote:
> [...]
> > > One exception is mem_cgroup_iter_break(), where the loop terminates
> > > with *leaked* refcnt and that is what the iter_break() needs to clean
> > > up. We can not rely on the next caller of the loop since it might
> > > never happen.
> > 
> > Yes, this is true and I already have a half baked patch for that. I
> > haven't posted it yet but it basically checks all node-zone-prio
> > last_visited and removes itself from them on the way out in pre_destroy
> > callback (I just need to cleanup "find a new last_visited" part and will
> > post it).
> 
> And a half baked patch - just compile tested
please ignore this patch. It is totally bogus.
> ---
> From 1c976c079c383175c679e00115aee0ab8e215bf2 Mon Sep 17 00:00:00 2001
> From: Michal Hocko <mhocko@suse.cz>
> Date: Tue, 11 Dec 2012 21:02:39 +0100
> Subject: [PATCH] NOT READY YET - just compile tested
> 
> memcg: remove memcg from the reclaim iterators
> 
> Now that per-node-zone-priority iterator caches memory cgroups rather
> than their css ids we have to be careful and remove them from the
> iterator when they are on the way out otherwise they might hang for
> unbounded amount of time (until the global reclaim triggers the zone
> under priority to find out the group is dead and let it to find the
> final rest).
> 
> This is solved by hooking into mem_cgroup_pre_destroy and checking all
> per-node-zone-priority iterators. If the current memcg is found in
> iter->last_visited then it is replaced by its left sibling or its parent
> otherwise. This guarantees that no group gets more reclaiming than
> necessary and the next iteration will continue seemingly.
> 
> Spotted-by: Ying Han <yinghan@google.com>
> Not-signed-off-by-yet: Michal Hocko <mhocko@suse.cz>
> ---
>  mm/memcontrol.c |   38 ++++++++++++++++++++++++++++++++++++++
>  1 file changed, 38 insertions(+)
> 
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 7134148..286db74 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -6213,12 +6213,50 @@ free_out:
>  	return ERR_PTR(error);
>  }
>  
> +static void mem_cgroup_remove_cached(struct mem_cgroup *memcg)
> +{
> +	int node, zone;
> +
> +	for_each_node(node) {
> +		struct mem_cgroup_per_node *pn = memcg->info.nodeinfo[node];
> +		int prio;
> +
> +		for (zone = 0; zone < MAX_NR_ZONES; zone++) {
> +			struct mem_cgroup_per_zone *mz;
> +
> +			mz = &pn->zoneinfo[zone];
> +			for (prio = 0; prio < DEF_PRIORITY + 1; prio++) {
> +				struct mem_cgroup_reclaim_iter *iter;
> +
> +				iter = &mz->reclaim_iter[prio];
> +				rcu_read_lock();
> +				spin_lock(&iter->iter_lock);
> +				if (iter->last_visited == memcg) {
> +					struct cgroup *cgroup, *prev;
> +
> +					cgroup = memcg->css.cgroup;
> +					prev = list_entry_rcu(cgroup->sibling.prev, struct cgroup, sibling);
> +					if (&prev->sibling == &prev->parent->children)
> +						prev = prev->parent;
> +					iter->last_visited = mem_cgroup_from_cont(prev);
> +
> +					/* TODO can we do this? */
> +					css_put(&memcg->css);
> +				}
> +				spin_unlock(&iter->iter_lock);
> +				rcu_read_unlock();
> +			}
> +		}
> +	}
> +}
> +
>  static void mem_cgroup_pre_destroy(struct cgroup *cont)
>  {
>  	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
>  
>  	mem_cgroup_reparent_charges(memcg);
>  	mem_cgroup_destroy_all_caches(memcg);
> +	mem_cgroup_remove_cached(memcg);
>  }
>  
>  static void mem_cgroup_destroy(struct cgroup *cont)
> -- 
> 1.7.10.4
> 
> -- 
> Michal Hocko
> SUSE Labs
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
-- 
Michal Hocko
SUSE Labs
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply	[flat|nested] 57+ messages in thread