[patch] memcg: skip scanning active lists based on individual size

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [patch] memcg: skip scanning active lists based on individual size
@ 2011-08-31  9:08 Johannes Weiner
  2011-08-31 10:13 ` Minchan Kim
                   ` (2 more replies)
  0 siblings, 3 replies; 12+ messages in thread
From: Johannes Weiner @ 2011-08-31  9:08 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Rik van Riel, KOSAKI Motohiro, KAMEZAWA Hiroyuki,
	Daisuke Nishimura, Balbir Singh, linux-mm, linux-kernel

Reclaim decides to skip scanning an active list when the corresponding
inactive list is above a certain size in comparison to leave the
assumed working set alone while there are still enough reclaim
candidates around.

The memcg implementation of comparing those lists instead reports
whether the whole memcg is low on the requested type of inactive
pages, considering all nodes and zones.

This can lead to an oversized active list not being scanned because of
the state of the other lists in the memcg, as well as an active list
being scanned while its corresponding inactive list has enough pages.

Not only is this wrong, it's also a scalability hazard, because the
global memory state over all nodes and zones has to be gathered for
each memcg and zone scanned.

Make these calculations purely based on the size of the two LRU lists
that are actually affected by the outcome of the decision.

Signed-off-by: Johannes Weiner <jweiner@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Balbir Singh <bsingharora@gmail.com>
---
 include/linux/memcontrol.h |   10 +++++---
 mm/memcontrol.c            |   51 ++++++++++++++-----------------------------
 mm/vmscan.c                |    4 +-
 3 files changed, 25 insertions(+), 40 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 343bd76..cbb45ce 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -105,8 +105,10 @@ extern void mem_cgroup_end_migration(struct mem_cgroup *mem,
 /*
  * For memory reclaim.
  */
-int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg);
-int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg);
+int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg,
+				    struct zone *zone);
+int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg,
+				    struct zone *zone);
 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg);
 unsigned long mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg,
 					int nid, int zid, unsigned int lrumask);
@@ -292,13 +294,13 @@ static inline bool mem_cgroup_disabled(void)
 }
 
 static inline int
-mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg)
+mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg, struct zone *zone)
 {
 	return 1;
 }
 
 static inline int
-mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg)
+mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg, struct zone *zone)
 {
 	return 1;
 }
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 3508777..d63dfb2 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1101,15 +1101,19 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
 	return ret;
 }
 
-static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages)
+int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg, struct zone *zone)
 {
-	unsigned long active;
+	unsigned long inactive_ratio;
+	int nid = zone_to_nid(zone);
+	int zid = zone_idx(zone);
 	unsigned long inactive;
+	unsigned long active;
 	unsigned long gb;
-	unsigned long inactive_ratio;
 
-	inactive = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_ANON));
-	active = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_ANON));
+	inactive = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid,
+						BIT(LRU_INACTIVE_ANON));
+	active = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid,
+					      BIT(LRU_ACTIVE_ANON));
 
 	gb = (inactive + active) >> (30 - PAGE_SHIFT);
 	if (gb)
@@ -1117,39 +1121,20 @@ static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_
 	else
 		inactive_ratio = 1;
 
-	if (present_pages) {
-		present_pages[0] = inactive;
-		present_pages[1] = active;
-	}
-
-	return inactive_ratio;
+	return inactive * inactive_ratio < active;
 }
 
-int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg)
-{
-	unsigned long active;
-	unsigned long inactive;
-	unsigned long present_pages[2];
-	unsigned long inactive_ratio;
-
-	inactive_ratio = calc_inactive_ratio(memcg, present_pages);
-
-	inactive = present_pages[0];
-	active = present_pages[1];
-
-	if (inactive * inactive_ratio < active)
-		return 1;
-
-	return 0;
-}
-
-int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg)
+int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg, struct zone *zone)
 {
 	unsigned long active;
 	unsigned long inactive;
+	int zid = zone_idx(zone);
+	int nid = zone_to_nid(zone);
 
-	inactive = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_FILE));
-	active = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_FILE));
+	inactive = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid,
+						BIT(LRU_INACTIVE_FILE));
+	active = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid,
+					      BIT(LRU_ACTIVE_FILE));
 
 	return (active > inactive);
 }
@@ -4188,8 +4173,6 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
 	}
 
 #ifdef CONFIG_DEBUG_VM
-	cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL));
-
 	{
 		int nid, zid;
 		struct mem_cgroup_per_zone *mz;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 6588746..a023778 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1699,7 +1699,7 @@ static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc)
 	if (scanning_global_lru(sc))
 		low = inactive_anon_is_low_global(zone);
 	else
-		low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup);
+		low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup, zone);
 	return low;
 }
 #else
@@ -1742,7 +1742,7 @@ static int inactive_file_is_low(struct zone *zone, struct scan_control *sc)
 	if (scanning_global_lru(sc))
 		low = inactive_file_is_low_global(zone);
 	else
-		low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup);
+		low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup, zone);
 	return low;
 }
 
-- 
1.7.6

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [patch] memcg: skip scanning active lists based on individual size
  2011-08-31  9:08 [patch] memcg: skip scanning active lists based on individual size Johannes Weiner
@ 2011-08-31 10:13 ` Minchan Kim
  2011-08-31 12:30   ` Johannes Weiner
  2011-09-01  0:09   ` KAMEZAWA Hiroyuki
  2011-08-31 17:19 ` Ying Han
  2011-08-31 18:27 ` Rik van Riel
  2 siblings, 2 replies; 12+ messages in thread
From: Minchan Kim @ 2011-08-31 10:13 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: Andrew Morton, Rik van Riel, KOSAKI Motohiro, KAMEZAWA Hiroyuki,
	Daisuke Nishimura, Balbir Singh, linux-mm, linux-kernel

On Wed, Aug 31, 2011 at 6:08 PM, Johannes Weiner <jweiner@redhat.com> wrote:
> Reclaim decides to skip scanning an active list when the corresponding
> inactive list is above a certain size in comparison to leave the
> assumed working set alone while there are still enough reclaim
> candidates around.
>
> The memcg implementation of comparing those lists instead reports
> whether the whole memcg is low on the requested type of inactive
> pages, considering all nodes and zones.
>
> This can lead to an oversized active list not being scanned because of
> the state of the other lists in the memcg, as well as an active list
> being scanned while its corresponding inactive list has enough pages.
>
> Not only is this wrong, it's also a scalability hazard, because the
> global memory state over all nodes and zones has to be gathered for
> each memcg and zone scanned.
>
> Make these calculations purely based on the size of the two LRU lists
> that are actually affected by the outcome of the decision.
>
> Signed-off-by: Johannes Weiner <jweiner@redhat.com>
> Cc: Rik van Riel <riel@redhat.com>
> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
> Cc: Balbir Singh <bsingharora@gmail.com>

Reviewed-by: Minchan Kim <minchan.kim@gmail.com>

I can't understand why memcg is designed for considering all nodes and zones.
Is it a mistake or on purpose?
Maybe Kame or Balbir can answer it.

Anyway, this change does make sense to me.

Nitpick: Please remove inactive_ratio in Documentation/cgroups/memory.txt.
I think it would be better to separate it into another patch.

-- 
Kind regards,
Minchan Kim

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [patch] memcg: skip scanning active lists based on individual size
  2011-08-31 10:13 ` Minchan Kim
@ 2011-08-31 12:30   ` Johannes Weiner
  2011-09-01  0:09   ` KAMEZAWA Hiroyuki
  1 sibling, 0 replies; 12+ messages in thread
From: Johannes Weiner @ 2011-08-31 12:30 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Andrew Morton, Rik van Riel, KOSAKI Motohiro, KAMEZAWA Hiroyuki,
	Daisuke Nishimura, Balbir Singh, linux-mm, linux-kernel

On Wed, Aug 31, 2011 at 07:13:34PM +0900, Minchan Kim wrote:
> On Wed, Aug 31, 2011 at 6:08 PM, Johannes Weiner <jweiner@redhat.com> wrote:
> > Reclaim decides to skip scanning an active list when the corresponding
> > inactive list is above a certain size in comparison to leave the
> > assumed working set alone while there are still enough reclaim
> > candidates around.
> >
> > The memcg implementation of comparing those lists instead reports
> > whether the whole memcg is low on the requested type of inactive
> > pages, considering all nodes and zones.
> >
> > This can lead to an oversized active list not being scanned because of
> > the state of the other lists in the memcg, as well as an active list
> > being scanned while its corresponding inactive list has enough pages.
> >
> > Not only is this wrong, it's also a scalability hazard, because the
> > global memory state over all nodes and zones has to be gathered for
> > each memcg and zone scanned.
> >
> > Make these calculations purely based on the size of the two LRU lists
> > that are actually affected by the outcome of the decision.
> >
> > Signed-off-by: Johannes Weiner <jweiner@redhat.com>
> > Cc: Rik van Riel <riel@redhat.com>
> > Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
> > Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> > Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
> > Cc: Balbir Singh <bsingharora@gmail.com>
> 
> Reviewed-by: Minchan Kim <minchan.kim@gmail.com>

Thank you.

> I can't understand why memcg is designed for considering all nodes and zones.
> Is it a mistake or on purpose?
> Maybe Kame or Balbir can answer it.
> 
> Anyway, this change does make sense to me.
> 
> Nitpick: Please remove inactive_ratio in Documentation/cgroups/memory.txt.
> I think it would be better to separate it into another patch.

Good catch.

---
Subject: [patch] memcg: skip scanning active lists based on individual fix

Also ditch the documentation note for the removed stats value.

Signed-off-by: Johannes Weiner <jweiner@redhat.com>
---

diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt
index 06eb6d9..cc0ebc5 100644
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroups/memory.txt
@@ -418,7 +418,6 @@ total_unevictable	- sum of all children's "unevictable"
 
 # The following additional stats are dependent on CONFIG_DEBUG_VM.
 
-inactive_ratio		- VM internal parameter. (see mm/page_alloc.c)
 recent_rotated_anon	- VM internal parameter. (see mm/vmscan.c)
 recent_rotated_file	- VM internal parameter. (see mm/vmscan.c)
 recent_scanned_anon	- VM internal parameter. (see mm/vmscan.c)

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [patch] memcg: skip scanning active lists based on individual size
  2011-08-31 10:13 ` Minchan Kim
  2011-08-31 12:30   ` Johannes Weiner
@ 2011-09-01  0:09   ` KAMEZAWA Hiroyuki
  2011-09-01  6:15     ` Johannes Weiner
  1 sibling, 1 reply; 12+ messages in thread
From: KAMEZAWA Hiroyuki @ 2011-09-01  0:09 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Johannes Weiner, Andrew Morton, Rik van Riel, KOSAKI Motohiro,
	Daisuke Nishimura, Balbir Singh, linux-mm, linux-kernel

On Wed, 31 Aug 2011 19:13:34 +0900
Minchan Kim <minchan.kim@gmail.com> wrote:

> On Wed, Aug 31, 2011 at 6:08 PM, Johannes Weiner <jweiner@redhat.com> wrote:
> > Reclaim decides to skip scanning an active list when the corresponding
> > inactive list is above a certain size in comparison to leave the
> > assumed working set alone while there are still enough reclaim
> > candidates around.
> >
> > The memcg implementation of comparing those lists instead reports
> > whether the whole memcg is low on the requested type of inactive
> > pages, considering all nodes and zones.
> >
> > This can lead to an oversized active list not being scanned because of
> > the state of the other lists in the memcg, as well as an active list
> > being scanned while its corresponding inactive list has enough pages.
> >
> > Not only is this wrong, it's also a scalability hazard, because the
> > global memory state over all nodes and zones has to be gathered for
> > each memcg and zone scanned.
> >
> > Make these calculations purely based on the size of the two LRU lists
> > that are actually affected by the outcome of the decision.
> >
> > Signed-off-by: Johannes Weiner <jweiner@redhat.com>
> > Cc: Rik van Riel <riel@redhat.com>
> > Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
> > Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> > Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
> > Cc: Balbir Singh <bsingharora@gmail.com>
> 
> Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
> 
> I can't understand why memcg is designed for considering all nodes and zones.
> Is it a mistake or on purpose?

It's purpose. memcg just takes care of the amount of pages.
Them, any performance numbers ?


But, hmm, this change may be good for softlimit and your work.

I'll ack when you add performance numbers in changelog.

Thanks,
-Kame

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [patch] memcg: skip scanning active lists based on individual size
  2011-09-01  0:09   ` KAMEZAWA Hiroyuki
@ 2011-09-01  6:15     ` Johannes Weiner
  2011-09-01  6:31       ` KAMEZAWA Hiroyuki
  0 siblings, 1 reply; 12+ messages in thread
From: Johannes Weiner @ 2011-09-01  6:15 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Minchan Kim, Andrew Morton, Rik van Riel, KOSAKI Motohiro,
	Daisuke Nishimura, Balbir Singh, linux-mm, linux-kernel

On Thu, Sep 01, 2011 at 09:09:31AM +0900, KAMEZAWA Hiroyuki wrote:
> On Wed, 31 Aug 2011 19:13:34 +0900
> Minchan Kim <minchan.kim@gmail.com> wrote:
> 
> > On Wed, Aug 31, 2011 at 6:08 PM, Johannes Weiner <jweiner@redhat.com> wrote:
> > > Reclaim decides to skip scanning an active list when the corresponding
> > > inactive list is above a certain size in comparison to leave the
> > > assumed working set alone while there are still enough reclaim
> > > candidates around.
> > >
> > > The memcg implementation of comparing those lists instead reports
> > > whether the whole memcg is low on the requested type of inactive
> > > pages, considering all nodes and zones.
> > >
> > > This can lead to an oversized active list not being scanned because of
> > > the state of the other lists in the memcg, as well as an active list
> > > being scanned while its corresponding inactive list has enough pages.
> > >
> > > Not only is this wrong, it's also a scalability hazard, because the
> > > global memory state over all nodes and zones has to be gathered for
> > > each memcg and zone scanned.
> > >
> > > Make these calculations purely based on the size of the two LRU lists
> > > that are actually affected by the outcome of the decision.
> > >
> > > Signed-off-by: Johannes Weiner <jweiner@redhat.com>
> > > Cc: Rik van Riel <riel@redhat.com>
> > > Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
> > > Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> > > Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
> > > Cc: Balbir Singh <bsingharora@gmail.com>
> > 
> > Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
> > 
> > I can't understand why memcg is designed for considering all nodes and zones.
> > Is it a mistake or on purpose?
> 
> It's purpose. memcg just takes care of the amount of pages.

This mechanism isn't about memcg at all, it's an aging decision at a
much lower level.  Can you tell me how the old implementation is
supposed to work?

> But, hmm, this change may be good for softlimit and your work.

Yes, I noticed those paths showing up in a profile with my patches.
Lots of memcgs on a multi-node machine will trigger it too.  But it's
secondary, my primary reasoning was: this does not make sense at all.

> I'll ack when you add performance numbers in changelog.

It's not exactly a performance optimization but I'll happily run some
workloads.  Do you have suggestions what to test for?  I.e. where
would you expect regressions?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [patch] memcg: skip scanning active lists based on individual size
  2011-09-01  6:15     ` Johannes Weiner
@ 2011-09-01  6:31       ` KAMEZAWA Hiroyuki
  2011-09-05 18:25         ` Johannes Weiner
  0 siblings, 1 reply; 12+ messages in thread
From: KAMEZAWA Hiroyuki @ 2011-09-01  6:31 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: Minchan Kim, Andrew Morton, Rik van Riel, KOSAKI Motohiro,
	Daisuke Nishimura, Balbir Singh, linux-mm, linux-kernel

On Thu, 1 Sep 2011 08:15:40 +0200
Johannes Weiner <jweiner@redhat.com> wrote:

> On Thu, Sep 01, 2011 at 09:09:31AM +0900, KAMEZAWA Hiroyuki wrote:
> > On Wed, 31 Aug 2011 19:13:34 +0900
> > Minchan Kim <minchan.kim@gmail.com> wrote:
> > 
> > > On Wed, Aug 31, 2011 at 6:08 PM, Johannes Weiner <jweiner@redhat.com> wrote:
> > > > Reclaim decides to skip scanning an active list when the corresponding
> > > > inactive list is above a certain size in comparison to leave the
> > > > assumed working set alone while there are still enough reclaim
> > > > candidates around.
> > > >
> > > > The memcg implementation of comparing those lists instead reports
> > > > whether the whole memcg is low on the requested type of inactive
> > > > pages, considering all nodes and zones.
> > > >
> > > > This can lead to an oversized active list not being scanned because of
> > > > the state of the other lists in the memcg, as well as an active list
> > > > being scanned while its corresponding inactive list has enough pages.
> > > >
> > > > Not only is this wrong, it's also a scalability hazard, because the
> > > > global memory state over all nodes and zones has to be gathered for
> > > > each memcg and zone scanned.
> > > >
> > > > Make these calculations purely based on the size of the two LRU lists
> > > > that are actually affected by the outcome of the decision.
> > > >
> > > > Signed-off-by: Johannes Weiner <jweiner@redhat.com>
> > > > Cc: Rik van Riel <riel@redhat.com>
> > > > Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
> > > > Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> > > > Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
> > > > Cc: Balbir Singh <bsingharora@gmail.com>
> > > 
> > > Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
> > > 
> > > I can't understand why memcg is designed for considering all nodes and zones.
> > > Is it a mistake or on purpose?
> > 
> > It's purpose. memcg just takes care of the amount of pages.
> 
> This mechanism isn't about memcg at all, it's an aging decision at a
> much lower level.  Can you tell me how the old implementation is
> supposed to work?
> 
Old implemenation was supporsed to make vmscan to see only memcg and
ignore zones. memcg doesn't take care of any zones. Then, it uses
global numbers rather than zones.

Assume a system with 2 nodes and the whole memcg's inactive/active ratio
is unbalaned. 

   Node      0     1
   Active   800M   30M
   Inactive 100M   200M

If we judge 'unbalance' based on zones, Node1's Active will not rotate
even if it's not accessed for a while.
If we judge unbalance based on total stat, Both of Node0 and Node 1
will be rotated.

Hmm, old one doesn't work as I expexted ?

But okay, as time goes, I think Node1's inactive will decreased
and then, rotate will happen even with zone based ones.



> > But, hmm, this change may be good for softlimit and your work.
> 
> Yes, I noticed those paths showing up in a profile with my patches.
> Lots of memcgs on a multi-node machine will trigger it too.  But it's
> secondary, my primary reasoning was: this does not make sense at all.
> 

your word sounds always too strong to me ;) please be soft.

> > I'll ack when you add performance numbers in changelog.
> 
> It's not exactly a performance optimization but I'll happily run some
> workloads.  Do you have suggestions what to test for?  I.e. where
> would you expect regressions?
> 
Some comparison about amount of swap-out before/after change will be good.

Hm. If I do...
  - set up x86-64 NUMA box. (fake numa is ok.)
  - create memcg with 500M limit.
  - running kernel make with make -j 6(or more)

see time of make and amount of swap-out.


Thanks,
-Kame

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [patch] memcg: skip scanning active lists based on individual size
  2011-09-01  6:31       ` KAMEZAWA Hiroyuki
@ 2011-09-05 18:25         ` Johannes Weiner
  2011-09-06  9:33           ` KAMEZAWA Hiroyuki
  0 siblings, 1 reply; 12+ messages in thread
From: Johannes Weiner @ 2011-09-05 18:25 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Minchan Kim, Andrew Morton, Rik van Riel, KOSAKI Motohiro,
	Daisuke Nishimura, Balbir Singh, linux-mm, linux-kernel

On Thu, Sep 01, 2011 at 03:31:48PM +0900, KAMEZAWA Hiroyuki wrote:
> On Thu, 1 Sep 2011 08:15:40 +0200
> Johannes Weiner <jweiner@redhat.com> wrote:
> 
> > On Thu, Sep 01, 2011 at 09:09:31AM +0900, KAMEZAWA Hiroyuki wrote:
> > > On Wed, 31 Aug 2011 19:13:34 +0900
> > > Minchan Kim <minchan.kim@gmail.com> wrote:
> > > 
> > > > On Wed, Aug 31, 2011 at 6:08 PM, Johannes Weiner <jweiner@redhat.com> wrote:
> > > > > Reclaim decides to skip scanning an active list when the corresponding
> > > > > inactive list is above a certain size in comparison to leave the
> > > > > assumed working set alone while there are still enough reclaim
> > > > > candidates around.
> > > > >
> > > > > The memcg implementation of comparing those lists instead reports
> > > > > whether the whole memcg is low on the requested type of inactive
> > > > > pages, considering all nodes and zones.
> > > > >
> > > > > This can lead to an oversized active list not being scanned because of
> > > > > the state of the other lists in the memcg, as well as an active list
> > > > > being scanned while its corresponding inactive list has enough pages.
> > > > >
> > > > > Not only is this wrong, it's also a scalability hazard, because the
> > > > > global memory state over all nodes and zones has to be gathered for
> > > > > each memcg and zone scanned.
> > > > >
> > > > > Make these calculations purely based on the size of the two LRU lists
> > > > > that are actually affected by the outcome of the decision.
> > > > >
> > > > > Signed-off-by: Johannes Weiner <jweiner@redhat.com>
> > > > > Cc: Rik van Riel <riel@redhat.com>
> > > > > Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
> > > > > Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> > > > > Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
> > > > > Cc: Balbir Singh <bsingharora@gmail.com>
> > > > 
> > > > Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
> > > > 
> > > > I can't understand why memcg is designed for considering all nodes and zones.
> > > > Is it a mistake or on purpose?
> > > 
> > > It's purpose. memcg just takes care of the amount of pages.
> > 
> > This mechanism isn't about memcg at all, it's an aging decision at a
> > much lower level.  Can you tell me how the old implementation is
> > supposed to work?
> > 
> Old implemenation was supporsed to make vmscan to see only memcg and
> ignore zones. memcg doesn't take care of any zones. Then, it uses
> global numbers rather than zones.
> 
> Assume a system with 2 nodes and the whole memcg's inactive/active ratio
> is unbalaned. 
> 
>    Node      0     1
>    Active   800M   30M
>    Inactive 100M   200M
> 
> If we judge 'unbalance' based on zones, Node1's Active will not rotate
> even if it's not accessed for a while.
> If we judge unbalance based on total stat, Both of Node0 and Node 1
> will be rotated.

But why should we deactivate on Node 1?  We have good reasons not to
on the global level, why should memcgs silently behave differently?

I mostly don't understand it on a semantic level.  vmscan needs to
know whether a certain inactive LRU list has enough reclaim candidates
to skip scanning its corresponding active list.  The global state is
not useful to find out if a single inactive list has enough pages.

> Hmm, old one doesn't work as I expexted ?
> 
> But okay, as time goes, I think Node1's inactive will decreased
> and then, rotate will happen even with zone based ones.

Yes, that's how the mechanism is intended to work: with a constant
influx of used-once pages, we don't want to touch the active list.
But when the workload changes and inactive pages get either activated
or all reclaimed, the ratio changes and eventually we fall back to
deactivating pages again.

That's reclaim behaviour that has been around for a while and it
shouldn't make a difference if your workload is running in
root_mem_cgroup or another memcg.

> > > But, hmm, this change may be good for softlimit and your work.
> > 
> > Yes, I noticed those paths showing up in a profile with my patches.
> > Lots of memcgs on a multi-node machine will trigger it too.  But it's
> > secondary, my primary reasoning was: this does not make sense at all.
> 
> your word sounds always too strong to me ;) please be soft.

Sorry, I'll try to be less harsh.  Please don't take it personally :)

What I meant was that the computational overhead was not the primary
reason for this patch.  Although a reduction there is very welcome,
it's that deciding to skip the list based on the list size seems more
correct than deciding based on the overall state of the memcg, which
can only by accident show the same proportion of inactive/active.

It's a correctness fix for existing code, not an optimization or
preparation for future changes.

> > > I'll ack when you add performance numbers in changelog.
> > 
> > It's not exactly a performance optimization but I'll happily run some
> > workloads.  Do you have suggestions what to test for?  I.e. where
> > would you expect regressions?
> > 
> Some comparison about amount of swap-out before/after change will be good.
> 
> Hm. If I do...
>   - set up x86-64 NUMA box. (fake numa is ok.)
>   - create memcg with 500M limit.
>   - running kernel make with make -j 6(or more)
> 
> see time of make and amount of swap-out.

4G ram, 500M swap on SSD, numa=fake=16, 10 runs of make -j11 in 500M
memcg, standard deviation in parens:

		seconds		pswpin			pswpout
vanilla:	175.359(0.106)	6906.900(1779.135)	8913.200(1917.369)
patched:	176.144(0.243)	8581.500(1833.432)	10872.400(2124.104)

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [patch] memcg: skip scanning active lists based on individual size
  2011-09-05 18:25         ` Johannes Weiner
@ 2011-09-06  9:33           ` KAMEZAWA Hiroyuki
  2011-09-06 10:43             ` Johannes Weiner
  0 siblings, 1 reply; 12+ messages in thread
From: KAMEZAWA Hiroyuki @ 2011-09-06  9:33 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: Minchan Kim, Andrew Morton, Rik van Riel, KOSAKI Motohiro,
	Daisuke Nishimura, Balbir Singh, linux-mm, linux-kernel

On Mon, 5 Sep 2011 20:25:14 +0200
Johannes Weiner <jweiner@redhat.com> wrote:

> On Thu, Sep 01, 2011 at 03:31:48PM +0900, KAMEZAWA Hiroyuki wrote:
> > On Thu, 1 Sep 2011 08:15:40 +0200
> > Johannes Weiner <jweiner@redhat.com> wrote:
> > Old implemenation was supporsed to make vmscan to see only memcg and
> > ignore zones. memcg doesn't take care of any zones. Then, it uses
> > global numbers rather than zones.
> > 
> > Assume a system with 2 nodes and the whole memcg's inactive/active ratio
> > is unbalaned. 
> > 
> >    Node      0     1
> >    Active   800M   30M
> >    Inactive 100M   200M
> > 
> > If we judge 'unbalance' based on zones, Node1's Active will not rotate
> > even if it's not accessed for a while.
> > If we judge unbalance based on total stat, Both of Node0 and Node 1
> > will be rotated.
> 
> But why should we deactivate on Node 1?  We have good reasons not to
> on the global level, why should memcgs silently behave differently?
> 

One reason was I thought that memcg should behave as to have one LRU list,
which is not devided by zones and wanted to ignore zones as much
as possible. Second reason was that I don't want to increase swap-out
caused by memcg limit.


> I mostly don't understand it on a semantic level.  vmscan needs to
> know whether a certain inactive LRU list has enough reclaim candidates
> to skip scanning its corresponding active list.  The global state is
> not useful to find out if a single inactive list has enough pages.
> 

Ok, I agree to this. I should add other logic to do what I want.
In my series,
  - passing nodemask
  - avoid overscan
  - calculating node weight
These will allow me to see what I want.

> > Hmm, old one doesn't work as I expexted ?
> > 
> > But okay, as time goes, I think Node1's inactive will decreased
> > and then, rotate will happen even with zone based ones.
> 
> Yes, that's how the mechanism is intended to work: with a constant
> influx of used-once pages, we don't want to touch the active list.
> But when the workload changes and inactive pages get either activated
> or all reclaimed, the ratio changes and eventually we fall back to
> deactivating pages again.
> 
> That's reclaim behaviour that has been around for a while and it
> shouldn't make a difference if your workload is running in
> root_mem_cgroup or another memcg.
> 

ok.


> > > > But, hmm, this change may be good for softlimit and your work.
> > > 
> > > Yes, I noticed those paths showing up in a profile with my patches.
> > > Lots of memcgs on a multi-node machine will trigger it too.  But it's
> > > secondary, my primary reasoning was: this does not make sense at all.
> > 
> > your word sounds always too strong to me ;) please be soft.
> 
> Sorry, I'll try to be less harsh.  Please don't take it personally :)
> 
> What I meant was that the computational overhead was not the primary
> reason for this patch.  Although a reduction there is very welcome,
> it's that deciding to skip the list based on the list size seems more
> correct than deciding based on the overall state of the memcg, which
> can only by accident show the same proportion of inactive/active.
> 
> It's a correctness fix for existing code, not an optimization or
> preparation for future changes.
> 
ok.


> > > > I'll ack when you add performance numbers in changelog.
> > > 
> > > It's not exactly a performance optimization but I'll happily run some
> > > workloads.  Do you have suggestions what to test for?  I.e. where
> > > would you expect regressions?
> > > 
> > Some comparison about amount of swap-out before/after change will be good.
> > 
> > Hm. If I do...
> >   - set up x86-64 NUMA box. (fake numa is ok.)
> >   - create memcg with 500M limit.
> >   - running kernel make with make -j 6(or more)
> > 
> > see time of make and amount of swap-out.
> 
> 4G ram, 500M swap on SSD, numa=fake=16, 10 runs of make -j11 in 500M
> memcg, standard deviation in parens:
> 
> 		seconds		pswpin			pswpout
> vanilla:	175.359(0.106)	6906.900(1779.135)	8913.200(1917.369)
> patched:	176.144(0.243)	8581.500(1833.432)	10872.400(2124.104)
> 

Hmm. swapin/out seems increased. But hmm...stddev is large.
Is this expected ? reason ?

Anyway, I don't want to disturb you more. Thanks.

-Kame

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [patch] memcg: skip scanning active lists based on individual size
  2011-09-06  9:33           ` KAMEZAWA Hiroyuki
@ 2011-09-06 10:43             ` Johannes Weiner
  2011-09-06 10:52               ` KAMEZAWA Hiroyuki
  0 siblings, 1 reply; 12+ messages in thread
From: Johannes Weiner @ 2011-09-06 10:43 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Minchan Kim, Andrew Morton, Rik van Riel, KOSAKI Motohiro,
	Daisuke Nishimura, Balbir Singh, linux-mm, linux-kernel

On Tue, Sep 06, 2011 at 06:33:58PM +0900, KAMEZAWA Hiroyuki wrote:
> On Mon, 5 Sep 2011 20:25:14 +0200
> Johannes Weiner <jweiner@redhat.com> wrote:
> 
> > On Thu, Sep 01, 2011 at 03:31:48PM +0900, KAMEZAWA Hiroyuki wrote:
> > > On Thu, 1 Sep 2011 08:15:40 +0200
> > > Johannes Weiner <jweiner@redhat.com> wrote:
> > > Old implemenation was supporsed to make vmscan to see only memcg and
> > > ignore zones. memcg doesn't take care of any zones. Then, it uses
> > > global numbers rather than zones.
> > > 
> > > Assume a system with 2 nodes and the whole memcg's inactive/active ratio
> > > is unbalaned. 
> > > 
> > >    Node      0     1
> > >    Active   800M   30M
> > >    Inactive 100M   200M
> > > 
> > > If we judge 'unbalance' based on zones, Node1's Active will not rotate
> > > even if it's not accessed for a while.
> > > If we judge unbalance based on total stat, Both of Node0 and Node 1
> > > will be rotated.
> > 
> > But why should we deactivate on Node 1?  We have good reasons not to
> > on the global level, why should memcgs silently behave differently?
> 
> One reason was I thought that memcg should behave as to have one LRU list,
> which is not devided by zones and wanted to ignore zones as much
> as possible. Second reason was that I don't want to increase swap-out
> caused by memcg limit.

You can think of it like this: if every active list is only balanced
when its inactive counterpart is too small, then the memcg-wide
proportion of inactive vs. active pages is as desired, too.  So even
after my change, the 'one big LRU' has the right inactive/active ratio.

On the other hand, the old way could allow for the memcg-level to have
the right proportion and still thrash one workload that is bound to
another node even though its inactive > active, but is overshadowed by
inactive < active workloads on other nodes.

> > I mostly don't understand it on a semantic level.  vmscan needs to
> > know whether a certain inactive LRU list has enough reclaim candidates
> > to skip scanning its corresponding active list.  The global state is
> > not useful to find out if a single inactive list has enough pages.
> 
> Ok, I agree to this. I should add other logic to do what I want.
> In my series,
>   - passing nodemask
>   - avoid overscan
>   - calculating node weight
> These will allow me to see what I want.

What /do/ you want? :) I just don't see your concern.  I mean, yes, no
increased swapout, but in what concrete scenario could you suspect
swapping to increase because of this change?

> > > > > I'll ack when you add performance numbers in changelog.
> > > > 
> > > > It's not exactly a performance optimization but I'll happily run some
> > > > workloads.  Do you have suggestions what to test for?  I.e. where
> > > > would you expect regressions?
> > > > 
> > > Some comparison about amount of swap-out before/after change will be good.
> > > 
> > > Hm. If I do...
> > >   - set up x86-64 NUMA box. (fake numa is ok.)
> > >   - create memcg with 500M limit.
> > >   - running kernel make with make -j 6(or more)
> > > 
> > > see time of make and amount of swap-out.
> > 
> > 4G ram, 500M swap on SSD, numa=fake=16, 10 runs of make -j11 in 500M
> > memcg, standard deviation in parens:
> > 
> > 		seconds		pswpin			pswpout
> > vanilla:	175.359(0.106)	6906.900(1779.135)	8913.200(1917.369)
> > patched:	176.144(0.243)	8581.500(1833.432)	10872.400(2124.104)
> 
> Hmm. swapin/out seems increased. But hmm...stddev is large.
> Is this expected ? reason ?

It's kind of expected because there is only a small number of parallel
jobs that have bursty memory usage, so the slightest timing variations
can make the difference between an episode of heavy thrashing and the
tasks having their bursts at different times and getting along fine.

So we are basically looking at test results that are clustered around
not one, but several different mean values.  The arithmetic mean is
not really meaningful for these samples.

> Anyway, I don't want to disturb you more. Thanks.

I am happy to test if my changes introduce regressions, I don't want
that, obviously.  But do you have a theory behind your concern that
swapping could increase?  Just saying, this test request seemed a bit
random because I don't see where my change would affect this
particular workload.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [patch] memcg: skip scanning active lists based on individual size
  2011-09-06 10:43             ` Johannes Weiner
@ 2011-09-06 10:52               ` KAMEZAWA Hiroyuki
  0 siblings, 0 replies; 12+ messages in thread
From: KAMEZAWA Hiroyuki @ 2011-09-06 10:52 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: Minchan Kim, Andrew Morton, Rik van Riel, KOSAKI Motohiro,
	Daisuke Nishimura, Balbir Singh, linux-mm, linux-kernel

On Tue, 6 Sep 2011 12:43:47 +0200
Johannes Weiner <jweiner@redhat.com> wrote:

> On Tue, Sep 06, 2011 at 06:33:58PM +0900, KAMEZAWA Hiroyuki wrote:
> > On Mon, 5 Sep 2011 20:25:14 +0200
> > Johannes Weiner <jweiner@redhat.com> wrote:
> > 
> > > On Thu, Sep 01, 2011 at 03:31:48PM +0900, KAMEZAWA Hiroyuki wrote:
> > > > On Thu, 1 Sep 2011 08:15:40 +0200
> > > > Johannes Weiner <jweiner@redhat.com> wrote:
> > > > Old implemenation was supporsed to make vmscan to see only memcg and
> > > > ignore zones. memcg doesn't take care of any zones. Then, it uses
> > > > global numbers rather than zones.
> > > > 
> > > > Assume a system with 2 nodes and the whole memcg's inactive/active ratio
> > > > is unbalaned. 
> > > > 
> > > >    Node      0     1
> > > >    Active   800M   30M
> > > >    Inactive 100M   200M
> > > > 
> > > > If we judge 'unbalance' based on zones, Node1's Active will not rotate
> > > > even if it's not accessed for a while.
> > > > If we judge unbalance based on total stat, Both of Node0 and Node 1
> > > > will be rotated.
> > > 
> > > But why should we deactivate on Node 1?  We have good reasons not to
> > > on the global level, why should memcgs silently behave differently?
> > 
> > One reason was I thought that memcg should behave as to have one LRU list,
> > which is not devided by zones and wanted to ignore zones as much
> > as possible. Second reason was that I don't want to increase swap-out
> > caused by memcg limit.
> 
> You can think of it like this: if every active list is only balanced
> when its inactive counterpart is too small, then the memcg-wide
> proportion of inactive vs. active pages is as desired, too.  So even
> after my change, the 'one big LRU' has the right inactive/active ratio.
> 
> On the other hand, the old way could allow for the memcg-level to have
> the right proportion and still thrash one workload that is bound to
> another node even though its inactive > active, but is overshadowed by
> inactive < active workloads on other nodes.
> 
ok.

> > > I mostly don't understand it on a semantic level.  vmscan needs to
> > > know whether a certain inactive LRU list has enough reclaim candidates
> > > to skip scanning its corresponding active list.  The global state is
> > > not useful to find out if a single inactive list has enough pages.
> > 
> > Ok, I agree to this. I should add other logic to do what I want.
> > In my series,
> >   - passing nodemask
> >   - avoid overscan
> >   - calculating node weight
> > These will allow me to see what I want.
> 
> What /do/ you want? :) I just don't see your concern.  I mean, yes, no
> increased swapout, but in what concrete scenario could you suspect
> swapping to increase because of this change?
> 

Ah, sorry. Maybe I was merged other concerns and this talk. This change
itseld doesn't related to my scenario.

Please forget. I'm a bit tired in these days...



> > > > > > I'll ack when you add performance numbers in changelog.
> > > > > 
> > > > > It's not exactly a performance optimization but I'll happily run some
> > > > > workloads.  Do you have suggestions what to test for?  I.e. where
> > > > > would you expect regressions?
> > > > > 
> > > > Some comparison about amount of swap-out before/after change will be good.
> > > > 
> > > > Hm. If I do...
> > > >   - set up x86-64 NUMA box. (fake numa is ok.)
> > > >   - create memcg with 500M limit.
> > > >   - running kernel make with make -j 6(or more)
> > > > 
> > > > see time of make and amount of swap-out.
> > > 
> > > 4G ram, 500M swap on SSD, numa=fake=16, 10 runs of make -j11 in 500M
> > > memcg, standard deviation in parens:
> > > 
> > > 		seconds		pswpin			pswpout
> > > vanilla:	175.359(0.106)	6906.900(1779.135)	8913.200(1917.369)
> > > patched:	176.144(0.243)	8581.500(1833.432)	10872.400(2124.104)
> > 
> > Hmm. swapin/out seems increased. But hmm...stddev is large.
> > Is this expected ? reason ?
> 
> It's kind of expected because there is only a small number of parallel
> jobs that have bursty memory usage, so the slightest timing variations
> can make the difference between an episode of heavy thrashing and the
> tasks having their bursts at different times and getting along fine.
> 
> So we are basically looking at test results that are clustered around
> not one, but several different mean values.  The arithmetic mean is
> not really meaningful for these samples.
> 
ok.

> > Anyway, I don't want to disturb you more. Thanks.
> 
> I am happy to test if my changes introduce regressions, I don't want
> that, obviously.  But do you have a theory behind your concern that
> swapping could increase?  Just saying, this test request seemed a bit
> random because I don't see where my change would affect this
> particular workload.
> 
> 

the patch is on mm queue and this may be too late, but..

Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>


Thanks,
-Kame

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [patch] memcg: skip scanning active lists based on individual size
  2011-08-31  9:08 [patch] memcg: skip scanning active lists based on individual size Johannes Weiner
  2011-08-31 10:13 ` Minchan Kim
@ 2011-08-31 17:19 ` Ying Han
  2011-08-31 18:27 ` Rik van Riel
  2 siblings, 0 replies; 12+ messages in thread
From: Ying Han @ 2011-08-31 17:19 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: Andrew Morton, Rik van Riel, KOSAKI Motohiro, KAMEZAWA Hiroyuki,
	Daisuke Nishimura, Balbir Singh, linux-mm, linux-kernel

On Wed, Aug 31, 2011 at 2:08 AM, Johannes Weiner <jweiner@redhat.com> wrote:
> Reclaim decides to skip scanning an active list when the corresponding
> inactive list is above a certain size in comparison to leave the
> assumed working set alone while there are still enough reclaim
> candidates around.
>
> The memcg implementation of comparing those lists instead reports
> whether the whole memcg is low on the requested type of inactive
> pages, considering all nodes and zones.
>
> This can lead to an oversized active list not being scanned because of
> the state of the other lists in the memcg, as well as an active list
> being scanned while its corresponding inactive list has enough pages.
>
> Not only is this wrong, it's also a scalability hazard, because the
> global memory state over all nodes and zones has to be gathered for
> each memcg and zone scanned.
>
> Make these calculations purely based on the size of the two LRU lists
> that are actually affected by the outcome of the decision.
>
> Signed-off-by: Johannes Weiner <jweiner@redhat.com>
> Cc: Rik van Riel <riel@redhat.com>
> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
> Cc: Balbir Singh <bsingharora@gmail.com>
> ---
>  include/linux/memcontrol.h |   10 +++++---
>  mm/memcontrol.c            |   51 ++++++++++++++-----------------------------
>  mm/vmscan.c                |    4 +-
>  3 files changed, 25 insertions(+), 40 deletions(-)
>
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 343bd76..cbb45ce 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -105,8 +105,10 @@ extern void mem_cgroup_end_migration(struct mem_cgroup *mem,
>  /*
>  * For memory reclaim.
>  */
> -int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg);
> -int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg);
> +int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg,
> +                                   struct zone *zone);
> +int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg,
> +                                   struct zone *zone);
>  int mem_cgroup_select_victim_node(struct mem_cgroup *memcg);
>  unsigned long mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg,
>                                        int nid, int zid, unsigned int lrumask);
> @@ -292,13 +294,13 @@ static inline bool mem_cgroup_disabled(void)
>  }
>
>  static inline int
> -mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg)
> +mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg, struct zone *zone)
>  {
>        return 1;
>  }
>
>  static inline int
> -mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg)
> +mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg, struct zone *zone)
>  {
>        return 1;
>  }
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 3508777..d63dfb2 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -1101,15 +1101,19 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
>        return ret;
>  }
>
> -static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages)
> +int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg, struct zone *zone)
>  {
> -       unsigned long active;
> +       unsigned long inactive_ratio;
> +       int nid = zone_to_nid(zone);
> +       int zid = zone_idx(zone);
>        unsigned long inactive;
> +       unsigned long active;
>        unsigned long gb;
> -       unsigned long inactive_ratio;
>
> -       inactive = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_ANON));
> -       active = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_ANON));
> +       inactive = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid,
> +                                               BIT(LRU_INACTIVE_ANON));
> +       active = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid,
> +                                             BIT(LRU_ACTIVE_ANON));
>
>        gb = (inactive + active) >> (30 - PAGE_SHIFT);
>        if (gb)
> @@ -1117,39 +1121,20 @@ static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_
>        else
>                inactive_ratio = 1;
>
> -       if (present_pages) {
> -               present_pages[0] = inactive;
> -               present_pages[1] = active;
> -       }
> -
> -       return inactive_ratio;
> +       return inactive * inactive_ratio < active;
>  }
>
> -int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg)
> -{
> -       unsigned long active;
> -       unsigned long inactive;
> -       unsigned long present_pages[2];
> -       unsigned long inactive_ratio;
> -
> -       inactive_ratio = calc_inactive_ratio(memcg, present_pages);
> -
> -       inactive = present_pages[0];
> -       active = present_pages[1];
> -
> -       if (inactive * inactive_ratio < active)
> -               return 1;
> -
> -       return 0;
> -}
> -
> -int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg)
> +int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg, struct zone *zone)
>  {
>        unsigned long active;
>        unsigned long inactive;
> +       int zid = zone_idx(zone);
> +       int nid = zone_to_nid(zone);
>
> -       inactive = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_FILE));
> -       active = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_FILE));
> +       inactive = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid,
> +                                               BIT(LRU_INACTIVE_FILE));
> +       active = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid,
> +                                             BIT(LRU_ACTIVE_FILE));
>
>        return (active > inactive);
>  }
> @@ -4188,8 +4173,6 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
>        }
>
>  #ifdef CONFIG_DEBUG_VM
> -       cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL));
> -
>        {
>                int nid, zid;
>                struct mem_cgroup_per_zone *mz;
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 6588746..a023778 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -1699,7 +1699,7 @@ static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc)
>        if (scanning_global_lru(sc))
>                low = inactive_anon_is_low_global(zone);
>        else
> -               low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup);
> +               low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup, zone);
>        return low;
>  }
>  #else
> @@ -1742,7 +1742,7 @@ static int inactive_file_is_low(struct zone *zone, struct scan_control *sc)
>        if (scanning_global_lru(sc))
>                low = inactive_file_is_low_global(zone);
>        else
> -               low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup);
> +               low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup, zone);
>        return low;
>  }
>
> --
> 1.7.6
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
>

Reviewed-by: Ying Han <yinghan@google.com>

--Ying

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [patch] memcg: skip scanning active lists based on individual size
  2011-08-31  9:08 [patch] memcg: skip scanning active lists based on individual size Johannes Weiner
  2011-08-31 10:13 ` Minchan Kim
  2011-08-31 17:19 ` Ying Han
@ 2011-08-31 18:27 ` Rik van Riel
  2 siblings, 0 replies; 12+ messages in thread
From: Rik van Riel @ 2011-08-31 18:27 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: Andrew Morton, KOSAKI Motohiro, KAMEZAWA Hiroyuki,
	Daisuke Nishimura, Balbir Singh, linux-mm, linux-kernel

On 08/31/2011 05:08 AM, Johannes Weiner wrote:
> Reclaim decides to skip scanning an active list when the corresponding
> inactive list is above a certain size in comparison to leave the
> assumed working set alone while there are still enough reclaim
> candidates around.
>
> The memcg implementation of comparing those lists instead reports
> whether the whole memcg is low on the requested type of inactive
> pages, considering all nodes and zones.
>
> This can lead to an oversized active list not being scanned because of
> the state of the other lists in the memcg, as well as an active list
> being scanned while its corresponding inactive list has enough pages.
>
> Not only is this wrong, it's also a scalability hazard, because the
> global memory state over all nodes and zones has to be gathered for
> each memcg and zone scanned.
>
> Make these calculations purely based on the size of the two LRU lists
> that are actually affected by the outcome of the decision.
>
> Signed-off-by: Johannes Weiner<jweiner@redhat.com>
> Cc: Rik van Riel<riel@redhat.com>
> Cc: KOSAKI Motohiro<kosaki.motohiro@jp.fujitsu.com>
> Cc: KAMEZAWA Hiroyuki<kamezawa.hiroyu@jp.fujitsu.com>
> Cc: Daisuke Nishimura<nishimura@mxp.nes.nec.co.jp>
> Cc: Balbir Singh<bsingharora@gmail.com>

Reviewed-by: Rik van Riel <riel@redhat.com>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2011-09-06 10:59 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2011-08-31  9:08 [patch] memcg: skip scanning active lists based on individual size Johannes Weiner
2011-08-31 10:13 ` Minchan Kim
2011-08-31 12:30   ` Johannes Weiner
2011-09-01  0:09   ` KAMEZAWA Hiroyuki
2011-09-01  6:15     ` Johannes Weiner
2011-09-01  6:31       ` KAMEZAWA Hiroyuki
2011-09-05 18:25         ` Johannes Weiner
2011-09-06  9:33           ` KAMEZAWA Hiroyuki
2011-09-06 10:43             ` Johannes Weiner
2011-09-06 10:52               ` KAMEZAWA Hiroyuki
2011-08-31 17:19 ` Ying Han
2011-08-31 18:27 ` Rik van Riel

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).