Fw: [PATCH] memcg: add reclaim statistics accounting

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* Fw: [PATCH] memcg: add reclaim statistics accounting
@ 2011-04-28  3:16 KAMEZAWA Hiroyuki
  2011-04-28  3:43 ` Ying Han
  0 siblings, 1 reply; 11+ messages in thread
From: KAMEZAWA Hiroyuki @ 2011-04-28  3:16 UTC (permalink / raw)
  To: linux-mm@kvack.org

sorry, I had wrong TO:...

Begin forwarded message:

Date: Thu, 28 Apr 2011 12:02:34 +0900
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
To: linux-mm@vger.kernel.org
Cc: "linux-kernel@vger.kernel.org" <linux-kernel@vger.kernel.org>, "nishimura@mxp.nes.nec.co.jp" <nishimura@mxp.nes.nec.co.jp>, "balbir@linux.vnet.ibm.com" <balbir@linux.vnet.ibm.com>, Ying Han <yinghan@google.com>, "akpm@linux-foundation.org" <akpm@linux-foundation.org>
Subject: [PATCH] memcg: add reclaim statistics accounting



Now, memory cgroup provides poor reclaim statistics per memcg. This
patch adds statistics for direct/soft reclaim as the number of
pages scans, the number of page freed by reclaim, the nanoseconds of
latency at reclaim. 

It's good to add statistics before we modify memcg/global reclaim, largely.
This patch refactors current soft limit status and add an unified update logic.

For example, After #cat 195Mfile > /dev/null under 100M limit.
	# cat /cgroup/memory/A/memory.stat
	....
	limit_freed 24592
	soft_steal 0
	limit_scan 43974
	soft_scan 0
	limit_latency 133837417

nearly 96M caches are freed. scanned twice. used 133ms.

Signed-off-by: KAMEZAWA Hiroyuki <kamaezawa.hiroyu@jp.fujitsu.com>
---
 Documentation/cgroups/memory.txt |   13 ++++++--
 include/linux/memcontrol.h       |    1 
 include/linux/swap.h             |   10 ++----
 mm/memcontrol.c                  |   63 ++++++++++++++++++++++++---------------
 mm/vmscan.c                      |   25 +++++++++++++--
 5 files changed, 77 insertions(+), 35 deletions(-)

Index: memcg/include/linux/memcontrol.h
===================================================================
--- memcg.orig/include/linux/memcontrol.h
+++ memcg/include/linux/memcontrol.h
@@ -106,6 +106,7 @@ extern void mem_cgroup_end_migration(str
 /*
  * For memory reclaim.
  */
+enum { RECLAIM_SCAN, RECLAIM_FREE, RECLAIM_LATENCY, NR_RECLAIM_INFO};
 int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg);
 int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg);
 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg);
Index: memcg/mm/memcontrol.c
===================================================================
--- memcg.orig/mm/memcontrol.c
+++ memcg/mm/memcontrol.c
@@ -96,10 +96,6 @@ enum mem_cgroup_events_index {
 	MEM_CGROUP_EVENTS_COUNT,	/* # of pages paged in/out */
 	MEM_CGROUP_EVENTS_PGFAULT,	/* # of page-faults */
 	MEM_CGROUP_EVENTS_PGMAJFAULT,	/* # of major page-faults */
-	MEM_CGROUP_EVENTS_SOFT_STEAL,	/* # of pages reclaimed from */
-					/* soft reclaim               */
-	MEM_CGROUP_EVENTS_SOFT_SCAN,	/* # of pages scanned from */
-					/* soft reclaim               */
 	MEM_CGROUP_EVENTS_NSTATS,
 };
 /*
@@ -206,6 +202,9 @@ struct mem_cgroup_eventfd_list {
 static void mem_cgroup_threshold(struct mem_cgroup *mem);
 static void mem_cgroup_oom_notify(struct mem_cgroup *mem);
 
+/* memory reclaim contexts */
+enum { MEM_LIMIT, MEM_SOFT, NR_MEM_CONTEXTS};
+
 /*
  * The memory controller data structure. The memory controller controls both
  * page cache and RSS per cgroup. We would eventually like to provide
@@ -242,6 +241,7 @@ struct mem_cgroup {
 	nodemask_t	scan_nodes;
 	unsigned long   next_scan_node_update;
 #endif
+	atomic_long_t	reclaim_info[NR_MEM_CONTEXTS][NR_RECLAIM_INFO];
 	/*
 	 * Should the accounting and control be hierarchical, per subtree?
 	 */
@@ -645,16 +645,6 @@ static void mem_cgroup_charge_statistics
 	preempt_enable();
 }
 
-static void mem_cgroup_soft_steal(struct mem_cgroup *mem, int val)
-{
-	this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_SOFT_STEAL], val);
-}
-
-static void mem_cgroup_soft_scan(struct mem_cgroup *mem, int val)
-{
-	this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_SOFT_SCAN], val);
-}
-
 static unsigned long
 mem_cgroup_get_zonestat_node(struct mem_cgroup *mem, int nid, enum lru_list idx)
 {
@@ -679,6 +669,15 @@ static unsigned long mem_cgroup_get_loca
 	return total;
 }
 
+void mem_cgroup_update_reclaim_info(struct mem_cgroup *mem, int context,
+				unsigned long *stats)
+{
+	int i;
+	for (i = 0; i < NR_RECLAIM_INFO; i++)
+		atomic_long_add(stats[i], &mem->reclaim_info[context][i]);
+}
+
+
 static bool __memcg_event_check(struct mem_cgroup *mem, int target)
 {
 	unsigned long val, next;
@@ -1560,6 +1559,8 @@ int mem_cgroup_select_victim_node(struct
 }
 #endif
 
+
+
 /*
  * Scan the hierarchy if needed to reclaim memory. We remember the last child
  * we reclaimed from, so that we don't end up penalizing one child extensively
@@ -1585,7 +1586,8 @@ static int mem_cgroup_hierarchical_recla
 	bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK;
 	bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT;
 	unsigned long excess;
-	unsigned long nr_scanned;
+	unsigned long stats[NR_RECLAIM_INFO];
+	int context = (check_soft)? MEM_SOFT : MEM_LIMIT;
 
 	excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT;
 
@@ -1631,13 +1633,12 @@ static int mem_cgroup_hierarchical_recla
 		if (check_soft) {
 			ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
 				noswap, get_swappiness(victim), zone,
-				&nr_scanned);
-			*total_scanned += nr_scanned;
-			mem_cgroup_soft_steal(victim, ret);
-			mem_cgroup_soft_scan(victim, nr_scanned);
+				stats);
+			*total_scanned += stats[RECLAIM_SCAN];
 		} else
 			ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
-						noswap, get_swappiness(victim));
+					noswap, get_swappiness(victim),stats);
+		mem_cgroup_update_reclaim_info(victim, context, stats);
 		css_put(&victim->css);
 		/*
 		 * At shrinking usage, we can't check we should stop here or
@@ -3661,7 +3662,7 @@ try_to_free:
 			goto out;
 		}
 		progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL,
-						false, get_swappiness(mem));
+					false, get_swappiness(mem), NULL);
 		if (!progress) {
 			nr_retries--;
 			/* maybe some writeback is necessary */
@@ -3929,8 +3930,12 @@ enum {
 	MCS_SWAP,
 	MCS_PGFAULT,
 	MCS_PGMAJFAULT,
+	MCS_LIMIT_FREED,
 	MCS_SOFT_STEAL,
+	MCS_LIMIT_SCAN,
 	MCS_SOFT_SCAN,
+	MCS_LIMIT_LATENCY,
+	MCS_SOFT_LATENCY,
 	MCS_INACTIVE_ANON,
 	MCS_ACTIVE_ANON,
 	MCS_INACTIVE_FILE,
@@ -3955,8 +3960,12 @@ struct {
 	{"swap", "total_swap"},
 	{"pgfault", "total_pgfault"},
 	{"pgmajfault", "total_pgmajfault"},
+	{"limit_freed", "total_limit_freed"},
 	{"soft_steal", "total_soft_steal"},
+	{"limit_scan", "total_limit_scan"},
 	{"soft_scan", "total_soft_scan"},
+	{"limit_latency", "total_limit_latency"},
+	{"soft_latency", "total_soft_latency"},
 	{"inactive_anon", "total_inactive_anon"},
 	{"active_anon", "total_active_anon"},
 	{"inactive_file", "total_inactive_file"},
@@ -3985,10 +3994,18 @@ mem_cgroup_get_local_stat(struct mem_cgr
 		val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT);
 		s->stat[MCS_SWAP] += val * PAGE_SIZE;
 	}
-	val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_SOFT_STEAL);
+	val = atomic_long_read(&mem->reclaim_info[MEM_LIMIT][RECLAIM_FREE]);
+	s->stat[MCS_LIMIT_FREED] += val;
+	val = atomic_long_read(&mem->reclaim_info[MEM_SOFT][RECLAIM_FREE]);
 	s->stat[MCS_SOFT_STEAL] += val;
-	val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_SOFT_SCAN);
+	val = atomic_long_read(&mem->reclaim_info[MEM_LIMIT][RECLAIM_SCAN]);
+	s->stat[MCS_LIMIT_SCAN] += val;
+	val = atomic_long_read(&mem->reclaim_info[MEM_SOFT][RECLAIM_SCAN]);
 	s->stat[MCS_SOFT_SCAN] += val;
+	val = atomic_long_read(&mem->reclaim_info[MEM_LIMIT][RECLAIM_LATENCY]);
+	s->stat[MCS_LIMIT_LATENCY] += val;
+	val = atomic_long_read(&mem->reclaim_info[MEM_SOFT][RECLAIM_LATENCY]);
+	s->stat[MCS_SOFT_LATENCY] += val;
 	val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGFAULT);
 	s->stat[MCS_PGFAULT] += val;
 	val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGMAJFAULT);
Index: memcg/mm/vmscan.c
===================================================================
--- memcg.orig/mm/vmscan.c
+++ memcg/mm/vmscan.c
@@ -2156,7 +2156,7 @@ unsigned long mem_cgroup_shrink_node_zon
 						gfp_t gfp_mask, bool noswap,
 						unsigned int swappiness,
 						struct zone *zone,
-						unsigned long *nr_scanned)
+						unsigned long *stats)
 {
 	struct scan_control sc = {
 		.nr_scanned = 0,
@@ -2168,6 +2168,9 @@ unsigned long mem_cgroup_shrink_node_zon
 		.order = 0,
 		.mem_cgroup = mem,
 	};
+	u64 start, end;
+
+	start = sched_clock();
 
 	sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
 			(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
@@ -2175,7 +2178,6 @@ unsigned long mem_cgroup_shrink_node_zon
 	trace_mm_vmscan_memcg_softlimit_reclaim_begin(0,
 						      sc.may_writepage,
 						      sc.gfp_mask);
-
 	/*
 	 * NOTE: Although we can get the priority field, using it
 	 * here is not a good idea, since it limits the pages we can scan.
@@ -2185,20 +2187,26 @@ unsigned long mem_cgroup_shrink_node_zon
 	 */
 	shrink_zone(0, zone, &sc);
 
+	stats[RECLAIM_SCAN] = sc.nr_scanned;
+	stats[RECLAIM_FREE] = sc.nr_reclaimed;
+	end = sched_clock();
+	stats[RECLAIM_LATENCY] = end - start;
+
 	trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
 
-	*nr_scanned = sc.nr_scanned;
 	return sc.nr_reclaimed;
 }
 
 unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
 					   gfp_t gfp_mask,
 					   bool noswap,
-					   unsigned int swappiness)
+					   unsigned int swappiness,
+					   unsigned long *stats)
 {
 	struct zonelist *zonelist;
 	unsigned long nr_reclaimed;
 	int nid;
+	u64 end, start;
 	struct scan_control sc = {
 		.may_writepage = !laptop_mode,
 		.may_unmap = 1,
@@ -2209,6 +2217,8 @@ unsigned long try_to_free_mem_cgroup_pag
 		.mem_cgroup = mem_cont,
 		.nodemask = NULL, /* we don't care the placement */
 	};
+
+	start = sched_clock();
 	/*
 	 * Unlike direct reclaim via alloc_pages(), memcg's reclaim
 	 * don't take care of from where we get pages . So, the node where
@@ -2226,6 +2236,13 @@ unsigned long try_to_free_mem_cgroup_pag
 
 	nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
 
+	if (stats) {
+		stats[RECLAIM_SCAN] = sc.nr_scanned;
+		stats[RECLAIM_FREE] = sc.nr_reclaimed;
+		end = sched_clock();
+		stats[RECLAIM_LATENCY] = end - start;
+	}
+
 	trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
 
 	return nr_reclaimed;
Index: memcg/Documentation/cgroups/memory.txt
===================================================================
--- memcg.orig/Documentation/cgroups/memory.txt
+++ memcg/Documentation/cgroups/memory.txt
@@ -387,8 +387,13 @@ pgpgout		- # of pages paged out (equival
 swap		- # of bytes of swap usage
 pgfault		- # of page faults.
 pgmajfault	- # of major page faults.
-soft_steal	- # of pages reclaimed from global hierarchical reclaim
-soft_scan	- # of pages scanned from global hierarchical reclaim
+limit_freed	- # of pages reclaimed by hitting limit.
+soft_steal	- # of pages reclaimed by kernel with hints of soft limit
+limit_scan	- # of pages scanned by hitting limit.
+soft_scan	- # of pages scanned by kernel with hints of soft limit
+limit_latency	- # of nanosecs epalsed at reclaiming by hitting limit
+soft_latency	- # of nanosecs epalsed at reclaiming by kernel with hits of
+		soft limit.
 inactive_anon	- # of bytes of anonymous memory and swap cache memory on
 		LRU list.
 active_anon	- # of bytes of anonymous and swap cache memory on active
@@ -412,8 +417,12 @@ total_pgpgout		- sum of all children's "
 total_swap		- sum of all children's "swap"
 total_pgfault		- sum of all children's "pgfault"
 total_pgmajfault	- sum of all children's "pgmajfault"
+total_limit_freed	- sum of all children's "limit_freed"
 total_soft_steal	- sum of all children's "soft_steal"
+total_limit_scan	- sum of all children's "limit_scan"
 total_soft_scan		- sum of all children's "soft_scan"
+total_limit_latency	- sum of all children's "limit_latency"
+total_soft_latency	- sum of all children's "soft_latency"
 total_inactive_anon	- sum of all children's "inactive_anon"
 total_active_anon	- sum of all children's "active_anon"
 total_inactive_file	- sum of all children's "inactive_file"
Index: memcg/include/linux/swap.h
===================================================================
--- memcg.orig/include/linux/swap.h
+++ memcg/include/linux/swap.h
@@ -252,13 +252,11 @@ static inline void lru_cache_add_file(st
 extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 					gfp_t gfp_mask, nodemask_t *mask);
 extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem,
-						  gfp_t gfp_mask, bool noswap,
-						  unsigned int swappiness);
+		gfp_t gfp_mask, bool noswap, unsigned int swappiness,
+		  unsigned long *stats);
 extern unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
-						gfp_t gfp_mask, bool noswap,
-						unsigned int swappiness,
-						struct zone *zone,
-						unsigned long *nr_scanned);
+		gfp_t gfp_mask, bool noswap, unsigned int swappiness,
+		struct zone *zone, unsigned long *stats);
 extern int __isolate_lru_page(struct page *page, int mode, int file);
 extern unsigned long shrink_all_memory(unsigned long nr_pages);
 extern int vm_swappiness;

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: Fw: [PATCH] memcg: add reclaim statistics accounting
  2011-04-28  3:16 Fw: [PATCH] memcg: add reclaim statistics accounting KAMEZAWA Hiroyuki
@ 2011-04-28  3:43 ` Ying Han
  2011-04-28  3:57   ` KAMEZAWA Hiroyuki
  2011-04-28  9:01   ` KAMEZAWA Hiroyuki
  0 siblings, 2 replies; 11+ messages in thread
From: Ying Han @ 2011-04-28  3:43 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: linux-mm@kvack.org

On Wed, Apr 27, 2011 at 8:16 PM, KAMEZAWA Hiroyuki
<kamezawa.hiroyu@jp.fujitsu.com> wrote:
> sorry, I had wrong TO:...
>
> Begin forwarded message:
>
> Date: Thu, 28 Apr 2011 12:02:34 +0900
> From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> To: linux-mm@vger.kernel.org
> Cc: "linux-kernel@vger.kernel.org" <linux-kernel@vger.kernel.org>, "nishimura@mxp.nes.nec.co.jp" <nishimura@mxp.nes.nec.co.jp>, "balbir@linux.vnet.ibm.com" <balbir@linux.vnet.ibm.com>, Ying Han <yinghan@google.com>, "akpm@linux-foundation.org" <akpm@linux-foundation.org>
> Subject: [PATCH] memcg: add reclaim statistics accounting
>
>
>
> Now, memory cgroup provides poor reclaim statistics per memcg. This
> patch adds statistics for direct/soft reclaim as the number of
> pages scans, the number of page freed by reclaim, the nanoseconds of
> latency at reclaim.
>
> It's good to add statistics before we modify memcg/global reclaim, largely.
> This patch refactors current soft limit status and add an unified update logic.
>
> For example, After #cat 195Mfile > /dev/null under 100M limit.
>        # cat /cgroup/memory/A/memory.stat
>        ....
>        limit_freed 24592

why not "limit_steal" ?

>        soft_steal 0
>        limit_scan 43974
>        soft_scan 0
>        limit_latency 133837417
>
> nearly 96M caches are freed. scanned twice. used 133ms.

Does it make sense to split up the soft_steal/scan for bg reclaim and
direct reclaim? The same for the limit_steal/scan. I am now testing
the patch to add the soft_limit reclaim on global ttfp, and i already
have the patch to add the following:

kswapd_soft_steal 0
kswapd_soft_scan 0
direct_soft_steal 0
direct_soft_scan 0
kswapd_steal 0
pg_pgsteal 0
kswapd_pgscan 0
pg_scan 0

It is more clear to me to have finer granularity of the stats. Let me
know if that works or not. I probably can post it this week.

--Ying
>
> Signed-off-by: KAMEZAWA Hiroyuki <kamaezawa.hiroyu@jp.fujitsu.com>
> ---
>  Documentation/cgroups/memory.txt |   13 ++++++--
>  include/linux/memcontrol.h       |    1
>  include/linux/swap.h             |   10 ++----
>  mm/memcontrol.c                  |   63 ++++++++++++++++++++++++---------------
>  mm/vmscan.c                      |   25 +++++++++++++--
>  5 files changed, 77 insertions(+), 35 deletions(-)
>
> Index: memcg/include/linux/memcontrol.h
> ===================================================================
> --- memcg.orig/include/linux/memcontrol.h
> +++ memcg/include/linux/memcontrol.h
> @@ -106,6 +106,7 @@ extern void mem_cgroup_end_migration(str
>  /*
>  * For memory reclaim.
>  */
> +enum { RECLAIM_SCAN, RECLAIM_FREE, RECLAIM_LATENCY, NR_RECLAIM_INFO};
>  int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg);
>  int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg);
>  int mem_cgroup_select_victim_node(struct mem_cgroup *memcg);
> Index: memcg/mm/memcontrol.c
> ===================================================================
> --- memcg.orig/mm/memcontrol.c
> +++ memcg/mm/memcontrol.c
> @@ -96,10 +96,6 @@ enum mem_cgroup_events_index {
>        MEM_CGROUP_EVENTS_COUNT,        /* # of pages paged in/out */
>        MEM_CGROUP_EVENTS_PGFAULT,      /* # of page-faults */
>        MEM_CGROUP_EVENTS_PGMAJFAULT,   /* # of major page-faults */
> -       MEM_CGROUP_EVENTS_SOFT_STEAL,   /* # of pages reclaimed from */
> -                                       /* soft reclaim               */
> -       MEM_CGROUP_EVENTS_SOFT_SCAN,    /* # of pages scanned from */
> -                                       /* soft reclaim               */
>        MEM_CGROUP_EVENTS_NSTATS,
>  };
>  /*
> @@ -206,6 +202,9 @@ struct mem_cgroup_eventfd_list {
>  static void mem_cgroup_threshold(struct mem_cgroup *mem);
>  static void mem_cgroup_oom_notify(struct mem_cgroup *mem);
>
> +/* memory reclaim contexts */
> +enum { MEM_LIMIT, MEM_SOFT, NR_MEM_CONTEXTS};
> +
>  /*
>  * The memory controller data structure. The memory controller controls both
>  * page cache and RSS per cgroup. We would eventually like to provide
> @@ -242,6 +241,7 @@ struct mem_cgroup {
>        nodemask_t      scan_nodes;
>        unsigned long   next_scan_node_update;
>  #endif
> +       atomic_long_t   reclaim_info[NR_MEM_CONTEXTS][NR_RECLAIM_INFO];
>        /*
>         * Should the accounting and control be hierarchical, per subtree?
>         */
> @@ -645,16 +645,6 @@ static void mem_cgroup_charge_statistics
>        preempt_enable();
>  }
>
> -static void mem_cgroup_soft_steal(struct mem_cgroup *mem, int val)
> -{
> -       this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_SOFT_STEAL], val);
> -}
> -
> -static void mem_cgroup_soft_scan(struct mem_cgroup *mem, int val)
> -{
> -       this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_SOFT_SCAN], val);
> -}
> -
>  static unsigned long
>  mem_cgroup_get_zonestat_node(struct mem_cgroup *mem, int nid, enum lru_list idx)
>  {
> @@ -679,6 +669,15 @@ static unsigned long mem_cgroup_get_loca
>        return total;
>  }
>
> +void mem_cgroup_update_reclaim_info(struct mem_cgroup *mem, int context,
> +                               unsigned long *stats)
> +{
> +       int i;
> +       for (i = 0; i < NR_RECLAIM_INFO; i++)
> +               atomic_long_add(stats[i], &mem->reclaim_info[context][i]);
> +}
> +
> +
>  static bool __memcg_event_check(struct mem_cgroup *mem, int target)
>  {
>        unsigned long val, next;
> @@ -1560,6 +1559,8 @@ int mem_cgroup_select_victim_node(struct
>  }
>  #endif
>
> +
> +
>  /*
>  * Scan the hierarchy if needed to reclaim memory. We remember the last child
>  * we reclaimed from, so that we don't end up penalizing one child extensively
> @@ -1585,7 +1586,8 @@ static int mem_cgroup_hierarchical_recla
>        bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK;
>        bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT;
>        unsigned long excess;
> -       unsigned long nr_scanned;
> +       unsigned long stats[NR_RECLAIM_INFO];
> +       int context = (check_soft)? MEM_SOFT : MEM_LIMIT;
>
>        excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT;
>
> @@ -1631,13 +1633,12 @@ static int mem_cgroup_hierarchical_recla
>                if (check_soft) {
>                        ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
>                                noswap, get_swappiness(victim), zone,
> -                               &nr_scanned);
> -                       *total_scanned += nr_scanned;
> -                       mem_cgroup_soft_steal(victim, ret);
> -                       mem_cgroup_soft_scan(victim, nr_scanned);
> +                               stats);
> +                       *total_scanned += stats[RECLAIM_SCAN];
>                } else
>                        ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
> -                                               noswap, get_swappiness(victim));
> +                                       noswap, get_swappiness(victim),stats);
> +               mem_cgroup_update_reclaim_info(victim, context, stats);
>                css_put(&victim->css);
>                /*
>                 * At shrinking usage, we can't check we should stop here or
> @@ -3661,7 +3662,7 @@ try_to_free:
>                        goto out;
>                }
>                progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL,
> -                                               false, get_swappiness(mem));
> +                                       false, get_swappiness(mem), NULL);
>                if (!progress) {
>                        nr_retries--;
>                        /* maybe some writeback is necessary */
> @@ -3929,8 +3930,12 @@ enum {
>        MCS_SWAP,
>        MCS_PGFAULT,
>        MCS_PGMAJFAULT,
> +       MCS_LIMIT_FREED,
>        MCS_SOFT_STEAL,
> +       MCS_LIMIT_SCAN,
>        MCS_SOFT_SCAN,
> +       MCS_LIMIT_LATENCY,
> +       MCS_SOFT_LATENCY,
>        MCS_INACTIVE_ANON,
>        MCS_ACTIVE_ANON,
>        MCS_INACTIVE_FILE,
> @@ -3955,8 +3960,12 @@ struct {
>        {"swap", "total_swap"},
>        {"pgfault", "total_pgfault"},
>        {"pgmajfault", "total_pgmajfault"},
> +       {"limit_freed", "total_limit_freed"},
>        {"soft_steal", "total_soft_steal"},
> +       {"limit_scan", "total_limit_scan"},
>        {"soft_scan", "total_soft_scan"},
> +       {"limit_latency", "total_limit_latency"},
> +       {"soft_latency", "total_soft_latency"},
>        {"inactive_anon", "total_inactive_anon"},
>        {"active_anon", "total_active_anon"},
>        {"inactive_file", "total_inactive_file"},
> @@ -3985,10 +3994,18 @@ mem_cgroup_get_local_stat(struct mem_cgr
>                val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT);
>                s->stat[MCS_SWAP] += val * PAGE_SIZE;
>        }
> -       val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_SOFT_STEAL);
> +       val = atomic_long_read(&mem->reclaim_info[MEM_LIMIT][RECLAIM_FREE]);
> +       s->stat[MCS_LIMIT_FREED] += val;
> +       val = atomic_long_read(&mem->reclaim_info[MEM_SOFT][RECLAIM_FREE]);
>        s->stat[MCS_SOFT_STEAL] += val;
> -       val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_SOFT_SCAN);
> +       val = atomic_long_read(&mem->reclaim_info[MEM_LIMIT][RECLAIM_SCAN]);
> +       s->stat[MCS_LIMIT_SCAN] += val;
> +       val = atomic_long_read(&mem->reclaim_info[MEM_SOFT][RECLAIM_SCAN]);
>        s->stat[MCS_SOFT_SCAN] += val;
> +       val = atomic_long_read(&mem->reclaim_info[MEM_LIMIT][RECLAIM_LATENCY]);
> +       s->stat[MCS_LIMIT_LATENCY] += val;
> +       val = atomic_long_read(&mem->reclaim_info[MEM_SOFT][RECLAIM_LATENCY]);
> +       s->stat[MCS_SOFT_LATENCY] += val;
>        val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGFAULT);
>        s->stat[MCS_PGFAULT] += val;
>        val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGMAJFAULT);
> Index: memcg/mm/vmscan.c
> ===================================================================
> --- memcg.orig/mm/vmscan.c
> +++ memcg/mm/vmscan.c
> @@ -2156,7 +2156,7 @@ unsigned long mem_cgroup_shrink_node_zon
>                                                gfp_t gfp_mask, bool noswap,
>                                                unsigned int swappiness,
>                                                struct zone *zone,
> -                                               unsigned long *nr_scanned)
> +                                               unsigned long *stats)
>  {
>        struct scan_control sc = {
>                .nr_scanned = 0,
> @@ -2168,6 +2168,9 @@ unsigned long mem_cgroup_shrink_node_zon
>                .order = 0,
>                .mem_cgroup = mem,
>        };
> +       u64 start, end;
> +
> +       start = sched_clock();
>
>        sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
>                        (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
> @@ -2175,7 +2178,6 @@ unsigned long mem_cgroup_shrink_node_zon
>        trace_mm_vmscan_memcg_softlimit_reclaim_begin(0,
>                                                      sc.may_writepage,
>                                                      sc.gfp_mask);
> -
>        /*
>         * NOTE: Although we can get the priority field, using it
>         * here is not a good idea, since it limits the pages we can scan.
> @@ -2185,20 +2187,26 @@ unsigned long mem_cgroup_shrink_node_zon
>         */
>        shrink_zone(0, zone, &sc);
>
> +       stats[RECLAIM_SCAN] = sc.nr_scanned;
> +       stats[RECLAIM_FREE] = sc.nr_reclaimed;
> +       end = sched_clock();
> +       stats[RECLAIM_LATENCY] = end - start;
> +
>        trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
>
> -       *nr_scanned = sc.nr_scanned;
>        return sc.nr_reclaimed;
>  }
>
>  unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
>                                           gfp_t gfp_mask,
>                                           bool noswap,
> -                                          unsigned int swappiness)
> +                                          unsigned int swappiness,
> +                                          unsigned long *stats)
>  {
>        struct zonelist *zonelist;
>        unsigned long nr_reclaimed;
>        int nid;
> +       u64 end, start;
>        struct scan_control sc = {
>                .may_writepage = !laptop_mode,
>                .may_unmap = 1,
> @@ -2209,6 +2217,8 @@ unsigned long try_to_free_mem_cgroup_pag
>                .mem_cgroup = mem_cont,
>                .nodemask = NULL, /* we don't care the placement */
>        };
> +
> +       start = sched_clock();
>        /*
>         * Unlike direct reclaim via alloc_pages(), memcg's reclaim
>         * don't take care of from where we get pages . So, the node where
> @@ -2226,6 +2236,13 @@ unsigned long try_to_free_mem_cgroup_pag
>
>        nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
>
> +       if (stats) {
> +               stats[RECLAIM_SCAN] = sc.nr_scanned;
> +               stats[RECLAIM_FREE] = sc.nr_reclaimed;
> +               end = sched_clock();
> +               stats[RECLAIM_LATENCY] = end - start;
> +       }
> +
>        trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
>
>        return nr_reclaimed;
> Index: memcg/Documentation/cgroups/memory.txt
> ===================================================================
> --- memcg.orig/Documentation/cgroups/memory.txt
> +++ memcg/Documentation/cgroups/memory.txt
> @@ -387,8 +387,13 @@ pgpgout            - # of pages paged out (equival
>  swap           - # of bytes of swap usage
>  pgfault                - # of page faults.
>  pgmajfault     - # of major page faults.
> -soft_steal     - # of pages reclaimed from global hierarchical reclaim
> -soft_scan      - # of pages scanned from global hierarchical reclaim
> +limit_freed    - # of pages reclaimed by hitting limit.
> +soft_steal     - # of pages reclaimed by kernel with hints of soft limit
> +limit_scan     - # of pages scanned by hitting limit.
> +soft_scan      - # of pages scanned by kernel with hints of soft limit
> +limit_latency  - # of nanosecs epalsed at reclaiming by hitting limit
> +soft_latency   - # of nanosecs epalsed at reclaiming by kernel with hits of
> +               soft limit.
>  inactive_anon  - # of bytes of anonymous memory and swap cache memory on
>                LRU list.
>  active_anon    - # of bytes of anonymous and swap cache memory on active
> @@ -412,8 +417,12 @@ total_pgpgout              - sum of all children's "
>  total_swap             - sum of all children's "swap"
>  total_pgfault          - sum of all children's "pgfault"
>  total_pgmajfault       - sum of all children's "pgmajfault"
> +total_limit_freed      - sum of all children's "limit_freed"
>  total_soft_steal       - sum of all children's "soft_steal"
> +total_limit_scan       - sum of all children's "limit_scan"
>  total_soft_scan                - sum of all children's "soft_scan"
> +total_limit_latency    - sum of all children's "limit_latency"
> +total_soft_latency     - sum of all children's "soft_latency"
>  total_inactive_anon    - sum of all children's "inactive_anon"
>  total_active_anon      - sum of all children's "active_anon"
>  total_inactive_file    - sum of all children's "inactive_file"
> Index: memcg/include/linux/swap.h
> ===================================================================
> --- memcg.orig/include/linux/swap.h
> +++ memcg/include/linux/swap.h
> @@ -252,13 +252,11 @@ static inline void lru_cache_add_file(st
>  extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
>                                        gfp_t gfp_mask, nodemask_t *mask);
>  extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem,
> -                                                 gfp_t gfp_mask, bool noswap,
> -                                                 unsigned int swappiness);
> +               gfp_t gfp_mask, bool noswap, unsigned int swappiness,
> +                 unsigned long *stats);
>  extern unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
> -                                               gfp_t gfp_mask, bool noswap,
> -                                               unsigned int swappiness,
> -                                               struct zone *zone,
> -                                               unsigned long *nr_scanned);
> +               gfp_t gfp_mask, bool noswap, unsigned int swappiness,
> +               struct zone *zone, unsigned long *stats);
>  extern int __isolate_lru_page(struct page *page, int mode, int file);
>  extern unsigned long shrink_all_memory(unsigned long nr_pages);
>  extern int vm_swappiness;
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
>
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: Fw: [PATCH] memcg: add reclaim statistics accounting
  2011-04-28  3:43 ` Ying Han
@ 2011-04-28  3:57   ` KAMEZAWA Hiroyuki
  2011-04-28  4:24     ` Ying Han
  2011-04-28  9:01   ` KAMEZAWA Hiroyuki
  1 sibling, 1 reply; 11+ messages in thread
From: KAMEZAWA Hiroyuki @ 2011-04-28  3:57 UTC (permalink / raw)
  To: Ying Han; +Cc: linux-mm@kvack.org

On Wed, 27 Apr 2011 20:43:58 -0700
Ying Han <yinghan@google.com> wrote:

> On Wed, Apr 27, 2011 at 8:16 PM, KAMEZAWA Hiroyuki
> <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> > sorry, I had wrong TO:...
> >
> > Begin forwarded message:
> >
> > Date: Thu, 28 Apr 2011 12:02:34 +0900
> > From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> > To: linux-mm@vger.kernel.org
> > Cc: "linux-kernel@vger.kernel.org" <linux-kernel@vger.kernel.org>, "nishimura@mxp.nes.nec.co.jp" <nishimura@mxp.nes.nec.co.jp>, "balbir@linux.vnet.ibm.com" <balbir@linux.vnet.ibm.com>, Ying Han <yinghan@google.com>, "akpm@linux-foundation.org" <akpm@linux-foundation.org>
> > Subject: [PATCH] memcg: add reclaim statistics accounting
> >
> >
> >
> > Now, memory cgroup provides poor reclaim statistics per memcg. This
> > patch adds statistics for direct/soft reclaim as the number of
> > pages scans, the number of page freed by reclaim, the nanoseconds of
> > latency at reclaim.
> >
> > It's good to add statistics before we modify memcg/global reclaim, largely.
> > This patch refactors current soft limit status and add an unified update logic.
> >
> > For example, After #cat 195Mfile > /dev/null under 100M limit.
> > A  A  A  A # cat /cgroup/memory/A/memory.stat
> > A  A  A  A ....
> > A  A  A  A limit_freed 24592
> 
> why not "limit_steal" ?
> 

It's not "stealed". Freed by itself.
pages reclaimed by soft-limit is stealed because of global memory pressure.
I don't like the name "steal" but I can't change it because of API breakage.


> > A  A  A  A soft_steal 0
> > A  A  A  A limit_scan 43974
> > A  A  A  A soft_scan 0
> > A  A  A  A limit_latency 133837417
> >
> > nearly 96M caches are freed. scanned twice. used 133ms.
> 
> Does it make sense to split up the soft_steal/scan for bg reclaim and
> direct reclaim? 

Please clarify what you're talking about before asking. Maybe you want to say
"I'm now working for supporting softlimit in direct reclaim path. So, does
 it make sense to account direct/kswapd works in statistics ?"

I think bg/direct reclaim is not required to be splitted.

> The same for the limit_steal/scan. 

limit has only direct reclaim, now. And this is independent from any
soft limit works.

> I am now testing
> the patch to add the soft_limit reclaim on global ttfp, and i already
> have the patch to add the following:
> 
> kswapd_soft_steal 0
> kswapd_soft_scan 0

please don't change the name of _used_ statisitcs.


> direct_soft_steal 0
> direct_soft_scan 0

Maybe these are new ones added by your work. But should be merged to
soft_steal/soft_scan.

> kswapd_steal 0
> pg_pgsteal 0
> kswapd_pgscan 0
> pg_scan 0
> 

Maybe this indicates reclaimed-by-other-tasks-than-this-memcg. Right ?
Maybe good for checking isolation of memcg, hmm, can these be accounted
in scalable way ?

BTW, my office will be closed for a week because of holidays. So, I'll not make
responce tomorrow. please CC kamezawa.hiroyuki@gmail.com if you need.
I may read e-mails.

Thanks,
-Kame



--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: Fw: [PATCH] memcg: add reclaim statistics accounting
  2011-04-28  3:57   ` KAMEZAWA Hiroyuki
@ 2011-04-28  4:24     ` Ying Han
  2011-04-28  4:27       ` KAMEZAWA Hiroyuki
  0 siblings, 1 reply; 11+ messages in thread
From: Ying Han @ 2011-04-28  4:24 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: linux-mm@kvack.org

On Wed, Apr 27, 2011 at 8:57 PM, KAMEZAWA Hiroyuki
<kamezawa.hiroyu@jp.fujitsu.com> wrote:
> On Wed, 27 Apr 2011 20:43:58 -0700
> Ying Han <yinghan@google.com> wrote:
>
>> On Wed, Apr 27, 2011 at 8:16 PM, KAMEZAWA Hiroyuki
>> <kamezawa.hiroyu@jp.fujitsu.com> wrote:
>> > sorry, I had wrong TO:...
>> >
>> > Begin forwarded message:
>> >
>> > Date: Thu, 28 Apr 2011 12:02:34 +0900
>> > From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
>> > To: linux-mm@vger.kernel.org
>> > Cc: "linux-kernel@vger.kernel.org" <linux-kernel@vger.kernel.org>, "nishimura@mxp.nes.nec.co.jp" <nishimura@mxp.nes.nec.co.jp>, "balbir@linux.vnet.ibm.com" <balbir@linux.vnet.ibm.com>, Ying Han <yinghan@google.com>, "akpm@linux-foundation.org" <akpm@linux-foundation.org>
>> > Subject: [PATCH] memcg: add reclaim statistics accounting
>> >
>> >
>> >
>> > Now, memory cgroup provides poor reclaim statistics per memcg. This
>> > patch adds statistics for direct/soft reclaim as the number of
>> > pages scans, the number of page freed by reclaim, the nanoseconds of
>> > latency at reclaim.
>> >
>> > It's good to add statistics before we modify memcg/global reclaim, largely.
>> > This patch refactors current soft limit status and add an unified update logic.
>> >
>> > For example, After #cat 195Mfile > /dev/null under 100M limit.
>> >        # cat /cgroup/memory/A/memory.stat
>> >        ....
>> >        limit_freed 24592
>>
>> why not "limit_steal" ?
>>
>
> It's not "stealed". Freed by itself.
> pages reclaimed by soft-limit is stealed because of global memory pressure.
> I don't like the name "steal" but I can't change it because of API breakage.
>
>
>> >        soft_steal 0
>> >        limit_scan 43974
>> >        soft_scan 0
>> >        limit_latency 133837417
>> >
>> > nearly 96M caches are freed. scanned twice. used 133ms.
>>
>> Does it make sense to split up the soft_steal/scan for bg reclaim and
>> direct reclaim?
>
> Please clarify what you're talking about before asking. Maybe you want to say
> "I'm now working for supporting softlimit in direct reclaim path. So, does
>  it make sense to account direct/kswapd works in statistics ?"
>
> I think bg/direct reclaim is not required to be splitted.

Ok, thanks for the clarification. The patch i am working now to be
more specific is to add the
soft_limit hierarchical reclaim on the global direct reclaim.

I am adding similar stats to monitor the soft_steal, but i split-off
the soft_steal from global direct reclaim and
global background reclaim. I am wondering isn't that give us more
visibility of the reclaim path?

>
>> The same for the limit_steal/scan.
>
> limit has only direct reclaim, now. And this is independent from any
> soft limit works.

agree.

>
>> I am now testing
>> the patch to add the soft_limit reclaim on global ttfp, and i already
>> have the patch to add the following:
>>
>> kswapd_soft_steal 0
>> kswapd_soft_scan 0
>
> please don't change the name of _used_ statisitcs.

good point. thanks

>
>
>> direct_soft_steal 0
>> direct_soft_scan 0
>
> Maybe these are new ones added by your work. But should be merged to
> soft_steal/soft_scan.
the same question above, why we don't want to have better visibility
of where we triggered
the soft_limit reclaim and how much has been done on behalf of each.

>
>> kswapd_steal 0
>> pg_pgsteal 0
>> kswapd_pgscan 0
>> pg_scan 0
>>
>
> Maybe this indicates reclaimed-by-other-tasks-than-this-memcg. Right ?
> Maybe good for checking isolation of memcg, hmm, can these be accounted
> in scalable way ?

you can ignore those four stats. They are part of the per-memcg-kswapd
patchset, and i guess you might
have similar patch for that purpose.

>
> BTW, my office will be closed for a week because of holidays. So, I'll not make
> responce tomorrow. please CC kamezawa.hiroyuki@gmail.com if you need.
> I may read e-mails.

Thanks for the heads up ~

--Ying

>
> Thanks,
> -Kame
>
>
>
>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: Fw: [PATCH] memcg: add reclaim statistics accounting
  2011-04-28  4:24     ` Ying Han
@ 2011-04-28  4:27       ` KAMEZAWA Hiroyuki
  2011-04-28  4:40         ` Ying Han
  0 siblings, 1 reply; 11+ messages in thread
From: KAMEZAWA Hiroyuki @ 2011-04-28  4:27 UTC (permalink / raw)
  To: Ying Han; +Cc: linux-mm@kvack.org

On Wed, 27 Apr 2011 21:24:30 -0700
Ying Han <yinghan@google.com> wrote:

> On Wed, Apr 27, 2011 at 8:57 PM, KAMEZAWA Hiroyuki
> <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> > On Wed, 27 Apr 2011 20:43:58 -0700
> > Ying Han <yinghan@google.com> wrote:
> >
>
> >> Does it make sense to split up the soft_steal/scan for bg reclaim and
> >> direct reclaim?
> >
> > Please clarify what you're talking about before asking. Maybe you want to say
> > "I'm now working for supporting softlimit in direct reclaim path. So, does
> > A it make sense to account direct/kswapd works in statistics ?"
> >
> > I think bg/direct reclaim is not required to be splitted.
> 
> Ok, thanks for the clarification. The patch i am working now to be
> more specific is to add the
> soft_limit hierarchical reclaim on the global direct reclaim.
> 
> I am adding similar stats to monitor the soft_steal, but i split-off
> the soft_steal from global direct reclaim and
> global background reclaim. I am wondering isn't that give us more
> visibility of the reclaim path?
>

Hmm, if kswapd and direc-reclaim uses the same logic, I don't care which
steals memory. But i'm not sure you implementation before seeing patch.
So, please let me postphone answering this. But, considering again,
/proc/vmstat has
==
pgscan_kswapd_dma 0
pgscan_kswapd_dma32 0
pgscan_kswapd_normal 0
pgscan_kswapd_movable 0
pgscan_direct_dma 0
pgscan_direct_dma32 0
pgscan_direct_normal 0
pgscan_direct_movable 0
==

maybe it's ok to have split stats.


BTW, ff I add more statistics, I'll add per-node statistics.
Hmm, memory.node_stat is required ?


> >
> >
> >> direct_soft_steal 0
> >> direct_soft_scan 0
> >
> > Maybe these are new ones added by your work. But should be merged to
> > soft_steal/soft_scan.
> the same question above, why we don't want to have better visibility
> of where we triggered
> the soft_limit reclaim and how much has been done on behalf of each.
> 
Maybe I answerd this.



> >
> >> kswapd_steal 0
> >> pg_pgsteal 0
> >> kswapd_pgscan 0
> >> pg_scan 0
> >>
> >
> > Maybe this indicates reclaimed-by-other-tasks-than-this-memcg. Right ?
> > Maybe good for checking isolation of memcg, hmm, can these be accounted
> > in scalable way ?
> 
> you can ignore those four stats. They are part of the per-memcg-kswapd
> patchset, and i guess you might
> have similar patch for that purpose.
> 
Ah, I named them as wmark_scan/wmark_steal for avoiding confusion.


Thanks,
-Kame

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: Fw: [PATCH] memcg: add reclaim statistics accounting
  2011-04-28  4:27       ` KAMEZAWA Hiroyuki
@ 2011-04-28  4:40         ` Ying Han
  2011-04-28  7:02           ` KAMEZAWA Hiroyuki
  0 siblings, 1 reply; 11+ messages in thread
From: Ying Han @ 2011-04-28  4:40 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: linux-mm@kvack.org

On Wed, Apr 27, 2011 at 9:27 PM, KAMEZAWA Hiroyuki
<kamezawa.hiroyu@jp.fujitsu.com> wrote:
> On Wed, 27 Apr 2011 21:24:30 -0700
> Ying Han <yinghan@google.com> wrote:
>
>> On Wed, Apr 27, 2011 at 8:57 PM, KAMEZAWA Hiroyuki
>> <kamezawa.hiroyu@jp.fujitsu.com> wrote:
>> > On Wed, 27 Apr 2011 20:43:58 -0700
>> > Ying Han <yinghan@google.com> wrote:
>> >
>>
>> >> Does it make sense to split up the soft_steal/scan for bg reclaim and
>> >> direct reclaim?
>> >
>> > Please clarify what you're talking about before asking. Maybe you want to say
>> > "I'm now working for supporting softlimit in direct reclaim path. So, does
>> >  it make sense to account direct/kswapd works in statistics ?"
>> >
>> > I think bg/direct reclaim is not required to be splitted.
>>
>> Ok, thanks for the clarification. The patch i am working now to be
>> more specific is to add the
>> soft_limit hierarchical reclaim on the global direct reclaim.
>>
>> I am adding similar stats to monitor the soft_steal, but i split-off
>> the soft_steal from global direct reclaim and
>> global background reclaim. I am wondering isn't that give us more
>> visibility of the reclaim path?
>>
>
> Hmm, if kswapd and direc-reclaim uses the same logic, I don't care which
> steals memory. But i'm not sure you implementation before seeing patch.
> So, please let me postphone answering this. But, considering again,
> /proc/vmstat has
> ==
> pgscan_kswapd_dma 0
> pgscan_kswapd_dma32 0
> pgscan_kswapd_normal 0
> pgscan_kswapd_movable 0
> pgscan_direct_dma 0
> pgscan_direct_dma32 0
> pgscan_direct_normal 0
> pgscan_direct_movable 0
> ==
>
> maybe it's ok to have split stats.
>
>
> BTW, ff I add more statistics, I'll add per-node statistics.
> Hmm, memory.node_stat is required ?

Yes and this will be useful. One of the stats I would like add now is
the number of pages allocated on behalf of the memcg per numa node.
This is a piece of useful information to evaluate the numa locality
correlated to the application
performance.

I was wondering where to add the stats and memory.stat seems not to be
the best fit. If we have memory.node_stat, that would be a good place
for those kind of info?

--Ying

>
>
>> >
>> >
>> >> direct_soft_steal 0
>> >> direct_soft_scan 0
>> >
>> > Maybe these are new ones added by your work. But should be merged to
>> > soft_steal/soft_scan.
>> the same question above, why we don't want to have better visibility
>> of where we triggered
>> the soft_limit reclaim and how much has been done on behalf of each.
>>
> Maybe I answerd this.
>
>
>
>> >
>> >> kswapd_steal 0
>> >> pg_pgsteal 0
>> >> kswapd_pgscan 0
>> >> pg_scan 0
>> >>
>> >
>> > Maybe this indicates reclaimed-by-other-tasks-than-this-memcg. Right ?
>> > Maybe good for checking isolation of memcg, hmm, can these be accounted
>> > in scalable way ?
>>
>> you can ignore those four stats. They are part of the per-memcg-kswapd
>> patchset, and i guess you might
>> have similar patch for that purpose.
>>
> Ah, I named them as wmark_scan/wmark_steal for avoiding confusion.
>
>
> Thanks,
> -Kame
>
>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: Fw: [PATCH] memcg: add reclaim statistics accounting
  2011-04-28  4:40         ` Ying Han
@ 2011-04-28  7:02           ` KAMEZAWA Hiroyuki
  0 siblings, 0 replies; 11+ messages in thread
From: KAMEZAWA Hiroyuki @ 2011-04-28  7:02 UTC (permalink / raw)
  To: Ying Han; +Cc: linux-mm@kvack.org

On Wed, 27 Apr 2011 21:40:06 -0700
Ying Han <yinghan@google.com> wrote:

> On Wed, Apr 27, 2011 at 9:27 PM, KAMEZAWA Hiroyuki
> <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> > On Wed, 27 Apr 2011 21:24:30 -0700
> > Ying Han <yinghan@google.com> wrote:

> > BTW, ff I add more statistics, I'll add per-node statistics.
> > Hmm, memory.node_stat is required ?
> 
> Yes and this will be useful. One of the stats I would like add now is
> the number of pages allocated on behalf of the memcg per numa node.
> This is a piece of useful information to evaluate the numa locality
> correlated to the application
> performance.
> 
> I was wondering where to add the stats and memory.stat seems not to be
> the best fit. If we have memory.node_stat, that would be a good place
> for those kind of info?
> 

Maybe it's better to add memory.node_stat ....memory.stat seems a bit long ;)
I'd like to consider to add a tool to grab information easily under somewhere
...as cgroup-top.

Thanks,
-Kame

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: Fw: [PATCH] memcg: add reclaim statistics accounting
  2011-04-28  3:43 ` Ying Han
  2011-04-28  3:57   ` KAMEZAWA Hiroyuki
@ 2011-04-28  9:01   ` KAMEZAWA Hiroyuki
  2011-04-28 12:36     ` Johannes Weiner
  1 sibling, 1 reply; 11+ messages in thread
From: KAMEZAWA Hiroyuki @ 2011-04-28  9:01 UTC (permalink / raw)
  To: Ying Han; +Cc: linux-mm@kvack.org

On Wed, 27 Apr 2011 20:43:58 -0700
Ying Han <yinghan@google.com> wrote:

> On Wed, Apr 27, 2011 at 8:16 PM, KAMEZAWA Hiroyuki
> <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> > sorry, I had wrong TO:...
> >
> > Begin forwarded message:
> >
> > Date: Thu, 28 Apr 2011 12:02:34 +0900
> > From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> > To: linux-mm@vger.kernel.org
> > Cc: "linux-kernel@vger.kernel.org" <linux-kernel@vger.kernel.org>, "nishimura@mxp.nes.nec.co.jp" <nishimura@mxp.nes.nec.co.jp>, "balbir@linux.vnet.ibm.com" <balbir@linux.vnet.ibm.com>, Ying Han <yinghan@google.com>, "akpm@linux-foundation.org" <akpm@linux-foundation.org>
> > Subject: [PATCH] memcg: add reclaim statistics accounting
> >
> >
> >
> > Now, memory cgroup provides poor reclaim statistics per memcg. This
> > patch adds statistics for direct/soft reclaim as the number of
> > pages scans, the number of page freed by reclaim, the nanoseconds of
> > latency at reclaim.
> >
> > It's good to add statistics before we modify memcg/global reclaim, largely.
> > This patch refactors current soft limit status and add an unified update logic.
> >
> > For example, After #cat 195Mfile > /dev/null under 100M limit.
> > A  A  A  A # cat /cgroup/memory/A/memory.stat
> > A  A  A  A ....
> > A  A  A  A limit_freed 24592
> 
> why not "limit_steal" ?
> 
> > A  A  A  A soft_steal 0
> > A  A  A  A limit_scan 43974
> > A  A  A  A soft_scan 0
> > A  A  A  A limit_latency 133837417
> >
> > nearly 96M caches are freed. scanned twice. used 133ms.
> 
> Does it make sense to split up the soft_steal/scan for bg reclaim and
> direct reclaim? The same for the limit_steal/scan. I am now testing
> the patch to add the soft_limit reclaim on global ttfp, and i already
> have the patch to add the following:
> 
> kswapd_soft_steal 0
> kswapd_soft_scan 0
> direct_soft_steal 0
> direct_soft_scan 0
> kswapd_steal 0
> pg_pgsteal 0
> kswapd_pgscan 0
> pg_scan 0
> 

I'll not post updated version until the end of holidays but my latest plan is
adding


limit_direct_free   - # of pages freed by limit in foreground (not stealed, you freed by yourself's limit)
soft_kswapd_steal   - # of pages stealed by kswapd based on soft limit
limit_direct_scan   - # of pages scanned by limit in foreground
soft_kswapd_scan    - # of pages scanned by kswapd based on soft limit

And then, you can add

soft_direct_steal     - # of pages stealed by foreground reclaim based on soft limit
soft_direct_scan        - # of pages scanned by foreground reclaim based on soft limit

And

kern_direct_steal  - # of pages stealed by foreground reclaim at memory shortage.
kern_direct_scan   - # of pages scanned by foreground reclaim at memory shortage.
kern_direct_steal  - # of pages stealed by kswapd at memory shortage
kern_direct_scan   - # of pages scanned by kswapd at memory shortage

(Above kern_xxx number includes soft_xxx in it. ) These will show influence by
other cgroups.

And

wmark_bg_free      - # of pages freed by watermark in background(not kswapd)
wmark_bg_scan      - # of pages scanned by watermark in background(not kswapd)

Hmm ? too many stats ;) 

And making current soft_steal/soft_scan planned to be obsolete...
 
Thanks,
-Kame




















--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: Fw: [PATCH] memcg: add reclaim statistics accounting
  2011-04-28  9:01   ` KAMEZAWA Hiroyuki
@ 2011-04-28 12:36     ` Johannes Weiner
  2011-04-28 17:46       ` Ying Han
  0 siblings, 1 reply; 11+ messages in thread
From: Johannes Weiner @ 2011-04-28 12:36 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: Ying Han, linux-mm@kvack.org

On Thu, Apr 28, 2011 at 06:01:39PM +0900, KAMEZAWA Hiroyuki wrote:
> On Wed, 27 Apr 2011 20:43:58 -0700
> Ying Han <yinghan@google.com> wrote:
> 
> > On Wed, Apr 27, 2011 at 8:16 PM, KAMEZAWA Hiroyuki
> > <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> > > sorry, I had wrong TO:...
> > >
> > > Begin forwarded message:
> > >
> > > Date: Thu, 28 Apr 2011 12:02:34 +0900
> > > From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> > > To: linux-mm@vger.kernel.org
> > > Cc: "linux-kernel@vger.kernel.org" <linux-kernel@vger.kernel.org>, "nishimura@mxp.nes.nec.co.jp" <nishimura@mxp.nes.nec.co.jp>, "balbir@linux.vnet.ibm.com" <balbir@linux.vnet.ibm.com>, Ying Han <yinghan@google.com>, "akpm@linux-foundation.org" <akpm@linux-foundation.org>
> > > Subject: [PATCH] memcg: add reclaim statistics accounting
> > >
> > >
> > >
> > > Now, memory cgroup provides poor reclaim statistics per memcg. This
> > > patch adds statistics for direct/soft reclaim as the number of
> > > pages scans, the number of page freed by reclaim, the nanoseconds of
> > > latency at reclaim.
> > >
> > > It's good to add statistics before we modify memcg/global reclaim, largely.
> > > This patch refactors current soft limit status and add an unified update logic.
> > >
> > > For example, After #cat 195Mfile > /dev/null under 100M limit.
> > >        # cat /cgroup/memory/A/memory.stat
> > >        ....
> > >        limit_freed 24592
> > 
> > why not "limit_steal" ?
> > 
> > >        soft_steal 0
> > >        limit_scan 43974
> > >        soft_scan 0
> > >        limit_latency 133837417
> > >
> > > nearly 96M caches are freed. scanned twice. used 133ms.
> > 
> > Does it make sense to split up the soft_steal/scan for bg reclaim and
> > direct reclaim? The same for the limit_steal/scan. I am now testing
> > the patch to add the soft_limit reclaim on global ttfp, and i already
> > have the patch to add the following:
> > 
> > kswapd_soft_steal 0
> > kswapd_soft_scan 0
> > direct_soft_steal 0
> > direct_soft_scan 0
> > kswapd_steal 0
> > pg_pgsteal 0
> > kswapd_pgscan 0
> > pg_scan 0
> > 
> 
> I'll not post updated version until the end of holidays but my latest plan is
> adding
> 
> 
> limit_direct_free   - # of pages freed by limit in foreground (not stealed, you freed by yourself's limit)
> soft_kswapd_steal   - # of pages stealed by kswapd based on soft limit
> limit_direct_scan   - # of pages scanned by limit in foreground
> soft_kswapd_scan    - # of pages scanned by kswapd based on soft limit
> 
> And then, you can add
> 
> soft_direct_steal     - # of pages stealed by foreground reclaim based on soft limit
> soft_direct_scan        - # of pages scanned by foreground reclaim based on soft limit
> 
> And
> 
> kern_direct_steal  - # of pages stealed by foreground reclaim at memory shortage.
> kern_direct_scan   - # of pages scanned by foreground reclaim at memory shortage.
> kern_direct_steal  - # of pages stealed by kswapd at memory shortage
> kern_direct_scan   - # of pages scanned by kswapd at memory shortage
> 
> (Above kern_xxx number includes soft_xxx in it. ) These will show influence by
> other cgroups.
> 
> And
> 
> wmark_bg_free      - # of pages freed by watermark in background(not kswapd)
> wmark_bg_scan      - # of pages scanned by watermark in background(not kswapd)
> 
> Hmm ? too many stats ;)

Indeed, and you have not even taken hierarchical reclaim into account.
What I propose is the separation of reclaim that happens within a
memcg due to an internal memcg condition, and reclaim that happens
within a memcg due to outside conditions - either the hierarchy or
global memory pressure.  Something like the following, maybe?

1. Limit-triggered direct reclaim

The memory cgroup hits its limit and the task does direct reclaim from
its own memcg.  We probably want statistics for this separately from
background reclaim to see how successful background reclaim is, the
same reason we have this separation in the global vmstat as well.

	pgscan_direct_limit
	pgfree_direct_limit

2. Limit-triggered background reclaim

This is the watermark-based asynchroneous reclaim that is currently in
discussion.  It's triggered by the memcg breaching its watermark,
which is relative to its hard-limit.  I named it kswapd because I
still think kswapd should do this job, but it is all open for
discussion, obviously.  Treat it as meaning 'background' or
'asynchroneous'.

	pgscan_kswapd_limit
	pgfree_kswapd_limit

3. Hierarchy-triggered direct reclaim

A condition outside the memcg leads to a task directly reclaiming from
this memcg.  This could be global memory pressure for example, but
also a parent cgroup hitting its limit.  It's probably helpful to
assume global memory pressure meaning that the root cgroup hit its
limit, conceptually.  We don't have that yet, but this could be the
direct softlimit reclaim Ying mentioned above.

	pgscan_direct_hierarchy
	pgsteal_direct_hierarchy

4. Hierarchy-triggered background reclaim

An outside condition leads to kswapd reclaiming from this memcg, like
kswapd doing softlimit pushback due to global memory pressure.

	pgscan_kswapd_hierarchy
	pgsteal_kswapd_hierarchy

---

With these stats in place, you can see how much pressure there is on
your memcg hierarchy.  This includes machine utilization and if you
overcommitted too much on a global level if there is a lot of reclaim
activity indicated in the hierarchical stats.

With the limit-based stats, you can see the amount of internal
pressure of memcgs, which shows you if you overcommitted on a local
level.

And for both cases, you can also see the effectiveness of background
reclaim by comparing the direct and the kswapd stats.

> And making current soft_steal/soft_scan planned to be obsolete...

It's in -mm, but not merged upstream.

Regardless of my proposol for any stats above, I want to ask everybody
involved that we do not add any more ABI and exports of random
internals of the memcg reclaim process at this point.

We have a lot of plans and ideas still in flux for memcg reclaim, I
think it's about the worst point in time to commit ourselves to
certain behaviour, knobs, and statistics regarding this code.

	Hannes

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: Fw: [PATCH] memcg: add reclaim statistics accounting
  2011-04-28 12:36     ` Johannes Weiner
@ 2011-04-28 17:46       ` Ying Han
  2011-04-29  6:26         ` Johannes Weiner
  0 siblings, 1 reply; 11+ messages in thread
From: Ying Han @ 2011-04-28 17:46 UTC (permalink / raw)
  To: Johannes Weiner; +Cc: KAMEZAWA Hiroyuki, linux-mm@kvack.org

On Thu, Apr 28, 2011 at 5:36 AM, Johannes Weiner <hannes@cmpxchg.org> wrote:
> On Thu, Apr 28, 2011 at 06:01:39PM +0900, KAMEZAWA Hiroyuki wrote:
>> On Wed, 27 Apr 2011 20:43:58 -0700
>> Ying Han <yinghan@google.com> wrote:
>>
>> > On Wed, Apr 27, 2011 at 8:16 PM, KAMEZAWA Hiroyuki
>> > <kamezawa.hiroyu@jp.fujitsu.com> wrote:
>> > > sorry, I had wrong TO:...
>> > >
>> > > Begin forwarded message:
>> > >
>> > > Date: Thu, 28 Apr 2011 12:02:34 +0900
>> > > From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
>> > > To: linux-mm@vger.kernel.org
>> > > Cc: "linux-kernel@vger.kernel.org" <linux-kernel@vger.kernel.org>, "nishimura@mxp.nes.nec.co.jp" <nishimura@mxp.nes.nec.co.jp>, "balbir@linux.vnet.ibm.com" <balbir@linux.vnet.ibm.com>, Ying Han <yinghan@google.com>, "akpm@linux-foundation.org" <akpm@linux-foundation.org>
>> > > Subject: [PATCH] memcg: add reclaim statistics accounting
>> > >
>> > >
>> > >
>> > > Now, memory cgroup provides poor reclaim statistics per memcg. This
>> > > patch adds statistics for direct/soft reclaim as the number of
>> > > pages scans, the number of page freed by reclaim, the nanoseconds of
>> > > latency at reclaim.
>> > >
>> > > It's good to add statistics before we modify memcg/global reclaim, largely.
>> > > This patch refactors current soft limit status and add an unified update logic.
>> > >
>> > > For example, After #cat 195Mfile > /dev/null under 100M limit.
>> > >        # cat /cgroup/memory/A/memory.stat
>> > >        ....
>> > >        limit_freed 24592
>> >
>> > why not "limit_steal" ?
>> >
>> > >        soft_steal 0
>> > >        limit_scan 43974
>> > >        soft_scan 0
>> > >        limit_latency 133837417
>> > >
>> > > nearly 96M caches are freed. scanned twice. used 133ms.
>> >
>> > Does it make sense to split up the soft_steal/scan for bg reclaim and
>> > direct reclaim? The same for the limit_steal/scan. I am now testing
>> > the patch to add the soft_limit reclaim on global ttfp, and i already
>> > have the patch to add the following:
>> >
>> > kswapd_soft_steal 0
>> > kswapd_soft_scan 0
>> > direct_soft_steal 0
>> > direct_soft_scan 0
>> > kswapd_steal 0
>> > pg_pgsteal 0
>> > kswapd_pgscan 0
>> > pg_scan 0
>> >
>>
>> I'll not post updated version until the end of holidays but my latest plan is
>> adding
>>
>>
>> limit_direct_free   - # of pages freed by limit in foreground (not stealed, you freed by yourself's limit)
>> soft_kswapd_steal   - # of pages stealed by kswapd based on soft limit
>> limit_direct_scan   - # of pages scanned by limit in foreground
>> soft_kswapd_scan    - # of pages scanned by kswapd based on soft limit
>>
>> And then, you can add
>>
>> soft_direct_steal     - # of pages stealed by foreground reclaim based on soft limit
>> soft_direct_scan        - # of pages scanned by foreground reclaim based on soft limit
>>
>> And
>>
>> kern_direct_steal  - # of pages stealed by foreground reclaim at memory shortage.
>> kern_direct_scan   - # of pages scanned by foreground reclaim at memory shortage.
>> kern_direct_steal  - # of pages stealed by kswapd at memory shortage
>> kern_direct_scan   - # of pages scanned by kswapd at memory shortage
>>
>> (Above kern_xxx number includes soft_xxx in it. ) These will show influence by
>> other cgroups.
>>
>> And
>>
>> wmark_bg_free      - # of pages freed by watermark in background(not kswapd)
>> wmark_bg_scan      - # of pages scanned by watermark in background(not kswapd)
>>
>> Hmm ? too many stats ;)
>
> Indeed, and you have not even taken hierarchical reclaim into account.
> What I propose is the separation of reclaim that happens within a
> memcg due to an internal memcg condition, and reclaim that happens
> within a memcg due to outside conditions - either the hierarchy or
> global memory pressure.  Something like the following, maybe?
>
> 1. Limit-triggered direct reclaim
>
> The memory cgroup hits its limit and the task does direct reclaim from
> its own memcg.  We probably want statistics for this separately from
> background reclaim to see how successful background reclaim is, the
> same reason we have this separation in the global vmstat as well.
>
>        pgscan_direct_limit
>        pgfree_direct_limit

Ack.
>
> 2. Limit-triggered background reclaim
>
> This is the watermark-based asynchroneous reclaim that is currently in
> discussion.  It's triggered by the memcg breaching its watermark,
> which is relative to its hard-limit.  I named it kswapd because I
> still think kswapd should do this job, but it is all open for
> discussion, obviously.  Treat it as meaning 'background' or
> 'asynchroneous'.
>
>        pgscan_kswapd_limit
>        pgfree_kswapd_limit
Ack.

To clarify, the 1 and 2 only count the reclaim which is due to the
pressure from the memcg itself.

> 3. Hierarchy-triggered direct reclaim
>
> A condition outside the memcg leads to a task directly reclaiming from
> this memcg.  This could be global memory pressure for example, but
> also a parent cgroup hitting its limit.  It's probably helpful to
> assume global memory pressure meaning that the root cgroup hit its
> limit, conceptually.  We don't have that yet, but this could be the
> direct softlimit reclaim Ying mentioned above.
>
>        pgscan_direct_hierarchy
>        pgsteal_direct_hierarchy

For this one, it could be global direct reclaim doing softlimit
pushback or hierarchical reclaim
due to the parent hit its hardlimit. It would be nice if we can
separate them up?

>
> 4. Hierarchy-triggered background reclaim
>
> An outside condition leads to kswapd reclaiming from this memcg, like
> kswapd doing softlimit pushback due to global memory pressure.
>
>        pgscan_kswapd_hierarchy
>        pgsteal_kswapd_hierarchy

Ack, and this should be only per-node kswap doing doftlimit pushback.

> ---
>
> With these stats in place, you can see how much pressure there is on
> your memcg hierarchy.  This includes machine utilization and if you
> overcommitted too much on a global level if there is a lot of reclaim
> activity indicated in the hierarchical stats.
>
> With the limit-based stats, you can see the amount of internal
> pressure of memcgs, which shows you if you overcommitted on a local
> level.
>
> And for both cases, you can also see the effectiveness of background
> reclaim by comparing the direct and the kswapd stats.
>
>> And making current soft_steal/soft_scan planned to be obsolete...
>
> It's in -mm, but not merged upstream.

Yes, and this is part of the effort of adding softlimit pushback in
global kswapd path.

--Ying
>
> Regardless of my proposol for any stats above, I want to ask everybody
> involved that we do not add any more ABI and exports of random
> internals of the memcg reclaim process at this point.
>
> We have a lot of plans and ideas still in flux for memcg reclaim, I
> think it's about the worst point in time to commit ourselves to
> certain behaviour, knobs, and statistics regarding this code.
>
>        Hannes
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: Fw: [PATCH] memcg: add reclaim statistics accounting
  2011-04-28 17:46       ` Ying Han
@ 2011-04-29  6:26         ` Johannes Weiner
  0 siblings, 0 replies; 11+ messages in thread
From: Johannes Weiner @ 2011-04-29  6:26 UTC (permalink / raw)
  To: Ying Han; +Cc: KAMEZAWA Hiroyuki, linux-mm@kvack.org

On Thu, Apr 28, 2011 at 10:46:07AM -0700, Ying Han wrote:
> On Thu, Apr 28, 2011 at 5:36 AM, Johannes Weiner <hannes@cmpxchg.org> wrote:
> > 1. Limit-triggered direct reclaim
> >
> > The memory cgroup hits its limit and the task does direct reclaim from
> > its own memcg.  We probably want statistics for this separately from
> > background reclaim to see how successful background reclaim is, the
> > same reason we have this separation in the global vmstat as well.
> >
> >        pgscan_direct_limit
> >        pgfree_direct_limit
> 
> Ack.
> >
> > 2. Limit-triggered background reclaim
> >
> > This is the watermark-based asynchroneous reclaim that is currently in
> > discussion.  It's triggered by the memcg breaching its watermark,
> > which is relative to its hard-limit.  I named it kswapd because I
> > still think kswapd should do this job, but it is all open for
> > discussion, obviously.  Treat it as meaning 'background' or
> > 'asynchroneous'.
> >
> >        pgscan_kswapd_limit
> >        pgfree_kswapd_limit
> Ack.
> 
> To clarify, the 1 and 2 only count the reclaim which is due to the
> pressure from the memcg itself.

Yes, limit-triggered implies that.  If you have reclaim going on in a
memcg that is unrelated to the limit, the pressure must be external.

> > 3. Hierarchy-triggered direct reclaim
> >
> > A condition outside the memcg leads to a task directly reclaiming from
> > this memcg.  This could be global memory pressure for example, but
> > also a parent cgroup hitting its limit.  It's probably helpful to
> > assume global memory pressure meaning that the root cgroup hit its
> > limit, conceptually.  We don't have that yet, but this could be the
> > direct softlimit reclaim Ying mentioned above.
> >
> >        pgscan_direct_hierarchy
> >        pgsteal_direct_hierarchy
> 
> For this one, it could be global direct reclaim doing softlimit
> pushback or hierarchical reclaim
> due to the parent hit its hardlimit. It would be nice if we can
> separate them up?

Short-answer: you are able to differentiate between the two by looking
at the memcg.  If the parent is the root cgroup, you know its direct
softlimit reclaim.

Long-answer:

In the paragraph of 3., I suggested that they are conceptually the
same.  If you observe hierarchical pressure on a memcg, you know that
one of the ancestors is in trouble and go up the chain to find which
one has internal pressure.  If the troubled ancestor turns out to be
the root cgroup, you know that it's a physical memory shortness, as
its ownly limit is physical memory.

It can all be described with the memcg-native concept of hierarchy and
the specialness of the root cgroup.

	Hannes

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2011-04-29  6:26 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2011-04-28  3:16 Fw: [PATCH] memcg: add reclaim statistics accounting KAMEZAWA Hiroyuki
2011-04-28  3:43 ` Ying Han
2011-04-28  3:57   ` KAMEZAWA Hiroyuki
2011-04-28  4:24     ` Ying Han
2011-04-28  4:27       ` KAMEZAWA Hiroyuki
2011-04-28  4:40         ` Ying Han
2011-04-28  7:02           ` KAMEZAWA Hiroyuki
2011-04-28  9:01   ` KAMEZAWA Hiroyuki
2011-04-28 12:36     ` Johannes Weiner
2011-04-28 17:46       ` Ying Han
2011-04-29  6:26         ` Johannes Weiner

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).