[PATCH V2] memcg: add mlock statistic in memory.stat

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [PATCH V2] memcg: add mlock statistic in memory.stat
@ 2012-04-18 18:21 Ying Han
  2012-04-18 23:33 ` Andrew Morton
  0 siblings, 1 reply; 13+ messages in thread
From: Ying Han @ 2012-04-18 18:21 UTC (permalink / raw)
  To: Michal Hocko, Johannes Weiner, Mel Gorman, KAMEZAWA Hiroyuki,
	Rik van Riel, Hillf Danton, Hugh Dickins, Dan Magenheimer,
	Andrew Morton
  Cc: linux-mm

We have the nr_mlock stat both in meminfo as well as vmstat system wide, this
patch adds the mlock field into per-memcg memory stat. The stat itself enhances
the metrics exported by memcg since the unevictable lru includes more than
mlock()'d page like SHM_LOCK'd.

Why we need to count mlock'd pages while they are unevictable and we can not
do much on them anyway?

This is true. The mlock stat I am proposing is more helpful for system admin
and kernel developer to understand the system workload. The same information
should be helpful to add into OOM log as well. Many times in the past that we
need to read the mlock stat from the per-container meminfo for different
reason. Afterall, we do have the ability to read the mlock from meminfo and
this patch fills the info in memcg.

v2..v1:
1. rebase on top of 3.4-rc2 and the code is based on the following commit
went into 3.4-rc1:

commit 89c06bd52fb9ffceddf84f7309d2e8c9f1666216
    memcg: use new logic for page stat accounting

Tested:
$ cat /dev/cgroup/memory/memory.use_hierarchy
1

$ mkdir /dev/cgroup/memory/A
$ mkdir /dev/cgroup/memory/A/B
$ echo 1g >/dev/cgroup/memory/A/memory.limit_in_bytes
$ echo 1g >/dev/cgroup/memory/B/memory.limit_in_bytes

1. Run memtoy in B and mlock 512m file pages:
memtoy>file /export/hda3/file_512m private
memtoy>map file_512m 0 512m
memtoy>lock file_512m
memtoy:  mlock of file_512m [131072 pages] took  5.296secs.

$ cat /dev/cgroup/memory/A/B/memory.stat
mlock 536870912
unevictable 536870912
..
total_mlock 536870912
total_unevictable 536870912

$ cat /dev/cgroup/memory/A/memory.stat
mlock 0
unevictable 0
..
total_mlock 536870912
total_unevictable 536870912

Signed-off-by: Ying Han <yinghan@google.com>
---
 Documentation/cgroups/memory.txt |    2 ++
 include/linux/memcontrol.h       |    1 +
 mm/internal.h                    |   18 ++++++++++++++++++
 mm/memcontrol.c                  |   16 ++++++++++++++++
 mm/mlock.c                       |   15 +++++++++++++++
 mm/page_alloc.c                  |   16 ++++++++++++----
 6 files changed, 64 insertions(+), 4 deletions(-)

diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt
index 4c95c00..13d9913 100644
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroups/memory.txt
@@ -410,6 +410,7 @@ memory.stat file includes following statistics
 cache		- # of bytes of page cache memory.
 rss		- # of bytes of anonymous and swap cache memory.
 mapped_file	- # of bytes of mapped file (includes tmpfs/shmem)
+mlock		- # of bytes of mlocked memory.
 pgpgin		- # of charging events to the memory cgroup. The charging
 		event happens each time a page is accounted as either mapped
 		anon page(RSS) or cache page(Page Cache) to the cgroup.
@@ -434,6 +435,7 @@ hierarchical_memsw_limit - # of bytes of memory+swap limit with regard to
 total_cache		- sum of all children's "cache"
 total_rss		- sum of all children's "rss"
 total_mapped_file	- sum of all children's "cache"
+total_mlock		- sum of all children's "mlock"
 total_pgpgin		- sum of all children's "pgpgin"
 total_pgpgout		- sum of all children's "pgpgout"
 total_swap		- sum of all children's "swap"
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index f94efd2..112b573 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -30,6 +30,7 @@ struct mm_struct;
 /* Stats that can be updated by kernel. */
 enum mem_cgroup_page_stat_item {
 	MEMCG_NR_FILE_MAPPED, /* # of pages charged as file rss */
+	MEMCG_NR_MLOCK, /* # of pages charged as mlock */
 };
 
 struct mem_cgroup_reclaim_cookie {
diff --git a/mm/internal.h b/mm/internal.h
index 2189af4..96684b5 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -12,6 +12,7 @@
 #define __MM_INTERNAL_H
 
 #include <linux/mm.h>
+#include <linux/memcontrol.h>
 
 void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
 		unsigned long floor, unsigned long ceiling);
@@ -133,15 +134,22 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
  */
 static inline int is_mlocked_vma(struct vm_area_struct *vma, struct page *page)
 {
+	bool locked;
+	unsigned long flags;
+
 	VM_BUG_ON(PageLRU(page));
 
 	if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED))
 		return 0;
 
+	mem_cgroup_begin_update_page_stat(page, &locked, &flags);
 	if (!TestSetPageMlocked(page)) {
 		inc_zone_page_state(page, NR_MLOCK);
+		mem_cgroup_inc_page_stat(page, MEMCG_NR_MLOCK);
 		count_vm_event(UNEVICTABLE_PGMLOCKED);
 	}
+	mem_cgroup_end_update_page_stat(page, &locked, &flags);
+
 	return 1;
 }
 
@@ -163,8 +171,13 @@ extern void munlock_vma_page(struct page *page);
 extern void __clear_page_mlock(struct page *page);
 static inline void clear_page_mlock(struct page *page)
 {
+	bool locked;
+	unsigned long flags;
+
+	mem_cgroup_begin_update_page_stat(page, &locked, &flags);
 	if (unlikely(TestClearPageMlocked(page)))
 		__clear_page_mlock(page);
+	mem_cgroup_end_update_page_stat(page, &locked, &flags);
 }
 
 /*
@@ -173,6 +186,11 @@ static inline void clear_page_mlock(struct page *page)
  */
 static inline void mlock_migrate_page(struct page *newpage, struct page *page)
 {
+	/*
+	 * Here we are supposed to update the page memcg's mlock stat and the
+	 * newpage memcgs' mlock. Since the page and newpage are always being
+	 * charged to the same memcg, so no need.
+	 */
 	if (TestClearPageMlocked(page)) {
 		unsigned long flags;
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7d698df..61cdaeb 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -87,6 +87,7 @@ enum mem_cgroup_stat_index {
 	MEM_CGROUP_STAT_CACHE, 	   /* # of pages charged as cache */
 	MEM_CGROUP_STAT_RSS,	   /* # of pages charged as anon rss */
 	MEM_CGROUP_STAT_FILE_MAPPED,  /* # of pages charged as file rss */
+	MEM_CGROUP_STAT_MLOCK, /* # of pages charged as mlock()ed */
 	MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
 	MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */
 	MEM_CGROUP_STAT_NSTATS,
@@ -1975,6 +1976,9 @@ void mem_cgroup_update_page_stat(struct page *page,
 	case MEMCG_NR_FILE_MAPPED:
 		idx = MEM_CGROUP_STAT_FILE_MAPPED;
 		break;
+	case MEMCG_NR_MLOCK:
+		idx = MEM_CGROUP_STAT_MLOCK;
+		break;
 	default:
 		BUG();
 	}
@@ -2627,6 +2631,14 @@ static int mem_cgroup_move_account(struct page *page,
 		__this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
 		preempt_enable();
 	}
+
+	if (PageMlocked(page)) {
+		/* Update mlocked data for mem_cgroup */
+		preempt_disable();
+		__this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_MLOCK]);
+		__this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_MLOCK]);
+		preempt_enable();
+	}
 	mem_cgroup_charge_statistics(from, anon, -nr_pages);
 	if (uncharge)
 		/* This is not "cancel", but cancel_charge does all we need. */
@@ -4047,6 +4059,7 @@ enum {
 	MCS_CACHE,
 	MCS_RSS,
 	MCS_FILE_MAPPED,
+	MCS_MLOCK,
 	MCS_PGPGIN,
 	MCS_PGPGOUT,
 	MCS_SWAP,
@@ -4071,6 +4084,7 @@ struct {
 	{"cache", "total_cache"},
 	{"rss", "total_rss"},
 	{"mapped_file", "total_mapped_file"},
+	{"mlock", "total_mlock"},
 	{"pgpgin", "total_pgpgin"},
 	{"pgpgout", "total_pgpgout"},
 	{"swap", "total_swap"},
@@ -4096,6 +4110,8 @@ mem_cgroup_get_local_stat(struct mem_cgroup *memcg, struct mcs_total_stat *s)
 	s->stat[MCS_RSS] += val * PAGE_SIZE;
 	val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED);
 	s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;
+	val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_MLOCK);
+	s->stat[MCS_MLOCK] += val * PAGE_SIZE;
 	val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGPGIN);
 	s->stat[MCS_PGPGIN] += val;
 	val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGPGOUT);
diff --git a/mm/mlock.c b/mm/mlock.c
index ef726e8..cef0201 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -50,6 +50,8 @@ EXPORT_SYMBOL(can_do_mlock);
 
 /*
  *  LRU accounting for clear_page_mlock()
+ *  Make sure the caller calls mem_cgroup_begin[end]_update_page_stat,
+ *  otherwise it will be race between "move" and "page stat accounting".
  */
 void __clear_page_mlock(struct page *page)
 {
@@ -60,6 +62,7 @@ void __clear_page_mlock(struct page *page)
 	}
 
 	dec_zone_page_state(page, NR_MLOCK);
+	mem_cgroup_dec_page_stat(page, MEMCG_NR_MLOCK);
 	count_vm_event(UNEVICTABLE_PGCLEARED);
 	if (!isolate_lru_page(page)) {
 		putback_lru_page(page);
@@ -78,14 +81,20 @@ void __clear_page_mlock(struct page *page)
  */
 void mlock_vma_page(struct page *page)
 {
+	bool locked;
+	unsigned long flags;
+
 	BUG_ON(!PageLocked(page));
 
+	mem_cgroup_begin_update_page_stat(page, &locked, &flags);
 	if (!TestSetPageMlocked(page)) {
 		inc_zone_page_state(page, NR_MLOCK);
+		mem_cgroup_inc_page_stat(page, MEMCG_NR_MLOCK);
 		count_vm_event(UNEVICTABLE_PGMLOCKED);
 		if (!isolate_lru_page(page))
 			putback_lru_page(page);
 	}
+	mem_cgroup_end_update_page_stat(page, &locked, &flags);
 }
 
 /**
@@ -105,10 +114,15 @@ void mlock_vma_page(struct page *page)
  */
 void munlock_vma_page(struct page *page)
 {
+	bool locked;
+	unsigned long flags;
+
 	BUG_ON(!PageLocked(page));
 
+	mem_cgroup_begin_update_page_stat(page, &locked, &flags);
 	if (TestClearPageMlocked(page)) {
 		dec_zone_page_state(page, NR_MLOCK);
+		mem_cgroup_dec_page_stat(page, MEMCG_NR_MLOCK);
 		if (!isolate_lru_page(page)) {
 			int ret = SWAP_AGAIN;
 
@@ -141,6 +155,7 @@ void munlock_vma_page(struct page *page)
 				count_vm_event(UNEVICTABLE_PGMUNLOCKED);
 		}
 	}
+	mem_cgroup_end_update_page_stat(page, &locked, &flags);
 }
 
 /**
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a712fb9..c7da329 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -596,10 +596,14 @@ out:
  * free_page_mlock() -- clean up attempts to free and mlocked() page.
  * Page should not be on lru, so no need to fix that up.
  * free_pages_check() will verify...
+ *
+ * Make sure the caller calls mem_cgroup_begin[end]_update_page_stat,
+ * otherwise it will be race between "move" and "page stat accounting".
  */
 static inline void free_page_mlock(struct page *page)
 {
 	__dec_zone_page_state(page, NR_MLOCK);
+	mem_cgroup_dec_page_stat(page, MEMCG_NR_MLOCK);
 	__count_vm_event(UNEVICTABLE_MLOCKFREED);
 }
 
@@ -716,17 +720,19 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
 static void __free_pages_ok(struct page *page, unsigned int order)
 {
 	unsigned long flags;
-	int wasMlocked = __TestClearPageMlocked(page);
+	bool locked;
 
 	if (!free_pages_prepare(page, order))
 		return;
 
 	local_irq_save(flags);
-	if (unlikely(wasMlocked))
+	mem_cgroup_begin_update_page_stat(page, &locked, &flags);
+	if (unlikely(__TestClearPageMlocked(page)))
 		free_page_mlock(page);
 	__count_vm_events(PGFREE, 1 << order);
 	free_one_page(page_zone(page), page, order,
 					get_pageblock_migratetype(page));
+	mem_cgroup_end_update_page_stat(page, &locked, &flags);
 	local_irq_restore(flags);
 }
 
@@ -1250,7 +1256,7 @@ void free_hot_cold_page(struct page *page, int cold)
 	struct per_cpu_pages *pcp;
 	unsigned long flags;
 	int migratetype;
-	int wasMlocked = __TestClearPageMlocked(page);
+	bool locked;
 
 	if (!free_pages_prepare(page, 0))
 		return;
@@ -1258,10 +1264,12 @@ void free_hot_cold_page(struct page *page, int cold)
 	migratetype = get_pageblock_migratetype(page);
 	set_page_private(page, migratetype);
 	local_irq_save(flags);
-	if (unlikely(wasMlocked))
+	mem_cgroup_begin_update_page_stat(page, &locked, &flags);
+	if (unlikely(__TestClearPageMlocked(page)))
 		free_page_mlock(page);
 	__count_vm_event(PGFREE);
 
+	mem_cgroup_end_update_page_stat(page, &locked, &flags);
 	/*
 	 * We only track unmovable, reclaimable and movable on pcp lists.
 	 * Free ISOLATE pages back to the allocator because they are being
-- 
1.7.7.3

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH V2] memcg: add mlock statistic in memory.stat
  2012-04-18 18:21 [PATCH V2] memcg: add mlock statistic in memory.stat Ying Han
@ 2012-04-18 23:33 ` Andrew Morton
  2012-04-19  0:59   ` KAMEZAWA Hiroyuki
  2012-04-19 22:30   ` Ying Han
  0 siblings, 2 replies; 13+ messages in thread
From: Andrew Morton @ 2012-04-18 23:33 UTC (permalink / raw)
  To: Ying Han
  Cc: Michal Hocko, Johannes Weiner, Mel Gorman, KAMEZAWA Hiroyuki,
	Rik van Riel, Hillf Danton, Hugh Dickins, Dan Magenheimer,
	linux-mm

On Wed, 18 Apr 2012 11:21:55 -0700
Ying Han <yinghan@google.com> wrote:

> We have the nr_mlock stat both in meminfo as well as vmstat system wide, this
> patch adds the mlock field into per-memcg memory stat. The stat itself enhances
> the metrics exported by memcg since the unevictable lru includes more than
> mlock()'d page like SHM_LOCK'd.
> 
> Why we need to count mlock'd pages while they are unevictable and we can not
> do much on them anyway?
> 
> This is true. The mlock stat I am proposing is more helpful for system admin
> and kernel developer to understand the system workload. The same information
> should be helpful to add into OOM log as well. Many times in the past that we
> need to read the mlock stat from the per-container meminfo for different
> reason. Afterall, we do have the ability to read the mlock from meminfo and
> this patch fills the info in memcg.
> 
>
> ...
>
>  static inline int is_mlocked_vma(struct vm_area_struct *vma, struct page *page)
>  {
> +	bool locked;
> +	unsigned long flags;
> +
>  	VM_BUG_ON(PageLRU(page));
>  
>  	if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED))
>  		return 0;
>  
> +	mem_cgroup_begin_update_page_stat(page, &locked, &flags);
>  	if (!TestSetPageMlocked(page)) {
>  		inc_zone_page_state(page, NR_MLOCK);
> +		mem_cgroup_inc_page_stat(page, MEMCG_NR_MLOCK);
>  		count_vm_event(UNEVICTABLE_PGMLOCKED);
>  	}
> +	mem_cgroup_end_update_page_stat(page, &locked, &flags);
> +
>  	return 1;
>  }

Unrelated to this patch: is_mlocked_vma() is misnamed.  A function with
that name should be a bool-returning test which has no side-effects.

>
> ...
>
>  static void __free_pages_ok(struct page *page, unsigned int order)
>  {
>  	unsigned long flags;
> -	int wasMlocked = __TestClearPageMlocked(page);
> +	bool locked;
>  
>  	if (!free_pages_prepare(page, order))
>  		return;
>  
>  	local_irq_save(flags);
> -	if (unlikely(wasMlocked))
> +	mem_cgroup_begin_update_page_stat(page, &locked, &flags);

hm, what's going on here.  The page now has a zero refcount and is to
be returned to the buddy.  But mem_cgroup_begin_update_page_stat()
assumes that the page still belongs to a memcg.  I'd have thought that
any page_cgroup backreferences would have been torn down by now?

> +	if (unlikely(__TestClearPageMlocked(page)))
>  		free_page_mlock(page);

And if the page _is_ still accessible via cgroup lookup, the use of the
nonatomic RMW is dangerous.

>  	__count_vm_events(PGFREE, 1 << order);
>  	free_one_page(page_zone(page), page, order,
>  					get_pageblock_migratetype(page));
> +	mem_cgroup_end_update_page_stat(page, &locked, &flags);
>  	local_irq_restore(flags);
>  }
>  
> @@ -1250,7 +1256,7 @@ void free_hot_cold_page(struct page *page, int cold)

The same comments apply in free_hot_cold_page().

>  	struct per_cpu_pages *pcp;
>  	unsigned long flags;
>  	int migratetype;
> -	int wasMlocked = __TestClearPageMlocked(page);
> +	bool locked;
>  
>  	if (!free_pages_prepare(page, 0))
>  		return;
>
> ...
>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH V2] memcg: add mlock statistic in memory.stat
  2012-04-18 23:33 ` Andrew Morton
@ 2012-04-19  0:59   ` KAMEZAWA Hiroyuki
  2012-04-19 13:12     ` Johannes Weiner
  2012-04-19 22:43     ` Ying Han
  2012-04-19 22:30   ` Ying Han
  1 sibling, 2 replies; 13+ messages in thread
From: KAMEZAWA Hiroyuki @ 2012-04-19  0:59 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Ying Han, Michal Hocko, Johannes Weiner, Mel Gorman, Rik van Riel,
	Hillf Danton, Hugh Dickins, Dan Magenheimer, linux-mm

(2012/04/19 8:33), Andrew Morton wrote:

> On Wed, 18 Apr 2012 11:21:55 -0700
> Ying Han <yinghan@google.com> wrote:
> 
>> We have the nr_mlock stat both in meminfo as well as vmstat system wide, this
>> patch adds the mlock field into per-memcg memory stat. The stat itself enhances
>> the metrics exported by memcg since the unevictable lru includes more than
>> mlock()'d page like SHM_LOCK'd.
>>
>> Why we need to count mlock'd pages while they are unevictable and we can not
>> do much on them anyway?
>>
>> This is true. The mlock stat I am proposing is more helpful for system admin
>> and kernel developer to understand the system workload. The same information
>> should be helpful to add into OOM log as well. Many times in the past that we
>> need to read the mlock stat from the per-container meminfo for different
>> reason. Afterall, we do have the ability to read the mlock from meminfo and
>> this patch fills the info in memcg.
>>
>>
>> ...
>>
>>  static inline int is_mlocked_vma(struct vm_area_struct *vma, struct page *page)
>>  {
>> +	bool locked;
>> +	unsigned long flags;
>> +
>>  	VM_BUG_ON(PageLRU(page));
>>  
>>  	if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED))
>>  		return 0;
>>  
>> +	mem_cgroup_begin_update_page_stat(page, &locked, &flags);
>>  	if (!TestSetPageMlocked(page)) {
>>  		inc_zone_page_state(page, NR_MLOCK);
>> +		mem_cgroup_inc_page_stat(page, MEMCG_NR_MLOCK);
>>  		count_vm_event(UNEVICTABLE_PGMLOCKED);
>>  	}
>> +	mem_cgroup_end_update_page_stat(page, &locked, &flags);
>> +
>>  	return 1;
>>  }
> 
> Unrelated to this patch: is_mlocked_vma() is misnamed.  A function with
> that name should be a bool-returning test which has no side-effects.
> 
>>
>> ...
>>
>>  static void __free_pages_ok(struct page *page, unsigned int order)
>>  {
>>  	unsigned long flags;
>> -	int wasMlocked = __TestClearPageMlocked(page);
>> +	bool locked;
>>  
>>  	if (!free_pages_prepare(page, order))
>>  		return;
>>  
>>  	local_irq_save(flags);
>> -	if (unlikely(wasMlocked))
>> +	mem_cgroup_begin_update_page_stat(page, &locked, &flags);
> 
> hm, what's going on here.  The page now has a zero refcount and is to
> be returned to the buddy.  But mem_cgroup_begin_update_page_stat()
> assumes that the page still belongs to a memcg.  I'd have thought that
> any page_cgroup backreferences would have been torn down by now?
> 
>> +	if (unlikely(__TestClearPageMlocked(page)))
>>  		free_page_mlock(page);
> 


Ah, this is problem. Now, we have following code.
==

> struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, struct page *page,
>                                        enum lru_list lru)
> {
>         struct mem_cgroup_per_zone *mz;
>         struct mem_cgroup *memcg;
>         struct page_cgroup *pc;
> 
>         if (mem_cgroup_disabled())
>                 return &zone->lruvec;
> 
>         pc = lookup_page_cgroup(page);
>         memcg = pc->mem_cgroup;
> 
>         /*
>          * Surreptitiously switch any uncharged page to root:
>          * an uncharged page off lru does nothing to secure
>          * its former mem_cgroup from sudden removal.
>          *
>          * Our caller holds lru_lock, and PageCgroupUsed is updated
>          * under page_cgroup lock: between them, they make all uses
>          * of pc->mem_cgroup safe.
>          */
>         if (!PageCgroupUsed(pc) && memcg != root_mem_cgroup)
>                 pc->mem_cgroup = memcg = root_mem_cgroup;

==

Then, accessing pc->mem_cgroup without checking PCG_USED bit is dangerous.
It may trigger #GP because of suddern removal of memcg or because of above
code, mis-accounting will happen... pc->mem_cgroup may be overwritten already.

Proposal from me is calling TestClearPageMlocked(page) via mem_cgroup_uncharge().

Like this.
==
        mem_cgroup_charge_statistics(memcg, anon, -nr_pages);

	/*
         * Pages reach here when it's fully unmapped or dropped from file cache.
	 * we are under lock_page_cgroup() and have no race with memcg activities.
         */
	if (unlikely(PageMlocked(page))) {
		if (TestClearPageMlocked())
			decrement counter.
	}

        ClearPageCgroupUsed(pc);
==
But please check performance impact...

Thanks,
-Kame















--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH V2] memcg: add mlock statistic in memory.stat
  2012-04-19  0:59   ` KAMEZAWA Hiroyuki
@ 2012-04-19 13:12     ` Johannes Weiner
  2012-04-19 22:46       ` Ying Han
  2012-04-20  0:37       ` KAMEZAWA Hiroyuki
  2012-04-19 22:43     ` Ying Han
  1 sibling, 2 replies; 13+ messages in thread
From: Johannes Weiner @ 2012-04-19 13:12 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Andrew Morton, Ying Han, Michal Hocko, Mel Gorman, Rik van Riel,
	Hillf Danton, Hugh Dickins, Dan Magenheimer, linux-mm

On Thu, Apr 19, 2012 at 09:59:20AM +0900, KAMEZAWA Hiroyuki wrote:
> (2012/04/19 8:33), Andrew Morton wrote:
> 
> > On Wed, 18 Apr 2012 11:21:55 -0700
> > Ying Han <yinghan@google.com> wrote:
> >>  static void __free_pages_ok(struct page *page, unsigned int order)
> >>  {
> >>  	unsigned long flags;
> >> -	int wasMlocked = __TestClearPageMlocked(page);
> >> +	bool locked;
> >>  
> >>  	if (!free_pages_prepare(page, order))
> >>  		return;
> >>  
> >>  	local_irq_save(flags);
> >> -	if (unlikely(wasMlocked))
> >> +	mem_cgroup_begin_update_page_stat(page, &locked, &flags);
> > 
> > hm, what's going on here.  The page now has a zero refcount and is to
> > be returned to the buddy.  But mem_cgroup_begin_update_page_stat()
> > assumes that the page still belongs to a memcg.  I'd have thought that
> > any page_cgroup backreferences would have been torn down by now?
> > 
> >> +	if (unlikely(__TestClearPageMlocked(page)))
> >>  		free_page_mlock(page);
> > 
> 
> 
> Ah, this is problem. Now, we have following code.
> ==
> 
> > struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, struct page *page,
> >                                        enum lru_list lru)
> > {
> >         struct mem_cgroup_per_zone *mz;
> >         struct mem_cgroup *memcg;
> >         struct page_cgroup *pc;
> > 
> >         if (mem_cgroup_disabled())
> >                 return &zone->lruvec;
> > 
> >         pc = lookup_page_cgroup(page);
> >         memcg = pc->mem_cgroup;
> > 
> >         /*
> >          * Surreptitiously switch any uncharged page to root:
> >          * an uncharged page off lru does nothing to secure
> >          * its former mem_cgroup from sudden removal.
> >          *
> >          * Our caller holds lru_lock, and PageCgroupUsed is updated
> >          * under page_cgroup lock: between them, they make all uses
> >          * of pc->mem_cgroup safe.
> >          */
> >         if (!PageCgroupUsed(pc) && memcg != root_mem_cgroup)
> >                 pc->mem_cgroup = memcg = root_mem_cgroup;
> 
> ==
> 
> Then, accessing pc->mem_cgroup without checking PCG_USED bit is dangerous.
> It may trigger #GP because of suddern removal of memcg or because of above
> code, mis-accounting will happen... pc->mem_cgroup may be overwritten already.
>
> Proposal from me is calling TestClearPageMlocked(page) via mem_cgroup_uncharge().
> 
> Like this.
> ==
>         mem_cgroup_charge_statistics(memcg, anon, -nr_pages);
> 
> 	/*
>          * Pages reach here when it's fully unmapped or dropped from file cache.
> 	 * we are under lock_page_cgroup() and have no race with memcg activities.
>          */
> 	if (unlikely(PageMlocked(page))) {
> 		if (TestClearPageMlocked())
> 			decrement counter.
> 	}
> 
>         ClearPageCgroupUsed(pc);
> ==
> But please check performance impact...

This makes the lifetime rules of mlocked anon really weird.

Plus this code runs for ALL uncharges, the unlikely() and preliminary
flag testing don't make it okay.  It's bad that we have this in the
allocator, but at least it would be good to hook into that branch and
not add another one.

pc->mem_cgroup stays intact after the uncharge.  Could we make the
memcg removal path wait on the mlock counter to drop to zero instead
and otherwise keep Ying's version?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH V2] memcg: add mlock statistic in memory.stat
  2012-04-19 13:12     ` Johannes Weiner
@ 2012-04-19 22:46       ` Ying Han
  2012-04-19 23:04         ` Johannes Weiner
  2012-04-20  0:37       ` KAMEZAWA Hiroyuki
  1 sibling, 1 reply; 13+ messages in thread
From: Ying Han @ 2012-04-19 22:46 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: KAMEZAWA Hiroyuki, Andrew Morton, Michal Hocko, Mel Gorman,
	Rik van Riel, Hillf Danton, Hugh Dickins, Dan Magenheimer,
	linux-mm

On Thu, Apr 19, 2012 at 6:12 AM, Johannes Weiner <hannes@cmpxchg.org> wrote:
> On Thu, Apr 19, 2012 at 09:59:20AM +0900, KAMEZAWA Hiroyuki wrote:
>> (2012/04/19 8:33), Andrew Morton wrote:
>>
>> > On Wed, 18 Apr 2012 11:21:55 -0700
>> > Ying Han <yinghan@google.com> wrote:
>> >>  static void __free_pages_ok(struct page *page, unsigned int order)
>> >>  {
>> >>    unsigned long flags;
>> >> -  int wasMlocked = __TestClearPageMlocked(page);
>> >> +  bool locked;
>> >>
>> >>    if (!free_pages_prepare(page, order))
>> >>            return;
>> >>
>> >>    local_irq_save(flags);
>> >> -  if (unlikely(wasMlocked))
>> >> +  mem_cgroup_begin_update_page_stat(page, &locked, &flags);
>> >
>> > hm, what's going on here.  The page now has a zero refcount and is to
>> > be returned to the buddy.  But mem_cgroup_begin_update_page_stat()
>> > assumes that the page still belongs to a memcg.  I'd have thought that
>> > any page_cgroup backreferences would have been torn down by now?
>> >
>> >> +  if (unlikely(__TestClearPageMlocked(page)))
>> >>            free_page_mlock(page);
>> >
>>
>>
>> Ah, this is problem. Now, we have following code.
>> ==
>>
>> > struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, struct page *page,
>> >                                        enum lru_list lru)
>> > {
>> >         struct mem_cgroup_per_zone *mz;
>> >         struct mem_cgroup *memcg;
>> >         struct page_cgroup *pc;
>> >
>> >         if (mem_cgroup_disabled())
>> >                 return &zone->lruvec;
>> >
>> >         pc = lookup_page_cgroup(page);
>> >         memcg = pc->mem_cgroup;
>> >
>> >         /*
>> >          * Surreptitiously switch any uncharged page to root:
>> >          * an uncharged page off lru does nothing to secure
>> >          * its former mem_cgroup from sudden removal.
>> >          *
>> >          * Our caller holds lru_lock, and PageCgroupUsed is updated
>> >          * under page_cgroup lock: between them, they make all uses
>> >          * of pc->mem_cgroup safe.
>> >          */
>> >         if (!PageCgroupUsed(pc) && memcg != root_mem_cgroup)
>> >                 pc->mem_cgroup = memcg = root_mem_cgroup;
>>
>> ==
>>
>> Then, accessing pc->mem_cgroup without checking PCG_USED bit is dangerous.
>> It may trigger #GP because of suddern removal of memcg or because of above
>> code, mis-accounting will happen... pc->mem_cgroup may be overwritten already.
>>
>> Proposal from me is calling TestClearPageMlocked(page) via mem_cgroup_uncharge().
>>
>> Like this.
>> ==
>>         mem_cgroup_charge_statistics(memcg, anon, -nr_pages);
>>
>>       /*
>>          * Pages reach here when it's fully unmapped or dropped from file cache.
>>        * we are under lock_page_cgroup() and have no race with memcg activities.
>>          */
>>       if (unlikely(PageMlocked(page))) {
>>               if (TestClearPageMlocked())
>>                       decrement counter.
>>       }
>>
>>         ClearPageCgroupUsed(pc);
>> ==
>> But please check performance impact...
>
> This makes the lifetime rules of mlocked anon really weird.
>
> Plus this code runs for ALL uncharges, the unlikely() and preliminary
> flag testing don't make it okay.  It's bad that we have this in the
> allocator, but at least it would be good to hook into that branch and
> not add another one.

Johannes,
Can you give a more details of your last sentence above? :)

>
> pc->mem_cgroup stays intact after the uncharge.  Could we make the
> memcg removal path wait on the mlock counter to drop to zero instead
> and otherwise keep Ying's version?

Will it delay the memcg predestroy ? I am wondering if we have page in
mmu gather or pagevec, and they won't be freed until we flush?

--Ying

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH V2] memcg: add mlock statistic in memory.stat
  2012-04-19 22:46       ` Ying Han
@ 2012-04-19 23:04         ` Johannes Weiner
  0 siblings, 0 replies; 13+ messages in thread
From: Johannes Weiner @ 2012-04-19 23:04 UTC (permalink / raw)
  To: Ying Han
  Cc: KAMEZAWA Hiroyuki, Andrew Morton, Michal Hocko, Mel Gorman,
	Rik van Riel, Hillf Danton, Hugh Dickins, Dan Magenheimer,
	linux-mm

On Thu, Apr 19, 2012 at 03:46:08PM -0700, Ying Han wrote:
> On Thu, Apr 19, 2012 at 6:12 AM, Johannes Weiner <hannes@cmpxchg.org> wrote:
> > On Thu, Apr 19, 2012 at 09:59:20AM +0900, KAMEZAWA Hiroyuki wrote:
> >> (2012/04/19 8:33), Andrew Morton wrote:
> >>
> >> > On Wed, 18 Apr 2012 11:21:55 -0700
> >> > Ying Han <yinghan@google.com> wrote:
> >> >>  static void __free_pages_ok(struct page *page, unsigned int order)
> >> >>  {
> >> >>    unsigned long flags;
> >> >> -  int wasMlocked = __TestClearPageMlocked(page);
> >> >> +  bool locked;
> >> >>
> >> >>    if (!free_pages_prepare(page, order))
> >> >>            return;
> >> >>
> >> >>    local_irq_save(flags);
> >> >> -  if (unlikely(wasMlocked))
> >> >> +  mem_cgroup_begin_update_page_stat(page, &locked, &flags);
> >> >
> >> > hm, what's going on here.  The page now has a zero refcount and is to
> >> > be returned to the buddy.  But mem_cgroup_begin_update_page_stat()
> >> > assumes that the page still belongs to a memcg.  I'd have thought that
> >> > any page_cgroup backreferences would have been torn down by now?
> >> >
> >> >> +  if (unlikely(__TestClearPageMlocked(page)))
> >> >>            free_page_mlock(page);
> >> >
> >>
> >>
> >> Ah, this is problem. Now, we have following code.
> >> ==
> >>
> >> > struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, struct page *page,
> >> >                                        enum lru_list lru)
> >> > {
> >> >         struct mem_cgroup_per_zone *mz;
> >> >         struct mem_cgroup *memcg;
> >> >         struct page_cgroup *pc;
> >> >
> >> >         if (mem_cgroup_disabled())
> >> >                 return &zone->lruvec;
> >> >
> >> >         pc = lookup_page_cgroup(page);
> >> >         memcg = pc->mem_cgroup;
> >> >
> >> >         /*
> >> >          * Surreptitiously switch any uncharged page to root:
> >> >          * an uncharged page off lru does nothing to secure
> >> >          * its former mem_cgroup from sudden removal.
> >> >          *
> >> >          * Our caller holds lru_lock, and PageCgroupUsed is updated
> >> >          * under page_cgroup lock: between them, they make all uses
> >> >          * of pc->mem_cgroup safe.
> >> >          */
> >> >         if (!PageCgroupUsed(pc) && memcg != root_mem_cgroup)
> >> >                 pc->mem_cgroup = memcg = root_mem_cgroup;
> >>
> >> ==
> >>
> >> Then, accessing pc->mem_cgroup without checking PCG_USED bit is dangerous.
> >> It may trigger #GP because of suddern removal of memcg or because of above
> >> code, mis-accounting will happen... pc->mem_cgroup may be overwritten already.
> >>
> >> Proposal from me is calling TestClearPageMlocked(page) via mem_cgroup_uncharge().
> >>
> >> Like this.
> >> ==
> >>         mem_cgroup_charge_statistics(memcg, anon, -nr_pages);
> >>
> >>       /*
> >>          * Pages reach here when it's fully unmapped or dropped from file cache.
> >>        * we are under lock_page_cgroup() and have no race with memcg activities.
> >>          */
> >>       if (unlikely(PageMlocked(page))) {
> >>               if (TestClearPageMlocked())
> >>                       decrement counter.
> >>       }
> >>
> >>         ClearPageCgroupUsed(pc);
> >> ==
> >> But please check performance impact...
> >
> > This makes the lifetime rules of mlocked anon really weird.
> >
> > Plus this code runs for ALL uncharges, the unlikely() and preliminary
> > flag testing don't make it okay.  It's bad that we have this in the
> > allocator, but at least it would be good to hook into that branch and
> > not add another one.
> 
> Johannes,
> Can you give a more details of your last sentence above? :)

It's a fast path for all pages at the end of their lifetime.  Mlocked
anon pages that reach here are a tiny small fraction of them [it's
just those pages that race with reclaim and lazy-mlock while being
unmapped], so I think we should do our very best to not add any checks
for them here, not even with a lot of mitigation.  It just seems badly
misplaced.

On the other hand, we already HAVE a branch to deal with them, in the
page allocator.  We can, and should, hook into that instead.

> > pc->mem_cgroup stays intact after the uncharge.  Could we make the
> > memcg removal path wait on the mlock counter to drop to zero instead
> > and otherwise keep Ying's version?
> 
> Will it delay the memcg predestroy ? I am wondering if we have page in
> mmu gather or pagevec, and they won't be freed until we flush?

Can we flush them from the waiting site?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH V2] memcg: add mlock statistic in memory.stat
  2012-04-19 13:12     ` Johannes Weiner
  2012-04-19 22:46       ` Ying Han
@ 2012-04-20  0:37       ` KAMEZAWA Hiroyuki
  2012-04-20  5:57         ` Ying Han
  1 sibling, 1 reply; 13+ messages in thread
From: KAMEZAWA Hiroyuki @ 2012-04-20  0:37 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: Andrew Morton, Ying Han, Michal Hocko, Mel Gorman, Rik van Riel,
	Hillf Danton, Hugh Dickins, Dan Magenheimer, linux-mm

(2012/04/19 22:12), Johannes Weiner wrote:

> On Thu, Apr 19, 2012 at 09:59:20AM +0900, KAMEZAWA Hiroyuki wrote:
>> (2012/04/19 8:33), Andrew Morton wrote:
>>
>>> On Wed, 18 Apr 2012 11:21:55 -0700
>>> Ying Han <yinghan@google.com> wrote:
>>>>  static void __free_pages_ok(struct page *page, unsigned int order)
>>>>  {
>>>>  	unsigned long flags;
>>>> -	int wasMlocked = __TestClearPageMlocked(page);
>>>> +	bool locked;
>>>>  
>>>>  	if (!free_pages_prepare(page, order))
>>>>  		return;
>>>>  
>>>>  	local_irq_save(flags);
>>>> -	if (unlikely(wasMlocked))
>>>> +	mem_cgroup_begin_update_page_stat(page, &locked, &flags);
>>>
>>> hm, what's going on here.  The page now has a zero refcount and is to
>>> be returned to the buddy.  But mem_cgroup_begin_update_page_stat()
>>> assumes that the page still belongs to a memcg.  I'd have thought that
>>> any page_cgroup backreferences would have been torn down by now?
>>>
>>>> +	if (unlikely(__TestClearPageMlocked(page)))
>>>>  		free_page_mlock(page);
>>>
>>
>>
>> Ah, this is problem. Now, we have following code.
>> ==
>>
>>> struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, struct page *page,
>>>                                        enum lru_list lru)
>>> {
>>>         struct mem_cgroup_per_zone *mz;
>>>         struct mem_cgroup *memcg;
>>>         struct page_cgroup *pc;
>>>
>>>         if (mem_cgroup_disabled())
>>>                 return &zone->lruvec;
>>>
>>>         pc = lookup_page_cgroup(page);
>>>         memcg = pc->mem_cgroup;
>>>
>>>         /*
>>>          * Surreptitiously switch any uncharged page to root:
>>>          * an uncharged page off lru does nothing to secure
>>>          * its former mem_cgroup from sudden removal.
>>>          *
>>>          * Our caller holds lru_lock, and PageCgroupUsed is updated
>>>          * under page_cgroup lock: between them, they make all uses
>>>          * of pc->mem_cgroup safe.
>>>          */
>>>         if (!PageCgroupUsed(pc) && memcg != root_mem_cgroup)
>>>                 pc->mem_cgroup = memcg = root_mem_cgroup;
>>
>> ==
>>
>> Then, accessing pc->mem_cgroup without checking PCG_USED bit is dangerous.
>> It may trigger #GP because of suddern removal of memcg or because of above
>> code, mis-accounting will happen... pc->mem_cgroup may be overwritten already.
>>
>> Proposal from me is calling TestClearPageMlocked(page) via mem_cgroup_uncharge().
>>
>> Like this.
>> ==
>>         mem_cgroup_charge_statistics(memcg, anon, -nr_pages);
>>
>> 	/*
>>          * Pages reach here when it's fully unmapped or dropped from file cache.
>> 	 * we are under lock_page_cgroup() and have no race with memcg activities.
>>          */
>> 	if (unlikely(PageMlocked(page))) {
>> 		if (TestClearPageMlocked())
>> 			decrement counter.
>> 	}
>>
>>         ClearPageCgroupUsed(pc);
>> ==
>> But please check performance impact...
> 
> This makes the lifetime rules of mlocked anon really weird.
> 

yes.

> Plus this code runs for ALL uncharges, the unlikely() and preliminary
> flag testing don't make it okay.  It's bad that we have this in the
> allocator, but at least it would be good to hook into that branch and
> not add another one.
> 
> pc->mem_cgroup stays intact after the uncharge.  Could we make the
> memcg removal path wait on the mlock counter to drop to zero instead
> and otherwise keep Ying's version?
> 


handling problem in ->destroy() path ? Hmm, it will work against use-after-free.
But accounting problem which may be caused by mem_cgroup_lru_add_list() cannot
be handled, which overwrites pc->mem_cgroup. 

But hm, is this too slow ?...
==
mem_cgroup_uncharge_common()
{
	....
	if (PageSwapCache(page) || PageMlocked(page))
		return NULL;
}

page_alloc.c::

static inline void free_page_mlock(struct page *page)
{

	__dec_zone_page_state(page, NR_MLOCK);
	__count_vm_event(UNEVICTABLE_MLOCKFREED);

	mem_cgroup_uncharge_page(page);
}
==

BTW, at reading code briefly....why we have hooks in free_page() ?

It seems do_munmap() and exit_mmap() calls munlock_vma_pages_all().
So, it seems all vmas which has VM_MLOCKED are checked before freeing.
vmscan never frees mlocked pages, I think.

Any other path to free mlocked pages without munlock ?
I feel freeing Mlocked page is a cause of problems.


Thanks,
-Kame







Thanks,
-Kame


	










--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH V2] memcg: add mlock statistic in memory.stat
  2012-04-20  0:37       ` KAMEZAWA Hiroyuki
@ 2012-04-20  5:57         ` Ying Han
  2012-04-20  6:16           ` KAMEZAWA Hiroyuki
  0 siblings, 1 reply; 13+ messages in thread
From: Ying Han @ 2012-04-20  5:57 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Johannes Weiner, Andrew Morton, Michal Hocko, Mel Gorman,
	Rik van Riel, Hillf Danton, Hugh Dickins, Dan Magenheimer,
	linux-mm

On Thu, Apr 19, 2012 at 5:37 PM, KAMEZAWA Hiroyuki
<kamezawa.hiroyu@jp.fujitsu.com> wrote:
> (2012/04/19 22:12), Johannes Weiner wrote:
>
>> On Thu, Apr 19, 2012 at 09:59:20AM +0900, KAMEZAWA Hiroyuki wrote:
>>> (2012/04/19 8:33), Andrew Morton wrote:
>>>
>>>> On Wed, 18 Apr 2012 11:21:55 -0700
>>>> Ying Han <yinghan@google.com> wrote:
>>>>>  static void __free_pages_ok(struct page *page, unsigned int order)
>>>>>  {
>>>>>    unsigned long flags;
>>>>> -  int wasMlocked = __TestClearPageMlocked(page);
>>>>> +  bool locked;
>>>>>
>>>>>    if (!free_pages_prepare(page, order))
>>>>>            return;
>>>>>
>>>>>    local_irq_save(flags);
>>>>> -  if (unlikely(wasMlocked))
>>>>> +  mem_cgroup_begin_update_page_stat(page, &locked, &flags);
>>>>
>>>> hm, what's going on here.  The page now has a zero refcount and is to
>>>> be returned to the buddy.  But mem_cgroup_begin_update_page_stat()
>>>> assumes that the page still belongs to a memcg.  I'd have thought that
>>>> any page_cgroup backreferences would have been torn down by now?
>>>>
>>>>> +  if (unlikely(__TestClearPageMlocked(page)))
>>>>>            free_page_mlock(page);
>>>>
>>>
>>>
>>> Ah, this is problem. Now, we have following code.
>>> ==
>>>
>>>> struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, struct page *page,
>>>>                                        enum lru_list lru)
>>>> {
>>>>         struct mem_cgroup_per_zone *mz;
>>>>         struct mem_cgroup *memcg;
>>>>         struct page_cgroup *pc;
>>>>
>>>>         if (mem_cgroup_disabled())
>>>>                 return &zone->lruvec;
>>>>
>>>>         pc = lookup_page_cgroup(page);
>>>>         memcg = pc->mem_cgroup;
>>>>
>>>>         /*
>>>>          * Surreptitiously switch any uncharged page to root:
>>>>          * an uncharged page off lru does nothing to secure
>>>>          * its former mem_cgroup from sudden removal.
>>>>          *
>>>>          * Our caller holds lru_lock, and PageCgroupUsed is updated
>>>>          * under page_cgroup lock: between them, they make all uses
>>>>          * of pc->mem_cgroup safe.
>>>>          */
>>>>         if (!PageCgroupUsed(pc) && memcg != root_mem_cgroup)
>>>>                 pc->mem_cgroup = memcg = root_mem_cgroup;
>>>
>>> ==
>>>
>>> Then, accessing pc->mem_cgroup without checking PCG_USED bit is dangerous.
>>> It may trigger #GP because of suddern removal of memcg or because of above
>>> code, mis-accounting will happen... pc->mem_cgroup may be overwritten already.
>>>
>>> Proposal from me is calling TestClearPageMlocked(page) via mem_cgroup_uncharge().
>>>
>>> Like this.
>>> ==
>>>         mem_cgroup_charge_statistics(memcg, anon, -nr_pages);
>>>
>>>      /*
>>>          * Pages reach here when it's fully unmapped or dropped from file cache.
>>>       * we are under lock_page_cgroup() and have no race with memcg activities.
>>>          */
>>>      if (unlikely(PageMlocked(page))) {
>>>              if (TestClearPageMlocked())
>>>                      decrement counter.
>>>      }
>>>
>>>         ClearPageCgroupUsed(pc);
>>> ==
>>> But please check performance impact...
>>
>> This makes the lifetime rules of mlocked anon really weird.
>>
>
> yes.
>
>> Plus this code runs for ALL uncharges, the unlikely() and preliminary
>> flag testing don't make it okay.  It's bad that we have this in the
>> allocator, but at least it would be good to hook into that branch and
>> not add another one.
>>
>> pc->mem_cgroup stays intact after the uncharge.  Could we make the
>> memcg removal path wait on the mlock counter to drop to zero instead
>> and otherwise keep Ying's version?
>>
>
>
> handling problem in ->destroy() path ? Hmm, it will work against use-after-free.

> But accounting problem which may be caused by mem_cgroup_lru_add_list() cannot
> be handled, which overwrites pc->mem_cgroup.

Kame, can you clarify that? What the mem_cgroup_lru_add_list() has
anything to do w/ this problem?

>
> But hm, is this too slow ?...
> ==
> mem_cgroup_uncharge_common()
> {
>        ....
>        if (PageSwapCache(page) || PageMlocked(page))
>                return NULL;
> }
>
> page_alloc.c::
>
> static inline void free_page_mlock(struct page *page)
> {
>
>        __dec_zone_page_state(page, NR_MLOCK);
>        __count_vm_event(UNEVICTABLE_MLOCKFREED);
>
>        mem_cgroup_uncharge_page(page);
> }
> ==
>
> BTW, at reading code briefly....why we have hooks in free_page() ?
>
> It seems do_munmap() and exit_mmap() calls munlock_vma_pages_all().
> So, it seems all vmas which has VM_MLOCKED are checked before freeing.
> vmscan never frees mlocked pages, I think.
>
> Any other path to free mlocked pages without munlock ?

I found this commit which introduced the hook in the freeing path,
however I couldn't get more details why it was introduced from the
commit description

commit 985737cf2ea096ea946aed82c7484d40defc71a8
Author: Lee Schermerhorn <lee.schermerhorn@hp.com>
Date:   Sat Oct 18 20:26:53 2008 -0700

    mlock: count attempts to free mlocked page

    Allow free of mlock()ed pages.  This shouldn't happen, but during
    developement, it occasionally did.

    This patch allows us to survive that condition, while keeping the
    statistics and events correct for debug.

> I feel freeing Mlocked page is a cause of problems.

--Ying
>
>
> Thanks,
> -Kame
>
>
>
>
>
>
>
> Thanks,
> -Kame
>
>
>
>
>
>
>
>
>
>
>
>
>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH V2] memcg: add mlock statistic in memory.stat
  2012-04-20  5:57         ` Ying Han
@ 2012-04-20  6:16           ` KAMEZAWA Hiroyuki
  2012-04-20  6:39             ` Ying Han
  0 siblings, 1 reply; 13+ messages in thread
From: KAMEZAWA Hiroyuki @ 2012-04-20  6:16 UTC (permalink / raw)
  To: Ying Han
  Cc: Johannes Weiner, Andrew Morton, Michal Hocko, Mel Gorman,
	Rik van Riel, Hillf Danton, Hugh Dickins, Dan Magenheimer,
	linux-mm

(2012/04/20 14:57), Ying Han wrote:

> On Thu, Apr 19, 2012 at 5:37 PM, KAMEZAWA Hiroyuki
> <kamezawa.hiroyu@jp.fujitsu.com> wrote:
>> (2012/04/19 22:12), Johannes Weiner wrote:
>>> Plus this code runs for ALL uncharges, the unlikely() and preliminary
>>> flag testing don't make it okay.  It's bad that we have this in the
>>> allocator, but at least it would be good to hook into that branch and
>>> not add another one.
>>>
>>> pc->mem_cgroup stays intact after the uncharge.  Could we make the
>>> memcg removal path wait on the mlock counter to drop to zero instead
>>> and otherwise keep Ying's version?
>>>
>>
>>
>> handling problem in ->destroy() path ? Hmm, it will work against use-after-free.
> 
>> But accounting problem which may be caused by mem_cgroup_lru_add_list() cannot
>> be handled, which overwrites pc->mem_cgroup.
> 
> Kame, can you clarify that? What the mem_cgroup_lru_add_list() has
> anything to do w/ this problem?
> 


It overwrites pc->mem_cgroup. Then, Assume a task in cgroup "A".

	1. page is charged.       pc->mem_cgroup = A + Used bit.
	2. page is set Mlocked.   A's mlock-counter += 1
	3. page is uncharged      - Used bit.
	4. page is added to lru   pc->mem_cgroup = root
	5. page is freed          root's mlock-coutner -=1, 

Then, A's mlock-counter +1, root's mlock-counter -1 IF free_pages()
really handle mlocked pages...



>>
>> But hm, is this too slow ?...
>> ==
>> mem_cgroup_uncharge_common()
>> {
>>        ....
>>        if (PageSwapCache(page) || PageMlocked(page))
>>                return NULL;
>> }
>>
>> page_alloc.c::
>>
>> static inline void free_page_mlock(struct page *page)
>> {
>>
>>        __dec_zone_page_state(page, NR_MLOCK);
>>        __count_vm_event(UNEVICTABLE_MLOCKFREED);
>>
>>        mem_cgroup_uncharge_page(page);
>> }
>> ==
>>
>> BTW, at reading code briefly....why we have hooks in free_page() ?
>>
>> It seems do_munmap() and exit_mmap() calls munlock_vma_pages_all().
>> So, it seems all vmas which has VM_MLOCKED are checked before freeing.
>> vmscan never frees mlocked pages, I think.
>>
>> Any other path to free mlocked pages without munlock ?
> 
> I found this commit which introduced the hook in the freeing path,
> however I couldn't get more details why it was introduced from the
> commit description
> 
> commit 985737cf2ea096ea946aed82c7484d40defc71a8
> Author: Lee Schermerhorn <lee.schermerhorn@hp.com>
> Date:   Sat Oct 18 20:26:53 2008 -0700
> 
>     mlock: count attempts to free mlocked page
> 
>     Allow free of mlock()ed pages.  This shouldn't happen, but during
>     developement, it occasionally did.
> 
>     This patch allows us to survive that condition, while keeping the
>     statistics and events correct for debug.
> 
>> I feel freeing Mlocked page is a cause of problems.
> 


Sigh...."This shouldn't happen"!!!!!

How about adding warning to free_page() path and remove your current hook ?

Thanks,
-Kame




--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH V2] memcg: add mlock statistic in memory.stat
  2012-04-20  6:16           ` KAMEZAWA Hiroyuki
@ 2012-04-20  6:39             ` Ying Han
  2012-04-20  6:52               ` KAMEZAWA Hiroyuki
  0 siblings, 1 reply; 13+ messages in thread
From: Ying Han @ 2012-04-20  6:39 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Johannes Weiner, Andrew Morton, Michal Hocko, Mel Gorman,
	Rik van Riel, Hillf Danton, Hugh Dickins, Dan Magenheimer,
	linux-mm

On Thu, Apr 19, 2012 at 11:16 PM, KAMEZAWA Hiroyuki
<kamezawa.hiroyu@jp.fujitsu.com> wrote:
> (2012/04/20 14:57), Ying Han wrote:
>
>> On Thu, Apr 19, 2012 at 5:37 PM, KAMEZAWA Hiroyuki
>> <kamezawa.hiroyu@jp.fujitsu.com> wrote:
>>> (2012/04/19 22:12), Johannes Weiner wrote:
>>>> Plus this code runs for ALL uncharges, the unlikely() and preliminary
>>>> flag testing don't make it okay.  It's bad that we have this in the
>>>> allocator, but at least it would be good to hook into that branch and
>>>> not add another one.
>>>>
>>>> pc->mem_cgroup stays intact after the uncharge.  Could we make the
>>>> memcg removal path wait on the mlock counter to drop to zero instead
>>>> and otherwise keep Ying's version?
>>>>
>>>
>>>
>>> handling problem in ->destroy() path ? Hmm, it will work against use-after-free.
>>
>>> But accounting problem which may be caused by mem_cgroup_lru_add_list() cannot
>>> be handled, which overwrites pc->mem_cgroup.
>>
>> Kame, can you clarify that? What the mem_cgroup_lru_add_list() has
>> anything to do w/ this problem?
>>
>
>
> It overwrites pc->mem_cgroup. Then, Assume a task in cgroup "A".
>
>        1. page is charged.       pc->mem_cgroup = A + Used bit.
>        2. page is set Mlocked.   A's mlock-counter += 1
>        3. page is uncharged      - Used bit.
>        4. page is added to lru   pc->mem_cgroup = root
>        5. page is freed          root's mlock-coutner -=1,
>
> Then, A's mlock-counter +1, root's mlock-counter -1 IF free_pages()
> really handle mlocked pages...

Hmm, now the question is whether the TestClearPageMlock() should only
happen between step 2 and step 3. If so, the mlock stat will be
updated correctly.

>
>
>
>>>
>>> But hm, is this too slow ?...
>>> ==
>>> mem_cgroup_uncharge_common()
>>> {
>>>        ....
>>>        if (PageSwapCache(page) || PageMlocked(page))
>>>                return NULL;
>>> }
>>>
>>> page_alloc.c::
>>>
>>> static inline void free_page_mlock(struct page *page)
>>> {
>>>
>>>        __dec_zone_page_state(page, NR_MLOCK);
>>>        __count_vm_event(UNEVICTABLE_MLOCKFREED);
>>>
>>>        mem_cgroup_uncharge_page(page);
>>> }
>>> ==
>>>
>>> BTW, at reading code briefly....why we have hooks in free_page() ?
>>>
>>> It seems do_munmap() and exit_mmap() calls munlock_vma_pages_all().
>>> So, it seems all vmas which has VM_MLOCKED are checked before freeing.
>>> vmscan never frees mlocked pages, I think.
>>>
>>> Any other path to free mlocked pages without munlock ?
>>
>> I found this commit which introduced the hook in the freeing path,
>> however I couldn't get more details why it was introduced from the
>> commit description
>>
>> commit 985737cf2ea096ea946aed82c7484d40defc71a8
>> Author: Lee Schermerhorn <lee.schermerhorn@hp.com>
>> Date:   Sat Oct 18 20:26:53 2008 -0700
>>
>>     mlock: count attempts to free mlocked page
>>
>>     Allow free of mlock()ed pages.  This shouldn't happen, but during
>>     developement, it occasionally did.
>>
>>     This patch allows us to survive that condition, while keeping the
>>     statistics and events correct for debug.
>>
>>> I feel freeing Mlocked page is a cause of problems.
>>
>
>
> Sigh...."This shouldn't happen"!!!!!
>
> How about adding warning to free_page() path and remove your current hook ?

That does make thing a lot simpler.. I will wait a bit in case someone
remember a counter example?

--Ying

> Thanks,
> -Kame
>
>
>
>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH V2] memcg: add mlock statistic in memory.stat
  2012-04-20  6:39             ` Ying Han
@ 2012-04-20  6:52               ` KAMEZAWA Hiroyuki
  0 siblings, 0 replies; 13+ messages in thread
From: KAMEZAWA Hiroyuki @ 2012-04-20  6:52 UTC (permalink / raw)
  To: Ying Han
  Cc: Johannes Weiner, Andrew Morton, Michal Hocko, Mel Gorman,
	Rik van Riel, Hillf Danton, Hugh Dickins, Dan Magenheimer,
	linux-mm

(2012/04/20 15:39), Ying Han wrote:

> On Thu, Apr 19, 2012 at 11:16 PM, KAMEZAWA Hiroyuki
> <kamezawa.hiroyu@jp.fujitsu.com> wrote:
>> (2012/04/20 14:57), Ying Han wrote:
>>
>>> On Thu, Apr 19, 2012 at 5:37 PM, KAMEZAWA Hiroyuki
>>> <kamezawa.hiroyu@jp.fujitsu.com> wrote:
>>>> (2012/04/19 22:12), Johannes Weiner wrote:
>>>>> Plus this code runs for ALL uncharges, the unlikely() and preliminary
>>>>> flag testing don't make it okay.  It's bad that we have this in the
>>>>> allocator, but at least it would be good to hook into that branch and
>>>>> not add another one.
>>>>>
>>>>> pc->mem_cgroup stays intact after the uncharge.  Could we make the
>>>>> memcg removal path wait on the mlock counter to drop to zero instead
>>>>> and otherwise keep Ying's version?
>>>>>
>>>>
>>>>
>>>> handling problem in ->destroy() path ? Hmm, it will work against use-after-free.
>>>
>>>> But accounting problem which may be caused by mem_cgroup_lru_add_list() cannot
>>>> be handled, which overwrites pc->mem_cgroup.
>>>
>>> Kame, can you clarify that? What the mem_cgroup_lru_add_list() has
>>> anything to do w/ this problem?
>>>
>>
>>
>> It overwrites pc->mem_cgroup. Then, Assume a task in cgroup "A".
>>
>>        1. page is charged.       pc->mem_cgroup = A + Used bit.
>>        2. page is set Mlocked.   A's mlock-counter += 1
>>        3. page is uncharged      - Used bit.
>>        4. page is added to lru   pc->mem_cgroup = root
>>        5. page is freed          root's mlock-coutner -=1,
>>
>> Then, A's mlock-counter +1, root's mlock-counter -1 IF free_pages()
>> really handle mlocked pages...
> 
> Hmm, now the question is whether the TestClearPageMlock() should only
> happen between step 2 and step 3. If so, the mlock stat will be
> updated correctly.
> 


Yes, I think it's true. TestClearPageMlock() should happen
betwen 2 and 3, I believe.


>> Sigh...."This shouldn't happen"!!!!!
>>
>> How about adding warning to free_page() path and remove your current hook ?
> 
> That does make thing a lot simpler.. I will wait a bit in case someone
> remember a counter example?
> 


Sure. Thank you for your works.

Thanks,
-Kame

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH V2] memcg: add mlock statistic in memory.stat
  2012-04-19  0:59   ` KAMEZAWA Hiroyuki
  2012-04-19 13:12     ` Johannes Weiner
@ 2012-04-19 22:43     ` Ying Han
  1 sibling, 0 replies; 13+ messages in thread
From: Ying Han @ 2012-04-19 22:43 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Andrew Morton, Michal Hocko, Johannes Weiner, Mel Gorman,
	Rik van Riel, Hillf Danton, Hugh Dickins, Dan Magenheimer,
	linux-mm

On Wed, Apr 18, 2012 at 5:59 PM, KAMEZAWA Hiroyuki
<kamezawa.hiroyu@jp.fujitsu.com> wrote:
> (2012/04/19 8:33), Andrew Morton wrote:
>
>> On Wed, 18 Apr 2012 11:21:55 -0700
>> Ying Han <yinghan@google.com> wrote:
>>
>>> We have the nr_mlock stat both in meminfo as well as vmstat system wide, this
>>> patch adds the mlock field into per-memcg memory stat. The stat itself enhances
>>> the metrics exported by memcg since the unevictable lru includes more than
>>> mlock()'d page like SHM_LOCK'd.
>>>
>>> Why we need to count mlock'd pages while they are unevictable and we can not
>>> do much on them anyway?
>>>
>>> This is true. The mlock stat I am proposing is more helpful for system admin
>>> and kernel developer to understand the system workload. The same information
>>> should be helpful to add into OOM log as well. Many times in the past that we
>>> need to read the mlock stat from the per-container meminfo for different
>>> reason. Afterall, we do have the ability to read the mlock from meminfo and
>>> this patch fills the info in memcg.
>>>
>>>
>>> ...
>>>
>>>  static inline int is_mlocked_vma(struct vm_area_struct *vma, struct page *page)
>>>  {
>>> +    bool locked;
>>> +    unsigned long flags;
>>> +
>>>      VM_BUG_ON(PageLRU(page));
>>>
>>>      if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED))
>>>              return 0;
>>>
>>> +    mem_cgroup_begin_update_page_stat(page, &locked, &flags);
>>>      if (!TestSetPageMlocked(page)) {
>>>              inc_zone_page_state(page, NR_MLOCK);
>>> +            mem_cgroup_inc_page_stat(page, MEMCG_NR_MLOCK);
>>>              count_vm_event(UNEVICTABLE_PGMLOCKED);
>>>      }
>>> +    mem_cgroup_end_update_page_stat(page, &locked, &flags);
>>> +
>>>      return 1;
>>>  }
>>
>> Unrelated to this patch: is_mlocked_vma() is misnamed.  A function with
>> that name should be a bool-returning test which has no side-effects.
>>
>>>
>>> ...
>>>
>>>  static void __free_pages_ok(struct page *page, unsigned int order)
>>>  {
>>>      unsigned long flags;
>>> -    int wasMlocked = __TestClearPageMlocked(page);
>>> +    bool locked;
>>>
>>>      if (!free_pages_prepare(page, order))
>>>              return;
>>>
>>>      local_irq_save(flags);
>>> -    if (unlikely(wasMlocked))
>>> +    mem_cgroup_begin_update_page_stat(page, &locked, &flags);
>>
>> hm, what's going on here.  The page now has a zero refcount and is to
>> be returned to the buddy.  But mem_cgroup_begin_update_page_stat()
>> assumes that the page still belongs to a memcg.  I'd have thought that
>> any page_cgroup backreferences would have been torn down by now?
>>
>>> +    if (unlikely(__TestClearPageMlocked(page)))
>>>              free_page_mlock(page);
>>
>
>
> Ah, this is problem. Now, we have following code.
> ==
>
>> struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, struct page *page,
>>                                        enum lru_list lru)
>> {
>>         struct mem_cgroup_per_zone *mz;
>>         struct mem_cgroup *memcg;
>>         struct page_cgroup *pc;
>>
>>         if (mem_cgroup_disabled())
>>                 return &zone->lruvec;
>>
>>         pc = lookup_page_cgroup(page);
>>         memcg = pc->mem_cgroup;
>>
>>         /*
>>          * Surreptitiously switch any uncharged page to root:
>>          * an uncharged page off lru does nothing to secure
>>          * its former mem_cgroup from sudden removal.
>>          *
>>          * Our caller holds lru_lock, and PageCgroupUsed is updated
>>          * under page_cgroup lock: between them, they make all uses
>>          * of pc->mem_cgroup safe.
>>          */
>>         if (!PageCgroupUsed(pc) && memcg != root_mem_cgroup)
>>                 pc->mem_cgroup = memcg = root_mem_cgroup;
>
> ==
>
> Then, accessing pc->mem_cgroup without checking PCG_USED bit is dangerous.
> It may trigger #GP because of suddern removal of memcg or because of above
> code, mis-accounting will happen... pc->mem_cgroup may be overwritten already.
>
> Proposal from me is calling TestClearPageMlocked(page) via mem_cgroup_uncharge().
>
> Like this.
> ==
>        mem_cgroup_charge_statistics(memcg, anon, -nr_pages);
>
>        /*
>         * Pages reach here when it's fully unmapped or dropped from file cache.
>         * we are under lock_page_cgroup() and have no race with memcg activities.
>         */
>        if (unlikely(PageMlocked(page))) {
>                if (TestClearPageMlocked())
>                        decrement counter.
>        }

Hmm, so we save the call to mem_cgroup_begin/end_update_page_stat()
here. Are you suggesting to move the call to free_page_mlock() to
here?

>        ClearPageCgroupUsed(pc);
> ==

> But please check performance impact...

Yes, i will run some performance measurement on that.

--Ying

>
> Thanks,
> -Kame
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH V2] memcg: add mlock statistic in memory.stat
  2012-04-18 23:33 ` Andrew Morton
  2012-04-19  0:59   ` KAMEZAWA Hiroyuki
@ 2012-04-19 22:30   ` Ying Han
  1 sibling, 0 replies; 13+ messages in thread
From: Ying Han @ 2012-04-19 22:30 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Michal Hocko, Johannes Weiner, Mel Gorman, KAMEZAWA Hiroyuki,
	Rik van Riel, Hillf Danton, Hugh Dickins, Dan Magenheimer,
	linux-mm

On Wed, Apr 18, 2012 at 4:33 PM, Andrew Morton
<akpm@linux-foundation.org> wrote:
> On Wed, 18 Apr 2012 11:21:55 -0700
> Ying Han <yinghan@google.com> wrote:
>
>> We have the nr_mlock stat both in meminfo as well as vmstat system wide, this
>> patch adds the mlock field into per-memcg memory stat. The stat itself enhances
>> the metrics exported by memcg since the unevictable lru includes more than
>> mlock()'d page like SHM_LOCK'd.
>>
>> Why we need to count mlock'd pages while they are unevictable and we can not
>> do much on them anyway?
>>
>> This is true. The mlock stat I am proposing is more helpful for system admin
>> and kernel developer to understand the system workload. The same information
>> should be helpful to add into OOM log as well. Many times in the past that we
>> need to read the mlock stat from the per-container meminfo for different
>> reason. Afterall, we do have the ability to read the mlock from meminfo and
>> this patch fills the info in memcg.
>>
>>
>> ...
>>
>>  static inline int is_mlocked_vma(struct vm_area_struct *vma, struct page *page)
>>  {
>> +     bool locked;
>> +     unsigned long flags;
>> +
>>       VM_BUG_ON(PageLRU(page));
>>
>>       if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED))
>>               return 0;
>>
>> +     mem_cgroup_begin_update_page_stat(page, &locked, &flags);
>>       if (!TestSetPageMlocked(page)) {
>>               inc_zone_page_state(page, NR_MLOCK);
>> +             mem_cgroup_inc_page_stat(page, MEMCG_NR_MLOCK);
>>               count_vm_event(UNEVICTABLE_PGMLOCKED);
>>       }
>> +     mem_cgroup_end_update_page_stat(page, &locked, &flags);
>> +
>>       return 1;
>>  }
>
> Unrelated to this patch: is_mlocked_vma() is misnamed.  A function with
> that name should be a bool-returning test which has no side-effects.

That is true. Maybe a separate patch to fix that up :)

>
>>
>> ...
>>
>>  static void __free_pages_ok(struct page *page, unsigned int order)
>>  {
>>       unsigned long flags;
>> -     int wasMlocked = __TestClearPageMlocked(page);
>> +     bool locked;
>>
>>       if (!free_pages_prepare(page, order))
>>               return;
>>
>>       local_irq_save(flags);
>> -     if (unlikely(wasMlocked))
>> +     mem_cgroup_begin_update_page_stat(page, &locked, &flags);
>
> hm, what's going on here.  The page now has a zero refcount and is to
> be returned to the buddy.  But mem_cgroup_begin_update_page_stat()
> assumes that the page still belongs to a memcg.  I'd have thought that
> any page_cgroup backreferences would have been torn down by now?

True, I missed that at the first place. This will trigger GPF easily
if the memcg is destroyed after the charge drops to 0.

The problem is the time window between mem_cgroup_uncharge_page() and
free_hot_cold_page() which the later one calls
__TestClearPageMlocked(page).

I am wondering whether we can move the __TestClearPageMlocked(page)
earlier, before memcg_cgroup_uncharge_page(). Is there a particular
reason why the Clear Mlock bit has to be the last moment ?

--Ying
>
>> +     if (unlikely(__TestClearPageMlocked(page)))
>>               free_page_mlock(page);
>
> And if the page _is_ still accessible via cgroup lookup, the use of the
> nonatomic RMW is dangerous.
>
>>       __count_vm_events(PGFREE, 1 << order);
>>       free_one_page(page_zone(page), page, order,
>>                                       get_pageblock_migratetype(page));
>> +     mem_cgroup_end_update_page_stat(page, &locked, &flags);
>>       local_irq_restore(flags);
>>  }
>>
>> @@ -1250,7 +1256,7 @@ void free_hot_cold_page(struct page *page, int cold)
>
> The same comments apply in free_hot_cold_page().
>
>>       struct per_cpu_pages *pcp;
>>       unsigned long flags;
>>       int migratetype;
>> -     int wasMlocked = __TestClearPageMlocked(page);
>> +     bool locked;
>>
>>       if (!free_pages_prepare(page, 0))
>>               return;
>>
>> ...
>>
>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2012-04-20  6:54 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2012-04-18 18:21 [PATCH V2] memcg: add mlock statistic in memory.stat Ying Han
2012-04-18 23:33 ` Andrew Morton
2012-04-19  0:59   ` KAMEZAWA Hiroyuki
2012-04-19 13:12     ` Johannes Weiner
2012-04-19 22:46       ` Ying Han
2012-04-19 23:04         ` Johannes Weiner
2012-04-20  0:37       ` KAMEZAWA Hiroyuki
2012-04-20  5:57         ` Ying Han
2012-04-20  6:16           ` KAMEZAWA Hiroyuki
2012-04-20  6:39             ` Ying Han
2012-04-20  6:52               ` KAMEZAWA Hiroyuki
2012-04-19 22:43     ` Ying Han
2012-04-19 22:30   ` Ying Han

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).