* [PATCH -mmotm 1/5] memcg: disable irq at page cgroup lock
2010-03-14 23:26 [PATCH -mmotm 0/5] memcg: per cgroup dirty limit (v7) Andrea Righi
@ 2010-03-14 23:26 ` Andrea Righi
2010-03-15 0:06 ` KAMEZAWA Hiroyuki
` (2 more replies)
2010-03-14 23:26 ` [PATCH -mmotm 2/5] memcg: dirty memory documentation Andrea Righi
` (6 subsequent siblings)
7 siblings, 3 replies; 71+ messages in thread
From: Andrea Righi @ 2010-03-14 23:26 UTC (permalink / raw)
To: KAMEZAWA Hiroyuki, Daisuke Nishimura, Balbir Singh
Cc: Vivek Goyal, Peter Zijlstra, Trond Myklebust, Suleiman Souhlal,
Greg Thelen, Kirill A. Shutemov, Andrew Morton, containers,
linux-kernel, linux-mm
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Now, file-mapped is maintaiend. But more generic update function
will be needed for dirty page accounting.
For accountig page status, we have to guarantee lock_page_cgroup()
will be never called under tree_lock held.
To guarantee that, we use trylock at updating status.
By this, we do fuzzy accounting, but in almost all case, it's correct.
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
include/linux/memcontrol.h | 7 +++-
include/linux/page_cgroup.h | 15 +++++++
mm/memcontrol.c | 94 +++++++++++++++++++++++++++++++++----------
mm/rmap.c | 4 +-
4 files changed, 95 insertions(+), 25 deletions(-)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 44301c6..88d3f9e 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -124,7 +124,12 @@ static inline bool mem_cgroup_disabled(void)
return false;
}
-void mem_cgroup_update_file_mapped(struct page *page, int val);
+enum mem_cgroup_page_stat_item {
+ MEMCG_NR_FILE_MAPPED,
+ MEMCG_NR_FILE_NSTAT,
+};
+
+void mem_cgroup_update_stat(struct page *page, int idx, bool charge);
unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
gfp_t gfp_mask, int nid,
int zid);
diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
index 30b0813..bf9a913 100644
--- a/include/linux/page_cgroup.h
+++ b/include/linux/page_cgroup.h
@@ -39,6 +39,8 @@ enum {
PCG_CACHE, /* charged as cache */
PCG_USED, /* this object is in use. */
PCG_ACCT_LRU, /* page has been accounted for */
+ /* for cache-status accounting */
+ PCG_FILE_MAPPED,
};
#define TESTPCGFLAG(uname, lname) \
@@ -57,6 +59,10 @@ static inline void ClearPageCgroup##uname(struct page_cgroup *pc) \
static inline int TestClearPageCgroup##uname(struct page_cgroup *pc) \
{ return test_and_clear_bit(PCG_##lname, &pc->flags); }
+/* Page/File stat flag mask */
+#define PCG_PageStatMask ((1 << PCG_FILE_MAPPED))
+
+
TESTPCGFLAG(Locked, LOCK)
/* Cache flag is set only once (at allocation) */
@@ -73,6 +79,10 @@ CLEARPCGFLAG(AcctLRU, ACCT_LRU)
TESTPCGFLAG(AcctLRU, ACCT_LRU)
TESTCLEARPCGFLAG(AcctLRU, ACCT_LRU)
+TESTPCGFLAG(FileMapped, FILE_MAPPED)
+SETPCGFLAG(FileMapped, FILE_MAPPED)
+CLEARPCGFLAG(FileMapped, FILE_MAPPED)
+
static inline int page_cgroup_nid(struct page_cgroup *pc)
{
return page_to_nid(pc->page);
@@ -93,6 +103,11 @@ static inline void unlock_page_cgroup(struct page_cgroup *pc)
bit_spin_unlock(PCG_LOCK, &pc->flags);
}
+static inline int trylock_page_cgroup(struct page_cgroup *pc)
+{
+ return bit_spin_trylock(PCG_LOCK, &pc->flags);
+}
+
#else /* CONFIG_CGROUP_MEM_RES_CTLR */
struct page_cgroup;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7fab84e..b7c23ea 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1348,30 +1348,83 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
* Currently used to update mapped file statistics, but the routine can be
* generalized to update other statistics as well.
*/
-void mem_cgroup_update_file_mapped(struct page *page, int val)
+void __mem_cgroup_update_stat(struct page_cgroup *pc, int idx, bool charge)
{
struct mem_cgroup *mem;
- struct page_cgroup *pc;
+ int val;
- pc = lookup_page_cgroup(page);
- if (unlikely(!pc))
- return;
-
- lock_page_cgroup(pc);
mem = pc->mem_cgroup;
- if (!mem)
- goto done;
+ if (!mem || !PageCgroupUsed(pc))
+ return;
- if (!PageCgroupUsed(pc))
- goto done;
+ if (charge)
+ val = 1;
+ else
+ val = -1;
+ switch (idx) {
+ case MEMCG_NR_FILE_MAPPED:
+ if (charge) {
+ if (!PageCgroupFileMapped(pc))
+ SetPageCgroupFileMapped(pc);
+ else
+ val = 0;
+ } else {
+ if (PageCgroupFileMapped(pc))
+ ClearPageCgroupFileMapped(pc);
+ else
+ val = 0;
+ }
+ idx = MEM_CGROUP_STAT_FILE_MAPPED;
+ break;
+ default:
+ BUG();
+ break;
+ }
/*
* Preemption is already disabled. We can use __this_cpu_xxx
*/
- __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], val);
+ __this_cpu_add(mem->stat->count[idx], val);
+}
-done:
- unlock_page_cgroup(pc);
+void mem_cgroup_update_stat(struct page *page, int idx, bool charge)
+{
+ struct page_cgroup *pc;
+
+ pc = lookup_page_cgroup(page);
+ if (!pc || mem_cgroup_is_root(pc->mem_cgroup))
+ return;
+
+ if (trylock_page_cgroup(pc)) {
+ __mem_cgroup_update_stat(pc, idx, charge);
+ unlock_page_cgroup(pc);
+ }
+ return;
+}
+
+static void mem_cgroup_migrate_stat(struct page_cgroup *pc,
+ struct mem_cgroup *from, struct mem_cgroup *to)
+{
+ if (PageCgroupFileMapped(pc)) {
+ __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
+ if (!mem_cgroup_is_root(to)) {
+ __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
+ } else {
+ ClearPageCgroupFileMapped(pc);
+ }
+ }
+}
+
+static void
+__mem_cgroup_stat_fixup(struct page_cgroup *pc, struct mem_cgroup *mem)
+{
+ if (mem_cgroup_is_root(mem))
+ return;
+ /* We'are in uncharge() and lock_page_cgroup */
+ if (PageCgroupFileMapped(pc)) {
+ __this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
+ ClearPageCgroupFileMapped(pc);
+ }
}
/*
@@ -1810,13 +1863,7 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc,
VM_BUG_ON(pc->mem_cgroup != from);
page = pc->page;
- if (page_mapped(page) && !PageAnon(page)) {
- /* Update mapped_file data for mem_cgroup */
- preempt_disable();
- __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
- __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
- preempt_enable();
- }
+ mem_cgroup_migrate_stat(pc, from, to);
mem_cgroup_charge_statistics(from, pc, false);
if (uncharge)
/* This is not "cancel", but cancel_charge does all we need. */
@@ -2208,6 +2255,9 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
__do_uncharge(mem, ctype);
if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
mem_cgroup_swap_statistics(mem, true);
+ if (unlikely(PCG_PageStatMask & pc->flags))
+ __mem_cgroup_stat_fixup(pc, mem);
+
mem_cgroup_charge_statistics(mem, pc, false);
ClearPageCgroupUsed(pc);
diff --git a/mm/rmap.c b/mm/rmap.c
index fcd593c..b5b2daf 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -829,7 +829,7 @@ void page_add_file_rmap(struct page *page)
{
if (atomic_inc_and_test(&page->_mapcount)) {
__inc_zone_page_state(page, NR_FILE_MAPPED);
- mem_cgroup_update_file_mapped(page, 1);
+ mem_cgroup_update_stat(page, MEMCG_NR_FILE_MAPPED, true);
}
}
@@ -861,7 +861,7 @@ void page_remove_rmap(struct page *page)
__dec_zone_page_state(page, NR_ANON_PAGES);
} else {
__dec_zone_page_state(page, NR_FILE_MAPPED);
- mem_cgroup_update_file_mapped(page, -1);
+ mem_cgroup_update_stat(page, MEMCG_NR_FILE_MAPPED, false);
}
/*
* It would be tidy to reset the PageAnon mapping here,
--
1.6.3.3
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related [flat|nested] 71+ messages in thread
* Re: [PATCH -mmotm 1/5] memcg: disable irq at page cgroup lock
2010-03-14 23:26 ` [PATCH -mmotm 1/5] memcg: disable irq at page cgroup lock Andrea Righi
@ 2010-03-15 0:06 ` KAMEZAWA Hiroyuki
2010-03-15 10:00 ` Andrea Righi
2010-03-17 7:04 ` Balbir Singh
2010-03-17 11:58 ` Balbir Singh
2 siblings, 1 reply; 71+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-03-15 0:06 UTC (permalink / raw)
To: Andrea Righi
Cc: Daisuke Nishimura, Balbir Singh, Vivek Goyal, Peter Zijlstra,
Trond Myklebust, Suleiman Souhlal, Greg Thelen,
Kirill A. Shutemov, Andrew Morton, containers, linux-kernel,
linux-mm
On Mon, 15 Mar 2010 00:26:38 +0100
Andrea Righi <arighi@develer.com> wrote:
> From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
>
> Now, file-mapped is maintaiend. But more generic update function
> will be needed for dirty page accounting.
>
> For accountig page status, we have to guarantee lock_page_cgroup()
> will be never called under tree_lock held.
> To guarantee that, we use trylock at updating status.
> By this, we do fuzzy accounting, but in almost all case, it's correct.
>
> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Bad patch title...."use trylock for safe accounting in some contexts" ?
Thanks,
-Kame
> ---
> include/linux/memcontrol.h | 7 +++-
> include/linux/page_cgroup.h | 15 +++++++
> mm/memcontrol.c | 94 +++++++++++++++++++++++++++++++++----------
> mm/rmap.c | 4 +-
> 4 files changed, 95 insertions(+), 25 deletions(-)
>
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 44301c6..88d3f9e 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -124,7 +124,12 @@ static inline bool mem_cgroup_disabled(void)
> return false;
> }
>
> -void mem_cgroup_update_file_mapped(struct page *page, int val);
> +enum mem_cgroup_page_stat_item {
> + MEMCG_NR_FILE_MAPPED,
> + MEMCG_NR_FILE_NSTAT,
> +};
> +
> +void mem_cgroup_update_stat(struct page *page, int idx, bool charge);
> unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
> gfp_t gfp_mask, int nid,
> int zid);
> diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
> index 30b0813..bf9a913 100644
> --- a/include/linux/page_cgroup.h
> +++ b/include/linux/page_cgroup.h
> @@ -39,6 +39,8 @@ enum {
> PCG_CACHE, /* charged as cache */
> PCG_USED, /* this object is in use. */
> PCG_ACCT_LRU, /* page has been accounted for */
> + /* for cache-status accounting */
> + PCG_FILE_MAPPED,
> };
>
> #define TESTPCGFLAG(uname, lname) \
> @@ -57,6 +59,10 @@ static inline void ClearPageCgroup##uname(struct page_cgroup *pc) \
> static inline int TestClearPageCgroup##uname(struct page_cgroup *pc) \
> { return test_and_clear_bit(PCG_##lname, &pc->flags); }
>
> +/* Page/File stat flag mask */
> +#define PCG_PageStatMask ((1 << PCG_FILE_MAPPED))
> +
> +
> TESTPCGFLAG(Locked, LOCK)
>
> /* Cache flag is set only once (at allocation) */
> @@ -73,6 +79,10 @@ CLEARPCGFLAG(AcctLRU, ACCT_LRU)
> TESTPCGFLAG(AcctLRU, ACCT_LRU)
> TESTCLEARPCGFLAG(AcctLRU, ACCT_LRU)
>
> +TESTPCGFLAG(FileMapped, FILE_MAPPED)
> +SETPCGFLAG(FileMapped, FILE_MAPPED)
> +CLEARPCGFLAG(FileMapped, FILE_MAPPED)
> +
> static inline int page_cgroup_nid(struct page_cgroup *pc)
> {
> return page_to_nid(pc->page);
> @@ -93,6 +103,11 @@ static inline void unlock_page_cgroup(struct page_cgroup *pc)
> bit_spin_unlock(PCG_LOCK, &pc->flags);
> }
>
> +static inline int trylock_page_cgroup(struct page_cgroup *pc)
> +{
> + return bit_spin_trylock(PCG_LOCK, &pc->flags);
> +}
> +
> #else /* CONFIG_CGROUP_MEM_RES_CTLR */
> struct page_cgroup;
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 7fab84e..b7c23ea 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -1348,30 +1348,83 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
> * Currently used to update mapped file statistics, but the routine can be
> * generalized to update other statistics as well.
> */
> -void mem_cgroup_update_file_mapped(struct page *page, int val)
> +void __mem_cgroup_update_stat(struct page_cgroup *pc, int idx, bool charge)
> {
> struct mem_cgroup *mem;
> - struct page_cgroup *pc;
> + int val;
>
> - pc = lookup_page_cgroup(page);
> - if (unlikely(!pc))
> - return;
> -
> - lock_page_cgroup(pc);
> mem = pc->mem_cgroup;
> - if (!mem)
> - goto done;
> + if (!mem || !PageCgroupUsed(pc))
> + return;
>
> - if (!PageCgroupUsed(pc))
> - goto done;
> + if (charge)
> + val = 1;
> + else
> + val = -1;
>
> + switch (idx) {
> + case MEMCG_NR_FILE_MAPPED:
> + if (charge) {
> + if (!PageCgroupFileMapped(pc))
> + SetPageCgroupFileMapped(pc);
> + else
> + val = 0;
> + } else {
> + if (PageCgroupFileMapped(pc))
> + ClearPageCgroupFileMapped(pc);
> + else
> + val = 0;
> + }
> + idx = MEM_CGROUP_STAT_FILE_MAPPED;
> + break;
> + default:
> + BUG();
> + break;
> + }
> /*
> * Preemption is already disabled. We can use __this_cpu_xxx
> */
> - __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], val);
> + __this_cpu_add(mem->stat->count[idx], val);
> +}
>
> -done:
> - unlock_page_cgroup(pc);
> +void mem_cgroup_update_stat(struct page *page, int idx, bool charge)
> +{
> + struct page_cgroup *pc;
> +
> + pc = lookup_page_cgroup(page);
> + if (!pc || mem_cgroup_is_root(pc->mem_cgroup))
> + return;
> +
> + if (trylock_page_cgroup(pc)) {
> + __mem_cgroup_update_stat(pc, idx, charge);
> + unlock_page_cgroup(pc);
> + }
> + return;
> +}
> +
> +static void mem_cgroup_migrate_stat(struct page_cgroup *pc,
> + struct mem_cgroup *from, struct mem_cgroup *to)
> +{
> + if (PageCgroupFileMapped(pc)) {
> + __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
> + if (!mem_cgroup_is_root(to)) {
> + __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
> + } else {
> + ClearPageCgroupFileMapped(pc);
> + }
> + }
> +}
> +
> +static void
> +__mem_cgroup_stat_fixup(struct page_cgroup *pc, struct mem_cgroup *mem)
> +{
> + if (mem_cgroup_is_root(mem))
> + return;
> + /* We'are in uncharge() and lock_page_cgroup */
> + if (PageCgroupFileMapped(pc)) {
> + __this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
> + ClearPageCgroupFileMapped(pc);
> + }
> }
>
> /*
> @@ -1810,13 +1863,7 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc,
> VM_BUG_ON(pc->mem_cgroup != from);
>
> page = pc->page;
> - if (page_mapped(page) && !PageAnon(page)) {
> - /* Update mapped_file data for mem_cgroup */
> - preempt_disable();
> - __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
> - __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
> - preempt_enable();
> - }
> + mem_cgroup_migrate_stat(pc, from, to);
> mem_cgroup_charge_statistics(from, pc, false);
> if (uncharge)
> /* This is not "cancel", but cancel_charge does all we need. */
> @@ -2208,6 +2255,9 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
> __do_uncharge(mem, ctype);
> if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
> mem_cgroup_swap_statistics(mem, true);
> + if (unlikely(PCG_PageStatMask & pc->flags))
> + __mem_cgroup_stat_fixup(pc, mem);
> +
> mem_cgroup_charge_statistics(mem, pc, false);
>
> ClearPageCgroupUsed(pc);
> diff --git a/mm/rmap.c b/mm/rmap.c
> index fcd593c..b5b2daf 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -829,7 +829,7 @@ void page_add_file_rmap(struct page *page)
> {
> if (atomic_inc_and_test(&page->_mapcount)) {
> __inc_zone_page_state(page, NR_FILE_MAPPED);
> - mem_cgroup_update_file_mapped(page, 1);
> + mem_cgroup_update_stat(page, MEMCG_NR_FILE_MAPPED, true);
> }
> }
>
> @@ -861,7 +861,7 @@ void page_remove_rmap(struct page *page)
> __dec_zone_page_state(page, NR_ANON_PAGES);
> } else {
> __dec_zone_page_state(page, NR_FILE_MAPPED);
> - mem_cgroup_update_file_mapped(page, -1);
> + mem_cgroup_update_stat(page, MEMCG_NR_FILE_MAPPED, false);
> }
> /*
> * It would be tidy to reset the PageAnon mapping here,
> --
> 1.6.3.3
>
>
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH -mmotm 1/5] memcg: disable irq at page cgroup lock
2010-03-15 0:06 ` KAMEZAWA Hiroyuki
@ 2010-03-15 10:00 ` Andrea Righi
0 siblings, 0 replies; 71+ messages in thread
From: Andrea Righi @ 2010-03-15 10:00 UTC (permalink / raw)
To: KAMEZAWA Hiroyuki
Cc: Daisuke Nishimura, Balbir Singh, Vivek Goyal, Peter Zijlstra,
Trond Myklebust, Suleiman Souhlal, Greg Thelen,
Kirill A. Shutemov, Andrew Morton, containers, linux-kernel,
linux-mm
On Mon, Mar 15, 2010 at 09:06:38AM +0900, KAMEZAWA Hiroyuki wrote:
> On Mon, 15 Mar 2010 00:26:38 +0100
> Andrea Righi <arighi@develer.com> wrote:
>
> > From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> >
> > Now, file-mapped is maintaiend. But more generic update function
> > will be needed for dirty page accounting.
> >
> > For accountig page status, we have to guarantee lock_page_cgroup()
> > will be never called under tree_lock held.
> > To guarantee that, we use trylock at updating status.
> > By this, we do fuzzy accounting, but in almost all case, it's correct.
> >
> > Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
>
> Bad patch title...."use trylock for safe accounting in some contexts" ?
OK, sounds better. I just copy & paste the email subject, but the title
was probably related to the old lock_page_cgroup()+irq_disable patch.
Thanks,
-Andrea
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH -mmotm 1/5] memcg: disable irq at page cgroup lock
2010-03-14 23:26 ` [PATCH -mmotm 1/5] memcg: disable irq at page cgroup lock Andrea Righi
2010-03-15 0:06 ` KAMEZAWA Hiroyuki
@ 2010-03-17 7:04 ` Balbir Singh
2010-03-17 11:58 ` Balbir Singh
2 siblings, 0 replies; 71+ messages in thread
From: Balbir Singh @ 2010-03-17 7:04 UTC (permalink / raw)
To: Andrea Righi
Cc: KAMEZAWA Hiroyuki, Daisuke Nishimura, Vivek Goyal, Peter Zijlstra,
Trond Myklebust, Suleiman Souhlal, Greg Thelen,
Kirill A. Shutemov, Andrew Morton, containers, linux-kernel,
linux-mm
* Andrea Righi <arighi@develer.com> [2010-03-15 00:26:38]:
> From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
>
> Now, file-mapped is maintaiend. But more generic update function
^^^^^^^^^^ (typo)
> will be needed for dirty page accounting.
>
> For accountig page status, we have to guarantee lock_page_cgroup()
> will be never called under tree_lock held.
> To guarantee that, we use trylock at updating status.
> By this, we do fuzzy accounting, but in almost all case, it's correct.
>
> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> ---
> include/linux/memcontrol.h | 7 +++-
> include/linux/page_cgroup.h | 15 +++++++
> mm/memcontrol.c | 94 +++++++++++++++++++++++++++++++++----------
> mm/rmap.c | 4 +-
> 4 files changed, 95 insertions(+), 25 deletions(-)
>
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 44301c6..88d3f9e 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -124,7 +124,12 @@ static inline bool mem_cgroup_disabled(void)
> return false;
> }
>
> -void mem_cgroup_update_file_mapped(struct page *page, int val);
> +enum mem_cgroup_page_stat_item {
> + MEMCG_NR_FILE_MAPPED,
> + MEMCG_NR_FILE_NSTAT,
> +};
> +
> +void mem_cgroup_update_stat(struct page *page, int idx, bool charge);
> unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
> gfp_t gfp_mask, int nid,
> int zid);
> diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
> index 30b0813..bf9a913 100644
> --- a/include/linux/page_cgroup.h
> +++ b/include/linux/page_cgroup.h
> @@ -39,6 +39,8 @@ enum {
> PCG_CACHE, /* charged as cache */
> PCG_USED, /* this object is in use. */
> PCG_ACCT_LRU, /* page has been accounted for */
> + /* for cache-status accounting */
> + PCG_FILE_MAPPED,
> };
>
> #define TESTPCGFLAG(uname, lname) \
> @@ -57,6 +59,10 @@ static inline void ClearPageCgroup##uname(struct page_cgroup *pc) \
> static inline int TestClearPageCgroup##uname(struct page_cgroup *pc) \
> { return test_and_clear_bit(PCG_##lname, &pc->flags); }
>
> +/* Page/File stat flag mask */
> +#define PCG_PageStatMask ((1 << PCG_FILE_MAPPED))
> +
> +
Extra newlines.
> TESTPCGFLAG(Locked, LOCK)
>
> /* Cache flag is set only once (at allocation) */
> @@ -73,6 +79,10 @@ CLEARPCGFLAG(AcctLRU, ACCT_LRU)
> TESTPCGFLAG(AcctLRU, ACCT_LRU)
> TESTCLEARPCGFLAG(AcctLRU, ACCT_LRU)
>
> +TESTPCGFLAG(FileMapped, FILE_MAPPED)
> +SETPCGFLAG(FileMapped, FILE_MAPPED)
> +CLEARPCGFLAG(FileMapped, FILE_MAPPED)
> +
> static inline int page_cgroup_nid(struct page_cgroup *pc)
> {
> return page_to_nid(pc->page);
> @@ -93,6 +103,11 @@ static inline void unlock_page_cgroup(struct page_cgroup *pc)
> bit_spin_unlock(PCG_LOCK, &pc->flags);
> }
>
> +static inline int trylock_page_cgroup(struct page_cgroup *pc)
> +{
> + return bit_spin_trylock(PCG_LOCK, &pc->flags);
> +}
> +
> #else /* CONFIG_CGROUP_MEM_RES_CTLR */
> struct page_cgroup;
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 7fab84e..b7c23ea 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -1348,30 +1348,83 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
> * Currently used to update mapped file statistics, but the routine can be
> * generalized to update other statistics as well.
> */
> -void mem_cgroup_update_file_mapped(struct page *page, int val)
> +void __mem_cgroup_update_stat(struct page_cgroup *pc, int idx, bool charge)
> {
> struct mem_cgroup *mem;
> - struct page_cgroup *pc;
> + int val;
>
> - pc = lookup_page_cgroup(page);
> - if (unlikely(!pc))
> - return;
> -
> - lock_page_cgroup(pc);
> mem = pc->mem_cgroup;
> - if (!mem)
> - goto done;
> + if (!mem || !PageCgroupUsed(pc))
> + return;
>
> - if (!PageCgroupUsed(pc))
> - goto done;
> + if (charge)
> + val = 1;
> + else
> + val = -1;
>
> + switch (idx) {
> + case MEMCG_NR_FILE_MAPPED:
> + if (charge) {
> + if (!PageCgroupFileMapped(pc))
> + SetPageCgroupFileMapped(pc);
> + else
> + val = 0;
> + } else {
> + if (PageCgroupFileMapped(pc))
> + ClearPageCgroupFileMapped(pc);
> + else
> + val = 0;
> + }
> + idx = MEM_CGROUP_STAT_FILE_MAPPED;
> + break;
> + default:
> + BUG();
> + break;
> + }
> /*
> * Preemption is already disabled. We can use __this_cpu_xxx
> */
> - __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], val);
> + __this_cpu_add(mem->stat->count[idx], val);
> +}
>
> -done:
> - unlock_page_cgroup(pc);
> +void mem_cgroup_update_stat(struct page *page, int idx, bool charge)
> +{
> + struct page_cgroup *pc;
> +
> + pc = lookup_page_cgroup(page);
> + if (!pc || mem_cgroup_is_root(pc->mem_cgroup))
> + return;
> +
> + if (trylock_page_cgroup(pc)) {
> + __mem_cgroup_update_stat(pc, idx, charge);
> + unlock_page_cgroup(pc);
> + }
What happens if the trylock fails, do we lose the stat?
A comment with details would help
> + return;
> +}
> +
> +static void mem_cgroup_migrate_stat(struct page_cgroup *pc,
> + struct mem_cgroup *from, struct mem_cgroup *to)
> +{
> + if (PageCgroupFileMapped(pc)) {
> + __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
> + if (!mem_cgroup_is_root(to)) {
> + __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
> + } else {
> + ClearPageCgroupFileMapped(pc);
> + }
> + }
> +}
> +
> +static void
> +__mem_cgroup_stat_fixup(struct page_cgroup *pc, struct mem_cgroup *mem)
> +{
> + if (mem_cgroup_is_root(mem))
> + return;
> + /* We'are in uncharge() and lock_page_cgroup */
> + if (PageCgroupFileMapped(pc)) {
> + __this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
> + ClearPageCgroupFileMapped(pc);
> + }
> }
>
> /*
> @@ -1810,13 +1863,7 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc,
> VM_BUG_ON(pc->mem_cgroup != from);
>
> page = pc->page;
> - if (page_mapped(page) && !PageAnon(page)) {
> - /* Update mapped_file data for mem_cgroup */
> - preempt_disable();
> - __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
> - __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
> - preempt_enable();
> - }
> + mem_cgroup_migrate_stat(pc, from, to);
> mem_cgroup_charge_statistics(from, pc, false);
> if (uncharge)
> /* This is not "cancel", but cancel_charge does all we need. */
> @@ -2208,6 +2255,9 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
> __do_uncharge(mem, ctype);
> if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
> mem_cgroup_swap_statistics(mem, true);
> + if (unlikely(PCG_PageStatMask & pc->flags))
> + __mem_cgroup_stat_fixup(pc, mem);
> +
> mem_cgroup_charge_statistics(mem, pc, false);
>
> ClearPageCgroupUsed(pc);
> diff --git a/mm/rmap.c b/mm/rmap.c
> index fcd593c..b5b2daf 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -829,7 +829,7 @@ void page_add_file_rmap(struct page *page)
> {
> if (atomic_inc_and_test(&page->_mapcount)) {
> __inc_zone_page_state(page, NR_FILE_MAPPED);
> - mem_cgroup_update_file_mapped(page, 1);
> + mem_cgroup_update_stat(page, MEMCG_NR_FILE_MAPPED, true);
> }
> }
>
> @@ -861,7 +861,7 @@ void page_remove_rmap(struct page *page)
> __dec_zone_page_state(page, NR_ANON_PAGES);
> } else {
> __dec_zone_page_state(page, NR_FILE_MAPPED);
> - mem_cgroup_update_file_mapped(page, -1);
> + mem_cgroup_update_stat(page, MEMCG_NR_FILE_MAPPED, false);
> }
> /*
> * It would be tidy to reset the PageAnon mapping here,
> --
> 1.6.3.3
>
--
Three Cheers,
Balbir
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH -mmotm 1/5] memcg: disable irq at page cgroup lock
2010-03-14 23:26 ` [PATCH -mmotm 1/5] memcg: disable irq at page cgroup lock Andrea Righi
2010-03-15 0:06 ` KAMEZAWA Hiroyuki
2010-03-17 7:04 ` Balbir Singh
@ 2010-03-17 11:58 ` Balbir Singh
2010-03-17 23:54 ` KAMEZAWA Hiroyuki
2 siblings, 1 reply; 71+ messages in thread
From: Balbir Singh @ 2010-03-17 11:58 UTC (permalink / raw)
To: Andrea Righi
Cc: KAMEZAWA Hiroyuki, Daisuke Nishimura, Vivek Goyal, Peter Zijlstra,
Trond Myklebust, Suleiman Souhlal, Greg Thelen,
Kirill A. Shutemov, Andrew Morton, containers, linux-kernel,
linux-mm
* Andrea Righi <arighi@develer.com> [2010-03-15 00:26:38]:
> From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
>
> Now, file-mapped is maintaiend. But more generic update function
> will be needed for dirty page accounting.
>
> For accountig page status, we have to guarantee lock_page_cgroup()
> will be never called under tree_lock held.
> To guarantee that, we use trylock at updating status.
> By this, we do fuzzy accounting, but in almost all case, it's correct.
>
I don't like this at all, but in almost all cases is not acceptable
for statistics, since decisions will be made on them and having them
incorrect is really bad. Could we do a form of deferred statistics and
fix this.
--
Three Cheers,
Balbir
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH -mmotm 1/5] memcg: disable irq at page cgroup lock
2010-03-17 11:58 ` Balbir Singh
@ 2010-03-17 23:54 ` KAMEZAWA Hiroyuki
2010-03-18 0:45 ` KAMEZAWA Hiroyuki
2010-03-18 4:19 ` Balbir Singh
0 siblings, 2 replies; 71+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-03-17 23:54 UTC (permalink / raw)
To: balbir
Cc: Andrea Righi, Daisuke Nishimura, Vivek Goyal, Peter Zijlstra,
Trond Myklebust, Suleiman Souhlal, Greg Thelen,
Kirill A. Shutemov, Andrew Morton, containers, linux-kernel,
linux-mm
On Wed, 17 Mar 2010 17:28:55 +0530
Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> * Andrea Righi <arighi@develer.com> [2010-03-15 00:26:38]:
>
> > From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> >
> > Now, file-mapped is maintaiend. But more generic update function
> > will be needed for dirty page accounting.
> >
> > For accountig page status, we have to guarantee lock_page_cgroup()
> > will be never called under tree_lock held.
> > To guarantee that, we use trylock at updating status.
> > By this, we do fuzzy accounting, but in almost all case, it's correct.
> >
>
> I don't like this at all, but in almost all cases is not acceptable
> for statistics, since decisions will be made on them and having them
> incorrect is really bad. Could we do a form of deferred statistics and
> fix this.
>
plz show your implementation which has no performance regresssion.
For me, I don't neee file_mapped accounting, at all. If we can remove that,
we can add simple migration lock.
file_mapped is a feattue you added. please improve it.
Thanks,
-Kame
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH -mmotm 1/5] memcg: disable irq at page cgroup lock
2010-03-17 23:54 ` KAMEZAWA Hiroyuki
@ 2010-03-18 0:45 ` KAMEZAWA Hiroyuki
2010-03-18 2:16 ` Daisuke Nishimura
2010-03-18 4:19 ` Balbir Singh
1 sibling, 1 reply; 71+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-03-18 0:45 UTC (permalink / raw)
To: KAMEZAWA Hiroyuki
Cc: balbir, Andrea Righi, Daisuke Nishimura, Vivek Goyal,
Peter Zijlstra, Trond Myklebust, Suleiman Souhlal, Greg Thelen,
Kirill A. Shutemov, Andrew Morton, containers, linux-kernel,
linux-mm
On Thu, 18 Mar 2010 08:54:11 +0900
KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> On Wed, 17 Mar 2010 17:28:55 +0530
> Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
>
> > * Andrea Righi <arighi@develer.com> [2010-03-15 00:26:38]:
> >
> > > From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> > >
> > > Now, file-mapped is maintaiend. But more generic update function
> > > will be needed for dirty page accounting.
> > >
> > > For accountig page status, we have to guarantee lock_page_cgroup()
> > > will be never called under tree_lock held.
> > > To guarantee that, we use trylock at updating status.
> > > By this, we do fuzzy accounting, but in almost all case, it's correct.
> > >
> >
> > I don't like this at all, but in almost all cases is not acceptable
> > for statistics, since decisions will be made on them and having them
> > incorrect is really bad. Could we do a form of deferred statistics and
> > fix this.
> >
>
> plz show your implementation which has no performance regresssion.
> For me, I don't neee file_mapped accounting, at all. If we can remove that,
> we can add simple migration lock.
> file_mapped is a feattue you added. please improve it.
>
BTW, I should explain how acculate this accounting is in this patch itself.
Now, lock_page_cgroup/unlock_page_cgroup happens when
- charge/uncharge/migrate/move accounting
Then, the lock contention (trylock failure) seems to occur in conflict
with
- charge, uncharge, migarate. move accounting
About dirty accounting, charge/uncharge/migarate are operation in synchronous
manner with radix-tree (holding treelock etc). Then no account leak.
move accounting is only source for inacculacy...but I don't think this move-task
is ciritial....moreover, we don't move any file pages at task-move, now.
(But Nishimura-san has a plan to do so.)
So, contention will happen only at confliction with force_empty.
About FILE_MAPPED accounting, it's not synchronous with radix-tree operaton.
Then, accounting-miss seems to happen when charge/uncharge/migrate/account move.
But
charge .... we don't account a page as FILE_MAPPED before it's charged.
uncharge .. usual operation in turncation is unmap->remove-from-radix-tree.
Then, it's sequential in almost all case. The race exists when...
Assume there are 2 threads A and B. A truncate a file, B map/unmap that.
This is very unusal confliction.
migrate.... we do try_to_unmap before migrating pages. Then, FILE_MAPPED
is properly handled.
move account .... we don't have move-account-mapped-file, yet.
Then, this trylock contention happens at contention with force_empty and truncate.
Then, main issue for contention is force_empty. But it's called for removing memcg,
accounting for such memcg is not important.
Then, I say "this accounting is Okay."
To do more accurate, we may need another "migration lock". But to get better
performance for root cgroup, we have to call mem_cgroup_is_root() before
taking lock and there will be another complicated race.
Bye,
-Kame
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH -mmotm 1/5] memcg: disable irq at page cgroup lock
2010-03-18 0:45 ` KAMEZAWA Hiroyuki
@ 2010-03-18 2:16 ` Daisuke Nishimura
2010-03-18 2:58 ` KAMEZAWA Hiroyuki
2010-03-18 5:12 ` Balbir Singh
0 siblings, 2 replies; 71+ messages in thread
From: Daisuke Nishimura @ 2010-03-18 2:16 UTC (permalink / raw)
To: KAMEZAWA Hiroyuki
Cc: balbir, Andrea Righi, Vivek Goyal, Peter Zijlstra,
Trond Myklebust, Suleiman Souhlal, Greg Thelen,
Kirill A. Shutemov, Andrew Morton, containers, linux-kernel,
linux-mm, Daisuke Nishimura
On Thu, 18 Mar 2010 09:45:19 +0900, KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> On Thu, 18 Mar 2010 08:54:11 +0900
> KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
>
> > On Wed, 17 Mar 2010 17:28:55 +0530
> > Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> >
> > > * Andrea Righi <arighi@develer.com> [2010-03-15 00:26:38]:
> > >
> > > > From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> > > >
> > > > Now, file-mapped is maintaiend. But more generic update function
> > > > will be needed for dirty page accounting.
> > > >
> > > > For accountig page status, we have to guarantee lock_page_cgroup()
> > > > will be never called under tree_lock held.
> > > > To guarantee that, we use trylock at updating status.
> > > > By this, we do fuzzy accounting, but in almost all case, it's correct.
> > > >
> > >
> > > I don't like this at all, but in almost all cases is not acceptable
> > > for statistics, since decisions will be made on them and having them
> > > incorrect is really bad. Could we do a form of deferred statistics and
> > > fix this.
> > >
> >
> > plz show your implementation which has no performance regresssion.
> > For me, I don't neee file_mapped accounting, at all. If we can remove that,
> > we can add simple migration lock.
> > file_mapped is a feattue you added. please improve it.
> >
>
> BTW, I should explain how acculate this accounting is in this patch itself.
>
> Now, lock_page_cgroup/unlock_page_cgroup happens when
> - charge/uncharge/migrate/move accounting
>
> Then, the lock contention (trylock failure) seems to occur in conflict
> with
> - charge, uncharge, migarate. move accounting
>
> About dirty accounting, charge/uncharge/migarate are operation in synchronous
> manner with radix-tree (holding treelock etc). Then no account leak.
> move accounting is only source for inacculacy...but I don't think this move-task
> is ciritial....moreover, we don't move any file pages at task-move, now.
> (But Nishimura-san has a plan to do so.)
> So, contention will happen only at confliction with force_empty.
>
> About FILE_MAPPED accounting, it's not synchronous with radix-tree operaton.
> Then, accounting-miss seems to happen when charge/uncharge/migrate/account move.
> But
> charge .... we don't account a page as FILE_MAPPED before it's charged.
> uncharge .. usual operation in turncation is unmap->remove-from-radix-tree.
> Then, it's sequential in almost all case. The race exists when...
> Assume there are 2 threads A and B. A truncate a file, B map/unmap that.
> This is very unusal confliction.
> migrate.... we do try_to_unmap before migrating pages. Then, FILE_MAPPED
> is properly handled.
> move account .... we don't have move-account-mapped-file, yet.
>
FILE_MAPPED is updated under pte lock. OTOH, move account is also done under
pte lock. page cgroup lock is held under pte lock in both cases, so move account
is not so problem as for FILE_MAPPED.
Thanks,
Daisuke Nishimura.
> Then, this trylock contention happens at contention with force_empty and truncate.
>
>
> Then, main issue for contention is force_empty. But it's called for removing memcg,
> accounting for such memcg is not important.
> Then, I say "this accounting is Okay."
>
> To do more accurate, we may need another "migration lock". But to get better
> performance for root cgroup, we have to call mem_cgroup_is_root() before
> taking lock and there will be another complicated race.
>
> Bye,
> -Kame
>
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH -mmotm 1/5] memcg: disable irq at page cgroup lock
2010-03-18 2:16 ` Daisuke Nishimura
@ 2010-03-18 2:58 ` KAMEZAWA Hiroyuki
2010-03-18 5:12 ` Balbir Singh
1 sibling, 0 replies; 71+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-03-18 2:58 UTC (permalink / raw)
To: Daisuke Nishimura
Cc: balbir, Andrea Righi, Vivek Goyal, Peter Zijlstra,
Trond Myklebust, Suleiman Souhlal, Greg Thelen,
Kirill A. Shutemov, Andrew Morton, containers, linux-kernel,
linux-mm
On Thu, 18 Mar 2010 11:16:53 +0900
Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:
> On Thu, 18 Mar 2010 09:45:19 +0900, KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> > On Thu, 18 Mar 2010 08:54:11 +0900
> > KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> >
> > > On Wed, 17 Mar 2010 17:28:55 +0530
> > > Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> > >
> > > > * Andrea Righi <arighi@develer.com> [2010-03-15 00:26:38]:
> > > >
> > > > > From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> > > > >
> > > > > Now, file-mapped is maintaiend. But more generic update function
> > > > > will be needed for dirty page accounting.
> > > > >
> > > > > For accountig page status, we have to guarantee lock_page_cgroup()
> > > > > will be never called under tree_lock held.
> > > > > To guarantee that, we use trylock at updating status.
> > > > > By this, we do fuzzy accounting, but in almost all case, it's correct.
> > > > >
> > > >
> > > > I don't like this at all, but in almost all cases is not acceptable
> > > > for statistics, since decisions will be made on them and having them
> > > > incorrect is really bad. Could we do a form of deferred statistics and
> > > > fix this.
> > > >
> > >
> > > plz show your implementation which has no performance regresssion.
> > > For me, I don't neee file_mapped accounting, at all. If we can remove that,
> > > we can add simple migration lock.
> > > file_mapped is a feattue you added. please improve it.
> > >
> >
> > BTW, I should explain how acculate this accounting is in this patch itself.
> >
> > Now, lock_page_cgroup/unlock_page_cgroup happens when
> > - charge/uncharge/migrate/move accounting
> >
> > Then, the lock contention (trylock failure) seems to occur in conflict
> > with
> > - charge, uncharge, migarate. move accounting
> >
> > About dirty accounting, charge/uncharge/migarate are operation in synchronous
> > manner with radix-tree (holding treelock etc). Then no account leak.
> > move accounting is only source for inacculacy...but I don't think this move-task
> > is ciritial....moreover, we don't move any file pages at task-move, now.
> > (But Nishimura-san has a plan to do so.)
> > So, contention will happen only at confliction with force_empty.
> >
> > About FILE_MAPPED accounting, it's not synchronous with radix-tree operaton.
> > Then, accounting-miss seems to happen when charge/uncharge/migrate/account move.
> > But
> > charge .... we don't account a page as FILE_MAPPED before it's charged.
> > uncharge .. usual operation in turncation is unmap->remove-from-radix-tree.
> > Then, it's sequential in almost all case. The race exists when...
> > Assume there are 2 threads A and B. A truncate a file, B map/unmap that.
> > This is very unusal confliction.
> > migrate.... we do try_to_unmap before migrating pages. Then, FILE_MAPPED
> > is properly handled.
> > move account .... we don't have move-account-mapped-file, yet.
> >
> FILE_MAPPED is updated under pte lock. OTOH, move account is also done under
> pte lock. page cgroup lock is held under pte lock in both cases, so move account
> is not so problem as for FILE_MAPPED.
>
HmmHmm, thank you. then, only racy cases are truncate and force_empty.
Thanks,
-Kame
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH -mmotm 1/5] memcg: disable irq at page cgroup lock
2010-03-18 2:16 ` Daisuke Nishimura
2010-03-18 2:58 ` KAMEZAWA Hiroyuki
@ 2010-03-18 5:12 ` Balbir Singh
1 sibling, 0 replies; 71+ messages in thread
From: Balbir Singh @ 2010-03-18 5:12 UTC (permalink / raw)
To: Daisuke Nishimura
Cc: KAMEZAWA Hiroyuki, Andrea Righi, Vivek Goyal, Peter Zijlstra,
Trond Myklebust, Suleiman Souhlal, Greg Thelen,
Kirill A. Shutemov, Andrew Morton, containers, linux-kernel,
linux-mm
* nishimura@mxp.nes.nec.co.jp <nishimura@mxp.nes.nec.co.jp> [2010-03-18 11:16:53]:
> On Thu, 18 Mar 2010 09:45:19 +0900, KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> > On Thu, 18 Mar 2010 08:54:11 +0900
> > KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> >
> > > On Wed, 17 Mar 2010 17:28:55 +0530
> > > Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> > >
> > > > * Andrea Righi <arighi@develer.com> [2010-03-15 00:26:38]:
> > > >
> > > > > From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> > > > >
> > > > > Now, file-mapped is maintaiend. But more generic update function
> > > > > will be needed for dirty page accounting.
> > > > >
> > > > > For accountig page status, we have to guarantee lock_page_cgroup()
> > > > > will be never called under tree_lock held.
> > > > > To guarantee that, we use trylock at updating status.
> > > > > By this, we do fuzzy accounting, but in almost all case, it's correct.
> > > > >
> > > >
> > > > I don't like this at all, but in almost all cases is not acceptable
> > > > for statistics, since decisions will be made on them and having them
> > > > incorrect is really bad. Could we do a form of deferred statistics and
> > > > fix this.
> > > >
> > >
> > > plz show your implementation which has no performance regresssion.
> > > For me, I don't neee file_mapped accounting, at all. If we can remove that,
> > > we can add simple migration lock.
> > > file_mapped is a feattue you added. please improve it.
> > >
> >
> > BTW, I should explain how acculate this accounting is in this patch itself.
> >
> > Now, lock_page_cgroup/unlock_page_cgroup happens when
> > - charge/uncharge/migrate/move accounting
> >
> > Then, the lock contention (trylock failure) seems to occur in conflict
> > with
> > - charge, uncharge, migarate. move accounting
> >
> > About dirty accounting, charge/uncharge/migarate are operation in synchronous
> > manner with radix-tree (holding treelock etc). Then no account leak.
> > move accounting is only source for inacculacy...but I don't think this move-task
> > is ciritial....moreover, we don't move any file pages at task-move, now.
> > (But Nishimura-san has a plan to do so.)
> > So, contention will happen only at confliction with force_empty.
> >
> > About FILE_MAPPED accounting, it's not synchronous with radix-tree operaton.
> > Then, accounting-miss seems to happen when charge/uncharge/migrate/account move.
> > But
> > charge .... we don't account a page as FILE_MAPPED before it's charged.
> > uncharge .. usual operation in turncation is unmap->remove-from-radix-tree.
> > Then, it's sequential in almost all case. The race exists when...
> > Assume there are 2 threads A and B. A truncate a file, B map/unmap that.
> > This is very unusal confliction.
> > migrate.... we do try_to_unmap before migrating pages. Then, FILE_MAPPED
> > is properly handled.
> > move account .... we don't have move-account-mapped-file, yet.
> >
> FILE_MAPPED is updated under pte lock. OTOH, move account is also done under
> pte lock. page cgroup lock is held under pte lock in both cases, so move account
> is not so problem as for FILE_MAPPED.
>
True
>
> > Then, this trylock contention happens at contention with force_empty and truncate.
> >
> >
> > Then, main issue for contention is force_empty. But it's called for removing memcg,
> > accounting for such memcg is not important.
> > Then, I say "this accounting is Okay."
> >
> > To do more accurate, we may need another "migration lock". But to get better
> > performance for root cgroup, we have to call mem_cgroup_is_root() before
> > taking lock and there will be another complicated race.
Agreed, we need to find a simpler way of doing this without affecting
the accuracy of accounting - may be two accounting routines for two
code paths. I have not thought through this yet.
--
Three Cheers,
Balbir
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH -mmotm 1/5] memcg: disable irq at page cgroup lock
2010-03-17 23:54 ` KAMEZAWA Hiroyuki
2010-03-18 0:45 ` KAMEZAWA Hiroyuki
@ 2010-03-18 4:19 ` Balbir Singh
2010-03-18 4:21 ` KAMEZAWA Hiroyuki
2010-03-18 4:35 ` KAMEZAWA Hiroyuki
1 sibling, 2 replies; 71+ messages in thread
From: Balbir Singh @ 2010-03-18 4:19 UTC (permalink / raw)
To: KAMEZAWA Hiroyuki
Cc: Andrea Righi, Daisuke Nishimura, Vivek Goyal, Peter Zijlstra,
Trond Myklebust, Suleiman Souhlal, Greg Thelen,
Kirill A. Shutemov, Andrew Morton, containers, linux-kernel,
linux-mm
* KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> [2010-03-18 08:54:11]:
> On Wed, 17 Mar 2010 17:28:55 +0530
> Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
>
> > * Andrea Righi <arighi@develer.com> [2010-03-15 00:26:38]:
> >
> > > From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> > >
> > > Now, file-mapped is maintaiend. But more generic update function
> > > will be needed for dirty page accounting.
> > >
> > > For accountig page status, we have to guarantee lock_page_cgroup()
> > > will be never called under tree_lock held.
> > > To guarantee that, we use trylock at updating status.
> > > By this, we do fuzzy accounting, but in almost all case, it's correct.
> > >
> >
> > I don't like this at all, but in almost all cases is not acceptable
> > for statistics, since decisions will be made on them and having them
> > incorrect is really bad. Could we do a form of deferred statistics and
> > fix this.
> >
>
> plz show your implementation which has no performance regresssion.
> For me, I don't neee file_mapped accounting, at all. If we can remove that,
> we can add simple migration lock.
That doesn't matter, if you need it, I think the larger user base
matters. Unmapped and mapped page cache is critical and I use it
almost daily.
> file_mapped is a feattue you added. please improve it.
>
I will, but please don't break it silently
--
Three Cheers,
Balbir
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH -mmotm 1/5] memcg: disable irq at page cgroup lock
2010-03-18 4:19 ` Balbir Singh
@ 2010-03-18 4:21 ` KAMEZAWA Hiroyuki
2010-03-18 6:25 ` Balbir Singh
2010-03-18 4:35 ` KAMEZAWA Hiroyuki
1 sibling, 1 reply; 71+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-03-18 4:21 UTC (permalink / raw)
To: balbir
Cc: Andrea Righi, Daisuke Nishimura, Vivek Goyal, Peter Zijlstra,
Trond Myklebust, Suleiman Souhlal, Greg Thelen,
Kirill A. Shutemov, Andrew Morton, containers, linux-kernel,
linux-mm
On Thu, 18 Mar 2010 09:49:44 +0530
Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> * KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> [2010-03-18 08:54:11]:
>
> > On Wed, 17 Mar 2010 17:28:55 +0530
> > Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> >
> > > * Andrea Righi <arighi@develer.com> [2010-03-15 00:26:38]:
> > >
> > > > From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> > > >
> > > > Now, file-mapped is maintaiend. But more generic update function
> > > > will be needed for dirty page accounting.
> > > >
> > > > For accountig page status, we have to guarantee lock_page_cgroup()
> > > > will be never called under tree_lock held.
> > > > To guarantee that, we use trylock at updating status.
> > > > By this, we do fuzzy accounting, but in almost all case, it's correct.
> > > >
> > >
> > > I don't like this at all, but in almost all cases is not acceptable
> > > for statistics, since decisions will be made on them and having them
> > > incorrect is really bad. Could we do a form of deferred statistics and
> > > fix this.
> > >
> >
> > plz show your implementation which has no performance regresssion.
> > For me, I don't neee file_mapped accounting, at all. If we can remove that,
> > we can add simple migration lock.
>
> That doesn't matter, if you need it, I think the larger user base
> matters. Unmapped and mapped page cache is critical and I use it
> almost daily.
>
> > file_mapped is a feattue you added. please improve it.
> >
>
> I will, but please don't break it silently
>
I never do silently. All e-mails are open.
Thanks,
-Kame
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH -mmotm 1/5] memcg: disable irq at page cgroup lock
2010-03-18 4:21 ` KAMEZAWA Hiroyuki
@ 2010-03-18 6:25 ` Balbir Singh
0 siblings, 0 replies; 71+ messages in thread
From: Balbir Singh @ 2010-03-18 6:25 UTC (permalink / raw)
To: KAMEZAWA Hiroyuki
Cc: Andrea Righi, Daisuke Nishimura, Vivek Goyal, Peter Zijlstra,
Trond Myklebust, Suleiman Souhlal, Greg Thelen,
Kirill A. Shutemov, Andrew Morton, containers, linux-kernel,
linux-mm
* KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> [2010-03-18 13:21:14]:
> On Thu, 18 Mar 2010 09:49:44 +0530
> Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
>
> > * KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> [2010-03-18 08:54:11]:
> >
> > > On Wed, 17 Mar 2010 17:28:55 +0530
> > > Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> > >
> > > > * Andrea Righi <arighi@develer.com> [2010-03-15 00:26:38]:
> > > >
> > > > > From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> > > > >
> > > > > Now, file-mapped is maintaiend. But more generic update function
> > > > > will be needed for dirty page accounting.
> > > > >
> > > > > For accountig page status, we have to guarantee lock_page_cgroup()
> > > > > will be never called under tree_lock held.
> > > > > To guarantee that, we use trylock at updating status.
> > > > > By this, we do fuzzy accounting, but in almost all case, it's correct.
> > > > >
> > > >
> > > > I don't like this at all, but in almost all cases is not acceptable
> > > > for statistics, since decisions will be made on them and having them
> > > > incorrect is really bad. Could we do a form of deferred statistics and
> > > > fix this.
> > > >
> > >
> > > plz show your implementation which has no performance regresssion.
> > > For me, I don't neee file_mapped accounting, at all. If we can remove that,
> > > we can add simple migration lock.
> >
> > That doesn't matter, if you need it, I think the larger user base
> > matters. Unmapped and mapped page cache is critical and I use it
> > almost daily.
> >
> > > file_mapped is a feattue you added. please improve it.
> > >
> >
> > I will, but please don't break it silently
> >
> I never do silently. All e-mails are open.
>
No, I meant it from the Documentation update perspective. The end user
will be confused when looking at stats without proper documentation.
--
Three Cheers,
Balbir
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH -mmotm 1/5] memcg: disable irq at page cgroup lock
2010-03-18 4:19 ` Balbir Singh
2010-03-18 4:21 ` KAMEZAWA Hiroyuki
@ 2010-03-18 4:35 ` KAMEZAWA Hiroyuki
2010-03-18 16:28 ` Balbir Singh
1 sibling, 1 reply; 71+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-03-18 4:35 UTC (permalink / raw)
To: balbir
Cc: Andrea Righi, Daisuke Nishimura, Vivek Goyal, Peter Zijlstra,
Trond Myklebust, Suleiman Souhlal, Greg Thelen,
Kirill A. Shutemov, Andrew Morton, containers, linux-kernel,
linux-mm
On Thu, 18 Mar 2010 09:49:44 +0530
Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> * KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> [2010-03-18 08:54:11]:
>
> > On Wed, 17 Mar 2010 17:28:55 +0530
> > Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> >
> > > * Andrea Righi <arighi@develer.com> [2010-03-15 00:26:38]:
> > >
> > > > From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> > > >
> > > > Now, file-mapped is maintaiend. But more generic update function
> > > > will be needed for dirty page accounting.
> > > >
> > > > For accountig page status, we have to guarantee lock_page_cgroup()
> > > > will be never called under tree_lock held.
> > > > To guarantee that, we use trylock at updating status.
> > > > By this, we do fuzzy accounting, but in almost all case, it's correct.
> > > >
> > >
> > > I don't like this at all, but in almost all cases is not acceptable
> > > for statistics, since decisions will be made on them and having them
> > > incorrect is really bad. Could we do a form of deferred statistics and
> > > fix this.
> > >
> >
> > plz show your implementation which has no performance regresssion.
> > For me, I don't neee file_mapped accounting, at all. If we can remove that,
> > we can add simple migration lock.
>
> That doesn't matter, if you need it, I think the larger user base
> matters. Unmapped and mapped page cache is critical and I use it
> almost daily.
>
> > file_mapped is a feattue you added. please improve it.
> >
>
> I will, but please don't break it silently
>
Andrea, could you go in following way ?
- don't touch FILE_MAPPED stuff.
- add new functions for other dirty accounting stuff as in this series.
(using trylock is ok.)
Then, no probelm. It's ok to add mem_cgroup_udpate_stat() indpendent from
mem_cgroup_update_file_mapped(). The look may be messy but it's not your
fault. But please write "why add new function" to patch description.
I'm sorry for wasting your time.
Thanks,
-Kame
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH -mmotm 1/5] memcg: disable irq at page cgroup lock
2010-03-18 4:35 ` KAMEZAWA Hiroyuki
@ 2010-03-18 16:28 ` Balbir Singh
2010-03-19 1:23 ` KAMEZAWA Hiroyuki
0 siblings, 1 reply; 71+ messages in thread
From: Balbir Singh @ 2010-03-18 16:28 UTC (permalink / raw)
To: KAMEZAWA Hiroyuki
Cc: Andrea Righi, Daisuke Nishimura, Vivek Goyal, Peter Zijlstra,
Trond Myklebust, Suleiman Souhlal, Greg Thelen,
Kirill A. Shutemov, Andrew Morton, containers, linux-kernel,
linux-mm
* KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> [2010-03-18 13:35:27]:
> On Thu, 18 Mar 2010 09:49:44 +0530
> Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
>
> > * KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> [2010-03-18 08:54:11]:
> >
> > > On Wed, 17 Mar 2010 17:28:55 +0530
> > > Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> > >
> > > > * Andrea Righi <arighi@develer.com> [2010-03-15 00:26:38]:
> > > >
> > > > > From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> > > > >
> > > > > Now, file-mapped is maintaiend. But more generic update function
> > > > > will be needed for dirty page accounting.
> > > > >
> > > > > For accountig page status, we have to guarantee lock_page_cgroup()
> > > > > will be never called under tree_lock held.
> > > > > To guarantee that, we use trylock at updating status.
> > > > > By this, we do fuzzy accounting, but in almost all case, it's correct.
> > > > >
> > > >
> > > > I don't like this at all, but in almost all cases is not acceptable
> > > > for statistics, since decisions will be made on them and having them
> > > > incorrect is really bad. Could we do a form of deferred statistics and
> > > > fix this.
> > > >
> > >
> > > plz show your implementation which has no performance regresssion.
> > > For me, I don't neee file_mapped accounting, at all. If we can remove that,
> > > we can add simple migration lock.
> >
> > That doesn't matter, if you need it, I think the larger user base
> > matters. Unmapped and mapped page cache is critical and I use it
> > almost daily.
> >
> > > file_mapped is a feattue you added. please improve it.
> > >
> >
> > I will, but please don't break it silently
> >
> Andrea, could you go in following way ?
>
> - don't touch FILE_MAPPED stuff.
> - add new functions for other dirty accounting stuff as in this series.
> (using trylock is ok.)
>
> Then, no probelm. It's ok to add mem_cgroup_udpate_stat() indpendent from
> mem_cgroup_update_file_mapped(). The look may be messy but it's not your
> fault. But please write "why add new function" to patch description.
>
> I'm sorry for wasting your time.
Do we need to go down this route? We could check the stat and do the
correct thing. In case of FILE_MAPPED, always grab page_cgroup_lock
and for others potentially look at trylock. It is OK for different
stats to be protected via different locks.
/me takes a look at the code again.
--
Three Cheers,
Balbir
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH -mmotm 1/5] memcg: disable irq at page cgroup lock
2010-03-18 16:28 ` Balbir Singh
@ 2010-03-19 1:23 ` KAMEZAWA Hiroyuki
2010-03-19 2:40 ` Balbir Singh
0 siblings, 1 reply; 71+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-03-19 1:23 UTC (permalink / raw)
To: balbir
Cc: Andrea Righi, Daisuke Nishimura, Vivek Goyal, Peter Zijlstra,
Trond Myklebust, Suleiman Souhlal, Greg Thelen,
Kirill A. Shutemov, Andrew Morton, containers, linux-kernel,
linux-mm
On Thu, 18 Mar 2010 21:58:55 +0530
Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> * KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> [2010-03-18 13:35:27]:
> > Then, no probelm. It's ok to add mem_cgroup_udpate_stat() indpendent from
> > mem_cgroup_update_file_mapped(). The look may be messy but it's not your
> > fault. But please write "why add new function" to patch description.
> >
> > I'm sorry for wasting your time.
>
> Do we need to go down this route? We could check the stat and do the
> correct thing. In case of FILE_MAPPED, always grab page_cgroup_lock
> and for others potentially look at trylock. It is OK for different
> stats to be protected via different locks.
>
I _don't_ want to see a mixture of spinlock and trylock in a function.
Thanks,
-Kame
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH -mmotm 1/5] memcg: disable irq at page cgroup lock
2010-03-19 1:23 ` KAMEZAWA Hiroyuki
@ 2010-03-19 2:40 ` Balbir Singh
2010-03-19 3:00 ` KAMEZAWA Hiroyuki
0 siblings, 1 reply; 71+ messages in thread
From: Balbir Singh @ 2010-03-19 2:40 UTC (permalink / raw)
To: KAMEZAWA Hiroyuki
Cc: Andrea Righi, Daisuke Nishimura, Vivek Goyal, Peter Zijlstra,
Trond Myklebust, Suleiman Souhlal, Greg Thelen,
Kirill A. Shutemov, Andrew Morton, containers, linux-kernel,
linux-mm
* KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> [2010-03-19 10:23:32]:
> On Thu, 18 Mar 2010 21:58:55 +0530
> Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
>
> > * KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> [2010-03-18 13:35:27]:
>
> > > Then, no probelm. It's ok to add mem_cgroup_udpate_stat() indpendent from
> > > mem_cgroup_update_file_mapped(). The look may be messy but it's not your
> > > fault. But please write "why add new function" to patch description.
> > >
> > > I'm sorry for wasting your time.
> >
> > Do we need to go down this route? We could check the stat and do the
> > correct thing. In case of FILE_MAPPED, always grab page_cgroup_lock
> > and for others potentially look at trylock. It is OK for different
> > stats to be protected via different locks.
> >
>
> I _don't_ want to see a mixture of spinlock and trylock in a function.
>
A well documented well written function can help. The other thing is to
of-course solve this correctly by introducing different locking around
the statistics. Are you suggesting the later?
--
Three Cheers,
Balbir
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH -mmotm 1/5] memcg: disable irq at page cgroup lock
2010-03-19 2:40 ` Balbir Singh
@ 2010-03-19 3:00 ` KAMEZAWA Hiroyuki
[not found] ` <xr93hbnepmj6.fsf@ninji.mtv.corp.google.com>
0 siblings, 1 reply; 71+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-03-19 3:00 UTC (permalink / raw)
To: balbir
Cc: Andrea Righi, Daisuke Nishimura, Vivek Goyal, Peter Zijlstra,
Trond Myklebust, Suleiman Souhlal, Greg Thelen,
Kirill A. Shutemov, Andrew Morton, containers, linux-kernel,
linux-mm
On Fri, 19 Mar 2010 08:10:39 +0530
Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> * KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> [2010-03-19 10:23:32]:
>
> > On Thu, 18 Mar 2010 21:58:55 +0530
> > Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> >
> > > * KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> [2010-03-18 13:35:27]:
> >
> > > > Then, no probelm. It's ok to add mem_cgroup_udpate_stat() indpendent from
> > > > mem_cgroup_update_file_mapped(). The look may be messy but it's not your
> > > > fault. But please write "why add new function" to patch description.
> > > >
> > > > I'm sorry for wasting your time.
> > >
> > > Do we need to go down this route? We could check the stat and do the
> > > correct thing. In case of FILE_MAPPED, always grab page_cgroup_lock
> > > and for others potentially look at trylock. It is OK for different
> > > stats to be protected via different locks.
> > >
> >
> > I _don't_ want to see a mixture of spinlock and trylock in a function.
> >
>
> A well documented well written function can help. The other thing is to
> of-course solve this correctly by introducing different locking around
> the statistics. Are you suggesting the later?
>
No. As I wrote.
- don't modify codes around FILE_MAPPED in this series.
- add a new functions for new statistics
Then,
- think about clean up later, after we confirm all things work as expected.
Thanks,
-Kame
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 71+ messages in thread
* [PATCH -mmotm 2/5] memcg: dirty memory documentation
2010-03-14 23:26 [PATCH -mmotm 0/5] memcg: per cgroup dirty limit (v7) Andrea Righi
2010-03-14 23:26 ` [PATCH -mmotm 1/5] memcg: disable irq at page cgroup lock Andrea Righi
@ 2010-03-14 23:26 ` Andrea Righi
2010-03-16 7:41 ` Daisuke Nishimura
2010-03-14 23:26 ` [PATCH -mmotm 3/5] page_cgroup: introduce file cache flags Andrea Righi
` (5 subsequent siblings)
7 siblings, 1 reply; 71+ messages in thread
From: Andrea Righi @ 2010-03-14 23:26 UTC (permalink / raw)
To: KAMEZAWA Hiroyuki, Daisuke Nishimura, Balbir Singh
Cc: Vivek Goyal, Peter Zijlstra, Trond Myklebust, Suleiman Souhlal,
Greg Thelen, Kirill A. Shutemov, Andrew Morton, containers,
linux-kernel, linux-mm, Andrea Righi
Document cgroup dirty memory interfaces and statistics.
Signed-off-by: Andrea Righi <arighi@develer.com>
---
Documentation/cgroups/memory.txt | 36 ++++++++++++++++++++++++++++++++++++
1 files changed, 36 insertions(+), 0 deletions(-)
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt
index 49f86f3..38ca499 100644
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroups/memory.txt
@@ -310,6 +310,11 @@ cache - # of bytes of page cache memory.
rss - # of bytes of anonymous and swap cache memory.
pgpgin - # of pages paged in (equivalent to # of charging events).
pgpgout - # of pages paged out (equivalent to # of uncharging events).
+filedirty - # of pages that are waiting to get written back to the disk.
+writeback - # of pages that are actively being written back to the disk.
+writeback_tmp - # of pages used by FUSE for temporary writeback buffers.
+nfs - # of NFS pages sent to the server, but not yet committed to
+ the actual storage.
active_anon - # of bytes of anonymous and swap cache memory on active
lru list.
inactive_anon - # of bytes of anonymous memory and swap cache memory on
@@ -345,6 +350,37 @@ Note:
- a cgroup which uses hierarchy and it has child cgroup.
- a cgroup which uses hierarchy and not the root of hierarchy.
+5.4 dirty memory
+
+ Control the maximum amount of dirty pages a cgroup can have at any given time.
+
+ Limiting dirty memory is like fixing the max amount of dirty (hard to
+ reclaim) page cache used by any cgroup. So, in case of multiple cgroup writers,
+ they will not be able to consume more than their designated share of dirty
+ pages and will be forced to perform write-out if they cross that limit.
+
+ The interface is equivalent to the procfs interface: /proc/sys/vm/dirty_*.
+ It is possible to configure a limit to trigger both a direct writeback or a
+ background writeback performed by per-bdi flusher threads.
+
+ Per-cgroup dirty limits can be set using the following files in the cgroupfs:
+
+ - memory.dirty_ratio: contains, as a percentage of cgroup memory, the
+ amount of dirty memory at which a process which is generating disk writes
+ inside the cgroup will start itself writing out dirty data.
+
+ - memory.dirty_bytes: the amount of dirty memory of the cgroup (expressed in
+ bytes) at which a process generating disk writes will start itself writing
+ out dirty data.
+
+ - memory.dirty_background_ratio: contains, as a percentage of the cgroup
+ memory, the amount of dirty memory at which background writeback kernel
+ threads will start writing out dirty data.
+
+ - memory.dirty_background_bytes: the amount of dirty memory of the cgroup (in
+ bytes) at which background writeback kernel threads will start writing out
+ dirty data.
+
6. Hierarchy support
--
1.6.3.3
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related [flat|nested] 71+ messages in thread
* Re: [PATCH -mmotm 2/5] memcg: dirty memory documentation
2010-03-14 23:26 ` [PATCH -mmotm 2/5] memcg: dirty memory documentation Andrea Righi
@ 2010-03-16 7:41 ` Daisuke Nishimura
2010-03-17 17:48 ` Greg Thelen
2010-03-17 22:43 ` Andrea Righi
0 siblings, 2 replies; 71+ messages in thread
From: Daisuke Nishimura @ 2010-03-16 7:41 UTC (permalink / raw)
To: Andrea Righi
Cc: KAMEZAWA Hiroyuki, Balbir Singh, Vivek Goyal, Peter Zijlstra,
Trond Myklebust, Suleiman Souhlal, Greg Thelen,
Kirill A. Shutemov, Andrew Morton, containers, linux-kernel,
linux-mm, Daisuke Nishimura
On Mon, 15 Mar 2010 00:26:39 +0100, Andrea Righi <arighi@develer.com> wrote:
> Document cgroup dirty memory interfaces and statistics.
>
> Signed-off-by: Andrea Righi <arighi@develer.com>
> ---
> Documentation/cgroups/memory.txt | 36 ++++++++++++++++++++++++++++++++++++
> 1 files changed, 36 insertions(+), 0 deletions(-)
>
> diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt
> index 49f86f3..38ca499 100644
> --- a/Documentation/cgroups/memory.txt
> +++ b/Documentation/cgroups/memory.txt
> @@ -310,6 +310,11 @@ cache - # of bytes of page cache memory.
> rss - # of bytes of anonymous and swap cache memory.
> pgpgin - # of pages paged in (equivalent to # of charging events).
> pgpgout - # of pages paged out (equivalent to # of uncharging events).
> +filedirty - # of pages that are waiting to get written back to the disk.
> +writeback - # of pages that are actively being written back to the disk.
> +writeback_tmp - # of pages used by FUSE for temporary writeback buffers.
> +nfs - # of NFS pages sent to the server, but not yet committed to
> + the actual storage.
> active_anon - # of bytes of anonymous and swap cache memory on active
> lru list.
> inactive_anon - # of bytes of anonymous memory and swap cache memory on
> @@ -345,6 +350,37 @@ Note:
> - a cgroup which uses hierarchy and it has child cgroup.
> - a cgroup which uses hierarchy and not the root of hierarchy.
>
> +5.4 dirty memory
> +
> + Control the maximum amount of dirty pages a cgroup can have at any given time.
> +
> + Limiting dirty memory is like fixing the max amount of dirty (hard to
> + reclaim) page cache used by any cgroup. So, in case of multiple cgroup writers,
> + they will not be able to consume more than their designated share of dirty
> + pages and will be forced to perform write-out if they cross that limit.
> +
> + The interface is equivalent to the procfs interface: /proc/sys/vm/dirty_*.
> + It is possible to configure a limit to trigger both a direct writeback or a
> + background writeback performed by per-bdi flusher threads.
> +
> + Per-cgroup dirty limits can be set using the following files in the cgroupfs:
> +
> + - memory.dirty_ratio: contains, as a percentage of cgroup memory, the
> + amount of dirty memory at which a process which is generating disk writes
> + inside the cgroup will start itself writing out dirty data.
> +
> + - memory.dirty_bytes: the amount of dirty memory of the cgroup (expressed in
> + bytes) at which a process generating disk writes will start itself writing
> + out dirty data.
> +
> + - memory.dirty_background_ratio: contains, as a percentage of the cgroup
> + memory, the amount of dirty memory at which background writeback kernel
> + threads will start writing out dirty data.
> +
> + - memory.dirty_background_bytes: the amount of dirty memory of the cgroup (in
> + bytes) at which background writeback kernel threads will start writing out
> + dirty data.
> +
>
It would be better to note that what those files of root cgroup mean.
We cannot write any value to them, IOW, we cannot control dirty limit about root cgroup.
And they show the same value as the global one(strictly speaking, it's not true
because global values can change. We need a hook in mem_cgroup_dirty_read()?).
Thanks,
Daisuke Nishimura.
> 6. Hierarchy support
>
> --
> 1.6.3.3
>
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH -mmotm 2/5] memcg: dirty memory documentation
2010-03-16 7:41 ` Daisuke Nishimura
@ 2010-03-17 17:48 ` Greg Thelen
2010-03-17 19:02 ` Balbir Singh
2010-03-17 22:43 ` Andrea Righi
1 sibling, 1 reply; 71+ messages in thread
From: Greg Thelen @ 2010-03-17 17:48 UTC (permalink / raw)
To: Daisuke Nishimura
Cc: Andrea Righi, KAMEZAWA Hiroyuki, Balbir Singh, Vivek Goyal,
Peter Zijlstra, Trond Myklebust, Suleiman Souhlal,
Kirill A. Shutemov, Andrew Morton, containers, linux-kernel,
linux-mm
On Mon, Mar 15, 2010 at 11:41 PM, Daisuke Nishimura
<nishimura@mxp.nes.nec.co.jp> wrote:
> On Mon, 15 Mar 2010 00:26:39 +0100, Andrea Righi <arighi@develer.com> wrote:
>> Document cgroup dirty memory interfaces and statistics.
>>
>> Signed-off-by: Andrea Righi <arighi@develer.com>
>> ---
>> Documentation/cgroups/memory.txt | 36 ++++++++++++++++++++++++++++++++++++
>> 1 files changed, 36 insertions(+), 0 deletions(-)
>>
>> diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt
>> index 49f86f3..38ca499 100644
>> --- a/Documentation/cgroups/memory.txt
>> +++ b/Documentation/cgroups/memory.txt
>> @@ -310,6 +310,11 @@ cache - # of bytes of page cache memory.
>> rss - # of bytes of anonymous and swap cache memory.
>> pgpgin - # of pages paged in (equivalent to # of charging events).
>> pgpgout - # of pages paged out (equivalent to # of uncharging events).
>> +filedirty - # of pages that are waiting to get written back to the disk.
>> +writeback - # of pages that are actively being written back to the disk.
>> +writeback_tmp - # of pages used by FUSE for temporary writeback buffers.
>> +nfs - # of NFS pages sent to the server, but not yet committed to
>> + the actual storage.
Should these new memory.stat counters (filedirty, etc) report byte
counts rather than page counts? I am thinking that byte counters
would make reporting more obvious depending on how heterogeneous page
sizes are used. Byte counters would also agree with /proc/meminfo.
Within the kernel we could still maintain page counts. The only
change would be to the reporting routine, mem_cgroup_get_local_stat(),
which would scale the page counts by PAGE_SIZE as it does for for
cache,rss,etc.
--
Greg
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH -mmotm 2/5] memcg: dirty memory documentation
2010-03-17 17:48 ` Greg Thelen
@ 2010-03-17 19:02 ` Balbir Singh
0 siblings, 0 replies; 71+ messages in thread
From: Balbir Singh @ 2010-03-17 19:02 UTC (permalink / raw)
To: Greg Thelen
Cc: Daisuke Nishimura, Andrea Righi, KAMEZAWA Hiroyuki, Vivek Goyal,
Peter Zijlstra, Trond Myklebust, Suleiman Souhlal,
Kirill A. Shutemov, Andrew Morton, containers, linux-kernel,
linux-mm
* Greg Thelen <gthelen@google.com> [2010-03-17 09:48:18]:
> On Mon, Mar 15, 2010 at 11:41 PM, Daisuke Nishimura
> <nishimura@mxp.nes.nec.co.jp> wrote:
> > On Mon, 15 Mar 2010 00:26:39 +0100, Andrea Righi <arighi@develer.com> wrote:
> >> Document cgroup dirty memory interfaces and statistics.
> >>
> >> Signed-off-by: Andrea Righi <arighi@develer.com>
> >> ---
> >> Documentation/cgroups/memory.txt | 36 ++++++++++++++++++++++++++++++++++++
> >> 1 files changed, 36 insertions(+), 0 deletions(-)
> >>
> >> diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt
> >> index 49f86f3..38ca499 100644
> >> --- a/Documentation/cgroups/memory.txt
> >> +++ b/Documentation/cgroups/memory.txt
> >> @@ -310,6 +310,11 @@ cache - # of bytes of page cache memory.
> >> rss - # of bytes of anonymous and swap cache memory.
> >> pgpgin - # of pages paged in (equivalent to # of charging events).
> >> pgpgout - # of pages paged out (equivalent to # of uncharging events).
> >> +filedirty - # of pages that are waiting to get written back to the disk.
> >> +writeback - # of pages that are actively being written back to the disk.
> >> +writeback_tmp - # of pages used by FUSE for temporary writeback buffers.
> >> +nfs - # of NFS pages sent to the server, but not yet committed to
> >> + the actual storage.
>
> Should these new memory.stat counters (filedirty, etc) report byte
> counts rather than page counts? I am thinking that byte counters
> would make reporting more obvious depending on how heterogeneous page
> sizes are used. Byte counters would also agree with /proc/meminfo.
> Within the kernel we could still maintain page counts. The only
> change would be to the reporting routine, mem_cgroup_get_local_stat(),
> which would scale the page counts by PAGE_SIZE as it does for for
> cache,rss,etc.
>
I agree, byte counts would be better than page counts. pgpin and
pgpout are special cases where the pages matter, the size does not due
to the nature of the operation.
--
Three Cheers,
Balbir
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH -mmotm 2/5] memcg: dirty memory documentation
2010-03-16 7:41 ` Daisuke Nishimura
2010-03-17 17:48 ` Greg Thelen
@ 2010-03-17 22:43 ` Andrea Righi
1 sibling, 0 replies; 71+ messages in thread
From: Andrea Righi @ 2010-03-17 22:43 UTC (permalink / raw)
To: Daisuke Nishimura
Cc: KAMEZAWA Hiroyuki, Balbir Singh, Vivek Goyal, Peter Zijlstra,
Trond Myklebust, Suleiman Souhlal, Greg Thelen,
Kirill A. Shutemov, Andrew Morton, containers, linux-kernel,
linux-mm
On Tue, Mar 16, 2010 at 04:41:21PM +0900, Daisuke Nishimura wrote:
> On Mon, 15 Mar 2010 00:26:39 +0100, Andrea Righi <arighi@develer.com> wrote:
> > Document cgroup dirty memory interfaces and statistics.
> >
> > Signed-off-by: Andrea Righi <arighi@develer.com>
> > ---
> > Documentation/cgroups/memory.txt | 36 ++++++++++++++++++++++++++++++++++++
> > 1 files changed, 36 insertions(+), 0 deletions(-)
> >
> > diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt
> > index 49f86f3..38ca499 100644
> > --- a/Documentation/cgroups/memory.txt
> > +++ b/Documentation/cgroups/memory.txt
> > @@ -310,6 +310,11 @@ cache - # of bytes of page cache memory.
> > rss - # of bytes of anonymous and swap cache memory.
> > pgpgin - # of pages paged in (equivalent to # of charging events).
> > pgpgout - # of pages paged out (equivalent to # of uncharging events).
> > +filedirty - # of pages that are waiting to get written back to the disk.
> > +writeback - # of pages that are actively being written back to the disk.
> > +writeback_tmp - # of pages used by FUSE for temporary writeback buffers.
> > +nfs - # of NFS pages sent to the server, but not yet committed to
> > + the actual storage.
> > active_anon - # of bytes of anonymous and swap cache memory on active
> > lru list.
> > inactive_anon - # of bytes of anonymous memory and swap cache memory on
> > @@ -345,6 +350,37 @@ Note:
> > - a cgroup which uses hierarchy and it has child cgroup.
> > - a cgroup which uses hierarchy and not the root of hierarchy.
> >
> > +5.4 dirty memory
> > +
> > + Control the maximum amount of dirty pages a cgroup can have at any given time.
> > +
> > + Limiting dirty memory is like fixing the max amount of dirty (hard to
> > + reclaim) page cache used by any cgroup. So, in case of multiple cgroup writers,
> > + they will not be able to consume more than their designated share of dirty
> > + pages and will be forced to perform write-out if they cross that limit.
> > +
> > + The interface is equivalent to the procfs interface: /proc/sys/vm/dirty_*.
> > + It is possible to configure a limit to trigger both a direct writeback or a
> > + background writeback performed by per-bdi flusher threads.
> > +
> > + Per-cgroup dirty limits can be set using the following files in the cgroupfs:
> > +
> > + - memory.dirty_ratio: contains, as a percentage of cgroup memory, the
> > + amount of dirty memory at which a process which is generating disk writes
> > + inside the cgroup will start itself writing out dirty data.
> > +
> > + - memory.dirty_bytes: the amount of dirty memory of the cgroup (expressed in
> > + bytes) at which a process generating disk writes will start itself writing
> > + out dirty data.
> > +
> > + - memory.dirty_background_ratio: contains, as a percentage of the cgroup
> > + memory, the amount of dirty memory at which background writeback kernel
> > + threads will start writing out dirty data.
> > +
> > + - memory.dirty_background_bytes: the amount of dirty memory of the cgroup (in
> > + bytes) at which background writeback kernel threads will start writing out
> > + dirty data.
> > +
> >
> It would be better to note that what those files of root cgroup mean.
> We cannot write any value to them, IOW, we cannot control dirty limit about root cgroup.
OK.
> And they show the same value as the global one(strictly speaking, it's not true
> because global values can change. We need a hook in mem_cgroup_dirty_read()?).
OK, we can just return system-wide value if mem_cgroup_is_root() in
mem_cgroup_dirty_read(). Will change this in the next version.
Thanks,
-Andrea
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 71+ messages in thread
* [PATCH -mmotm 3/5] page_cgroup: introduce file cache flags
2010-03-14 23:26 [PATCH -mmotm 0/5] memcg: per cgroup dirty limit (v7) Andrea Righi
2010-03-14 23:26 ` [PATCH -mmotm 1/5] memcg: disable irq at page cgroup lock Andrea Righi
2010-03-14 23:26 ` [PATCH -mmotm 2/5] memcg: dirty memory documentation Andrea Righi
@ 2010-03-14 23:26 ` Andrea Righi
2010-03-14 23:26 ` [PATCH -mmotm 4/5] memcg: dirty pages accounting and limiting infrastructure Andrea Righi
` (4 subsequent siblings)
7 siblings, 0 replies; 71+ messages in thread
From: Andrea Righi @ 2010-03-14 23:26 UTC (permalink / raw)
To: KAMEZAWA Hiroyuki, Daisuke Nishimura, Balbir Singh
Cc: Vivek Goyal, Peter Zijlstra, Trond Myklebust, Suleiman Souhlal,
Greg Thelen, Kirill A. Shutemov, Andrew Morton, containers,
linux-kernel, linux-mm, Andrea Righi
Introduce page_cgroup flags to keep track of file cache pages.
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrea Righi <arighi@develer.com>
---
include/linux/page_cgroup.h | 22 +++++++++++++++++++++-
1 files changed, 21 insertions(+), 1 deletions(-)
diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
index bf9a913..65247e4 100644
--- a/include/linux/page_cgroup.h
+++ b/include/linux/page_cgroup.h
@@ -40,7 +40,11 @@ enum {
PCG_USED, /* this object is in use. */
PCG_ACCT_LRU, /* page has been accounted for */
/* for cache-status accounting */
- PCG_FILE_MAPPED,
+ PCG_FILE_MAPPED, /* page is accounted as file rss*/
+ PCG_FILE_DIRTY, /* page is dirty */
+ PCG_FILE_WRITEBACK, /* page is being written back to disk */
+ PCG_FILE_WRITEBACK_TEMP, /* page is used as temporary buffer for FUSE */
+ PCG_FILE_UNSTABLE_NFS, /* NFS page not yet committed to the server */
};
#define TESTPCGFLAG(uname, lname) \
@@ -83,6 +87,22 @@ TESTPCGFLAG(FileMapped, FILE_MAPPED)
SETPCGFLAG(FileMapped, FILE_MAPPED)
CLEARPCGFLAG(FileMapped, FILE_MAPPED)
+TESTPCGFLAG(FileDirty, FILE_DIRTY)
+SETPCGFLAG(FileDirty, FILE_DIRTY)
+CLEARPCGFLAG(FileDirty, FILE_DIRTY)
+
+TESTPCGFLAG(FileWriteback, FILE_WRITEBACK)
+SETPCGFLAG(FileWriteback, FILE_WRITEBACK)
+CLEARPCGFLAG(FileWriteback, FILE_WRITEBACK)
+
+TESTPCGFLAG(FileWritebackTemp, FILE_WRITEBACK_TEMP)
+SETPCGFLAG(FileWritebackTemp, FILE_WRITEBACK_TEMP)
+CLEARPCGFLAG(FileWritebackTemp, FILE_WRITEBACK_TEMP)
+
+TESTPCGFLAG(FileUnstableNFS, FILE_UNSTABLE_NFS)
+SETPCGFLAG(FileUnstableNFS, FILE_UNSTABLE_NFS)
+CLEARPCGFLAG(FileUnstableNFS, FILE_UNSTABLE_NFS)
+
static inline int page_cgroup_nid(struct page_cgroup *pc)
{
return page_to_nid(pc->page);
--
1.6.3.3
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related [flat|nested] 71+ messages in thread
* [PATCH -mmotm 4/5] memcg: dirty pages accounting and limiting infrastructure
2010-03-14 23:26 [PATCH -mmotm 0/5] memcg: per cgroup dirty limit (v7) Andrea Righi
` (2 preceding siblings ...)
2010-03-14 23:26 ` [PATCH -mmotm 3/5] page_cgroup: introduce file cache flags Andrea Righi
@ 2010-03-14 23:26 ` Andrea Righi
2010-03-15 2:26 ` KAMEZAWA Hiroyuki
` (2 more replies)
2010-03-14 23:26 ` [PATCH -mmotm 5/5] memcg: dirty pages instrumentation Andrea Righi
` (3 subsequent siblings)
7 siblings, 3 replies; 71+ messages in thread
From: Andrea Righi @ 2010-03-14 23:26 UTC (permalink / raw)
To: KAMEZAWA Hiroyuki, Daisuke Nishimura, Balbir Singh
Cc: Vivek Goyal, Peter Zijlstra, Trond Myklebust, Suleiman Souhlal,
Greg Thelen, Kirill A. Shutemov, Andrew Morton, containers,
linux-kernel, linux-mm, Andrea Righi
Infrastructure to account dirty pages per cgroup and add dirty limit
interfaces in the cgroupfs:
- Direct write-out: memory.dirty_ratio, memory.dirty_bytes
- Background write-out: memory.dirty_background_ratio, memory.dirty_background_bytes
Signed-off-by: Andrea Righi <arighi@develer.com>
---
include/linux/memcontrol.h | 92 ++++++++-
mm/memcontrol.c | 484 +++++++++++++++++++++++++++++++++++++++++---
2 files changed, 540 insertions(+), 36 deletions(-)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 88d3f9e..0602ec9 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -19,12 +19,55 @@
#ifndef _LINUX_MEMCONTROL_H
#define _LINUX_MEMCONTROL_H
+
+#include <linux/writeback.h>
#include <linux/cgroup.h>
+
struct mem_cgroup;
struct page_cgroup;
struct page;
struct mm_struct;
+/* Cgroup memory statistics items exported to the kernel */
+enum mem_cgroup_read_page_stat_item {
+ MEMCG_NR_DIRTYABLE_PAGES,
+ MEMCG_NR_RECLAIM_PAGES,
+ MEMCG_NR_WRITEBACK,
+ MEMCG_NR_DIRTY_WRITEBACK_PAGES,
+};
+
+/* File cache pages accounting */
+enum mem_cgroup_write_page_stat_item {
+ MEMCG_NR_FILE_MAPPED, /* # of pages charged as file rss */
+ MEMCG_NR_FILE_DIRTY, /* # of dirty pages in page cache */
+ MEMCG_NR_FILE_WRITEBACK, /* # of pages under writeback */
+ MEMCG_NR_FILE_WRITEBACK_TEMP, /* # of pages under writeback using
+ temporary buffers */
+ MEMCG_NR_FILE_UNSTABLE_NFS, /* # of NFS unstable pages */
+
+ MEMCG_NR_FILE_NSTAT,
+};
+
+/* Dirty memory parameters */
+struct vm_dirty_param {
+ int dirty_ratio;
+ int dirty_background_ratio;
+ unsigned long dirty_bytes;
+ unsigned long dirty_background_bytes;
+};
+
+/*
+ * TODO: provide a validation check routine. And retry if validation
+ * fails.
+ */
+static inline void get_global_vm_dirty_param(struct vm_dirty_param *param)
+{
+ param->dirty_ratio = vm_dirty_ratio;
+ param->dirty_bytes = vm_dirty_bytes;
+ param->dirty_background_ratio = dirty_background_ratio;
+ param->dirty_background_bytes = dirty_background_bytes;
+}
+
#ifdef CONFIG_CGROUP_MEM_RES_CTLR
/*
* All "charge" functions with gfp_mask should use GFP_KERNEL or
@@ -117,6 +160,25 @@ extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
extern int do_swap_account;
#endif
+extern bool mem_cgroup_has_dirty_limit(void);
+extern void get_vm_dirty_param(struct vm_dirty_param *param);
+extern s64 mem_cgroup_page_stat(enum mem_cgroup_read_page_stat_item item);
+
+extern void mem_cgroup_update_page_stat(struct page *page,
+ enum mem_cgroup_write_page_stat_item idx, bool charge);
+
+static inline void mem_cgroup_inc_page_stat(struct page *page,
+ enum mem_cgroup_write_page_stat_item idx)
+{
+ mem_cgroup_update_page_stat(page, idx, true);
+}
+
+static inline void mem_cgroup_dec_page_stat(struct page *page,
+ enum mem_cgroup_write_page_stat_item idx)
+{
+ mem_cgroup_update_page_stat(page, idx, false);
+}
+
static inline bool mem_cgroup_disabled(void)
{
if (mem_cgroup_subsys.disabled)
@@ -124,12 +186,6 @@ static inline bool mem_cgroup_disabled(void)
return false;
}
-enum mem_cgroup_page_stat_item {
- MEMCG_NR_FILE_MAPPED,
- MEMCG_NR_FILE_NSTAT,
-};
-
-void mem_cgroup_update_stat(struct page *page, int idx, bool charge);
unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
gfp_t gfp_mask, int nid,
int zid);
@@ -299,8 +355,18 @@ mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
{
}
-static inline void mem_cgroup_update_file_mapped(struct page *page,
- int val)
+static inline s64 mem_cgroup_page_stat(enum mem_cgroup_read_page_stat_item item)
+{
+ return -ENOSYS;
+}
+
+static inline void mem_cgroup_inc_page_stat(struct page *page,
+ enum mem_cgroup_write_page_stat_item idx)
+{
+}
+
+static inline void mem_cgroup_dec_page_stat(struct page *page,
+ enum mem_cgroup_write_page_stat_item idx)
{
}
@@ -311,6 +377,16 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
return 0;
}
+static inline bool mem_cgroup_has_dirty_limit(void)
+{
+ return false;
+}
+
+static inline void get_vm_dirty_param(struct vm_dirty_param *param)
+{
+ get_global_vm_dirty_param(param);
+}
+
#endif /* CONFIG_CGROUP_MEM_CONT */
#endif /* _LINUX_MEMCONTROL_H */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b7c23ea..91770d0 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -80,14 +80,21 @@ enum mem_cgroup_stat_index {
/*
* For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
*/
- MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */
+ MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */
MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */
- MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */
MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */
MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */
MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
MEM_CGROUP_EVENTS, /* incremented at every pagein/pageout */
+ /* File cache pages accounting */
+ MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */
+ MEM_CGROUP_STAT_FILE_DIRTY, /* # of dirty pages in page cache */
+ MEM_CGROUP_STAT_WRITEBACK, /* # of pages under writeback */
+ MEM_CGROUP_STAT_WRITEBACK_TEMP, /* # of pages under writeback using
+ temporary buffers */
+ MEM_CGROUP_STAT_UNSTABLE_NFS, /* # of NFS unstable pages */
+
MEM_CGROUP_STAT_NSTATS,
};
@@ -95,6 +102,19 @@ struct mem_cgroup_stat_cpu {
s64 count[MEM_CGROUP_STAT_NSTATS];
};
+/* Per cgroup page statistics */
+struct mem_cgroup_page_stat {
+ enum mem_cgroup_read_page_stat_item item;
+ s64 value;
+};
+
+enum {
+ MEM_CGROUP_DIRTY_RATIO,
+ MEM_CGROUP_DIRTY_BYTES,
+ MEM_CGROUP_DIRTY_BACKGROUND_RATIO,
+ MEM_CGROUP_DIRTY_BACKGROUND_BYTES,
+};
+
/*
* per-zone information in memory controller.
*/
@@ -208,6 +228,9 @@ struct mem_cgroup {
unsigned int swappiness;
+ /* control memory cgroup dirty pages */
+ struct vm_dirty_param dirty_param;
+
/* set when res.limit == memsw.limit */
bool memsw_is_minimum;
@@ -1033,6 +1056,189 @@ static unsigned int get_swappiness(struct mem_cgroup *memcg)
return swappiness;
}
+static bool dirty_param_is_valid(struct vm_dirty_param *param)
+{
+ if (param->dirty_ratio && param->dirty_bytes)
+ return false;
+ if (param->dirty_background_ratio && param->dirty_background_bytes)
+ return false;
+ return true;
+}
+
+static void __mem_cgroup_get_dirty_param(struct vm_dirty_param *param,
+ struct mem_cgroup *mem)
+{
+ param->dirty_ratio = mem->dirty_param.dirty_ratio;
+ param->dirty_bytes = mem->dirty_param.dirty_bytes;
+ param->dirty_background_ratio = mem->dirty_param.dirty_background_ratio;
+ param->dirty_background_bytes = mem->dirty_param.dirty_background_bytes;
+}
+
+/*
+ * get_vm_dirty_param() - get dirty memory parameters of the current memcg
+ * @param: a structure that is filled with the dirty memory settings
+ *
+ * The function fills @param with the current memcg dirty memory settings. If
+ * memory cgroup is disabled or in case of error the structure is filled with
+ * the global dirty memory settings.
+ */
+void get_vm_dirty_param(struct vm_dirty_param *param)
+{
+ struct mem_cgroup *memcg;
+
+ if (mem_cgroup_disabled()) {
+ get_global_vm_dirty_param(param);
+ return;
+ }
+ /*
+ * It's possible that "current" may be moved to other cgroup while we
+ * access cgroup. But precise check is meaningless because the task can
+ * be moved after our access and writeback tends to take long time.
+ * At least, "memcg" will not be freed under rcu_read_lock().
+ */
+ while (1) {
+ rcu_read_lock();
+ memcg = mem_cgroup_from_task(current);
+ if (likely(memcg))
+ __mem_cgroup_get_dirty_param(param, memcg);
+ else
+ get_global_vm_dirty_param(param);
+ rcu_read_unlock();
+ /*
+ * Since global and memcg vm_dirty_param are not protected we
+ * try to speculatively read them and retry if we get
+ * inconsistent values.
+ */
+ if (likely(dirty_param_is_valid(param)))
+ break;
+ }
+}
+
+/*
+ * mem_cgroup_has_dirty_limit() - check if current memcg has local dirty limits
+ *
+ * Return true if the current memory cgroup has local dirty memory settings,
+ * false otherwise.
+ */
+bool mem_cgroup_has_dirty_limit(void)
+{
+ struct mem_cgroup *mem;
+
+ if (mem_cgroup_disabled())
+ return false;
+ mem = mem_cgroup_from_task(current);
+ return mem && !mem_cgroup_is_root(mem);
+}
+
+static inline bool mem_cgroup_can_swap(struct mem_cgroup *memcg)
+{
+ if (!do_swap_account)
+ return nr_swap_pages > 0;
+ return !memcg->memsw_is_minimum &&
+ (res_counter_read_u64(&memcg->memsw, RES_LIMIT) > 0);
+}
+
+static s64 mem_cgroup_get_local_page_stat(struct mem_cgroup *memcg,
+ enum mem_cgroup_read_page_stat_item item)
+{
+ s64 ret;
+
+ switch (item) {
+ case MEMCG_NR_DIRTYABLE_PAGES:
+ ret = mem_cgroup_read_stat(memcg, LRU_ACTIVE_FILE) +
+ mem_cgroup_read_stat(memcg, LRU_INACTIVE_FILE);
+ if (mem_cgroup_can_swap(memcg))
+ ret += mem_cgroup_read_stat(memcg, LRU_ACTIVE_ANON) +
+ mem_cgroup_read_stat(memcg, LRU_INACTIVE_ANON);
+ break;
+ case MEMCG_NR_RECLAIM_PAGES:
+ ret = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_FILE_DIRTY) +
+ mem_cgroup_read_stat(memcg,
+ MEM_CGROUP_STAT_UNSTABLE_NFS);
+ break;
+ case MEMCG_NR_WRITEBACK:
+ ret = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
+ break;
+ case MEMCG_NR_DIRTY_WRITEBACK_PAGES:
+ ret = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_WRITEBACK) +
+ mem_cgroup_read_stat(memcg,
+ MEM_CGROUP_STAT_UNSTABLE_NFS);
+ break;
+ default:
+ BUG();
+ break;
+ }
+ return ret;
+}
+
+static int mem_cgroup_page_stat_cb(struct mem_cgroup *mem, void *data)
+{
+ struct mem_cgroup_page_stat *stat = (struct mem_cgroup_page_stat *)data;
+
+ stat->value += mem_cgroup_get_local_page_stat(mem, stat->item);
+ return 0;
+}
+
+static unsigned long long
+memcg_get_hierarchical_free_pages(struct mem_cgroup *mem)
+{
+ struct cgroup *cgroup;
+ unsigned long long min_free, free;
+
+ min_free = res_counter_read_u64(&mem->res, RES_LIMIT) -
+ res_counter_read_u64(&mem->res, RES_USAGE);
+ cgroup = mem->css.cgroup;
+ if (!mem->use_hierarchy)
+ goto out;
+
+ while (cgroup->parent) {
+ cgroup = cgroup->parent;
+ mem = mem_cgroup_from_cont(cgroup);
+ if (!mem->use_hierarchy)
+ break;
+ free = res_counter_read_u64(&mem->res, RES_LIMIT) -
+ res_counter_read_u64(&mem->res, RES_USAGE);
+ min_free = min(min_free, free);
+ }
+out:
+ /* Translate free memory in pages */
+ return min_free >> PAGE_SHIFT;
+}
+
+/*
+ * mem_cgroup_page_stat() - get memory cgroup file cache statistics
+ * @item: memory statistic item exported to the kernel
+ *
+ * Return the accounted statistic value, or a negative value in case of error.
+ */
+s64 mem_cgroup_page_stat(enum mem_cgroup_read_page_stat_item item)
+{
+ struct mem_cgroup_page_stat stat = {};
+ struct mem_cgroup *mem;
+
+ rcu_read_lock();
+ mem = mem_cgroup_from_task(current);
+ if (mem && !mem_cgroup_is_root(mem)) {
+ /*
+ * If we're looking for dirtyable pages we need to evaluate
+ * free pages depending on the limit and usage of the parents
+ * first of all.
+ */
+ if (item == MEMCG_NR_DIRTYABLE_PAGES)
+ stat.value = memcg_get_hierarchical_free_pages(mem);
+ /*
+ * Recursively evaluate page statistics against all cgroup
+ * under hierarchy tree
+ */
+ stat.item = item;
+ mem_cgroup_walk_tree(mem, &stat, mem_cgroup_page_stat_cb);
+ } else
+ stat.value = -EINVAL;
+ rcu_read_unlock();
+
+ return stat.value;
+}
+
static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data)
{
int *val = data;
@@ -1344,24 +1550,16 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
return true;
}
-/*
- * Currently used to update mapped file statistics, but the routine can be
- * generalized to update other statistics as well.
- */
-void __mem_cgroup_update_stat(struct page_cgroup *pc, int idx, bool charge)
+static void __mem_cgroup_update_page_stat(struct page_cgroup *pc,
+ int idx, bool charge)
{
struct mem_cgroup *mem;
- int val;
+ int val = charge ? 1 : -1;
mem = pc->mem_cgroup;
if (!mem || !PageCgroupUsed(pc))
return;
- if (charge)
- val = 1;
- else
- val = -1;
-
switch (idx) {
case MEMCG_NR_FILE_MAPPED:
if (charge) {
@@ -1377,6 +1575,62 @@ void __mem_cgroup_update_stat(struct page_cgroup *pc, int idx, bool charge)
}
idx = MEM_CGROUP_STAT_FILE_MAPPED;
break;
+ case MEMCG_NR_FILE_DIRTY:
+ if (charge) {
+ if (!PageCgroupFileDirty(pc))
+ SetPageCgroupFileDirty(pc);
+ else
+ val = 0;
+ } else {
+ if (PageCgroupFileDirty(pc))
+ ClearPageCgroupFileDirty(pc);
+ else
+ val = 0;
+ }
+ idx = MEM_CGROUP_STAT_FILE_DIRTY;
+ break;
+ case MEMCG_NR_FILE_WRITEBACK:
+ if (charge) {
+ if (!PageCgroupFileWriteback(pc))
+ SetPageCgroupFileWriteback(pc);
+ else
+ val = 0;
+ } else {
+ if (PageCgroupFileWriteback(pc))
+ ClearPageCgroupFileWriteback(pc);
+ else
+ val = 0;
+ }
+ idx = MEM_CGROUP_STAT_WRITEBACK;
+ break;
+ case MEMCG_NR_FILE_WRITEBACK_TEMP:
+ if (charge) {
+ if (!PageCgroupFileWritebackTemp(pc))
+ SetPageCgroupFileWritebackTemp(pc);
+ else
+ val = 0;
+ } else {
+ if (PageCgroupFileWritebackTemp(pc))
+ ClearPageCgroupFileWritebackTemp(pc);
+ else
+ val = 0;
+ }
+ idx = MEM_CGROUP_STAT_WRITEBACK_TEMP;
+ break;
+ case MEMCG_NR_FILE_UNSTABLE_NFS:
+ if (charge) {
+ if (!PageCgroupFileUnstableNFS(pc))
+ SetPageCgroupFileUnstableNFS(pc);
+ else
+ val = 0;
+ } else {
+ if (PageCgroupFileUnstableNFS(pc))
+ ClearPageCgroupFileUnstableNFS(pc);
+ else
+ val = 0;
+ }
+ idx = MEM_CGROUP_STAT_UNSTABLE_NFS;
+ break;
default:
BUG();
break;
@@ -1384,34 +1638,81 @@ void __mem_cgroup_update_stat(struct page_cgroup *pc, int idx, bool charge)
/*
* Preemption is already disabled. We can use __this_cpu_xxx
*/
- __this_cpu_add(mem->stat->count[idx], val);
+ if (val)
+ __this_cpu_add(mem->stat->count[idx], val);
}
-void mem_cgroup_update_stat(struct page *page, int idx, bool charge)
+/*
+ * mem_cgroup_update_page_stat() - update memcg file cache's accounting
+ * @page: the page involved in a file cache operation.
+ * @idx: the particular file cache statistic.
+ * @charge: true to increment, false to decrement the statistic specified
+ * by @idx.
+ *
+ * Update memory cgroup file cache's accounting.
+ */
+void mem_cgroup_update_page_stat(struct page *page,
+ enum mem_cgroup_write_page_stat_item idx, bool charge)
{
struct page_cgroup *pc;
+ if (mem_cgroup_disabled())
+ return;
pc = lookup_page_cgroup(page);
if (!pc || mem_cgroup_is_root(pc->mem_cgroup))
return;
if (trylock_page_cgroup(pc)) {
- __mem_cgroup_update_stat(pc, idx, charge);
+ __mem_cgroup_update_page_stat(pc, idx, charge);
unlock_page_cgroup(pc);
}
- return;
}
+EXPORT_SYMBOL_GPL(mem_cgroup_update_page_stat);
-static void mem_cgroup_migrate_stat(struct page_cgroup *pc,
+/* Update file cache accounted statistics on task migration. */
+static void mem_cgroup_migrate_file_stat(struct page_cgroup *pc,
struct mem_cgroup *from, struct mem_cgroup *to)
{
if (PageCgroupFileMapped(pc)) {
__this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
- if (!mem_cgroup_is_root(to)) {
- __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
- } else {
+ if (!mem_cgroup_is_root(to))
+ __this_cpu_inc(
+ to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
+ else
ClearPageCgroupFileMapped(pc);
- }
+ }
+ if (PageCgroupFileDirty(pc)) {
+ __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_DIRTY]);
+ if (!mem_cgroup_is_root(to))
+ __this_cpu_inc(
+ to->stat->count[MEM_CGROUP_STAT_FILE_DIRTY]);
+ else
+ ClearPageCgroupFileDirty(pc);
+ }
+ if (PageCgroupFileWriteback(pc)) {
+ __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_WRITEBACK]);
+ if (!mem_cgroup_is_root(to))
+ __this_cpu_inc(
+ to->stat->count[MEM_CGROUP_STAT_WRITEBACK]);
+ else
+ ClearPageCgroupFileWriteback(pc);
+ }
+ if (PageCgroupFileWritebackTemp(pc)) {
+ __this_cpu_dec(
+ from->stat->count[MEM_CGROUP_STAT_WRITEBACK_TEMP]);
+ if (!mem_cgroup_is_root(to))
+ __this_cpu_inc(to->stat->count[
+ MEM_CGROUP_STAT_WRITEBACK_TEMP]);
+ else
+ ClearPageCgroupFileWritebackTemp(pc);
+ }
+ if (PageCgroupFileUnstableNFS(pc)) {
+ __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_UNSTABLE_NFS]);
+ if (!mem_cgroup_is_root(to))
+ __this_cpu_inc(
+ to->stat->count[MEM_CGROUP_STAT_UNSTABLE_NFS]);
+ else
+ ClearPageCgroupFileUnstableNFS(pc);
}
}
@@ -1425,6 +1726,23 @@ __mem_cgroup_stat_fixup(struct page_cgroup *pc, struct mem_cgroup *mem)
__this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
ClearPageCgroupFileMapped(pc);
}
+ if (PageCgroupFileDirty(pc)) {
+ __this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_FILE_DIRTY]);
+ ClearPageCgroupFileDirty(pc);
+ }
+ if (PageCgroupFileWriteback(pc)) {
+ __this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_WRITEBACK]);
+ ClearPageCgroupFileWriteback(pc);
+ }
+ if (PageCgroupFileWritebackTemp(pc)) {
+ __this_cpu_dec(
+ mem->stat->count[MEM_CGROUP_STAT_WRITEBACK_TEMP]);
+ ClearPageCgroupFileWritebackTemp(pc);
+ }
+ if (PageCgroupFileUnstableNFS(pc)) {
+ __this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_UNSTABLE_NFS]);
+ ClearPageCgroupFileUnstableNFS(pc);
+ }
}
/*
@@ -1863,7 +2181,7 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc,
VM_BUG_ON(pc->mem_cgroup != from);
page = pc->page;
- mem_cgroup_migrate_stat(pc, from, to);
+ mem_cgroup_migrate_file_stat(pc, from, to);
mem_cgroup_charge_statistics(from, pc, false);
if (uncharge)
/* This is not "cancel", but cancel_charge does all we need. */
@@ -3168,10 +3486,14 @@ static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
enum {
MCS_CACHE,
MCS_RSS,
- MCS_FILE_MAPPED,
MCS_PGPGIN,
MCS_PGPGOUT,
MCS_SWAP,
+ MCS_FILE_MAPPED,
+ MCS_FILE_DIRTY,
+ MCS_WRITEBACK,
+ MCS_WRITEBACK_TEMP,
+ MCS_UNSTABLE_NFS,
MCS_INACTIVE_ANON,
MCS_ACTIVE_ANON,
MCS_INACTIVE_FILE,
@@ -3190,10 +3512,14 @@ struct {
} memcg_stat_strings[NR_MCS_STAT] = {
{"cache", "total_cache"},
{"rss", "total_rss"},
- {"mapped_file", "total_mapped_file"},
{"pgpgin", "total_pgpgin"},
{"pgpgout", "total_pgpgout"},
{"swap", "total_swap"},
+ {"mapped_file", "total_mapped_file"},
+ {"filedirty", "dirty_pages"},
+ {"writeback", "writeback_pages"},
+ {"writeback_tmp", "writeback_temp_pages"},
+ {"nfs", "nfs_unstable"},
{"inactive_anon", "total_inactive_anon"},
{"active_anon", "total_active_anon"},
{"inactive_file", "total_inactive_file"},
@@ -3212,8 +3538,6 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data)
s->stat[MCS_CACHE] += val * PAGE_SIZE;
val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
s->stat[MCS_RSS] += val * PAGE_SIZE;
- val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED);
- s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;
val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGIN_COUNT);
s->stat[MCS_PGPGIN] += val;
val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGOUT_COUNT);
@@ -3222,6 +3546,16 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data)
val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT);
s->stat[MCS_SWAP] += val * PAGE_SIZE;
}
+ val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED);
+ s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;
+ val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_DIRTY);
+ s->stat[MCS_FILE_DIRTY] += val;
+ val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_WRITEBACK);
+ s->stat[MCS_WRITEBACK] += val;
+ val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_WRITEBACK_TEMP);
+ s->stat[MCS_WRITEBACK_TEMP] += val;
+ val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_UNSTABLE_NFS);
+ s->stat[MCS_UNSTABLE_NFS] += val;
/* per zone stat */
val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON);
@@ -3583,6 +3917,63 @@ unlock:
return ret;
}
+static u64 mem_cgroup_dirty_read(struct cgroup *cgrp, struct cftype *cft)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
+
+ switch (cft->private) {
+ case MEM_CGROUP_DIRTY_RATIO:
+ return memcg->dirty_param.dirty_ratio;
+ case MEM_CGROUP_DIRTY_BYTES:
+ return memcg->dirty_param.dirty_bytes;
+ case MEM_CGROUP_DIRTY_BACKGROUND_RATIO:
+ return memcg->dirty_param.dirty_background_ratio;
+ case MEM_CGROUP_DIRTY_BACKGROUND_BYTES:
+ return memcg->dirty_param.dirty_background_bytes;
+ default:
+ BUG();
+ }
+}
+
+static int
+mem_cgroup_dirty_write(struct cgroup *cgrp, struct cftype *cft, u64 val)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
+ int type = cft->private;
+
+ if (cgrp->parent == NULL)
+ return -EINVAL;
+ if ((type == MEM_CGROUP_DIRTY_RATIO ||
+ type == MEM_CGROUP_DIRTY_BACKGROUND_RATIO) && val > 100)
+ return -EINVAL;
+ /*
+ * TODO: provide a validation check routine. And retry if validation
+ * fails.
+ */
+ switch (type) {
+ case MEM_CGROUP_DIRTY_RATIO:
+ memcg->dirty_param.dirty_ratio = val;
+ memcg->dirty_param.dirty_bytes = 0;
+ break;
+ case MEM_CGROUP_DIRTY_BYTES:
+ memcg->dirty_param.dirty_ratio = 0;
+ memcg->dirty_param.dirty_bytes = val;
+ break;
+ case MEM_CGROUP_DIRTY_BACKGROUND_RATIO:
+ memcg->dirty_param.dirty_background_ratio = val;
+ memcg->dirty_param.dirty_background_bytes = 0;
+ break;
+ case MEM_CGROUP_DIRTY_BACKGROUND_BYTES:
+ memcg->dirty_param.dirty_background_ratio = 0;
+ memcg->dirty_param.dirty_background_bytes = val;
+ break;
+ default:
+ BUG();
+ break;
+ }
+ return 0;
+}
+
static struct cftype mem_cgroup_files[] = {
{
.name = "usage_in_bytes",
@@ -3634,6 +4025,30 @@ static struct cftype mem_cgroup_files[] = {
.write_u64 = mem_cgroup_swappiness_write,
},
{
+ .name = "dirty_ratio",
+ .read_u64 = mem_cgroup_dirty_read,
+ .write_u64 = mem_cgroup_dirty_write,
+ .private = MEM_CGROUP_DIRTY_RATIO,
+ },
+ {
+ .name = "dirty_bytes",
+ .read_u64 = mem_cgroup_dirty_read,
+ .write_u64 = mem_cgroup_dirty_write,
+ .private = MEM_CGROUP_DIRTY_BYTES,
+ },
+ {
+ .name = "dirty_background_ratio",
+ .read_u64 = mem_cgroup_dirty_read,
+ .write_u64 = mem_cgroup_dirty_write,
+ .private = MEM_CGROUP_DIRTY_BACKGROUND_RATIO,
+ },
+ {
+ .name = "dirty_background_bytes",
+ .read_u64 = mem_cgroup_dirty_read,
+ .write_u64 = mem_cgroup_dirty_write,
+ .private = MEM_CGROUP_DIRTY_BACKGROUND_BYTES,
+ },
+ {
.name = "move_charge_at_immigrate",
.read_u64 = mem_cgroup_move_charge_read,
.write_u64 = mem_cgroup_move_charge_write,
@@ -3892,8 +4307,21 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
mem->last_scanned_child = 0;
spin_lock_init(&mem->reclaim_param_lock);
- if (parent)
+ if (parent) {
mem->swappiness = get_swappiness(parent);
+ mem->dirty_param = parent->dirty_param;
+ } else {
+ while (1) {
+ get_global_vm_dirty_param(&mem->dirty_param);
+ /*
+ * Since global dirty parameters are not protected we
+ * try to speculatively read them and retry if we get
+ * inconsistent values.
+ */
+ if (likely(dirty_param_is_valid(&mem->dirty_param)))
+ break;
+ }
+ }
atomic_set(&mem->refcnt, 1);
mem->move_charge_at_immigrate = 0;
mutex_init(&mem->thresholds_lock);
--
1.6.3.3
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related [flat|nested] 71+ messages in thread
* Re: [PATCH -mmotm 4/5] memcg: dirty pages accounting and limiting infrastructure
2010-03-14 23:26 ` [PATCH -mmotm 4/5] memcg: dirty pages accounting and limiting infrastructure Andrea Righi
@ 2010-03-15 2:26 ` KAMEZAWA Hiroyuki
2010-03-16 2:32 ` Daisuke Nishimura
2010-03-18 6:48 ` Greg Thelen
2 siblings, 0 replies; 71+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-03-15 2:26 UTC (permalink / raw)
To: Andrea Righi
Cc: Daisuke Nishimura, Balbir Singh, Vivek Goyal, Peter Zijlstra,
Trond Myklebust, Suleiman Souhlal, Greg Thelen,
Kirill A. Shutemov, Andrew Morton, containers, linux-kernel,
linux-mm
On Mon, 15 Mar 2010 00:26:41 +0100
Andrea Righi <arighi@develer.com> wrote:
> Infrastructure to account dirty pages per cgroup and add dirty limit
> interfaces in the cgroupfs:
>
> - Direct write-out: memory.dirty_ratio, memory.dirty_bytes
>
> - Background write-out: memory.dirty_background_ratio, memory.dirty_background_bytes
>
> Signed-off-by: Andrea Righi <arighi@develer.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH -mmotm 4/5] memcg: dirty pages accounting and limiting infrastructure
2010-03-14 23:26 ` [PATCH -mmotm 4/5] memcg: dirty pages accounting and limiting infrastructure Andrea Righi
2010-03-15 2:26 ` KAMEZAWA Hiroyuki
@ 2010-03-16 2:32 ` Daisuke Nishimura
2010-03-16 14:11 ` Vivek Goyal
2010-03-17 22:52 ` Andrea Righi
2010-03-18 6:48 ` Greg Thelen
2 siblings, 2 replies; 71+ messages in thread
From: Daisuke Nishimura @ 2010-03-16 2:32 UTC (permalink / raw)
To: Andrea Righi
Cc: KAMEZAWA Hiroyuki, Balbir Singh, Vivek Goyal, Peter Zijlstra,
Trond Myklebust, Suleiman Souhlal, Greg Thelen,
Kirill A. Shutemov, Andrew Morton, containers, linux-kernel,
linux-mm, Daisuke Nishimura
On Mon, 15 Mar 2010 00:26:41 +0100, Andrea Righi <arighi@develer.com> wrote:
> Infrastructure to account dirty pages per cgroup and add dirty limit
> interfaces in the cgroupfs:
>
> - Direct write-out: memory.dirty_ratio, memory.dirty_bytes
>
> - Background write-out: memory.dirty_background_ratio, memory.dirty_background_bytes
>
> Signed-off-by: Andrea Righi <arighi@develer.com>
> ---
> include/linux/memcontrol.h | 92 ++++++++-
> mm/memcontrol.c | 484 +++++++++++++++++++++++++++++++++++++++++---
> 2 files changed, 540 insertions(+), 36 deletions(-)
>
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 88d3f9e..0602ec9 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -19,12 +19,55 @@
>
> #ifndef _LINUX_MEMCONTROL_H
> #define _LINUX_MEMCONTROL_H
> +
> +#include <linux/writeback.h>
> #include <linux/cgroup.h>
> +
> struct mem_cgroup;
> struct page_cgroup;
> struct page;
> struct mm_struct;
>
> +/* Cgroup memory statistics items exported to the kernel */
> +enum mem_cgroup_read_page_stat_item {
> + MEMCG_NR_DIRTYABLE_PAGES,
> + MEMCG_NR_RECLAIM_PAGES,
> + MEMCG_NR_WRITEBACK,
> + MEMCG_NR_DIRTY_WRITEBACK_PAGES,
> +};
> +
> +/* File cache pages accounting */
> +enum mem_cgroup_write_page_stat_item {
> + MEMCG_NR_FILE_MAPPED, /* # of pages charged as file rss */
> + MEMCG_NR_FILE_DIRTY, /* # of dirty pages in page cache */
> + MEMCG_NR_FILE_WRITEBACK, /* # of pages under writeback */
> + MEMCG_NR_FILE_WRITEBACK_TEMP, /* # of pages under writeback using
> + temporary buffers */
> + MEMCG_NR_FILE_UNSTABLE_NFS, /* # of NFS unstable pages */
> +
> + MEMCG_NR_FILE_NSTAT,
> +};
> +
> +/* Dirty memory parameters */
> +struct vm_dirty_param {
> + int dirty_ratio;
> + int dirty_background_ratio;
> + unsigned long dirty_bytes;
> + unsigned long dirty_background_bytes;
> +};
> +
> +/*
> + * TODO: provide a validation check routine. And retry if validation
> + * fails.
> + */
> +static inline void get_global_vm_dirty_param(struct vm_dirty_param *param)
> +{
> + param->dirty_ratio = vm_dirty_ratio;
> + param->dirty_bytes = vm_dirty_bytes;
> + param->dirty_background_ratio = dirty_background_ratio;
> + param->dirty_background_bytes = dirty_background_bytes;
> +}
> +
> #ifdef CONFIG_CGROUP_MEM_RES_CTLR
> /*
> * All "charge" functions with gfp_mask should use GFP_KERNEL or
> @@ -117,6 +160,25 @@ extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
> extern int do_swap_account;
> #endif
>
> +extern bool mem_cgroup_has_dirty_limit(void);
> +extern void get_vm_dirty_param(struct vm_dirty_param *param);
> +extern s64 mem_cgroup_page_stat(enum mem_cgroup_read_page_stat_item item);
> +
> +extern void mem_cgroup_update_page_stat(struct page *page,
> + enum mem_cgroup_write_page_stat_item idx, bool charge);
> +
> +static inline void mem_cgroup_inc_page_stat(struct page *page,
> + enum mem_cgroup_write_page_stat_item idx)
> +{
> + mem_cgroup_update_page_stat(page, idx, true);
> +}
> +
> +static inline void mem_cgroup_dec_page_stat(struct page *page,
> + enum mem_cgroup_write_page_stat_item idx)
> +{
> + mem_cgroup_update_page_stat(page, idx, false);
> +}
> +
> static inline bool mem_cgroup_disabled(void)
> {
> if (mem_cgroup_subsys.disabled)
> @@ -124,12 +186,6 @@ static inline bool mem_cgroup_disabled(void)
> return false;
> }
>
> -enum mem_cgroup_page_stat_item {
> - MEMCG_NR_FILE_MAPPED,
> - MEMCG_NR_FILE_NSTAT,
> -};
> -
> -void mem_cgroup_update_stat(struct page *page, int idx, bool charge);
> unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
> gfp_t gfp_mask, int nid,
> int zid);
> @@ -299,8 +355,18 @@ mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
> {
> }
>
> -static inline void mem_cgroup_update_file_mapped(struct page *page,
> - int val)
> +static inline s64 mem_cgroup_page_stat(enum mem_cgroup_read_page_stat_item item)
> +{
> + return -ENOSYS;
> +}
> +
> +static inline void mem_cgroup_inc_page_stat(struct page *page,
> + enum mem_cgroup_write_page_stat_item idx)
> +{
> +}
> +
> +static inline void mem_cgroup_dec_page_stat(struct page *page,
> + enum mem_cgroup_write_page_stat_item idx)
> {
> }
>
> @@ -311,6 +377,16 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
> return 0;
> }
>
> +static inline bool mem_cgroup_has_dirty_limit(void)
> +{
> + return false;
> +}
> +
> +static inline void get_vm_dirty_param(struct vm_dirty_param *param)
> +{
> + get_global_vm_dirty_param(param);
> +}
> +
> #endif /* CONFIG_CGROUP_MEM_CONT */
>
> #endif /* _LINUX_MEMCONTROL_H */
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index b7c23ea..91770d0 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -80,14 +80,21 @@ enum mem_cgroup_stat_index {
> /*
> * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
> */
> - MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */
> + MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */
> MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */
> - MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */
> MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */
> MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */
> MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
> MEM_CGROUP_EVENTS, /* incremented at every pagein/pageout */
>
> + /* File cache pages accounting */
> + MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */
> + MEM_CGROUP_STAT_FILE_DIRTY, /* # of dirty pages in page cache */
> + MEM_CGROUP_STAT_WRITEBACK, /* # of pages under writeback */
> + MEM_CGROUP_STAT_WRITEBACK_TEMP, /* # of pages under writeback using
> + temporary buffers */
> + MEM_CGROUP_STAT_UNSTABLE_NFS, /* # of NFS unstable pages */
> +
> MEM_CGROUP_STAT_NSTATS,
> };
>
> @@ -95,6 +102,19 @@ struct mem_cgroup_stat_cpu {
> s64 count[MEM_CGROUP_STAT_NSTATS];
> };
>
> +/* Per cgroup page statistics */
> +struct mem_cgroup_page_stat {
> + enum mem_cgroup_read_page_stat_item item;
> + s64 value;
> +};
> +
> +enum {
> + MEM_CGROUP_DIRTY_RATIO,
> + MEM_CGROUP_DIRTY_BYTES,
> + MEM_CGROUP_DIRTY_BACKGROUND_RATIO,
> + MEM_CGROUP_DIRTY_BACKGROUND_BYTES,
> +};
> +
> /*
> * per-zone information in memory controller.
> */
> @@ -208,6 +228,9 @@ struct mem_cgroup {
>
> unsigned int swappiness;
>
> + /* control memory cgroup dirty pages */
> + struct vm_dirty_param dirty_param;
> +
> /* set when res.limit == memsw.limit */
> bool memsw_is_minimum;
>
> @@ -1033,6 +1056,189 @@ static unsigned int get_swappiness(struct mem_cgroup *memcg)
> return swappiness;
> }
>
> +static bool dirty_param_is_valid(struct vm_dirty_param *param)
> +{
> + if (param->dirty_ratio && param->dirty_bytes)
> + return false;
> + if (param->dirty_background_ratio && param->dirty_background_bytes)
> + return false;
> + return true;
> +}
> +
> +static void __mem_cgroup_get_dirty_param(struct vm_dirty_param *param,
> + struct mem_cgroup *mem)
> +{
> + param->dirty_ratio = mem->dirty_param.dirty_ratio;
> + param->dirty_bytes = mem->dirty_param.dirty_bytes;
> + param->dirty_background_ratio = mem->dirty_param.dirty_background_ratio;
> + param->dirty_background_bytes = mem->dirty_param.dirty_background_bytes;
> +}
> +
> +/*
> + * get_vm_dirty_param() - get dirty memory parameters of the current memcg
> + * @param: a structure that is filled with the dirty memory settings
> + *
> + * The function fills @param with the current memcg dirty memory settings. If
> + * memory cgroup is disabled or in case of error the structure is filled with
> + * the global dirty memory settings.
> + */
> +void get_vm_dirty_param(struct vm_dirty_param *param)
> +{
> + struct mem_cgroup *memcg;
> +
> + if (mem_cgroup_disabled()) {
> + get_global_vm_dirty_param(param);
> + return;
> + }
> + /*
> + * It's possible that "current" may be moved to other cgroup while we
> + * access cgroup. But precise check is meaningless because the task can
> + * be moved after our access and writeback tends to take long time.
> + * At least, "memcg" will not be freed under rcu_read_lock().
> + */
> + while (1) {
> + rcu_read_lock();
> + memcg = mem_cgroup_from_task(current);
> + if (likely(memcg))
> + __mem_cgroup_get_dirty_param(param, memcg);
> + else
> + get_global_vm_dirty_param(param);
> + rcu_read_unlock();
> + /*
> + * Since global and memcg vm_dirty_param are not protected we
> + * try to speculatively read them and retry if we get
> + * inconsistent values.
> + */
> + if (likely(dirty_param_is_valid(param)))
> + break;
> + }
> +}
> +
> +/*
> + * mem_cgroup_has_dirty_limit() - check if current memcg has local dirty limits
> + *
> + * Return true if the current memory cgroup has local dirty memory settings,
> + * false otherwise.
> + */
> +bool mem_cgroup_has_dirty_limit(void)
> +{
> + struct mem_cgroup *mem;
> +
> + if (mem_cgroup_disabled())
> + return false;
> + mem = mem_cgroup_from_task(current);
> + return mem && !mem_cgroup_is_root(mem);
> +}
> +
> +static inline bool mem_cgroup_can_swap(struct mem_cgroup *memcg)
> +{
> + if (!do_swap_account)
> + return nr_swap_pages > 0;
> + return !memcg->memsw_is_minimum &&
> + (res_counter_read_u64(&memcg->memsw, RES_LIMIT) > 0);
> +}
> +
> +static s64 mem_cgroup_get_local_page_stat(struct mem_cgroup *memcg,
> + enum mem_cgroup_read_page_stat_item item)
> +{
> + s64 ret;
> +
> + switch (item) {
> + case MEMCG_NR_DIRTYABLE_PAGES:
> + ret = mem_cgroup_read_stat(memcg, LRU_ACTIVE_FILE) +
> + mem_cgroup_read_stat(memcg, LRU_INACTIVE_FILE);
> + if (mem_cgroup_can_swap(memcg))
> + ret += mem_cgroup_read_stat(memcg, LRU_ACTIVE_ANON) +
> + mem_cgroup_read_stat(memcg, LRU_INACTIVE_ANON);
> + break;
> + case MEMCG_NR_RECLAIM_PAGES:
> + ret = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_FILE_DIRTY) +
> + mem_cgroup_read_stat(memcg,
> + MEM_CGROUP_STAT_UNSTABLE_NFS);
> + break;
> + case MEMCG_NR_WRITEBACK:
> + ret = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
> + break;
> + case MEMCG_NR_DIRTY_WRITEBACK_PAGES:
> + ret = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_WRITEBACK) +
> + mem_cgroup_read_stat(memcg,
> + MEM_CGROUP_STAT_UNSTABLE_NFS);
> + break;
> + default:
> + BUG();
> + break;
> + }
> + return ret;
> +}
> +
> +static int mem_cgroup_page_stat_cb(struct mem_cgroup *mem, void *data)
> +{
> + struct mem_cgroup_page_stat *stat = (struct mem_cgroup_page_stat *)data;
> +
> + stat->value += mem_cgroup_get_local_page_stat(mem, stat->item);
> + return 0;
> +}
> +
> +static unsigned long long
> +memcg_get_hierarchical_free_pages(struct mem_cgroup *mem)
> +{
> + struct cgroup *cgroup;
> + unsigned long long min_free, free;
> +
> + min_free = res_counter_read_u64(&mem->res, RES_LIMIT) -
> + res_counter_read_u64(&mem->res, RES_USAGE);
> + cgroup = mem->css.cgroup;
> + if (!mem->use_hierarchy)
> + goto out;
> +
> + while (cgroup->parent) {
> + cgroup = cgroup->parent;
> + mem = mem_cgroup_from_cont(cgroup);
> + if (!mem->use_hierarchy)
> + break;
> + free = res_counter_read_u64(&mem->res, RES_LIMIT) -
> + res_counter_read_u64(&mem->res, RES_USAGE);
> + min_free = min(min_free, free);
> + }
> +out:
> + /* Translate free memory in pages */
> + return min_free >> PAGE_SHIFT;
> +}
> +
> +/*
> + * mem_cgroup_page_stat() - get memory cgroup file cache statistics
> + * @item: memory statistic item exported to the kernel
> + *
> + * Return the accounted statistic value, or a negative value in case of error.
> + */
> +s64 mem_cgroup_page_stat(enum mem_cgroup_read_page_stat_item item)
> +{
> + struct mem_cgroup_page_stat stat = {};
> + struct mem_cgroup *mem;
> +
> + rcu_read_lock();
> + mem = mem_cgroup_from_task(current);
> + if (mem && !mem_cgroup_is_root(mem)) {
> + /*
> + * If we're looking for dirtyable pages we need to evaluate
> + * free pages depending on the limit and usage of the parents
> + * first of all.
> + */
> + if (item == MEMCG_NR_DIRTYABLE_PAGES)
> + stat.value = memcg_get_hierarchical_free_pages(mem);
> + /*
> + * Recursively evaluate page statistics against all cgroup
> + * under hierarchy tree
> + */
> + stat.item = item;
> + mem_cgroup_walk_tree(mem, &stat, mem_cgroup_page_stat_cb);
> + } else
> + stat.value = -EINVAL;
> + rcu_read_unlock();
> +
> + return stat.value;
> +}
> +
hmm, mem_cgroup_page_stat() can return negative value, but you place BUG_ON()
in [5/5] to check it returns negative value. What happens if the current is moved
to root between mem_cgroup_has_dirty_limit() and mem_cgroup_page_stat() ?
How about making mem_cgroup_has_dirty_limit() return the target mem_cgroup, and
passing the mem_cgroup to mem_cgroup_page_stat() ?
> static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data)
> {
> int *val = data;
> @@ -1344,24 +1550,16 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
> return true;
> }
>
> -/*
> - * Currently used to update mapped file statistics, but the routine can be
> - * generalized to update other statistics as well.
> - */
> -void __mem_cgroup_update_stat(struct page_cgroup *pc, int idx, bool charge)
> +static void __mem_cgroup_update_page_stat(struct page_cgroup *pc,
> + int idx, bool charge)
> {
> struct mem_cgroup *mem;
> - int val;
> + int val = charge ? 1 : -1;
>
> mem = pc->mem_cgroup;
> if (!mem || !PageCgroupUsed(pc))
> return;
>
> - if (charge)
> - val = 1;
> - else
> - val = -1;
> -
> switch (idx) {
> case MEMCG_NR_FILE_MAPPED:
> if (charge) {
> @@ -1377,6 +1575,62 @@ void __mem_cgroup_update_stat(struct page_cgroup *pc, int idx, bool charge)
> }
> idx = MEM_CGROUP_STAT_FILE_MAPPED;
> break;
> + case MEMCG_NR_FILE_DIRTY:
> + if (charge) {
> + if (!PageCgroupFileDirty(pc))
> + SetPageCgroupFileDirty(pc);
> + else
> + val = 0;
> + } else {
> + if (PageCgroupFileDirty(pc))
> + ClearPageCgroupFileDirty(pc);
> + else
> + val = 0;
> + }
> + idx = MEM_CGROUP_STAT_FILE_DIRTY;
> + break;
> + case MEMCG_NR_FILE_WRITEBACK:
> + if (charge) {
> + if (!PageCgroupFileWriteback(pc))
> + SetPageCgroupFileWriteback(pc);
> + else
> + val = 0;
> + } else {
> + if (PageCgroupFileWriteback(pc))
> + ClearPageCgroupFileWriteback(pc);
> + else
> + val = 0;
> + }
> + idx = MEM_CGROUP_STAT_WRITEBACK;
> + break;
> + case MEMCG_NR_FILE_WRITEBACK_TEMP:
> + if (charge) {
> + if (!PageCgroupFileWritebackTemp(pc))
> + SetPageCgroupFileWritebackTemp(pc);
> + else
> + val = 0;
> + } else {
> + if (PageCgroupFileWritebackTemp(pc))
> + ClearPageCgroupFileWritebackTemp(pc);
> + else
> + val = 0;
> + }
> + idx = MEM_CGROUP_STAT_WRITEBACK_TEMP;
> + break;
> + case MEMCG_NR_FILE_UNSTABLE_NFS:
> + if (charge) {
> + if (!PageCgroupFileUnstableNFS(pc))
> + SetPageCgroupFileUnstableNFS(pc);
> + else
> + val = 0;
> + } else {
> + if (PageCgroupFileUnstableNFS(pc))
> + ClearPageCgroupFileUnstableNFS(pc);
> + else
> + val = 0;
> + }
> + idx = MEM_CGROUP_STAT_UNSTABLE_NFS;
> + break;
> default:
> BUG();
> break;
> @@ -1384,34 +1638,81 @@ void __mem_cgroup_update_stat(struct page_cgroup *pc, int idx, bool charge)
> /*
> * Preemption is already disabled. We can use __this_cpu_xxx
> */
> - __this_cpu_add(mem->stat->count[idx], val);
> + if (val)
> + __this_cpu_add(mem->stat->count[idx], val);
> }
>
> -void mem_cgroup_update_stat(struct page *page, int idx, bool charge)
> +/*
> + * mem_cgroup_update_page_stat() - update memcg file cache's accounting
> + * @page: the page involved in a file cache operation.
> + * @idx: the particular file cache statistic.
> + * @charge: true to increment, false to decrement the statistic specified
> + * by @idx.
> + *
> + * Update memory cgroup file cache's accounting.
> + */
> +void mem_cgroup_update_page_stat(struct page *page,
> + enum mem_cgroup_write_page_stat_item idx, bool charge)
> {
> struct page_cgroup *pc;
>
> + if (mem_cgroup_disabled())
> + return;
> pc = lookup_page_cgroup(page);
> if (!pc || mem_cgroup_is_root(pc->mem_cgroup))
> return;
>
> if (trylock_page_cgroup(pc)) {
> - __mem_cgroup_update_stat(pc, idx, charge);
> + __mem_cgroup_update_page_stat(pc, idx, charge);
> unlock_page_cgroup(pc);
> }
> - return;
> }
> +EXPORT_SYMBOL_GPL(mem_cgroup_update_page_stat);
>
> -static void mem_cgroup_migrate_stat(struct page_cgroup *pc,
> +/* Update file cache accounted statistics on task migration. */
> +static void mem_cgroup_migrate_file_stat(struct page_cgroup *pc,
> struct mem_cgroup *from, struct mem_cgroup *to)
> {
> if (PageCgroupFileMapped(pc)) {
> __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
> - if (!mem_cgroup_is_root(to)) {
> - __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
> - } else {
> + if (!mem_cgroup_is_root(to))
> + __this_cpu_inc(
> + to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
> + else
> ClearPageCgroupFileMapped(pc);
> - }
> + }
> + if (PageCgroupFileDirty(pc)) {
> + __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_DIRTY]);
> + if (!mem_cgroup_is_root(to))
> + __this_cpu_inc(
> + to->stat->count[MEM_CGROUP_STAT_FILE_DIRTY]);
> + else
> + ClearPageCgroupFileDirty(pc);
> + }
> + if (PageCgroupFileWriteback(pc)) {
> + __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_WRITEBACK]);
> + if (!mem_cgroup_is_root(to))
> + __this_cpu_inc(
> + to->stat->count[MEM_CGROUP_STAT_WRITEBACK]);
> + else
> + ClearPageCgroupFileWriteback(pc);
> + }
> + if (PageCgroupFileWritebackTemp(pc)) {
> + __this_cpu_dec(
> + from->stat->count[MEM_CGROUP_STAT_WRITEBACK_TEMP]);
> + if (!mem_cgroup_is_root(to))
> + __this_cpu_inc(to->stat->count[
> + MEM_CGROUP_STAT_WRITEBACK_TEMP]);
> + else
> + ClearPageCgroupFileWritebackTemp(pc);
> + }
> + if (PageCgroupFileUnstableNFS(pc)) {
> + __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_UNSTABLE_NFS]);
> + if (!mem_cgroup_is_root(to))
> + __this_cpu_inc(
> + to->stat->count[MEM_CGROUP_STAT_UNSTABLE_NFS]);
> + else
> + ClearPageCgroupFileUnstableNFS(pc);
> }
> }
>
> @@ -1425,6 +1726,23 @@ __mem_cgroup_stat_fixup(struct page_cgroup *pc, struct mem_cgroup *mem)
> __this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
> ClearPageCgroupFileMapped(pc);
> }
> + if (PageCgroupFileDirty(pc)) {
> + __this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_FILE_DIRTY]);
> + ClearPageCgroupFileDirty(pc);
> + }
> + if (PageCgroupFileWriteback(pc)) {
> + __this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_WRITEBACK]);
> + ClearPageCgroupFileWriteback(pc);
> + }
> + if (PageCgroupFileWritebackTemp(pc)) {
> + __this_cpu_dec(
> + mem->stat->count[MEM_CGROUP_STAT_WRITEBACK_TEMP]);
> + ClearPageCgroupFileWritebackTemp(pc);
> + }
> + if (PageCgroupFileUnstableNFS(pc)) {
> + __this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_UNSTABLE_NFS]);
> + ClearPageCgroupFileUnstableNFS(pc);
> + }
> }
>
> /*
> @@ -1863,7 +2181,7 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc,
> VM_BUG_ON(pc->mem_cgroup != from);
>
> page = pc->page;
> - mem_cgroup_migrate_stat(pc, from, to);
> + mem_cgroup_migrate_file_stat(pc, from, to);
> mem_cgroup_charge_statistics(from, pc, false);
> if (uncharge)
> /* This is not "cancel", but cancel_charge does all we need. */
> @@ -3168,10 +3486,14 @@ static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
> enum {
> MCS_CACHE,
> MCS_RSS,
> - MCS_FILE_MAPPED,
> MCS_PGPGIN,
> MCS_PGPGOUT,
> MCS_SWAP,
> + MCS_FILE_MAPPED,
> + MCS_FILE_DIRTY,
> + MCS_WRITEBACK,
> + MCS_WRITEBACK_TEMP,
> + MCS_UNSTABLE_NFS,
> MCS_INACTIVE_ANON,
> MCS_ACTIVE_ANON,
> MCS_INACTIVE_FILE,
> @@ -3190,10 +3512,14 @@ struct {
> } memcg_stat_strings[NR_MCS_STAT] = {
> {"cache", "total_cache"},
> {"rss", "total_rss"},
> - {"mapped_file", "total_mapped_file"},
> {"pgpgin", "total_pgpgin"},
> {"pgpgout", "total_pgpgout"},
> {"swap", "total_swap"},
> + {"mapped_file", "total_mapped_file"},
> + {"filedirty", "dirty_pages"},
> + {"writeback", "writeback_pages"},
> + {"writeback_tmp", "writeback_temp_pages"},
> + {"nfs", "nfs_unstable"},
> {"inactive_anon", "total_inactive_anon"},
> {"active_anon", "total_active_anon"},
> {"inactive_file", "total_inactive_file"},
Why not using "total_xxx" for total_name ?
> @@ -3212,8 +3538,6 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data)
> s->stat[MCS_CACHE] += val * PAGE_SIZE;
> val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
> s->stat[MCS_RSS] += val * PAGE_SIZE;
> - val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED);
> - s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;
> val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGIN_COUNT);
> s->stat[MCS_PGPGIN] += val;
> val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGOUT_COUNT);
> @@ -3222,6 +3546,16 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data)
> val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT);
> s->stat[MCS_SWAP] += val * PAGE_SIZE;
> }
> + val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED);
> + s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;
> + val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_DIRTY);
> + s->stat[MCS_FILE_DIRTY] += val;
> + val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_WRITEBACK);
> + s->stat[MCS_WRITEBACK] += val;
> + val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_WRITEBACK_TEMP);
> + s->stat[MCS_WRITEBACK_TEMP] += val;
> + val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_UNSTABLE_NFS);
> + s->stat[MCS_UNSTABLE_NFS] += val;
>
I don't have a strong objection, but I prefer showing them in bytes.
And can you add to mem_cgroup_stat_show() something like:
for (i = 0; i < NR_MCS_STAT; i++) {
if (i == MCS_SWAP && !do_swap_account)
continue;
+ if (i >= MCS_FILE_STAT_STAR && i <= MCS_FILE_STAT_END &&
+ mem_cgroup_is_root(mem_cont))
+ continue;
cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]);
}
not to show file stat in root cgroup ? It's meaningless value anyway.
Of course, you'd better mention it in [2/5] too.
Thanks,
Daisuke Nishimura.
> /* per zone stat */
> val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON);
> @@ -3583,6 +3917,63 @@ unlock:
> return ret;
> }
>
> +static u64 mem_cgroup_dirty_read(struct cgroup *cgrp, struct cftype *cft)
> +{
> + struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
> +
> + switch (cft->private) {
> + case MEM_CGROUP_DIRTY_RATIO:
> + return memcg->dirty_param.dirty_ratio;
> + case MEM_CGROUP_DIRTY_BYTES:
> + return memcg->dirty_param.dirty_bytes;
> + case MEM_CGROUP_DIRTY_BACKGROUND_RATIO:
> + return memcg->dirty_param.dirty_background_ratio;
> + case MEM_CGROUP_DIRTY_BACKGROUND_BYTES:
> + return memcg->dirty_param.dirty_background_bytes;
> + default:
> + BUG();
> + }
> +}
> +
> +static int
> +mem_cgroup_dirty_write(struct cgroup *cgrp, struct cftype *cft, u64 val)
> +{
> + struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
> + int type = cft->private;
> +
> + if (cgrp->parent == NULL)
> + return -EINVAL;
> + if ((type == MEM_CGROUP_DIRTY_RATIO ||
> + type == MEM_CGROUP_DIRTY_BACKGROUND_RATIO) && val > 100)
> + return -EINVAL;
> + /*
> + * TODO: provide a validation check routine. And retry if validation
> + * fails.
> + */
> + switch (type) {
> + case MEM_CGROUP_DIRTY_RATIO:
> + memcg->dirty_param.dirty_ratio = val;
> + memcg->dirty_param.dirty_bytes = 0;
> + break;
> + case MEM_CGROUP_DIRTY_BYTES:
> + memcg->dirty_param.dirty_ratio = 0;
> + memcg->dirty_param.dirty_bytes = val;
> + break;
> + case MEM_CGROUP_DIRTY_BACKGROUND_RATIO:
> + memcg->dirty_param.dirty_background_ratio = val;
> + memcg->dirty_param.dirty_background_bytes = 0;
> + break;
> + case MEM_CGROUP_DIRTY_BACKGROUND_BYTES:
> + memcg->dirty_param.dirty_background_ratio = 0;
> + memcg->dirty_param.dirty_background_bytes = val;
> + break;
> + default:
> + BUG();
> + break;
> + }
> + return 0;
> +}
> +
> static struct cftype mem_cgroup_files[] = {
> {
> .name = "usage_in_bytes",
> @@ -3634,6 +4025,30 @@ static struct cftype mem_cgroup_files[] = {
> .write_u64 = mem_cgroup_swappiness_write,
> },
> {
> + .name = "dirty_ratio",
> + .read_u64 = mem_cgroup_dirty_read,
> + .write_u64 = mem_cgroup_dirty_write,
> + .private = MEM_CGROUP_DIRTY_RATIO,
> + },
> + {
> + .name = "dirty_bytes",
> + .read_u64 = mem_cgroup_dirty_read,
> + .write_u64 = mem_cgroup_dirty_write,
> + .private = MEM_CGROUP_DIRTY_BYTES,
> + },
> + {
> + .name = "dirty_background_ratio",
> + .read_u64 = mem_cgroup_dirty_read,
> + .write_u64 = mem_cgroup_dirty_write,
> + .private = MEM_CGROUP_DIRTY_BACKGROUND_RATIO,
> + },
> + {
> + .name = "dirty_background_bytes",
> + .read_u64 = mem_cgroup_dirty_read,
> + .write_u64 = mem_cgroup_dirty_write,
> + .private = MEM_CGROUP_DIRTY_BACKGROUND_BYTES,
> + },
> + {
> .name = "move_charge_at_immigrate",
> .read_u64 = mem_cgroup_move_charge_read,
> .write_u64 = mem_cgroup_move_charge_write,
> @@ -3892,8 +4307,21 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
> mem->last_scanned_child = 0;
> spin_lock_init(&mem->reclaim_param_lock);
>
> - if (parent)
> + if (parent) {
> mem->swappiness = get_swappiness(parent);
> + mem->dirty_param = parent->dirty_param;
> + } else {
> + while (1) {
> + get_global_vm_dirty_param(&mem->dirty_param);
> + /*
> + * Since global dirty parameters are not protected we
> + * try to speculatively read them and retry if we get
> + * inconsistent values.
> + */
> + if (likely(dirty_param_is_valid(&mem->dirty_param)))
> + break;
> + }
> + }
> atomic_set(&mem->refcnt, 1);
> mem->move_charge_at_immigrate = 0;
> mutex_init(&mem->thresholds_lock);
> --
> 1.6.3.3
>
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH -mmotm 4/5] memcg: dirty pages accounting and limiting infrastructure
2010-03-16 2:32 ` Daisuke Nishimura
@ 2010-03-16 14:11 ` Vivek Goyal
2010-03-16 15:09 ` Daisuke Nishimura
2010-03-17 22:37 ` Andrea Righi
2010-03-17 22:52 ` Andrea Righi
1 sibling, 2 replies; 71+ messages in thread
From: Vivek Goyal @ 2010-03-16 14:11 UTC (permalink / raw)
To: Daisuke Nishimura
Cc: Andrea Righi, KAMEZAWA Hiroyuki, Balbir Singh, Peter Zijlstra,
Trond Myklebust, Suleiman Souhlal, Greg Thelen,
Kirill A. Shutemov, Andrew Morton, containers, linux-kernel,
linux-mm
On Tue, Mar 16, 2010 at 11:32:38AM +0900, Daisuke Nishimura wrote:
[..]
> > + * mem_cgroup_page_stat() - get memory cgroup file cache statistics
> > + * @item: memory statistic item exported to the kernel
> > + *
> > + * Return the accounted statistic value, or a negative value in case of error.
> > + */
> > +s64 mem_cgroup_page_stat(enum mem_cgroup_read_page_stat_item item)
> > +{
> > + struct mem_cgroup_page_stat stat = {};
> > + struct mem_cgroup *mem;
> > +
> > + rcu_read_lock();
> > + mem = mem_cgroup_from_task(current);
> > + if (mem && !mem_cgroup_is_root(mem)) {
> > + /*
> > + * If we're looking for dirtyable pages we need to evaluate
> > + * free pages depending on the limit and usage of the parents
> > + * first of all.
> > + */
> > + if (item == MEMCG_NR_DIRTYABLE_PAGES)
> > + stat.value = memcg_get_hierarchical_free_pages(mem);
> > + /*
> > + * Recursively evaluate page statistics against all cgroup
> > + * under hierarchy tree
> > + */
> > + stat.item = item;
> > + mem_cgroup_walk_tree(mem, &stat, mem_cgroup_page_stat_cb);
> > + } else
> > + stat.value = -EINVAL;
> > + rcu_read_unlock();
> > +
> > + return stat.value;
> > +}
> > +
> hmm, mem_cgroup_page_stat() can return negative value, but you place BUG_ON()
> in [5/5] to check it returns negative value. What happens if the current is moved
> to root between mem_cgroup_has_dirty_limit() and mem_cgroup_page_stat() ?
> How about making mem_cgroup_has_dirty_limit() return the target mem_cgroup, and
> passing the mem_cgroup to mem_cgroup_page_stat() ?
>
Hmm, if mem_cgroup_has_dirty_limit() retrun pointer to memcg, then one
shall have to use rcu_read_lock() and that will look ugly.
Why don't we simply look at the return value and if it is negative, we
fall back to using global stats and get rid of BUG_ON()?
Or, modify mem_cgroup_page_stat() to return global stats if it can't
determine per cgroup stat for some reason. (mem=NULL or root cgroup etc).
Vivek
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH -mmotm 4/5] memcg: dirty pages accounting and limiting infrastructure
2010-03-16 14:11 ` Vivek Goyal
@ 2010-03-16 15:09 ` Daisuke Nishimura
2010-03-17 22:37 ` Andrea Righi
1 sibling, 0 replies; 71+ messages in thread
From: Daisuke Nishimura @ 2010-03-16 15:09 UTC (permalink / raw)
To: Vivek Goyal
Cc: Andrea Righi, KAMEZAWA Hiroyuki, Balbir Singh, Peter Zijlstra,
Trond Myklebust, Suleiman Souhlal, Greg Thelen,
Kirill A. Shutemov, Andrew Morton, containers, linux-kernel,
linux-mm, Daisuke Nishimura
On Tue, 16 Mar 2010 10:11:50 -0400
Vivek Goyal <vgoyal@redhat.com> wrote:
> On Tue, Mar 16, 2010 at 11:32:38AM +0900, Daisuke Nishimura wrote:
>
> [..]
> > > + * mem_cgroup_page_stat() - get memory cgroup file cache statistics
> > > + * @item: memory statistic item exported to the kernel
> > > + *
> > > + * Return the accounted statistic value, or a negative value in case of error.
> > > + */
> > > +s64 mem_cgroup_page_stat(enum mem_cgroup_read_page_stat_item item)
> > > +{
> > > + struct mem_cgroup_page_stat stat = {};
> > > + struct mem_cgroup *mem;
> > > +
> > > + rcu_read_lock();
> > > + mem = mem_cgroup_from_task(current);
> > > + if (mem && !mem_cgroup_is_root(mem)) {
> > > + /*
> > > + * If we're looking for dirtyable pages we need to evaluate
> > > + * free pages depending on the limit and usage of the parents
> > > + * first of all.
> > > + */
> > > + if (item == MEMCG_NR_DIRTYABLE_PAGES)
> > > + stat.value = memcg_get_hierarchical_free_pages(mem);
> > > + /*
> > > + * Recursively evaluate page statistics against all cgroup
> > > + * under hierarchy tree
> > > + */
> > > + stat.item = item;
> > > + mem_cgroup_walk_tree(mem, &stat, mem_cgroup_page_stat_cb);
> > > + } else
> > > + stat.value = -EINVAL;
> > > + rcu_read_unlock();
> > > +
> > > + return stat.value;
> > > +}
> > > +
> > hmm, mem_cgroup_page_stat() can return negative value, but you place BUG_ON()
> > in [5/5] to check it returns negative value. What happens if the current is moved
> > to root between mem_cgroup_has_dirty_limit() and mem_cgroup_page_stat() ?
> > How about making mem_cgroup_has_dirty_limit() return the target mem_cgroup, and
> > passing the mem_cgroup to mem_cgroup_page_stat() ?
> >
>
> Hmm, if mem_cgroup_has_dirty_limit() retrun pointer to memcg, then one
> shall have to use rcu_read_lock() and that will look ugly.
>
agreed.
> Why don't we simply look at the return value and if it is negative, we
> fall back to using global stats and get rid of BUG_ON()?
>
> Or, modify mem_cgroup_page_stat() to return global stats if it can't
> determine per cgroup stat for some reason. (mem=NULL or root cgroup etc).
>
I don't have any objection as long as we don't hit BUG_ON.
Thanks,
Daisuke Nishimura.
> Vivek
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
>
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH -mmotm 4/5] memcg: dirty pages accounting and limiting infrastructure
2010-03-16 14:11 ` Vivek Goyal
2010-03-16 15:09 ` Daisuke Nishimura
@ 2010-03-17 22:37 ` Andrea Righi
1 sibling, 0 replies; 71+ messages in thread
From: Andrea Righi @ 2010-03-17 22:37 UTC (permalink / raw)
To: Vivek Goyal
Cc: Daisuke Nishimura, KAMEZAWA Hiroyuki, Balbir Singh,
Peter Zijlstra, Trond Myklebust, Suleiman Souhlal, Greg Thelen,
Kirill A. Shutemov, Andrew Morton, containers, linux-kernel,
linux-mm
On Tue, Mar 16, 2010 at 10:11:50AM -0400, Vivek Goyal wrote:
> On Tue, Mar 16, 2010 at 11:32:38AM +0900, Daisuke Nishimura wrote:
>
> [..]
> > > + * mem_cgroup_page_stat() - get memory cgroup file cache statistics
> > > + * @item: memory statistic item exported to the kernel
> > > + *
> > > + * Return the accounted statistic value, or a negative value in case of error.
> > > + */
> > > +s64 mem_cgroup_page_stat(enum mem_cgroup_read_page_stat_item item)
> > > +{
> > > + struct mem_cgroup_page_stat stat = {};
> > > + struct mem_cgroup *mem;
> > > +
> > > + rcu_read_lock();
> > > + mem = mem_cgroup_from_task(current);
> > > + if (mem && !mem_cgroup_is_root(mem)) {
> > > + /*
> > > + * If we're looking for dirtyable pages we need to evaluate
> > > + * free pages depending on the limit and usage of the parents
> > > + * first of all.
> > > + */
> > > + if (item == MEMCG_NR_DIRTYABLE_PAGES)
> > > + stat.value = memcg_get_hierarchical_free_pages(mem);
> > > + /*
> > > + * Recursively evaluate page statistics against all cgroup
> > > + * under hierarchy tree
> > > + */
> > > + stat.item = item;
> > > + mem_cgroup_walk_tree(mem, &stat, mem_cgroup_page_stat_cb);
> > > + } else
> > > + stat.value = -EINVAL;
> > > + rcu_read_unlock();
> > > +
> > > + return stat.value;
> > > +}
> > > +
> > hmm, mem_cgroup_page_stat() can return negative value, but you place BUG_ON()
> > in [5/5] to check it returns negative value. What happens if the current is moved
> > to root between mem_cgroup_has_dirty_limit() and mem_cgroup_page_stat() ?
> > How about making mem_cgroup_has_dirty_limit() return the target mem_cgroup, and
> > passing the mem_cgroup to mem_cgroup_page_stat() ?
> >
>
> Hmm, if mem_cgroup_has_dirty_limit() retrun pointer to memcg, then one
> shall have to use rcu_read_lock() and that will look ugly.
>
> Why don't we simply look at the return value and if it is negative, we
> fall back to using global stats and get rid of BUG_ON()?
I vote for this one. IMHO the caller of mem_cgroup_page_stat() should
fallback to the equivalent global stats. This allows to keep the things
separated and put in mm/memcontrol.c only the memcg stuff.
>
> Or, modify mem_cgroup_page_stat() to return global stats if it can't
> determine per cgroup stat for some reason. (mem=NULL or root cgroup etc).
>
> Vivek
Thanks,
-Andrea
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH -mmotm 4/5] memcg: dirty pages accounting and limiting infrastructure
2010-03-16 2:32 ` Daisuke Nishimura
2010-03-16 14:11 ` Vivek Goyal
@ 2010-03-17 22:52 ` Andrea Righi
1 sibling, 0 replies; 71+ messages in thread
From: Andrea Righi @ 2010-03-17 22:52 UTC (permalink / raw)
To: Daisuke Nishimura
Cc: KAMEZAWA Hiroyuki, Balbir Singh, Vivek Goyal, Peter Zijlstra,
Trond Myklebust, Suleiman Souhlal, Greg Thelen,
Kirill A. Shutemov, Andrew Morton, containers, linux-kernel,
linux-mm
On Tue, Mar 16, 2010 at 11:32:38AM +0900, Daisuke Nishimura wrote:
[snip]
> > @@ -3190,10 +3512,14 @@ struct {
> > } memcg_stat_strings[NR_MCS_STAT] = {
> > {"cache", "total_cache"},
> > {"rss", "total_rss"},
> > - {"mapped_file", "total_mapped_file"},
> > {"pgpgin", "total_pgpgin"},
> > {"pgpgout", "total_pgpgout"},
> > {"swap", "total_swap"},
> > + {"mapped_file", "total_mapped_file"},
> > + {"filedirty", "dirty_pages"},
> > + {"writeback", "writeback_pages"},
> > + {"writeback_tmp", "writeback_temp_pages"},
> > + {"nfs", "nfs_unstable"},
> > {"inactive_anon", "total_inactive_anon"},
> > {"active_anon", "total_active_anon"},
> > {"inactive_file", "total_inactive_file"},
> Why not using "total_xxx" for total_name ?
Agreed. I would be definitely more clear. Balbir, KAME-san, what do you
think?
>
> > @@ -3212,8 +3538,6 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data)
> > s->stat[MCS_CACHE] += val * PAGE_SIZE;
> > val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
> > s->stat[MCS_RSS] += val * PAGE_SIZE;
> > - val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED);
> > - s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;
> > val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGIN_COUNT);
> > s->stat[MCS_PGPGIN] += val;
> > val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGOUT_COUNT);
> > @@ -3222,6 +3546,16 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data)
> > val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT);
> > s->stat[MCS_SWAP] += val * PAGE_SIZE;
> > }
> > + val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED);
> > + s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;
> > + val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_DIRTY);
> > + s->stat[MCS_FILE_DIRTY] += val;
> > + val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_WRITEBACK);
> > + s->stat[MCS_WRITEBACK] += val;
> > + val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_WRITEBACK_TEMP);
> > + s->stat[MCS_WRITEBACK_TEMP] += val;
> > + val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_UNSTABLE_NFS);
> > + s->stat[MCS_UNSTABLE_NFS] += val;
> >
> I don't have a strong objection, but I prefer showing them in bytes.
> And can you add to mem_cgroup_stat_show() something like:
>
> for (i = 0; i < NR_MCS_STAT; i++) {
> if (i == MCS_SWAP && !do_swap_account)
> continue;
> + if (i >= MCS_FILE_STAT_STAR && i <= MCS_FILE_STAT_END &&
> + mem_cgroup_is_root(mem_cont))
> + continue;
> cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]);
> }
I like this. And I also prefer to show these values in bytes.
>
> not to show file stat in root cgroup ? It's meaningless value anyway.
> Of course, you'd better mention it in [2/5] too.
OK.
Thanks,
-Andrea
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH -mmotm 4/5] memcg: dirty pages accounting and limiting infrastructure
2010-03-14 23:26 ` [PATCH -mmotm 4/5] memcg: dirty pages accounting and limiting infrastructure Andrea Righi
2010-03-15 2:26 ` KAMEZAWA Hiroyuki
2010-03-16 2:32 ` Daisuke Nishimura
@ 2010-03-18 6:48 ` Greg Thelen
2 siblings, 0 replies; 71+ messages in thread
From: Greg Thelen @ 2010-03-18 6:48 UTC (permalink / raw)
To: Andrea Righi, KAMEZAWA Hiroyuki, Daisuke Nishimura, Balbir Singh
Cc: Vivek Goyal, Peter Zijlstra, Trond Myklebust, Suleiman Souhlal,
Kirill A. Shutemov, Andrew Morton, containers, linux-kernel,
linux-mm, Greg Thelen
I have a proposed change to "[PATCH -mmotm 4/5] memcg: dirty pages accounting
and limiting infrastructure" v6. The change is small and I am presenting it
as a git patch (below) to be applied after 4/5 v6 has been applied.
The change is fairly simple. An alternative would be to reject my
patch (below) and enhance get_vm_dirty_param() to loop for consistenty in all
cases.
---patch snip here, rest of email is git patch of 4/5 v6 ---
Removed unneeded looping from get_vm_dirty_param(). The only caller of
get_vm_dirty_param() gracefully handles inconsistent values, so there is no
need for get_vm_dirty_param() to loop to ensure consistency. The previous
looping was inconsistent because it did not loop for the case where memory
cgroups were disabled.
Signed-off-by: Greg Thelen <gthelen@google.com>
---
mm/memcontrol.c | 28 ++++++++++++----------------
1 files changed, 12 insertions(+), 16 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 4d00c0f..990a907 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1081,6 +1081,10 @@ static void __mem_cgroup_get_dirty_param(struct vm_dirty_param *param,
* The function fills @param with the current memcg dirty memory settings. If
* memory cgroup is disabled or in case of error the structure is filled with
* the global dirty memory settings.
+ *
+ * Because global and memcg vm_dirty_param are not protected, inconsistent
+ * values may be returned. If consistent values are required, then the caller
+ * should call this routine until dirty_param_is_valid() returns true.
*/
void get_vm_dirty_param(struct vm_dirty_param *param)
{
@@ -1090,28 +1094,20 @@ void get_vm_dirty_param(struct vm_dirty_param *param)
get_global_vm_dirty_param(param);
return;
}
+
/*
* It's possible that "current" may be moved to other cgroup while we
* access cgroup. But precise check is meaningless because the task can
* be moved after our access and writeback tends to take long time.
* At least, "memcg" will not be freed under rcu_read_lock().
*/
- while (1) {
- rcu_read_lock();
- memcg = mem_cgroup_from_task(current);
- if (likely(memcg))
- __mem_cgroup_get_dirty_param(param, memcg);
- else
- get_global_vm_dirty_param(param);
- rcu_read_unlock();
- /*
- * Since global and memcg vm_dirty_param are not protected we
- * try to speculatively read them and retry if we get
- * inconsistent values.
- */
- if (likely(dirty_param_is_valid(param)))
- break;
- }
+ rcu_read_lock();
+ memcg = mem_cgroup_from_task(current);
+ if (likely(memcg))
+ __mem_cgroup_get_dirty_param(param, memcg);
+ else
+ get_global_vm_dirty_param(param);
+ rcu_read_unlock();
}
/*
--
1.7.0.1
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related [flat|nested] 71+ messages in thread
* [PATCH -mmotm 5/5] memcg: dirty pages instrumentation
2010-03-14 23:26 [PATCH -mmotm 0/5] memcg: per cgroup dirty limit (v7) Andrea Righi
` (3 preceding siblings ...)
2010-03-14 23:26 ` [PATCH -mmotm 4/5] memcg: dirty pages accounting and limiting infrastructure Andrea Righi
@ 2010-03-14 23:26 ` Andrea Righi
2010-03-15 2:31 ` KAMEZAWA Hiroyuki
2010-03-15 2:36 ` [PATCH -mmotm 0/5] memcg: per cgroup dirty limit (v7) KAMEZAWA Hiroyuki
` (2 subsequent siblings)
7 siblings, 1 reply; 71+ messages in thread
From: Andrea Righi @ 2010-03-14 23:26 UTC (permalink / raw)
To: KAMEZAWA Hiroyuki, Daisuke Nishimura, Balbir Singh
Cc: Vivek Goyal, Peter Zijlstra, Trond Myklebust, Suleiman Souhlal,
Greg Thelen, Kirill A. Shutemov, Andrew Morton, containers,
linux-kernel, linux-mm, Andrea Righi
Apply the cgroup dirty pages accounting and limiting infrastructure to
the opportune kernel functions.
[ NOTE: for now do not account WritebackTmp pages (FUSE) and NILFS2
bounce pages. This depends on charging also bounce pages per cgroup. ]
As a bonus, make determine_dirtyable_memory() static again: this
function isn't used anymore outside page writeback.
Signed-off-by: Andrea Righi <arighi@develer.com>
---
fs/nfs/write.c | 4 +
include/linux/writeback.h | 2 -
mm/filemap.c | 1 +
mm/page-writeback.c | 215 ++++++++++++++++++++++++++++-----------------
mm/rmap.c | 4 +-
mm/truncate.c | 1 +
6 files changed, 141 insertions(+), 86 deletions(-)
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 53ff70e..3e8b9f8 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -440,6 +440,7 @@ nfs_mark_request_commit(struct nfs_page *req)
NFS_PAGE_TAG_COMMIT);
nfsi->ncommit++;
spin_unlock(&inode->i_lock);
+ mem_cgroup_inc_page_stat(req->wb_page, MEMCG_NR_FILE_UNSTABLE_NFS);
inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE);
__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
@@ -451,6 +452,7 @@ nfs_clear_request_commit(struct nfs_page *req)
struct page *page = req->wb_page;
if (test_and_clear_bit(PG_CLEAN, &(req)->wb_flags)) {
+ mem_cgroup_dec_page_stat(page, MEMCG_NR_FILE_UNSTABLE_NFS);
dec_zone_page_state(page, NR_UNSTABLE_NFS);
dec_bdi_stat(page->mapping->backing_dev_info, BDI_RECLAIMABLE);
return 1;
@@ -1277,6 +1279,8 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how)
req = nfs_list_entry(head->next);
nfs_list_remove_request(req);
nfs_mark_request_commit(req);
+ mem_cgroup_dec_page_stat(req->wb_page,
+ MEMCG_NR_FILE_UNSTABLE_NFS);
dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
dec_bdi_stat(req->wb_page->mapping->backing_dev_info,
BDI_RECLAIMABLE);
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index dd9512d..39e4cb2 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -117,8 +117,6 @@ extern int vm_highmem_is_dirtyable;
extern int block_dump;
extern int laptop_mode;
-extern unsigned long determine_dirtyable_memory(void);
-
extern int dirty_background_ratio_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos);
diff --git a/mm/filemap.c b/mm/filemap.c
index 62cbac0..bd833fe 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -135,6 +135,7 @@ void __remove_from_page_cache(struct page *page)
* having removed the page entirely.
*/
if (PageDirty(page) && mapping_cap_account_dirty(mapping)) {
+ mem_cgroup_dec_page_stat(page, MEMCG_NR_FILE_DIRTY);
dec_zone_page_state(page, NR_FILE_DIRTY);
dec_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index ab84693..fcac9b4 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -131,6 +131,111 @@ static struct prop_descriptor vm_completions;
static struct prop_descriptor vm_dirties;
/*
+ * Work out the current dirty-memory clamping and background writeout
+ * thresholds.
+ *
+ * The main aim here is to lower them aggressively if there is a lot of mapped
+ * memory around. To avoid stressing page reclaim with lots of unreclaimable
+ * pages. It is better to clamp down on writers than to start swapping, and
+ * performing lots of scanning.
+ *
+ * We only allow 1/2 of the currently-unmapped memory to be dirtied.
+ *
+ * We don't permit the clamping level to fall below 5% - that is getting rather
+ * excessive.
+ *
+ * We make sure that the background writeout level is below the adjusted
+ * clamping level.
+ */
+
+static unsigned long highmem_dirtyable_memory(unsigned long total)
+{
+#ifdef CONFIG_HIGHMEM
+ int node;
+ unsigned long x = 0;
+
+ for_each_node_state(node, N_HIGH_MEMORY) {
+ struct zone *z =
+ &NODE_DATA(node)->node_zones[ZONE_HIGHMEM];
+
+ x += zone_page_state(z, NR_FREE_PAGES) +
+ zone_reclaimable_pages(z);
+ }
+ /*
+ * Make sure that the number of highmem pages is never larger
+ * than the number of the total dirtyable memory. This can only
+ * occur in very strange VM situations but we want to make sure
+ * that this does not occur.
+ */
+ return min(x, total);
+#else
+ return 0;
+#endif
+}
+
+static unsigned long get_global_dirtyable_memory(void)
+{
+ unsigned long memory;
+
+ memory = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages();
+ if (!vm_highmem_is_dirtyable)
+ memory -= highmem_dirtyable_memory(memory);
+ return memory + 1;
+}
+
+static unsigned long get_dirtyable_memory(void)
+{
+ unsigned long memory;
+ s64 memcg_memory;
+
+ memory = get_global_dirtyable_memory();
+ if (!mem_cgroup_has_dirty_limit())
+ return memory;
+ memcg_memory = mem_cgroup_page_stat(MEMCG_NR_DIRTYABLE_PAGES);
+ BUG_ON(memcg_memory < 0);
+
+ return min((unsigned long)memcg_memory, memory);
+}
+
+static long get_reclaimable_pages(void)
+{
+ s64 ret;
+
+ if (!mem_cgroup_has_dirty_limit())
+ return global_page_state(NR_FILE_DIRTY) +
+ global_page_state(NR_UNSTABLE_NFS);
+ ret = mem_cgroup_page_stat(MEMCG_NR_RECLAIM_PAGES);
+ BUG_ON(ret < 0);
+
+ return ret;
+}
+
+static long get_writeback_pages(void)
+{
+ s64 ret;
+
+ if (!mem_cgroup_has_dirty_limit())
+ return global_page_state(NR_WRITEBACK);
+ ret = mem_cgroup_page_stat(MEMCG_NR_WRITEBACK);
+ BUG_ON(ret < 0);
+
+ return ret;
+}
+
+static unsigned long get_dirty_writeback_pages(void)
+{
+ s64 ret;
+
+ if (!mem_cgroup_has_dirty_limit())
+ return global_page_state(NR_UNSTABLE_NFS) +
+ global_page_state(NR_WRITEBACK);
+ ret = mem_cgroup_page_stat(MEMCG_NR_DIRTY_WRITEBACK_PAGES);
+ BUG_ON(ret < 0);
+
+ return ret;
+}
+
+/*
* couple the period to the dirty_ratio:
*
* period/2 ~ roundup_pow_of_two(dirty limit)
@@ -142,7 +247,7 @@ static int calc_period_shift(void)
if (vm_dirty_bytes)
dirty_total = vm_dirty_bytes / PAGE_SIZE;
else
- dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) /
+ dirty_total = (vm_dirty_ratio * get_global_dirtyable_memory()) /
100;
return 2 + ilog2(dirty_total - 1);
}
@@ -355,92 +460,34 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio)
}
EXPORT_SYMBOL(bdi_set_max_ratio);
-/*
- * Work out the current dirty-memory clamping and background writeout
- * thresholds.
- *
- * The main aim here is to lower them aggressively if there is a lot of mapped
- * memory around. To avoid stressing page reclaim with lots of unreclaimable
- * pages. It is better to clamp down on writers than to start swapping, and
- * performing lots of scanning.
- *
- * We only allow 1/2 of the currently-unmapped memory to be dirtied.
- *
- * We don't permit the clamping level to fall below 5% - that is getting rather
- * excessive.
- *
- * We make sure that the background writeout level is below the adjusted
- * clamping level.
- */
-
-static unsigned long highmem_dirtyable_memory(unsigned long total)
-{
-#ifdef CONFIG_HIGHMEM
- int node;
- unsigned long x = 0;
-
- for_each_node_state(node, N_HIGH_MEMORY) {
- struct zone *z =
- &NODE_DATA(node)->node_zones[ZONE_HIGHMEM];
-
- x += zone_page_state(z, NR_FREE_PAGES) +
- zone_reclaimable_pages(z);
- }
- /*
- * Make sure that the number of highmem pages is never larger
- * than the number of the total dirtyable memory. This can only
- * occur in very strange VM situations but we want to make sure
- * that this does not occur.
- */
- return min(x, total);
-#else
- return 0;
-#endif
-}
-
-/**
- * determine_dirtyable_memory - amount of memory that may be used
- *
- * Returns the numebr of pages that can currently be freed and used
- * by the kernel for direct mappings.
- */
-unsigned long determine_dirtyable_memory(void)
-{
- unsigned long x;
-
- x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages();
-
- if (!vm_highmem_is_dirtyable)
- x -= highmem_dirtyable_memory(x);
-
- return x + 1; /* Ensure that we never return 0 */
-}
-
void
get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty,
unsigned long *pbdi_dirty, struct backing_dev_info *bdi)
{
- unsigned long background;
- unsigned long dirty;
- unsigned long available_memory = determine_dirtyable_memory();
+ unsigned long dirty, background;
+ unsigned long available_memory = get_dirtyable_memory();
struct task_struct *tsk;
+ struct vm_dirty_param dirty_param;
- if (vm_dirty_bytes)
- dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
+ get_vm_dirty_param(&dirty_param);
+
+ if (dirty_param.dirty_bytes)
+ dirty = DIV_ROUND_UP(dirty_param.dirty_bytes, PAGE_SIZE);
else {
int dirty_ratio;
- dirty_ratio = vm_dirty_ratio;
+ dirty_ratio = dirty_param.dirty_ratio;
if (dirty_ratio < 5)
dirty_ratio = 5;
dirty = (dirty_ratio * available_memory) / 100;
}
- if (dirty_background_bytes)
- background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE);
+ if (dirty_param.dirty_background_bytes)
+ background = DIV_ROUND_UP(dirty_param.dirty_background_bytes,
+ PAGE_SIZE);
else
- background = (dirty_background_ratio * available_memory) / 100;
-
+ background = (dirty_param.dirty_background_ratio *
+ available_memory) / 100;
if (background >= dirty)
background = dirty / 2;
tsk = current;
@@ -505,9 +552,8 @@ static void balance_dirty_pages(struct address_space *mapping,
get_dirty_limits(&background_thresh, &dirty_thresh,
&bdi_thresh, bdi);
- nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
- global_page_state(NR_UNSTABLE_NFS);
- nr_writeback = global_page_state(NR_WRITEBACK);
+ nr_reclaimable = get_reclaimable_pages();
+ nr_writeback = get_writeback_pages();
bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
@@ -593,10 +639,9 @@ static void balance_dirty_pages(struct address_space *mapping,
* In normal mode, we start background writeout at the lower
* background_thresh, to keep the amount of dirty memory low.
*/
+ nr_reclaimable = get_reclaimable_pages();
if ((laptop_mode && pages_written) ||
- (!laptop_mode && ((global_page_state(NR_FILE_DIRTY)
- + global_page_state(NR_UNSTABLE_NFS))
- > background_thresh)))
+ (!laptop_mode && (nr_reclaimable > background_thresh)))
bdi_start_writeback(bdi, NULL, 0);
}
@@ -660,6 +705,8 @@ void throttle_vm_writeout(gfp_t gfp_mask)
unsigned long dirty_thresh;
for ( ; ; ) {
+ unsigned long dirty;
+
get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
/*
@@ -668,10 +715,10 @@ void throttle_vm_writeout(gfp_t gfp_mask)
*/
dirty_thresh += dirty_thresh / 10; /* wheeee... */
- if (global_page_state(NR_UNSTABLE_NFS) +
- global_page_state(NR_WRITEBACK) <= dirty_thresh)
- break;
- congestion_wait(BLK_RW_ASYNC, HZ/10);
+ dirty = get_dirty_writeback_pages();
+ if (dirty <= dirty_thresh)
+ break;
+ congestion_wait(BLK_RW_ASYNC, HZ/10);
/*
* The caller might hold locks which can prevent IO completion
@@ -1078,6 +1125,7 @@ int __set_page_dirty_no_writeback(struct page *page)
void account_page_dirtied(struct page *page, struct address_space *mapping)
{
if (mapping_cap_account_dirty(mapping)) {
+ mem_cgroup_inc_page_stat(page, MEMCG_NR_FILE_DIRTY);
__inc_zone_page_state(page, NR_FILE_DIRTY);
__inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
task_dirty_inc(current);
@@ -1279,6 +1327,7 @@ int clear_page_dirty_for_io(struct page *page)
* for more comments.
*/
if (TestClearPageDirty(page)) {
+ mem_cgroup_dec_page_stat(page, MEMCG_NR_FILE_DIRTY);
dec_zone_page_state(page, NR_FILE_DIRTY);
dec_bdi_stat(mapping->backing_dev_info,
BDI_RECLAIMABLE);
@@ -1310,6 +1359,7 @@ int test_clear_page_writeback(struct page *page)
__bdi_writeout_inc(bdi);
}
}
+ mem_cgroup_dec_page_stat(page, MEMCG_NR_FILE_WRITEBACK);
spin_unlock_irqrestore(&mapping->tree_lock, flags);
} else {
ret = TestClearPageWriteback(page);
@@ -1341,6 +1391,7 @@ int test_set_page_writeback(struct page *page)
radix_tree_tag_clear(&mapping->page_tree,
page_index(page),
PAGECACHE_TAG_DIRTY);
+ mem_cgroup_inc_page_stat(page, MEMCG_NR_FILE_WRITEBACK);
spin_unlock_irqrestore(&mapping->tree_lock, flags);
} else {
ret = TestSetPageWriteback(page);
diff --git a/mm/rmap.c b/mm/rmap.c
index b5b2daf..916a660 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -828,8 +828,8 @@ void page_add_new_anon_rmap(struct page *page,
void page_add_file_rmap(struct page *page)
{
if (atomic_inc_and_test(&page->_mapcount)) {
+ mem_cgroup_inc_page_stat(page, MEMCG_NR_FILE_MAPPED);
__inc_zone_page_state(page, NR_FILE_MAPPED);
- mem_cgroup_update_stat(page, MEMCG_NR_FILE_MAPPED, true);
}
}
@@ -860,8 +860,8 @@ void page_remove_rmap(struct page *page)
mem_cgroup_uncharge_page(page);
__dec_zone_page_state(page, NR_ANON_PAGES);
} else {
+ mem_cgroup_dec_page_stat(page, MEMCG_NR_FILE_MAPPED);
__dec_zone_page_state(page, NR_FILE_MAPPED);
- mem_cgroup_update_stat(page, MEMCG_NR_FILE_MAPPED, false);
}
/*
* It would be tidy to reset the PageAnon mapping here,
diff --git a/mm/truncate.c b/mm/truncate.c
index e87e372..83366da 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -73,6 +73,7 @@ void cancel_dirty_page(struct page *page, unsigned int account_size)
if (TestClearPageDirty(page)) {
struct address_space *mapping = page->mapping;
if (mapping && mapping_cap_account_dirty(mapping)) {
+ mem_cgroup_dec_page_stat(page, MEMCG_NR_FILE_DIRTY);
dec_zone_page_state(page, NR_FILE_DIRTY);
dec_bdi_stat(mapping->backing_dev_info,
BDI_RECLAIMABLE);
--
1.6.3.3
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related [flat|nested] 71+ messages in thread
* Re: [PATCH -mmotm 5/5] memcg: dirty pages instrumentation
2010-03-14 23:26 ` [PATCH -mmotm 5/5] memcg: dirty pages instrumentation Andrea Righi
@ 2010-03-15 2:31 ` KAMEZAWA Hiroyuki
0 siblings, 0 replies; 71+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-03-15 2:31 UTC (permalink / raw)
To: Andrea Righi
Cc: Daisuke Nishimura, Balbir Singh, Vivek Goyal, Peter Zijlstra,
Trond Myklebust, Suleiman Souhlal, Greg Thelen,
Kirill A. Shutemov, Andrew Morton, containers, linux-kernel,
linux-mm
On Mon, 15 Mar 2010 00:26:42 +0100
Andrea Righi <arighi@develer.com> wrote:
> Apply the cgroup dirty pages accounting and limiting infrastructure to
> the opportune kernel functions.
>
> [ NOTE: for now do not account WritebackTmp pages (FUSE) and NILFS2
> bounce pages. This depends on charging also bounce pages per cgroup. ]
>
> As a bonus, make determine_dirtyable_memory() static again: this
> function isn't used anymore outside page writeback.
>
> Signed-off-by: Andrea Righi <arighi@develer.com>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
A nitpick.
> @@ -660,6 +705,8 @@ void throttle_vm_writeout(gfp_t gfp_mask)
> unsigned long dirty_thresh;
>
> for ( ; ; ) {
> + unsigned long dirty;
> +
> get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
>
> /*
> @@ -668,10 +715,10 @@ void throttle_vm_writeout(gfp_t gfp_mask)
> */
> dirty_thresh += dirty_thresh / 10; /* wheeee... */
>
> - if (global_page_state(NR_UNSTABLE_NFS) +
> - global_page_state(NR_WRITEBACK) <= dirty_thresh)
> - break;
> - congestion_wait(BLK_RW_ASYNC, HZ/10);
> + dirty = get_dirty_writeback_pages();
> + if (dirty <= dirty_thresh)
> + break;
> + congestion_wait(BLK_RW_ASYNC, HZ/10);
>
> /*
> * The caller might hold locks which can prevent IO completion
"dirty" seems not to be necessary.
if (get_dirty_writeback_pages() < dirty_thresh) ?
Thanks,
-Kame
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH -mmotm 0/5] memcg: per cgroup dirty limit (v7)
2010-03-14 23:26 [PATCH -mmotm 0/5] memcg: per cgroup dirty limit (v7) Andrea Righi
` (4 preceding siblings ...)
2010-03-14 23:26 ` [PATCH -mmotm 5/5] memcg: dirty pages instrumentation Andrea Righi
@ 2010-03-15 2:36 ` KAMEZAWA Hiroyuki
2010-03-15 10:02 ` Andrea Righi
2010-03-15 17:12 ` Vivek Goyal
2010-03-17 6:44 ` Balbir Singh
7 siblings, 1 reply; 71+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-03-15 2:36 UTC (permalink / raw)
To: Andrea Righi
Cc: Daisuke Nishimura, Balbir Singh, Vivek Goyal, Peter Zijlstra,
Trond Myklebust, Suleiman Souhlal, Greg Thelen,
Kirill A. Shutemov, Andrew Morton, containers, linux-kernel,
linux-mm
On Mon, 15 Mar 2010 00:26:37 +0100
Andrea Righi <arighi@develer.com> wrote:
> Control the maximum amount of dirty pages a cgroup can have at any given time.
>
> Per cgroup dirty limit is like fixing the max amount of dirty (hard to reclaim)
> page cache used by any cgroup. So, in case of multiple cgroup writers, they
> will not be able to consume more than their designated share of dirty pages and
> will be forced to perform write-out if they cross that limit.
>
> The overall design is the following:
>
> - account dirty pages per cgroup
> - limit the number of dirty pages via memory.dirty_ratio / memory.dirty_bytes
> and memory.dirty_background_ratio / memory.dirty_background_bytes in
> cgroupfs
> - start to write-out (background or actively) when the cgroup limits are
> exceeded
>
> This feature is supposed to be strictly connected to any underlying IO
> controller implementation, so we can stop increasing dirty pages in VM layer
> and enforce a write-out before any cgroup will consume the global amount of
> dirty pages defined by the /proc/sys/vm/dirty_ratio|dirty_bytes and
> /proc/sys/vm/dirty_background_ratio|dirty_background_bytes limits.
>
> Changelog (v6 -> v7)
> ~~~~~~~~~~~~~~~~~~~~~~
> * introduce trylock_page_cgroup() to guarantee that lock_page_cgroup()
> is never called under tree_lock (no strict accounting, but better overall
> performance)
> * do not account file cache statistics for the root cgroup (zero
> overhead for the root cgroup)
> * fix: evaluate cgroup free pages as at the minimum free pages of all
> its parents
>
> Results
> ~~~~~~~
> The testcase is a kernel build (2.6.33 x86_64_defconfig) on a Intel Core 2 @
> 1.2GHz:
>
> <before>
> - root cgroup: 11m51.983s
> - child cgroup: 11m56.596s
>
> <after>
> - root cgroup: 11m51.742s
> - child cgroup: 12m5.016s
>
> In the previous version of this patchset, using the "complex" locking scheme
> with the _locked and _unlocked version of mem_cgroup_update_page_stat(), the
> child cgroup required 11m57.896s and 12m9.920s with lock_page_cgroup()+irq_disabled.
>
> With this version there's no overhead for the root cgroup (the small difference
> is in error range). I expected to see less overhead for the child cgroup, I'll
> do more testing and try to figure better what's happening.
>
Okay, thanks. This seems good result. Optimization for children can be done under
-mm tree, I think. (If no nack, this seems ready for test in -mm.)
> In the while, it would be great if someone could perform some tests on a larger
> system... unfortunately at the moment I don't have a big system available for
> this kind of tests...
>
I hope, too.
Thanks,
-Kame
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH -mmotm 0/5] memcg: per cgroup dirty limit (v7)
2010-03-15 2:36 ` [PATCH -mmotm 0/5] memcg: per cgroup dirty limit (v7) KAMEZAWA Hiroyuki
@ 2010-03-15 10:02 ` Andrea Righi
0 siblings, 0 replies; 71+ messages in thread
From: Andrea Righi @ 2010-03-15 10:02 UTC (permalink / raw)
To: KAMEZAWA Hiroyuki
Cc: Daisuke Nishimura, Balbir Singh, Vivek Goyal, Peter Zijlstra,
Trond Myklebust, Suleiman Souhlal, Greg Thelen,
Kirill A. Shutemov, Andrew Morton, containers, linux-kernel,
linux-mm
On Mon, Mar 15, 2010 at 11:36:12AM +0900, KAMEZAWA Hiroyuki wrote:
> On Mon, 15 Mar 2010 00:26:37 +0100
> Andrea Righi <arighi@develer.com> wrote:
>
> > Control the maximum amount of dirty pages a cgroup can have at any given time.
> >
> > Per cgroup dirty limit is like fixing the max amount of dirty (hard to reclaim)
> > page cache used by any cgroup. So, in case of multiple cgroup writers, they
> > will not be able to consume more than their designated share of dirty pages and
> > will be forced to perform write-out if they cross that limit.
> >
> > The overall design is the following:
> >
> > - account dirty pages per cgroup
> > - limit the number of dirty pages via memory.dirty_ratio / memory.dirty_bytes
> > and memory.dirty_background_ratio / memory.dirty_background_bytes in
> > cgroupfs
> > - start to write-out (background or actively) when the cgroup limits are
> > exceeded
> >
> > This feature is supposed to be strictly connected to any underlying IO
> > controller implementation, so we can stop increasing dirty pages in VM layer
> > and enforce a write-out before any cgroup will consume the global amount of
> > dirty pages defined by the /proc/sys/vm/dirty_ratio|dirty_bytes and
> > /proc/sys/vm/dirty_background_ratio|dirty_background_bytes limits.
> >
> > Changelog (v6 -> v7)
> > ~~~~~~~~~~~~~~~~~~~~~~
> > * introduce trylock_page_cgroup() to guarantee that lock_page_cgroup()
> > is never called under tree_lock (no strict accounting, but better overall
> > performance)
> > * do not account file cache statistics for the root cgroup (zero
> > overhead for the root cgroup)
> > * fix: evaluate cgroup free pages as at the minimum free pages of all
> > its parents
> >
> > Results
> > ~~~~~~~
> > The testcase is a kernel build (2.6.33 x86_64_defconfig) on a Intel Core 2 @
> > 1.2GHz:
> >
> > <before>
> > - root cgroup: 11m51.983s
> > - child cgroup: 11m56.596s
> >
> > <after>
> > - root cgroup: 11m51.742s
> > - child cgroup: 12m5.016s
> >
> > In the previous version of this patchset, using the "complex" locking scheme
> > with the _locked and _unlocked version of mem_cgroup_update_page_stat(), the
> > child cgroup required 11m57.896s and 12m9.920s with lock_page_cgroup()+irq_disabled.
> >
> > With this version there's no overhead for the root cgroup (the small difference
> > is in error range). I expected to see less overhead for the child cgroup, I'll
> > do more testing and try to figure better what's happening.
> >
> Okay, thanks. This seems good result. Optimization for children can be done under
> -mm tree, I think. (If no nack, this seems ready for test in -mm.)
OK, I'll wait a bit to see if someone has other fixes or issues and post
a new version soon including these small changes.
Thanks,
-Andrea
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH -mmotm 0/5] memcg: per cgroup dirty limit (v7)
2010-03-14 23:26 [PATCH -mmotm 0/5] memcg: per cgroup dirty limit (v7) Andrea Righi
` (5 preceding siblings ...)
2010-03-15 2:36 ` [PATCH -mmotm 0/5] memcg: per cgroup dirty limit (v7) KAMEZAWA Hiroyuki
@ 2010-03-15 17:12 ` Vivek Goyal
2010-03-15 17:19 ` Vivek Goyal
2010-03-17 6:44 ` Balbir Singh
7 siblings, 1 reply; 71+ messages in thread
From: Vivek Goyal @ 2010-03-15 17:12 UTC (permalink / raw)
To: Andrea Righi
Cc: KAMEZAWA Hiroyuki, Daisuke Nishimura, Balbir Singh,
Peter Zijlstra, Trond Myklebust, Suleiman Souhlal, Greg Thelen,
Kirill A. Shutemov, Andrew Morton, containers, linux-kernel,
linux-mm
On Mon, Mar 15, 2010 at 12:26:37AM +0100, Andrea Righi wrote:
> Control the maximum amount of dirty pages a cgroup can have at any given time.
>
> Per cgroup dirty limit is like fixing the max amount of dirty (hard to reclaim)
> page cache used by any cgroup. So, in case of multiple cgroup writers, they
> will not be able to consume more than their designated share of dirty pages and
> will be forced to perform write-out if they cross that limit.
>
For me even with this version I see that group with 100M limit is getting
much more BW.
root cgroup
==========
#time dd if=/dev/zero of=/root/zerofile bs=4K count=1M
4294967296 bytes (4.3 GB) copied, 55.7979 s, 77.0 MB/s
real 0m56.209s
test1 cgroup with memory limit of 100M
======================================
# time dd if=/dev/zero of=/root/zerofile1 bs=4K count=1M
4294967296 bytes (4.3 GB) copied, 20.9252 s, 205 MB/s
real 0m21.096s
Note, these two jobs are not running in parallel. These are running one
after the other.
Vivek
> The overall design is the following:
>
> - account dirty pages per cgroup
> - limit the number of dirty pages via memory.dirty_ratio / memory.dirty_bytes
> and memory.dirty_background_ratio / memory.dirty_background_bytes in
> cgroupfs
> - start to write-out (background or actively) when the cgroup limits are
> exceeded
>
> This feature is supposed to be strictly connected to any underlying IO
> controller implementation, so we can stop increasing dirty pages in VM layer
> and enforce a write-out before any cgroup will consume the global amount of
> dirty pages defined by the /proc/sys/vm/dirty_ratio|dirty_bytes and
> /proc/sys/vm/dirty_background_ratio|dirty_background_bytes limits.
>
> Changelog (v6 -> v7)
> ~~~~~~~~~~~~~~~~~~~~~~
> * introduce trylock_page_cgroup() to guarantee that lock_page_cgroup()
> is never called under tree_lock (no strict accounting, but better overall
> performance)
> * do not account file cache statistics for the root cgroup (zero
> overhead for the root cgroup)
> * fix: evaluate cgroup free pages as at the minimum free pages of all
> its parents
>
> Results
> ~~~~~~~
> The testcase is a kernel build (2.6.33 x86_64_defconfig) on a Intel Core 2 @
> 1.2GHz:
>
> <before>
> - root cgroup: 11m51.983s
> - child cgroup: 11m56.596s
>
> <after>
> - root cgroup: 11m51.742s
> - child cgroup: 12m5.016s
>
> In the previous version of this patchset, using the "complex" locking scheme
> with the _locked and _unlocked version of mem_cgroup_update_page_stat(), the
> child cgroup required 11m57.896s and 12m9.920s with lock_page_cgroup()+irq_disabled.
>
> With this version there's no overhead for the root cgroup (the small difference
> is in error range). I expected to see less overhead for the child cgroup, I'll
> do more testing and try to figure better what's happening.
>
> In the while, it would be great if someone could perform some tests on a larger
> system... unfortunately at the moment I don't have a big system available for
> this kind of tests...
>
> Thanks,
> -Andrea
>
> Documentation/cgroups/memory.txt | 36 +++
> fs/nfs/write.c | 4 +
> include/linux/memcontrol.h | 87 ++++++-
> include/linux/page_cgroup.h | 35 +++
> include/linux/writeback.h | 2 -
> mm/filemap.c | 1 +
> mm/memcontrol.c | 542 +++++++++++++++++++++++++++++++++++---
> mm/page-writeback.c | 215 ++++++++++------
> mm/rmap.c | 4 +-
> mm/truncate.c | 1 +
> 10 files changed, 806 insertions(+), 121 deletions(-)
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH -mmotm 0/5] memcg: per cgroup dirty limit (v7)
2010-03-15 17:12 ` Vivek Goyal
@ 2010-03-15 17:19 ` Vivek Goyal
2010-03-17 11:54 ` Balbir Singh
0 siblings, 1 reply; 71+ messages in thread
From: Vivek Goyal @ 2010-03-15 17:19 UTC (permalink / raw)
To: Andrea Righi
Cc: KAMEZAWA Hiroyuki, Daisuke Nishimura, Balbir Singh,
Peter Zijlstra, Trond Myklebust, Suleiman Souhlal, Greg Thelen,
Kirill A. Shutemov, Andrew Morton, containers, linux-kernel,
linux-mm
On Mon, Mar 15, 2010 at 01:12:09PM -0400, Vivek Goyal wrote:
> On Mon, Mar 15, 2010 at 12:26:37AM +0100, Andrea Righi wrote:
> > Control the maximum amount of dirty pages a cgroup can have at any given time.
> >
> > Per cgroup dirty limit is like fixing the max amount of dirty (hard to reclaim)
> > page cache used by any cgroup. So, in case of multiple cgroup writers, they
> > will not be able to consume more than their designated share of dirty pages and
> > will be forced to perform write-out if they cross that limit.
> >
>
> For me even with this version I see that group with 100M limit is getting
> much more BW.
>
> root cgroup
> ==========
> #time dd if=/dev/zero of=/root/zerofile bs=4K count=1M
> 4294967296 bytes (4.3 GB) copied, 55.7979 s, 77.0 MB/s
>
> real 0m56.209s
>
> test1 cgroup with memory limit of 100M
> ======================================
> # time dd if=/dev/zero of=/root/zerofile1 bs=4K count=1M
> 4294967296 bytes (4.3 GB) copied, 20.9252 s, 205 MB/s
>
> real 0m21.096s
>
> Note, these two jobs are not running in parallel. These are running one
> after the other.
>
Ok, here is the strange part. I am seeing similar behavior even without
your patches applied.
root cgroup
==========
#time dd if=/dev/zero of=/root/zerofile bs=4K count=1M
4294967296 bytes (4.3 GB) copied, 56.098 s, 76.6 MB/s
real 0m56.614s
test1 cgroup with memory limit 100M
===================================
# time dd if=/dev/zero of=/root/zerofile1 bs=4K count=1M
4294967296 bytes (4.3 GB) copied, 19.8097 s, 217 MB/s
real 0m19.992s
Vivek
>
> > The overall design is the following:
> >
> > - account dirty pages per cgroup
> > - limit the number of dirty pages via memory.dirty_ratio / memory.dirty_bytes
> > and memory.dirty_background_ratio / memory.dirty_background_bytes in
> > cgroupfs
> > - start to write-out (background or actively) when the cgroup limits are
> > exceeded
> >
> > This feature is supposed to be strictly connected to any underlying IO
> > controller implementation, so we can stop increasing dirty pages in VM layer
> > and enforce a write-out before any cgroup will consume the global amount of
> > dirty pages defined by the /proc/sys/vm/dirty_ratio|dirty_bytes and
> > /proc/sys/vm/dirty_background_ratio|dirty_background_bytes limits.
> >
> > Changelog (v6 -> v7)
> > ~~~~~~~~~~~~~~~~~~~~~~
> > * introduce trylock_page_cgroup() to guarantee that lock_page_cgroup()
> > is never called under tree_lock (no strict accounting, but better overall
> > performance)
> > * do not account file cache statistics for the root cgroup (zero
> > overhead for the root cgroup)
> > * fix: evaluate cgroup free pages as at the minimum free pages of all
> > its parents
> >
> > Results
> > ~~~~~~~
> > The testcase is a kernel build (2.6.33 x86_64_defconfig) on a Intel Core 2 @
> > 1.2GHz:
> >
> > <before>
> > - root cgroup: 11m51.983s
> > - child cgroup: 11m56.596s
> >
> > <after>
> > - root cgroup: 11m51.742s
> > - child cgroup: 12m5.016s
> >
> > In the previous version of this patchset, using the "complex" locking scheme
> > with the _locked and _unlocked version of mem_cgroup_update_page_stat(), the
> > child cgroup required 11m57.896s and 12m9.920s with lock_page_cgroup()+irq_disabled.
> >
> > With this version there's no overhead for the root cgroup (the small difference
> > is in error range). I expected to see less overhead for the child cgroup, I'll
> > do more testing and try to figure better what's happening.
> >
> > In the while, it would be great if someone could perform some tests on a larger
> > system... unfortunately at the moment I don't have a big system available for
> > this kind of tests...
> >
> > Thanks,
> > -Andrea
> >
> > Documentation/cgroups/memory.txt | 36 +++
> > fs/nfs/write.c | 4 +
> > include/linux/memcontrol.h | 87 ++++++-
> > include/linux/page_cgroup.h | 35 +++
> > include/linux/writeback.h | 2 -
> > mm/filemap.c | 1 +
> > mm/memcontrol.c | 542 +++++++++++++++++++++++++++++++++++---
> > mm/page-writeback.c | 215 ++++++++++------
> > mm/rmap.c | 4 +-
> > mm/truncate.c | 1 +
> > 10 files changed, 806 insertions(+), 121 deletions(-)
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH -mmotm 0/5] memcg: per cgroup dirty limit (v7)
2010-03-15 17:19 ` Vivek Goyal
@ 2010-03-17 11:54 ` Balbir Singh
2010-03-17 13:34 ` Vivek Goyal
0 siblings, 1 reply; 71+ messages in thread
From: Balbir Singh @ 2010-03-17 11:54 UTC (permalink / raw)
To: Vivek Goyal
Cc: Andrea Righi, KAMEZAWA Hiroyuki, Daisuke Nishimura,
Peter Zijlstra, Trond Myklebust, Suleiman Souhlal, Greg Thelen,
Kirill A. Shutemov, Andrew Morton, containers, linux-kernel,
linux-mm
* Vivek Goyal <vgoyal@redhat.com> [2010-03-15 13:19:21]:
> On Mon, Mar 15, 2010 at 01:12:09PM -0400, Vivek Goyal wrote:
> > On Mon, Mar 15, 2010 at 12:26:37AM +0100, Andrea Righi wrote:
> > > Control the maximum amount of dirty pages a cgroup can have at any given time.
> > >
> > > Per cgroup dirty limit is like fixing the max amount of dirty (hard to reclaim)
> > > page cache used by any cgroup. So, in case of multiple cgroup writers, they
> > > will not be able to consume more than their designated share of dirty pages and
> > > will be forced to perform write-out if they cross that limit.
> > >
> >
> > For me even with this version I see that group with 100M limit is getting
> > much more BW.
> >
> > root cgroup
> > ==========
> > #time dd if=/dev/zero of=/root/zerofile bs=4K count=1M
> > 4294967296 bytes (4.3 GB) copied, 55.7979 s, 77.0 MB/s
> >
> > real 0m56.209s
> >
> > test1 cgroup with memory limit of 100M
> > ======================================
> > # time dd if=/dev/zero of=/root/zerofile1 bs=4K count=1M
> > 4294967296 bytes (4.3 GB) copied, 20.9252 s, 205 MB/s
> >
> > real 0m21.096s
> >
> > Note, these two jobs are not running in parallel. These are running one
> > after the other.
> >
>
> Ok, here is the strange part. I am seeing similar behavior even without
> your patches applied.
>
> root cgroup
> ==========
> #time dd if=/dev/zero of=/root/zerofile bs=4K count=1M
> 4294967296 bytes (4.3 GB) copied, 56.098 s, 76.6 MB/s
>
> real 0m56.614s
>
> test1 cgroup with memory limit 100M
> ===================================
> # time dd if=/dev/zero of=/root/zerofile1 bs=4K count=1M
> 4294967296 bytes (4.3 GB) copied, 19.8097 s, 217 MB/s
>
> real 0m19.992s
>
This is strange, did you flish the cache between the two runs?
NOTE: Since the files are same, we reuse page cache from the
other cgroup.
--
Three Cheers,
Balbir
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH -mmotm 0/5] memcg: per cgroup dirty limit (v7)
2010-03-17 11:54 ` Balbir Singh
@ 2010-03-17 13:34 ` Vivek Goyal
2010-03-17 18:53 ` Balbir Singh
2010-03-17 19:17 ` Balbir Singh
0 siblings, 2 replies; 71+ messages in thread
From: Vivek Goyal @ 2010-03-17 13:34 UTC (permalink / raw)
To: Balbir Singh
Cc: Andrea Righi, KAMEZAWA Hiroyuki, Daisuke Nishimura,
Peter Zijlstra, Trond Myklebust, Suleiman Souhlal, Greg Thelen,
Kirill A. Shutemov, Andrew Morton, containers, linux-kernel,
linux-mm
On Wed, Mar 17, 2010 at 05:24:28PM +0530, Balbir Singh wrote:
> * Vivek Goyal <vgoyal@redhat.com> [2010-03-15 13:19:21]:
>
> > On Mon, Mar 15, 2010 at 01:12:09PM -0400, Vivek Goyal wrote:
> > > On Mon, Mar 15, 2010 at 12:26:37AM +0100, Andrea Righi wrote:
> > > > Control the maximum amount of dirty pages a cgroup can have at any given time.
> > > >
> > > > Per cgroup dirty limit is like fixing the max amount of dirty (hard to reclaim)
> > > > page cache used by any cgroup. So, in case of multiple cgroup writers, they
> > > > will not be able to consume more than their designated share of dirty pages and
> > > > will be forced to perform write-out if they cross that limit.
> > > >
> > >
> > > For me even with this version I see that group with 100M limit is getting
> > > much more BW.
> > >
> > > root cgroup
> > > ==========
> > > #time dd if=/dev/zero of=/root/zerofile bs=4K count=1M
> > > 4294967296 bytes (4.3 GB) copied, 55.7979 s, 77.0 MB/s
> > >
> > > real 0m56.209s
> > >
> > > test1 cgroup with memory limit of 100M
> > > ======================================
> > > # time dd if=/dev/zero of=/root/zerofile1 bs=4K count=1M
> > > 4294967296 bytes (4.3 GB) copied, 20.9252 s, 205 MB/s
> > >
> > > real 0m21.096s
> > >
> > > Note, these two jobs are not running in parallel. These are running one
> > > after the other.
> > >
> >
> > Ok, here is the strange part. I am seeing similar behavior even without
> > your patches applied.
> >
> > root cgroup
> > ==========
> > #time dd if=/dev/zero of=/root/zerofile bs=4K count=1M
> > 4294967296 bytes (4.3 GB) copied, 56.098 s, 76.6 MB/s
> >
> > real 0m56.614s
> >
> > test1 cgroup with memory limit 100M
> > ===================================
> > # time dd if=/dev/zero of=/root/zerofile1 bs=4K count=1M
> > 4294967296 bytes (4.3 GB) copied, 19.8097 s, 217 MB/s
> >
> > real 0m19.992s
> >
>
> This is strange, did you flish the cache between the two runs?
> NOTE: Since the files are same, we reuse page cache from the
> other cgroup.
Files are different. Note suffix "1".
Vivek
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH -mmotm 0/5] memcg: per cgroup dirty limit (v7)
2010-03-17 13:34 ` Vivek Goyal
@ 2010-03-17 18:53 ` Balbir Singh
2010-03-17 19:15 ` Vivek Goyal
2010-03-17 19:17 ` Balbir Singh
1 sibling, 1 reply; 71+ messages in thread
From: Balbir Singh @ 2010-03-17 18:53 UTC (permalink / raw)
To: Vivek Goyal
Cc: Andrea Righi, KAMEZAWA Hiroyuki, Daisuke Nishimura,
Peter Zijlstra, Trond Myklebust, Suleiman Souhlal, Greg Thelen,
Kirill A. Shutemov, Andrew Morton, containers, linux-kernel,
linux-mm
* Vivek Goyal <vgoyal@redhat.com> [2010-03-17 09:34:07]:
> > >
> > > root cgroup
> > > ==========
> > > #time dd if=/dev/zero of=/root/zerofile bs=4K count=1M
> > > 4294967296 bytes (4.3 GB) copied, 56.098 s, 76.6 MB/s
> > >
> > > real 0m56.614s
> > >
> > > test1 cgroup with memory limit 100M
> > > ===================================
> > > # time dd if=/dev/zero of=/root/zerofile1 bs=4K count=1M
> > > 4294967296 bytes (4.3 GB) copied, 19.8097 s, 217 MB/s
> > >
> > > real 0m19.992s
> > >
> >
> > This is strange, did you flish the cache between the two runs?
> > NOTE: Since the files are same, we reuse page cache from the
> > other cgroup.
>
> Files are different. Note suffix "1".
>
Thanks, I'll get the perf output and see what I get.
--
Three Cheers,
Balbir
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH -mmotm 0/5] memcg: per cgroup dirty limit (v7)
2010-03-17 18:53 ` Balbir Singh
@ 2010-03-17 19:15 ` Vivek Goyal
0 siblings, 0 replies; 71+ messages in thread
From: Vivek Goyal @ 2010-03-17 19:15 UTC (permalink / raw)
To: Balbir Singh
Cc: Andrea Righi, KAMEZAWA Hiroyuki, Daisuke Nishimura,
Peter Zijlstra, Trond Myklebust, Suleiman Souhlal, Greg Thelen,
Kirill A. Shutemov, Andrew Morton, containers, linux-kernel,
linux-mm
On Thu, Mar 18, 2010 at 12:23:27AM +0530, Balbir Singh wrote:
> * Vivek Goyal <vgoyal@redhat.com> [2010-03-17 09:34:07]:
>
> > > >
> > > > root cgroup
> > > > ==========
> > > > #time dd if=/dev/zero of=/root/zerofile bs=4K count=1M
> > > > 4294967296 bytes (4.3 GB) copied, 56.098 s, 76.6 MB/s
> > > >
> > > > real 0m56.614s
> > > >
> > > > test1 cgroup with memory limit 100M
> > > > ===================================
> > > > # time dd if=/dev/zero of=/root/zerofile1 bs=4K count=1M
> > > > 4294967296 bytes (4.3 GB) copied, 19.8097 s, 217 MB/s
> > > >
> > > > real 0m19.992s
> > > >
> > >
> > > This is strange, did you flish the cache between the two runs?
> > > NOTE: Since the files are same, we reuse page cache from the
> > > other cgroup.
> >
> > Files are different. Note suffix "1".
> >
>
> Thanks, I'll get the perf output and see what I get.
One more thing I noticed and that is, it happens only if we limit the
memory of cgroup to 100M. If same cgroup test1 is unlimited memory
thing, then it did not happen.
I also did not notice this happening on other system where I have 4G of
memory. So it also seems to be related with only bigger configurations.
Thanks
Vivek
>
> --
> Three Cheers,
> Balbir
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH -mmotm 0/5] memcg: per cgroup dirty limit (v7)
2010-03-17 13:34 ` Vivek Goyal
2010-03-17 18:53 ` Balbir Singh
@ 2010-03-17 19:17 ` Balbir Singh
2010-03-17 19:48 ` Vivek Goyal
1 sibling, 1 reply; 71+ messages in thread
From: Balbir Singh @ 2010-03-17 19:17 UTC (permalink / raw)
To: Vivek Goyal
Cc: Andrea Righi, KAMEZAWA Hiroyuki, Daisuke Nishimura,
Peter Zijlstra, Trond Myklebust, Suleiman Souhlal, Greg Thelen,
Kirill A. Shutemov, Andrew Morton, containers, linux-kernel,
linux-mm
* Vivek Goyal <vgoyal@redhat.com> [2010-03-17 09:34:07]:
> On Wed, Mar 17, 2010 at 05:24:28PM +0530, Balbir Singh wrote:
> > * Vivek Goyal <vgoyal@redhat.com> [2010-03-15 13:19:21]:
> >
> > > On Mon, Mar 15, 2010 at 01:12:09PM -0400, Vivek Goyal wrote:
> > > > On Mon, Mar 15, 2010 at 12:26:37AM +0100, Andrea Righi wrote:
> > > > > Control the maximum amount of dirty pages a cgroup can have at any given time.
> > > > >
> > > > > Per cgroup dirty limit is like fixing the max amount of dirty (hard to reclaim)
> > > > > page cache used by any cgroup. So, in case of multiple cgroup writers, they
> > > > > will not be able to consume more than their designated share of dirty pages and
> > > > > will be forced to perform write-out if they cross that limit.
> > > > >
> > > >
> > > > For me even with this version I see that group with 100M limit is getting
> > > > much more BW.
> > > >
> > > > root cgroup
> > > > ==========
> > > > #time dd if=/dev/zero of=/root/zerofile bs=4K count=1M
> > > > 4294967296 bytes (4.3 GB) copied, 55.7979 s, 77.0 MB/s
> > > >
> > > > real 0m56.209s
> > > >
> > > > test1 cgroup with memory limit of 100M
> > > > ======================================
> > > > # time dd if=/dev/zero of=/root/zerofile1 bs=4K count=1M
> > > > 4294967296 bytes (4.3 GB) copied, 20.9252 s, 205 MB/s
> > > >
> > > > real 0m21.096s
> > > >
> > > > Note, these two jobs are not running in parallel. These are running one
> > > > after the other.
> > > >
> > >
The data is not always repeatable at my end. Are you able to modify
the order and get repeatable results?
In fact, I saw
for cgroup != root
------------------
4294967296 bytes (4.3 GB) copied, 120.359 s, 35.7 MB/s
for cgroup = root
-----------------
4294967296 bytes (4.3 GB) copied, 84.504 s, 50.8 MB/s
This is without the patches applied.
--
Three Cheers,
Balbir
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH -mmotm 0/5] memcg: per cgroup dirty limit (v7)
2010-03-17 19:17 ` Balbir Singh
@ 2010-03-17 19:48 ` Vivek Goyal
0 siblings, 0 replies; 71+ messages in thread
From: Vivek Goyal @ 2010-03-17 19:48 UTC (permalink / raw)
To: Balbir Singh
Cc: Andrea Righi, KAMEZAWA Hiroyuki, Daisuke Nishimura,
Peter Zijlstra, Trond Myklebust, Suleiman Souhlal, Greg Thelen,
Kirill A. Shutemov, Andrew Morton, containers, linux-kernel,
linux-mm
On Thu, Mar 18, 2010 at 12:47:43AM +0530, Balbir Singh wrote:
> * Vivek Goyal <vgoyal@redhat.com> [2010-03-17 09:34:07]:
>
> > On Wed, Mar 17, 2010 at 05:24:28PM +0530, Balbir Singh wrote:
> > > * Vivek Goyal <vgoyal@redhat.com> [2010-03-15 13:19:21]:
> > >
> > > > On Mon, Mar 15, 2010 at 01:12:09PM -0400, Vivek Goyal wrote:
> > > > > On Mon, Mar 15, 2010 at 12:26:37AM +0100, Andrea Righi wrote:
> > > > > > Control the maximum amount of dirty pages a cgroup can have at any given time.
> > > > > >
> > > > > > Per cgroup dirty limit is like fixing the max amount of dirty (hard to reclaim)
> > > > > > page cache used by any cgroup. So, in case of multiple cgroup writers, they
> > > > > > will not be able to consume more than their designated share of dirty pages and
> > > > > > will be forced to perform write-out if they cross that limit.
> > > > > >
> > > > >
> > > > > For me even with this version I see that group with 100M limit is getting
> > > > > much more BW.
> > > > >
> > > > > root cgroup
> > > > > ==========
> > > > > #time dd if=/dev/zero of=/root/zerofile bs=4K count=1M
> > > > > 4294967296 bytes (4.3 GB) copied, 55.7979 s, 77.0 MB/s
> > > > >
> > > > > real 0m56.209s
> > > > >
> > > > > test1 cgroup with memory limit of 100M
> > > > > ======================================
> > > > > # time dd if=/dev/zero of=/root/zerofile1 bs=4K count=1M
> > > > > 4294967296 bytes (4.3 GB) copied, 20.9252 s, 205 MB/s
> > > > >
> > > > > real 0m21.096s
> > > > >
> > > > > Note, these two jobs are not running in parallel. These are running one
> > > > > after the other.
> > > > >
> > > >
>
> The data is not always repeatable at my end. Are you able to modify
> the order and get repeatable results?
>
> In fact, I saw
>
> for cgroup != root
> ------------------
> 4294967296 bytes (4.3 GB) copied, 120.359 s, 35.7 MB/s
>
> for cgroup = root
> -----------------
> 4294967296 bytes (4.3 GB) copied, 84.504 s, 50.8 MB/s
>
> This is without the patches applied.
I lost the access to the big configuration machine but on that machine I
could reproduce it all the time. But on smaller machine (4core, 4G), I
could not.
I will do some more tests later.
Vivek
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH -mmotm 0/5] memcg: per cgroup dirty limit (v7)
2010-03-14 23:26 [PATCH -mmotm 0/5] memcg: per cgroup dirty limit (v7) Andrea Righi
` (6 preceding siblings ...)
2010-03-15 17:12 ` Vivek Goyal
@ 2010-03-17 6:44 ` Balbir Singh
7 siblings, 0 replies; 71+ messages in thread
From: Balbir Singh @ 2010-03-17 6:44 UTC (permalink / raw)
To: Andrea Righi
Cc: KAMEZAWA Hiroyuki, Daisuke Nishimura, Vivek Goyal, Peter Zijlstra,
Trond Myklebust, Suleiman Souhlal, Greg Thelen,
Kirill A. Shutemov, Andrew Morton, containers, linux-kernel,
linux-mm
* Andrea Righi <arighi@develer.com> [2010-03-15 00:26:37]:
> Control the maximum amount of dirty pages a cgroup can have at any given time.
>
> Per cgroup dirty limit is like fixing the max amount of dirty (hard to reclaim)
> page cache used by any cgroup. So, in case of multiple cgroup writers, they
> will not be able to consume more than their designated share of dirty pages and
> will be forced to perform write-out if they cross that limit.
>
> The overall design is the following:
>
> - account dirty pages per cgroup
> - limit the number of dirty pages via memory.dirty_ratio / memory.dirty_bytes
> and memory.dirty_background_ratio / memory.dirty_background_bytes in
> cgroupfs
> - start to write-out (background or actively) when the cgroup limits are
> exceeded
>
> This feature is supposed to be strictly connected to any underlying IO
> controller implementation, so we can stop increasing dirty pages in VM layer
> and enforce a write-out before any cgroup will consume the global amount of
> dirty pages defined by the /proc/sys/vm/dirty_ratio|dirty_bytes and
> /proc/sys/vm/dirty_background_ratio|dirty_background_bytes limits.
>
> Changelog (v6 -> v7)
> ~~~~~~~~~~~~~~~~~~~~~~
> * introduce trylock_page_cgroup() to guarantee that lock_page_cgroup()
> is never called under tree_lock (no strict accounting, but better overall
> performance)
> * do not account file cache statistics for the root cgroup (zero
> overhead for the root cgroup)
> * fix: evaluate cgroup free pages as at the minimum free pages of all
> its parents
>
> Results
> ~~~~~~~
> The testcase is a kernel build (2.6.33 x86_64_defconfig) on a Intel Core 2 @
> 1.2GHz:
>
> <before>
> - root cgroup: 11m51.983s
> - child cgroup: 11m56.596s
>
> <after>
> - root cgroup: 11m51.742s
> - child cgroup: 12m5.016s
>
> In the previous version of this patchset, using the "complex" locking scheme
> with the _locked and _unlocked version of mem_cgroup_update_page_stat(), the
> child cgroup required 11m57.896s and 12m9.920s with lock_page_cgroup()+irq_disabled.
>
> With this version there's no overhead for the root cgroup (the small difference
> is in error range). I expected to see less overhead for the child cgroup, I'll
> do more testing and try to figure better what's happening.
I like that the root overhead is going away.
>
> In the while, it would be great if someone could perform some tests on a larger
> system... unfortunately at the moment I don't have a big system available for
> this kind of tests...
>
I'll test this, I have a small machine to test on at the moment, I'll
revert back with data.
--
Three Cheers,
Balbir
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 71+ messages in thread
* [PATCH -mmotm 4/5] memcg: dirty pages accounting and limiting infrastructure
2010-03-09 23:00 [PATCH -mmotm 0/5] memcg: per cgroup dirty limit (v6) Andrea Righi
@ 2010-03-09 23:00 ` Andrea Righi
2010-03-10 22:23 ` Vivek Goyal
0 siblings, 1 reply; 71+ messages in thread
From: Andrea Righi @ 2010-03-09 23:00 UTC (permalink / raw)
To: KAMEZAWA Hiroyuki, Balbir Singh, Daisuke Nishimura
Cc: Vivek Goyal, Peter Zijlstra, Trond Myklebust, Suleiman Souhlal,
Greg Thelen, Kirill A. Shutemov, Andrew Morton, containers,
linux-kernel, linux-mm, Andrea Righi
Infrastructure to account dirty pages per cgroup and add dirty limit
interfaces in the cgroupfs:
- Direct write-out: memory.dirty_ratio, memory.dirty_bytes
- Background write-out: memory.dirty_background_ratio, memory.dirty_background_bytes
Signed-off-by: Andrea Righi <arighi@develer.com>
---
include/linux/memcontrol.h | 87 +++++++++-
mm/memcontrol.c | 432 ++++++++++++++++++++++++++++++++++++++++----
2 files changed, 480 insertions(+), 39 deletions(-)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 44301c6..0602ec9 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -19,12 +19,55 @@
#ifndef _LINUX_MEMCONTROL_H
#define _LINUX_MEMCONTROL_H
+
+#include <linux/writeback.h>
#include <linux/cgroup.h>
+
struct mem_cgroup;
struct page_cgroup;
struct page;
struct mm_struct;
+/* Cgroup memory statistics items exported to the kernel */
+enum mem_cgroup_read_page_stat_item {
+ MEMCG_NR_DIRTYABLE_PAGES,
+ MEMCG_NR_RECLAIM_PAGES,
+ MEMCG_NR_WRITEBACK,
+ MEMCG_NR_DIRTY_WRITEBACK_PAGES,
+};
+
+/* File cache pages accounting */
+enum mem_cgroup_write_page_stat_item {
+ MEMCG_NR_FILE_MAPPED, /* # of pages charged as file rss */
+ MEMCG_NR_FILE_DIRTY, /* # of dirty pages in page cache */
+ MEMCG_NR_FILE_WRITEBACK, /* # of pages under writeback */
+ MEMCG_NR_FILE_WRITEBACK_TEMP, /* # of pages under writeback using
+ temporary buffers */
+ MEMCG_NR_FILE_UNSTABLE_NFS, /* # of NFS unstable pages */
+
+ MEMCG_NR_FILE_NSTAT,
+};
+
+/* Dirty memory parameters */
+struct vm_dirty_param {
+ int dirty_ratio;
+ int dirty_background_ratio;
+ unsigned long dirty_bytes;
+ unsigned long dirty_background_bytes;
+};
+
+/*
+ * TODO: provide a validation check routine. And retry if validation
+ * fails.
+ */
+static inline void get_global_vm_dirty_param(struct vm_dirty_param *param)
+{
+ param->dirty_ratio = vm_dirty_ratio;
+ param->dirty_bytes = vm_dirty_bytes;
+ param->dirty_background_ratio = dirty_background_ratio;
+ param->dirty_background_bytes = dirty_background_bytes;
+}
+
#ifdef CONFIG_CGROUP_MEM_RES_CTLR
/*
* All "charge" functions with gfp_mask should use GFP_KERNEL or
@@ -117,6 +160,25 @@ extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
extern int do_swap_account;
#endif
+extern bool mem_cgroup_has_dirty_limit(void);
+extern void get_vm_dirty_param(struct vm_dirty_param *param);
+extern s64 mem_cgroup_page_stat(enum mem_cgroup_read_page_stat_item item);
+
+extern void mem_cgroup_update_page_stat(struct page *page,
+ enum mem_cgroup_write_page_stat_item idx, bool charge);
+
+static inline void mem_cgroup_inc_page_stat(struct page *page,
+ enum mem_cgroup_write_page_stat_item idx)
+{
+ mem_cgroup_update_page_stat(page, idx, true);
+}
+
+static inline void mem_cgroup_dec_page_stat(struct page *page,
+ enum mem_cgroup_write_page_stat_item idx)
+{
+ mem_cgroup_update_page_stat(page, idx, false);
+}
+
static inline bool mem_cgroup_disabled(void)
{
if (mem_cgroup_subsys.disabled)
@@ -124,7 +186,6 @@ static inline bool mem_cgroup_disabled(void)
return false;
}
-void mem_cgroup_update_file_mapped(struct page *page, int val);
unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
gfp_t gfp_mask, int nid,
int zid);
@@ -294,8 +355,18 @@ mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
{
}
-static inline void mem_cgroup_update_file_mapped(struct page *page,
- int val)
+static inline s64 mem_cgroup_page_stat(enum mem_cgroup_read_page_stat_item item)
+{
+ return -ENOSYS;
+}
+
+static inline void mem_cgroup_inc_page_stat(struct page *page,
+ enum mem_cgroup_write_page_stat_item idx)
+{
+}
+
+static inline void mem_cgroup_dec_page_stat(struct page *page,
+ enum mem_cgroup_write_page_stat_item idx)
{
}
@@ -306,6 +377,16 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
return 0;
}
+static inline bool mem_cgroup_has_dirty_limit(void)
+{
+ return false;
+}
+
+static inline void get_vm_dirty_param(struct vm_dirty_param *param)
+{
+ get_global_vm_dirty_param(param);
+}
+
#endif /* CONFIG_CGROUP_MEM_CONT */
#endif /* _LINUX_MEMCONTROL_H */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index a9fd736..ffcf37c 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -80,14 +80,21 @@ enum mem_cgroup_stat_index {
/*
* For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
*/
- MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */
+ MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */
MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */
- MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */
MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */
MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */
MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
MEM_CGROUP_EVENTS, /* incremented at every pagein/pageout */
+ /* File cache pages accounting */
+ MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */
+ MEM_CGROUP_STAT_FILE_DIRTY, /* # of dirty pages in page cache */
+ MEM_CGROUP_STAT_WRITEBACK, /* # of pages under writeback */
+ MEM_CGROUP_STAT_WRITEBACK_TEMP, /* # of pages under writeback using
+ temporary buffers */
+ MEM_CGROUP_STAT_UNSTABLE_NFS, /* # of NFS unstable pages */
+
MEM_CGROUP_STAT_NSTATS,
};
@@ -95,6 +102,19 @@ struct mem_cgroup_stat_cpu {
s64 count[MEM_CGROUP_STAT_NSTATS];
};
+/* Per cgroup page statistics */
+struct mem_cgroup_page_stat {
+ enum mem_cgroup_read_page_stat_item item;
+ s64 value;
+};
+
+enum {
+ MEM_CGROUP_DIRTY_RATIO,
+ MEM_CGROUP_DIRTY_BYTES,
+ MEM_CGROUP_DIRTY_BACKGROUND_RATIO,
+ MEM_CGROUP_DIRTY_BACKGROUND_BYTES,
+};
+
/*
* per-zone information in memory controller.
*/
@@ -208,6 +228,9 @@ struct mem_cgroup {
unsigned int swappiness;
+ /* control memory cgroup dirty pages */
+ struct vm_dirty_param dirty_param;
+
/* set when res.limit == memsw.limit */
bool memsw_is_minimum;
@@ -1033,6 +1056,157 @@ static unsigned int get_swappiness(struct mem_cgroup *memcg)
return swappiness;
}
+static bool dirty_param_is_valid(struct vm_dirty_param *param)
+{
+ if (param->dirty_ratio && param->dirty_bytes)
+ return false;
+ if (param->dirty_background_ratio && param->dirty_background_bytes)
+ return false;
+ return true;
+}
+
+static void __mem_cgroup_get_dirty_param(struct vm_dirty_param *param,
+ struct mem_cgroup *mem)
+{
+ param->dirty_ratio = mem->dirty_param.dirty_ratio;
+ param->dirty_bytes = mem->dirty_param.dirty_bytes;
+ param->dirty_background_ratio = mem->dirty_param.dirty_background_ratio;
+ param->dirty_background_bytes = mem->dirty_param.dirty_background_bytes;
+}
+
+/*
+ * get_vm_dirty_param() - get dirty memory parameters of the current memcg
+ * @param: a structure that is filled with the dirty memory settings
+ *
+ * The function fills @param with the current memcg dirty memory settings. If
+ * memory cgroup is disabled or in case of error the structure is filled with
+ * the global dirty memory settings.
+ */
+void get_vm_dirty_param(struct vm_dirty_param *param)
+{
+ struct mem_cgroup *memcg;
+
+ if (mem_cgroup_disabled()) {
+ get_global_vm_dirty_param(param);
+ return;
+ }
+ /*
+ * It's possible that "current" may be moved to other cgroup while we
+ * access cgroup. But precise check is meaningless because the task can
+ * be moved after our access and writeback tends to take long time.
+ * At least, "memcg" will not be freed under rcu_read_lock().
+ */
+ while (1) {
+ rcu_read_lock();
+ memcg = mem_cgroup_from_task(current);
+ if (likely(memcg))
+ __mem_cgroup_get_dirty_param(param, memcg);
+ else
+ get_global_vm_dirty_param(param);
+ rcu_read_unlock();
+ /*
+ * Since global and memcg vm_dirty_param are not protected we
+ * try to speculatively read them and retry if we get
+ * inconsistent values.
+ */
+ if (likely(dirty_param_is_valid(param)))
+ break;
+ }
+}
+
+static inline bool mem_cgroup_can_swap(struct mem_cgroup *memcg)
+{
+ if (!do_swap_account)
+ return nr_swap_pages > 0;
+ return !memcg->memsw_is_minimum &&
+ (res_counter_read_u64(&memcg->memsw, RES_LIMIT) > 0);
+}
+
+static s64 mem_cgroup_get_local_page_stat(struct mem_cgroup *memcg,
+ enum mem_cgroup_read_page_stat_item item)
+{
+ s64 ret;
+
+ switch (item) {
+ case MEMCG_NR_DIRTYABLE_PAGES:
+ ret = res_counter_read_u64(&memcg->res, RES_LIMIT) -
+ res_counter_read_u64(&memcg->res, RES_USAGE);
+ /* Translate free memory in pages */
+ ret >>= PAGE_SHIFT;
+ ret += mem_cgroup_read_stat(memcg, LRU_ACTIVE_FILE) +
+ mem_cgroup_read_stat(memcg, LRU_INACTIVE_FILE);
+ if (mem_cgroup_can_swap(memcg))
+ ret += mem_cgroup_read_stat(memcg, LRU_ACTIVE_ANON) +
+ mem_cgroup_read_stat(memcg, LRU_INACTIVE_ANON);
+ break;
+ case MEMCG_NR_RECLAIM_PAGES:
+ ret = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_FILE_DIRTY) +
+ mem_cgroup_read_stat(memcg,
+ MEM_CGROUP_STAT_UNSTABLE_NFS);
+ break;
+ case MEMCG_NR_WRITEBACK:
+ ret = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
+ break;
+ case MEMCG_NR_DIRTY_WRITEBACK_PAGES:
+ ret = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_WRITEBACK) +
+ mem_cgroup_read_stat(memcg,
+ MEM_CGROUP_STAT_UNSTABLE_NFS);
+ break;
+ default:
+ BUG();
+ break;
+ }
+ return ret;
+}
+
+/*
+ * mem_cgroup_has_dirty_limit() - check if current memcg has local dirty limits
+ *
+ * Return true if the current memory cgroup has local dirty memory settings,
+ * false otherwise.
+ */
+bool mem_cgroup_has_dirty_limit(void)
+{
+ if (mem_cgroup_disabled())
+ return false;
+ return mem_cgroup_from_task(current) != NULL;
+}
+
+static int mem_cgroup_page_stat_cb(struct mem_cgroup *mem, void *data)
+{
+ struct mem_cgroup_page_stat *stat = (struct mem_cgroup_page_stat *)data;
+
+ stat->value += mem_cgroup_get_local_page_stat(mem, stat->item);
+ return 0;
+}
+
+/*
+ * mem_cgroup_page_stat() - get memory cgroup file cache statistics
+ * @item: memory statistic item exported to the kernel
+ *
+ * Return the accounted statistic value, or a negative value in case of error.
+ */
+s64 mem_cgroup_page_stat(enum mem_cgroup_read_page_stat_item item)
+{
+ struct mem_cgroup_page_stat stat = {};
+ struct mem_cgroup *memcg;
+
+ rcu_read_lock();
+ memcg = mem_cgroup_from_task(current);
+ if (memcg) {
+ /*
+ * Recursively evaulate page statistics against all cgroup
+ * under hierarchy tree
+ */
+ stat.item = item;
+ mem_cgroup_walk_tree(memcg, &stat, mem_cgroup_page_stat_cb);
+ } else
+ stat.value = -EINVAL;
+ rcu_read_unlock();
+
+ return stat.value;
+}
+
static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data)
{
int *val = data;
@@ -1344,36 +1518,86 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
return true;
}
+static void __mem_cgroup_update_page_stat(struct page_cgroup *pc,
+ enum mem_cgroup_write_page_stat_item idx, bool charge)
+{
+ struct mem_cgroup *mem = pc->mem_cgroup;
+
+ /*
+ * Set the opportune flags of page_cgroup and translate the public
+ * mem_cgroup_page_stat_index into the local mem_cgroup_stat_index.
+ *
+ * In this way we can export to the kernel only a restricted subset of
+ * memcg flags.
+ */
+ switch (idx) {
+ case MEMCG_NR_FILE_MAPPED:
+ if (charge)
+ SetPageCgroupFileMapped(pc);
+ else
+ ClearPageCgroupFileMapped(pc);
+ idx = MEM_CGROUP_STAT_FILE_MAPPED;
+ break;
+ case MEMCG_NR_FILE_DIRTY:
+ if (charge)
+ SetPageCgroupDirty(pc);
+ else
+ ClearPageCgroupDirty(pc);
+ idx = MEM_CGROUP_STAT_FILE_DIRTY;
+ break;
+ case MEMCG_NR_FILE_WRITEBACK:
+ if (charge)
+ SetPageCgroupWriteback(pc);
+ else
+ ClearPageCgroupWriteback(pc);
+ idx = MEM_CGROUP_STAT_WRITEBACK;
+ break;
+ case MEMCG_NR_FILE_WRITEBACK_TEMP:
+ if (charge)
+ SetPageCgroupWritebackTemp(pc);
+ else
+ ClearPageCgroupWritebackTemp(pc);
+ idx = MEM_CGROUP_STAT_WRITEBACK_TEMP;
+ break;
+ case MEMCG_NR_FILE_UNSTABLE_NFS:
+ if (charge)
+ SetPageCgroupUnstableNFS(pc);
+ else
+ ClearPageCgroupUnstableNFS(pc);
+ idx = MEM_CGROUP_STAT_UNSTABLE_NFS;
+ break;
+ default:
+ BUG();
+ break;
+ }
+ __this_cpu_add(mem->stat->count[idx], charge ? 1 : -1);
+}
+
/*
- * Currently used to update mapped file statistics, but the routine can be
- * generalized to update other statistics as well.
+ * mem_cgroup_update_page_stat() - update memcg file cache's accounting
+ * @page: the page involved in a file cache operation.
+ * @idx: the particular file cache statistic.
+ * @charge: true to increment, false to decrement the statistic specified
+ * by @idx.
+ *
+ * Update memory cgroup file cache's accounting.
*/
-void mem_cgroup_update_file_mapped(struct page *page, int val)
+void mem_cgroup_update_page_stat(struct page *page,
+ enum mem_cgroup_write_page_stat_item idx, bool charge)
{
- struct mem_cgroup *mem;
struct page_cgroup *pc;
unsigned long flags;
+ if (mem_cgroup_disabled())
+ return;
pc = lookup_page_cgroup(page);
- if (unlikely(!pc))
+ if (unlikely(!pc) || !PageCgroupUsed(pc))
return;
-
lock_page_cgroup(pc, flags);
- mem = pc->mem_cgroup;
- if (!mem)
- goto done;
-
- if (!PageCgroupUsed(pc))
- goto done;
-
- /*
- * Preemption is already disabled. We can use __this_cpu_xxx
- */
- __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], val);
-
-done:
+ __mem_cgroup_update_page_stat(pc, idx, charge);
unlock_page_cgroup(pc, flags);
}
+EXPORT_SYMBOL_GPL(mem_cgroup_update_page_stat_unlocked);
/*
* size of first charge trial. "32" comes from vmscan.c's magic value.
@@ -1785,6 +2009,39 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
memcg_check_events(mem, pc->page);
}
+/* Update file cache accounted statistics on task migration. */
+static void __mem_cgroup_update_file_stat(struct page_cgroup *pc,
+ struct mem_cgroup *from, struct mem_cgroup *to)
+{
+ struct page *page = pc->page;
+
+ if (!page_mapped(page) || PageAnon(page))
+ return;
+
+ if (PageCgroupFileMapped(pc)) {
+ __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
+ __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
+ }
+ if (PageCgroupDirty(pc)) {
+ __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_DIRTY]);
+ __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_DIRTY]);
+ }
+ if (PageCgroupWriteback(pc)) {
+ __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_WRITEBACK]);
+ __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_WRITEBACK]);
+ }
+ if (PageCgroupWritebackTemp(pc)) {
+ __this_cpu_dec(
+ from->stat->count[MEM_CGROUP_STAT_WRITEBACK_TEMP]);
+ __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_WRITEBACK_TEMP]);
+ }
+ if (PageCgroupUnstableNFS(pc)) {
+ __this_cpu_dec(
+ from->stat->count[MEM_CGROUP_STAT_UNSTABLE_NFS]);
+ __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_UNSTABLE_NFS]);
+ }
+}
+
/**
* __mem_cgroup_move_account - move account of the page
* @pc: page_cgroup of the page.
@@ -1805,22 +2062,14 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
static void __mem_cgroup_move_account(struct page_cgroup *pc,
struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
{
- struct page *page;
-
VM_BUG_ON(from == to);
VM_BUG_ON(PageLRU(pc->page));
VM_BUG_ON(!PageCgroupLocked(pc));
VM_BUG_ON(!PageCgroupUsed(pc));
VM_BUG_ON(pc->mem_cgroup != from);
- page = pc->page;
- if (page_mapped(page) && !PageAnon(page)) {
- /* Update mapped_file data for mem_cgroup */
- preempt_disable();
- __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
- __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
- preempt_enable();
- }
+ __mem_cgroup_update_file_stat(pc, from, to);
+
mem_cgroup_charge_statistics(from, pc, false);
if (uncharge)
/* This is not "cancel", but cancel_charge does all we need. */
@@ -1847,6 +2096,7 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
{
int ret = -EINVAL;
unsigned long flags;
+
lock_page_cgroup(pc, flags);
if (PageCgroupUsed(pc) && pc->mem_cgroup == from) {
__mem_cgroup_move_account(pc, from, to, uncharge);
@@ -3125,10 +3375,14 @@ static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
enum {
MCS_CACHE,
MCS_RSS,
- MCS_FILE_MAPPED,
MCS_PGPGIN,
MCS_PGPGOUT,
MCS_SWAP,
+ MCS_FILE_MAPPED,
+ MCS_FILE_DIRTY,
+ MCS_WRITEBACK,
+ MCS_WRITEBACK_TEMP,
+ MCS_UNSTABLE_NFS,
MCS_INACTIVE_ANON,
MCS_ACTIVE_ANON,
MCS_INACTIVE_FILE,
@@ -3147,10 +3401,14 @@ struct {
} memcg_stat_strings[NR_MCS_STAT] = {
{"cache", "total_cache"},
{"rss", "total_rss"},
- {"mapped_file", "total_mapped_file"},
{"pgpgin", "total_pgpgin"},
{"pgpgout", "total_pgpgout"},
{"swap", "total_swap"},
+ {"mapped_file", "total_mapped_file"},
+ {"filedirty", "dirty_pages"},
+ {"writeback", "writeback_pages"},
+ {"writeback_tmp", "writeback_temp_pages"},
+ {"nfs", "nfs_unstable"},
{"inactive_anon", "total_inactive_anon"},
{"active_anon", "total_active_anon"},
{"inactive_file", "total_inactive_file"},
@@ -3169,8 +3427,6 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data)
s->stat[MCS_CACHE] += val * PAGE_SIZE;
val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
s->stat[MCS_RSS] += val * PAGE_SIZE;
- val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED);
- s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;
val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGIN_COUNT);
s->stat[MCS_PGPGIN] += val;
val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGOUT_COUNT);
@@ -3179,6 +3435,16 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data)
val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT);
s->stat[MCS_SWAP] += val * PAGE_SIZE;
}
+ val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED);
+ s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;
+ val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_DIRTY);
+ s->stat[MCS_FILE_DIRTY] += val;
+ val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_WRITEBACK);
+ s->stat[MCS_WRITEBACK] += val;
+ val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_WRITEBACK_TEMP);
+ s->stat[MCS_WRITEBACK_TEMP] += val;
+ val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_UNSTABLE_NFS);
+ s->stat[MCS_UNSTABLE_NFS] += val;
/* per zone stat */
val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON);
@@ -3540,6 +3806,63 @@ unlock:
return ret;
}
+static u64 mem_cgroup_dirty_read(struct cgroup *cgrp, struct cftype *cft)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
+
+ switch (cft->private) {
+ case MEM_CGROUP_DIRTY_RATIO:
+ return memcg->dirty_param.dirty_ratio;
+ case MEM_CGROUP_DIRTY_BYTES:
+ return memcg->dirty_param.dirty_bytes;
+ case MEM_CGROUP_DIRTY_BACKGROUND_RATIO:
+ return memcg->dirty_param.dirty_background_ratio;
+ case MEM_CGROUP_DIRTY_BACKGROUND_BYTES:
+ return memcg->dirty_param.dirty_background_bytes;
+ default:
+ BUG();
+ }
+}
+
+static int
+mem_cgroup_dirty_write(struct cgroup *cgrp, struct cftype *cft, u64 val)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
+ int type = cft->private;
+
+ if (cgrp->parent == NULL)
+ return -EINVAL;
+ if ((type == MEM_CGROUP_DIRTY_RATIO ||
+ type == MEM_CGROUP_DIRTY_BACKGROUND_RATIO) && val > 100)
+ return -EINVAL;
+ /*
+ * TODO: provide a validation check routine. And retry if validation
+ * fails.
+ */
+ switch (type) {
+ case MEM_CGROUP_DIRTY_RATIO:
+ memcg->dirty_param.dirty_ratio = val;
+ memcg->dirty_param.dirty_bytes = 0;
+ break;
+ case MEM_CGROUP_DIRTY_BYTES:
+ memcg->dirty_param.dirty_ratio = 0;
+ memcg->dirty_param.dirty_bytes = val;
+ break;
+ case MEM_CGROUP_DIRTY_BACKGROUND_RATIO:
+ memcg->dirty_param.dirty_background_ratio = val;
+ memcg->dirty_param.dirty_background_bytes = 0;
+ break;
+ case MEM_CGROUP_DIRTY_BACKGROUND_BYTES:
+ memcg->dirty_param.dirty_background_ratio = 0;
+ memcg->dirty_param.dirty_background_bytes = val;
+ break;
+ default:
+ BUG();
+ break;
+ }
+ return 0;
+}
+
static struct cftype mem_cgroup_files[] = {
{
.name = "usage_in_bytes",
@@ -3591,6 +3914,30 @@ static struct cftype mem_cgroup_files[] = {
.write_u64 = mem_cgroup_swappiness_write,
},
{
+ .name = "dirty_ratio",
+ .read_u64 = mem_cgroup_dirty_read,
+ .write_u64 = mem_cgroup_dirty_write,
+ .private = MEM_CGROUP_DIRTY_RATIO,
+ },
+ {
+ .name = "dirty_bytes",
+ .read_u64 = mem_cgroup_dirty_read,
+ .write_u64 = mem_cgroup_dirty_write,
+ .private = MEM_CGROUP_DIRTY_BYTES,
+ },
+ {
+ .name = "dirty_background_ratio",
+ .read_u64 = mem_cgroup_dirty_read,
+ .write_u64 = mem_cgroup_dirty_write,
+ .private = MEM_CGROUP_DIRTY_BACKGROUND_RATIO,
+ },
+ {
+ .name = "dirty_background_bytes",
+ .read_u64 = mem_cgroup_dirty_read,
+ .write_u64 = mem_cgroup_dirty_write,
+ .private = MEM_CGROUP_DIRTY_BACKGROUND_BYTES,
+ },
+ {
.name = "move_charge_at_immigrate",
.read_u64 = mem_cgroup_move_charge_read,
.write_u64 = mem_cgroup_move_charge_write,
@@ -3849,8 +4196,21 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
mem->last_scanned_child = 0;
spin_lock_init(&mem->reclaim_param_lock);
- if (parent)
+ if (parent) {
mem->swappiness = get_swappiness(parent);
+ mem->dirty_param = parent->dirty_param;
+ } else {
+ while (1) {
+ get_global_vm_dirty_param(&mem->dirty_param);
+ /*
+ * Since global dirty parameters are not protected we
+ * try to speculatively read them and retry if we get
+ * inconsistent values.
+ */
+ if (likely(dirty_param_is_valid(&mem->dirty_param)))
+ break;
+ }
+ }
atomic_set(&mem->refcnt, 1);
mem->move_charge_at_immigrate = 0;
mutex_init(&mem->thresholds_lock);
--
1.6.3.3
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related [flat|nested] 71+ messages in thread
* Re: [PATCH -mmotm 4/5] memcg: dirty pages accounting and limiting infrastructure
2010-03-09 23:00 ` [PATCH -mmotm 4/5] memcg: dirty pages accounting and limiting infrastructure Andrea Righi
@ 2010-03-10 22:23 ` Vivek Goyal
2010-03-11 22:27 ` Andrea Righi
0 siblings, 1 reply; 71+ messages in thread
From: Vivek Goyal @ 2010-03-10 22:23 UTC (permalink / raw)
To: Andrea Righi
Cc: KAMEZAWA Hiroyuki, Balbir Singh, Daisuke Nishimura,
Peter Zijlstra, Trond Myklebust, Suleiman Souhlal, Greg Thelen,
Kirill A. Shutemov, Andrew Morton, containers, linux-kernel,
linux-mm
On Wed, Mar 10, 2010 at 12:00:35AM +0100, Andrea Righi wrote:
[..]
> - * Currently used to update mapped file statistics, but the routine can be
> - * generalized to update other statistics as well.
> + * mem_cgroup_update_page_stat() - update memcg file cache's accounting
> + * @page: the page involved in a file cache operation.
> + * @idx: the particular file cache statistic.
> + * @charge: true to increment, false to decrement the statistic specified
> + * by @idx.
> + *
> + * Update memory cgroup file cache's accounting.
> */
> -void mem_cgroup_update_file_mapped(struct page *page, int val)
> +void mem_cgroup_update_page_stat(struct page *page,
> + enum mem_cgroup_write_page_stat_item idx, bool charge)
> {
> - struct mem_cgroup *mem;
> struct page_cgroup *pc;
> unsigned long flags;
>
> + if (mem_cgroup_disabled())
> + return;
> pc = lookup_page_cgroup(page);
> - if (unlikely(!pc))
> + if (unlikely(!pc) || !PageCgroupUsed(pc))
> return;
> -
> lock_page_cgroup(pc, flags);
> - mem = pc->mem_cgroup;
> - if (!mem)
> - goto done;
> -
> - if (!PageCgroupUsed(pc))
> - goto done;
> -
> - /*
> - * Preemption is already disabled. We can use __this_cpu_xxx
> - */
> - __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], val);
> -
> -done:
> + __mem_cgroup_update_page_stat(pc, idx, charge);
> unlock_page_cgroup(pc, flags);
> }
> +EXPORT_SYMBOL_GPL(mem_cgroup_update_page_stat_unlocked);
CC mm/memcontrol.o
mm/memcontrol.c:1600: error: a??mem_cgroup_update_page_stat_unlockeda??
undeclared here (not in a function)
mm/memcontrol.c:1600: warning: type defaults to a??inta?? in declaration of
a??mem_cgroup_update_page_stat_unlockeda??
make[1]: *** [mm/memcontrol.o] Error 1
make: *** [mm] Error 2
Thanks
Vivek
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH -mmotm 4/5] memcg: dirty pages accounting and limiting infrastructure
2010-03-10 22:23 ` Vivek Goyal
@ 2010-03-11 22:27 ` Andrea Righi
0 siblings, 0 replies; 71+ messages in thread
From: Andrea Righi @ 2010-03-11 22:27 UTC (permalink / raw)
To: Vivek Goyal
Cc: KAMEZAWA Hiroyuki, Balbir Singh, Daisuke Nishimura,
Peter Zijlstra, Trond Myklebust, Suleiman Souhlal, Greg Thelen,
Kirill A. Shutemov, Andrew Morton, containers, linux-kernel,
linux-mm
On Wed, Mar 10, 2010 at 05:23:39PM -0500, Vivek Goyal wrote:
> On Wed, Mar 10, 2010 at 12:00:35AM +0100, Andrea Righi wrote:
>
> [..]
>
> > - * Currently used to update mapped file statistics, but the routine can be
> > - * generalized to update other statistics as well.
> > + * mem_cgroup_update_page_stat() - update memcg file cache's accounting
> > + * @page: the page involved in a file cache operation.
> > + * @idx: the particular file cache statistic.
> > + * @charge: true to increment, false to decrement the statistic specified
> > + * by @idx.
> > + *
> > + * Update memory cgroup file cache's accounting.
> > */
> > -void mem_cgroup_update_file_mapped(struct page *page, int val)
> > +void mem_cgroup_update_page_stat(struct page *page,
> > + enum mem_cgroup_write_page_stat_item idx, bool charge)
> > {
> > - struct mem_cgroup *mem;
> > struct page_cgroup *pc;
> > unsigned long flags;
> >
> > + if (mem_cgroup_disabled())
> > + return;
> > pc = lookup_page_cgroup(page);
> > - if (unlikely(!pc))
> > + if (unlikely(!pc) || !PageCgroupUsed(pc))
> > return;
> > -
> > lock_page_cgroup(pc, flags);
> > - mem = pc->mem_cgroup;
> > - if (!mem)
> > - goto done;
> > -
> > - if (!PageCgroupUsed(pc))
> > - goto done;
> > -
> > - /*
> > - * Preemption is already disabled. We can use __this_cpu_xxx
> > - */
> > - __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], val);
> > -
> > -done:
> > + __mem_cgroup_update_page_stat(pc, idx, charge);
> > unlock_page_cgroup(pc, flags);
> > }
> > +EXPORT_SYMBOL_GPL(mem_cgroup_update_page_stat_unlocked);
>
> CC mm/memcontrol.o
> mm/memcontrol.c:1600: error: a??mem_cgroup_update_page_stat_unlockeda??
> undeclared here (not in a function)
> mm/memcontrol.c:1600: warning: type defaults to a??inta?? in declaration of
> a??mem_cgroup_update_page_stat_unlockeda??
> make[1]: *** [mm/memcontrol.o] Error 1
> make: *** [mm] Error 2
Thanks! Will fix in the next version.
(mmh... why I didn't see this? probably because I'm building a static kernel...)
-Andrea
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 71+ messages in thread