From: Andrea Righi <arighi@develer.com>
To: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>,
containers@lists.linux-foundation.org,
linux-kernel@vger.kernel.org, linux-mm@kvack.org,
Greg@smtp1.linux-foundation.org,
Suleiman Souhlal <suleiman@google.com>,
Andrew Morton <akpm@linux-foundation.org>,
Balbir Singh <balbir@linux.vnet.ibm.com>
Subject: Re: [PATCH -mmotm 3/3] memcg: dirty pages instrumentation
Date: Wed, 3 Mar 2010 23:03:19 +0100 [thread overview]
Message-ID: <20100303220319.GA2706@linux> (raw)
In-Reply-To: <20100303172132.fc6d9387.kamezawa.hiroyu@jp.fujitsu.com>
On Wed, Mar 03, 2010 at 05:21:32PM +0900, KAMEZAWA Hiroyuki wrote:
> On Wed, 3 Mar 2010 15:15:49 +0900
> KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
>
> > Agreed.
> > Let's try how we can write a code in clean way. (we have time ;)
> > For now, to me, IRQ disabling while lock_page_cgroup() seems to be a little
> > over killing. What I really want is lockless code...but it seems impossible
> > under current implementation.
> >
> > I wonder the fact "the page is never unchareged under us" can give us some chances
> > ...Hmm.
> >
>
> How about this ? Basically, I don't like duplicating information...so,
> # of new pcg_flags may be able to be reduced.
>
> I'm glad this can be a hint for Andrea-san.
>
> ==
> ---
> include/linux/page_cgroup.h | 44 ++++++++++++++++++++-
> mm/memcontrol.c | 91 +++++++++++++++++++++++++++++++++++++++++++-
> 2 files changed, 132 insertions(+), 3 deletions(-)
>
> Index: mmotm-2.6.33-Mar2/include/linux/page_cgroup.h
> ===================================================================
> --- mmotm-2.6.33-Mar2.orig/include/linux/page_cgroup.h
> +++ mmotm-2.6.33-Mar2/include/linux/page_cgroup.h
> @@ -39,6 +39,11 @@ enum {
> PCG_CACHE, /* charged as cache */
> PCG_USED, /* this object is in use. */
> PCG_ACCT_LRU, /* page has been accounted for */
> + PCG_MIGRATE_LOCK, /* used for mutual execution of account migration */
> + PCG_ACCT_DIRTY,
> + PCG_ACCT_WB,
> + PCG_ACCT_WB_TEMP,
> + PCG_ACCT_UNSTABLE,
> };
>
> #define TESTPCGFLAG(uname, lname) \
> @@ -73,6 +78,23 @@ CLEARPCGFLAG(AcctLRU, ACCT_LRU)
> TESTPCGFLAG(AcctLRU, ACCT_LRU)
> TESTCLEARPCGFLAG(AcctLRU, ACCT_LRU)
>
> +SETPCGFLAG(AcctDirty, ACCT_DIRTY);
> +CLEARPCGFLAG(AcctDirty, ACCT_DIRTY);
> +TESTPCGFLAG(AcctDirty, ACCT_DIRTY);
> +
> +SETPCGFLAG(AcctWB, ACCT_WB);
> +CLEARPCGFLAG(AcctWB, ACCT_WB);
> +TESTPCGFLAG(AcctWB, ACCT_WB);
> +
> +SETPCGFLAG(AcctWBTemp, ACCT_WB_TEMP);
> +CLEARPCGFLAG(AcctWBTemp, ACCT_WB_TEMP);
> +TESTPCGFLAG(AcctWBTemp, ACCT_WB_TEMP);
> +
> +SETPCGFLAG(AcctUnstableNFS, ACCT_UNSTABLE);
> +CLEARPCGFLAG(AcctUnstableNFS, ACCT_UNSTABLE);
> +TESTPCGFLAG(AcctUnstableNFS, ACCT_UNSTABLE);
> +
> +
> static inline int page_cgroup_nid(struct page_cgroup *pc)
> {
> return page_to_nid(pc->page);
> @@ -82,7 +104,9 @@ static inline enum zone_type page_cgroup
> {
> return page_zonenum(pc->page);
> }
> -
> +/*
> + * lock_page_cgroup() should not be held under mapping->tree_lock
> + */
> static inline void lock_page_cgroup(struct page_cgroup *pc)
> {
> bit_spin_lock(PCG_LOCK, &pc->flags);
> @@ -93,6 +117,24 @@ static inline void unlock_page_cgroup(st
> bit_spin_unlock(PCG_LOCK, &pc->flags);
> }
>
> +/*
> + * Lock order is
> + * lock_page_cgroup()
> + * lock_page_cgroup_migrate()
> + * This lock is not be lock for charge/uncharge but for account moving.
> + * i.e. overwrite pc->mem_cgroup. The lock owner should guarantee by itself
> + * the page is uncharged while we hold this.
> + */
> +static inline void lock_page_cgroup_migrate(struct page_cgroup *pc)
> +{
> + bit_spin_lock(PCG_MIGRATE_LOCK, &pc->flags);
> +}
> +
> +static inline void unlock_page_cgroup_migrate(struct page_cgroup *pc)
> +{
> + bit_spin_unlock(PCG_MIGRATE_LOCK, &pc->flags);
> +}
> +
> #else /* CONFIG_CGROUP_MEM_RES_CTLR */
> struct page_cgroup;
>
> Index: mmotm-2.6.33-Mar2/mm/memcontrol.c
> ===================================================================
> --- mmotm-2.6.33-Mar2.orig/mm/memcontrol.c
> +++ mmotm-2.6.33-Mar2/mm/memcontrol.c
> @@ -87,6 +87,10 @@ enum mem_cgroup_stat_index {
> MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */
> MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
> MEM_CGROUP_EVENTS, /* incremented at every pagein/pageout */
> + MEM_CGROUP_STAT_DIRTY,
> + MEM_CGROUP_STAT_WBACK,
> + MEM_CGROUP_STAT_WBACK_TEMP,
> + MEM_CGROUP_STAT_UNSTABLE_NFS,
>
> MEM_CGROUP_STAT_NSTATS,
> };
> @@ -1360,6 +1364,86 @@ done:
> }
>
> /*
> + * Update file cache's status for memcg. Before calling this,
> + * mapping->tree_lock should be held and preemption is disabled.
> + * Then, it's guarnteed that the page is not uncharged while we
> + * access page_cgroup. We can make use of that.
> + */
> +void mem_cgroup_update_stat_locked(struct page *page, int idx, bool set)
> +{
> + struct page_cgroup *pc;
> + struct mem_cgroup *mem;
> +
> + pc = lookup_page_cgroup(page);
> + /* Not accounted ? */
> + if (!PageCgroupUsed(pc))
> + return;
> + lock_page_cgroup_migrate(pc);
> + /*
> + * It's guarnteed that this page is never uncharged.
> + * The only racy problem is moving account among memcgs.
> + */
> + switch (idx) {
> + case MEM_CGROUP_STAT_DIRTY:
> + if (set)
> + SetPageCgroupAcctDirty(pc);
> + else
> + ClearPageCgroupAcctDirty(pc);
> + break;
> + case MEM_CGROUP_STAT_WBACK:
> + if (set)
> + SetPageCgroupAcctWB(pc);
> + else
> + ClearPageCgroupAcctWB(pc);
> + break;
> + case MEM_CGROUP_STAT_WBACK_TEMP:
> + if (set)
> + SetPageCgroupAcctWBTemp(pc);
> + else
> + ClearPageCgroupAcctWBTemp(pc);
> + break;
> + case MEM_CGROUP_STAT_UNSTABLE_NFS:
> + if (set)
> + SetPageCgroupAcctUnstableNFS(pc);
> + else
> + ClearPageCgroupAcctUnstableNFS(pc);
> + break;
> + default:
> + BUG();
> + break;
> + }
> + mem = pc->mem_cgroup;
> + if (set)
> + __this_cpu_inc(mem->stat->count[idx]);
> + else
> + __this_cpu_dec(mem->stat->count[idx]);
> + unlock_page_cgroup_migrate(pc);
> +}
> +
> +static void move_acct_information(struct mem_cgroup *from,
> + struct mem_cgroup *to,
> + struct page_cgroup *pc)
> +{
> + /* preemption is disabled, migration_lock is held. */
> + if (PageCgroupAcctDirty(pc)) {
> + __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_DIRTY]);
> + __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_DIRTY]);
> + }
> + if (PageCgroupAcctWB(pc)) {
> + __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_WBACK]);
> + __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_WBACK]);
> + }
> + if (PageCgroupAcctWBTemp(pc)) {
> + __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_WBACK_TEMP]);
> + __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_WBACK_TEMP]);
> + }
> + if (PageCgroupAcctUnstableNFS(pc)) {
> + __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_UNSTABLE_NFS]);
> + __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_UNSTABLE_NFS]);
> + }
> +}
> +
> +/*
> * size of first charge trial. "32" comes from vmscan.c's magic value.
> * TODO: maybe necessary to use big numbers in big irons.
> */
> @@ -1794,15 +1878,16 @@ static void __mem_cgroup_move_account(st
> VM_BUG_ON(!PageCgroupUsed(pc));
> VM_BUG_ON(pc->mem_cgroup != from);
>
> + preempt_disable();
> + lock_page_cgroup_migrate(pc);
> page = pc->page;
> if (page_mapped(page) && !PageAnon(page)) {
> /* Update mapped_file data for mem_cgroup */
> - preempt_disable();
> __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
> __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
> - preempt_enable();
> }
> mem_cgroup_charge_statistics(from, pc, false);
> + move_acct_information(from, to, pc);
Kame-san, a question. According to is_target_pte_for_mc() it seems we
don't move file pages across cgroups for now. If !PageAnon(page) we just
return 0 and the page won't be selected for migration in
mem_cgroup_move_charge_pte_range().
So, if I've understood well the code is correct in perspective, but
right now it's unnecessary. File pages are not moved on task migration
across cgroups and, at the moment, there's no way for file page
accounted statistics to go negative.
Or am I missing something?
Thanks,
-Andrea
> if (uncharge)
> /* This is not "cancel", but cancel_charge does all we need. */
> mem_cgroup_cancel_charge(from);
> @@ -1810,6 +1895,8 @@ static void __mem_cgroup_move_account(st
> /* caller should have done css_get */
> pc->mem_cgroup = to;
> mem_cgroup_charge_statistics(to, pc, true);
> + unlock_page_cgroup_migrate(pc);
> + preempt_enable();
> /*
> * We charges against "to" which may not have any tasks. Then, "to"
> * can be under rmdir(). But in current implementation, caller of
>
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
next prev parent reply other threads:[~2010-03-03 22:03 UTC|newest]
Thread overview: 47+ messages / expand[flat|nested] mbox.gz Atom feed top
2010-03-01 21:23 [PATCH -mmotm 0/3] memcg: per cgroup dirty limit (v3) Andrea Righi
2010-03-01 21:23 ` [PATCH -mmotm 1/3] memcg: dirty memory documentation Andrea Righi
2010-03-01 21:23 ` [PATCH -mmotm 2/3] memcg: dirty pages accounting and limiting infrastructure Andrea Righi
2010-03-02 0:20 ` KAMEZAWA Hiroyuki
2010-03-02 10:04 ` Kirill A. Shutemov
2010-03-02 11:00 ` Andrea Righi
2010-03-02 13:02 ` Balbir Singh
2010-03-02 21:50 ` Andrea Righi
2010-03-02 18:08 ` Greg Thelen
2010-03-02 22:24 ` Andrea Righi
2010-03-01 21:23 ` [PATCH -mmotm 3/3] memcg: dirty pages instrumentation Andrea Righi
2010-03-01 22:02 ` Vivek Goyal
2010-03-01 22:18 ` Andrea Righi
2010-03-02 15:05 ` Vivek Goyal
2010-03-02 22:22 ` Andrea Righi
2010-03-02 23:59 ` Vivek Goyal
2010-03-03 11:47 ` Andrea Righi
2010-03-03 11:56 ` Andrea Righi
2010-03-02 0:23 ` KAMEZAWA Hiroyuki
2010-03-02 8:01 ` Andrea Righi
2010-03-02 8:12 ` Daisuke Nishimura
2010-03-02 8:23 ` KAMEZAWA Hiroyuki
2010-03-02 13:50 ` Balbir Singh
2010-03-02 22:18 ` Andrea Righi
2010-03-02 23:21 ` Daisuke Nishimura
2010-03-03 11:48 ` Andrea Righi
2010-03-02 10:11 ` Kirill A. Shutemov
2010-03-02 11:02 ` Andrea Righi
2010-03-02 11:09 ` Kirill A. Shutemov
2010-03-02 11:34 ` Andrea Righi
2010-03-02 13:47 ` Balbir Singh
2010-03-02 13:56 ` Kirill A. Shutemov
2010-03-02 13:48 ` Peter Zijlstra
2010-03-02 15:26 ` Balbir Singh
2010-03-02 15:49 ` Trond Myklebust
2010-03-02 22:14 ` Andrea Righi
2010-03-03 10:07 ` Peter Zijlstra
2010-03-03 12:05 ` Andrea Righi
2010-03-03 2:12 ` Daisuke Nishimura
2010-03-03 3:29 ` KAMEZAWA Hiroyuki
2010-03-03 6:01 ` Daisuke Nishimura
2010-03-03 6:15 ` KAMEZAWA Hiroyuki
2010-03-03 8:21 ` KAMEZAWA Hiroyuki
2010-03-03 11:50 ` Andrea Righi
2010-03-03 22:03 ` Andrea Righi [this message]
2010-03-03 23:25 ` Daisuke Nishimura
2010-03-04 3:45 ` KAMEZAWA Hiroyuki
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20100303220319.GA2706@linux \
--to=arighi@develer.com \
--cc=Greg@smtp1.linux-foundation.org \
--cc=akpm@linux-foundation.org \
--cc=balbir@linux.vnet.ibm.com \
--cc=containers@lists.linux-foundation.org \
--cc=kamezawa.hiroyu@jp.fujitsu.com \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=nishimura@mxp.nes.nec.co.jp \
--cc=suleiman@google.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).