* [PATCH 1/5] cgroup: do ID allocation under css allocator.
2010-08-25 8:04 [PATCH 0/5] memcg: towards I/O aware memcg v6 KAMEZAWA Hiroyuki
@ 2010-08-25 8:06 ` KAMEZAWA Hiroyuki
2010-08-25 14:15 ` Balbir Singh
2010-08-30 5:44 ` Daisuke Nishimura
2010-08-25 8:07 ` [PATCH 2/5] memcg: quick memcg lookup array KAMEZAWA Hiroyuki
` (3 subsequent siblings)
4 siblings, 2 replies; 19+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-08-25 8:06 UTC (permalink / raw)
To: KAMEZAWA Hiroyuki
Cc: linux-mm, nishimura@mxp.nes.nec.co.jp, balbir@linux.vnet.ibm.com,
gthelen, m-ikeda, akpm@linux-foundation.org,
linux-kernel@vger.kernel.org, menage@google.com,
lizf@cn.fujitsu.com
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Now, css'id is allocated after ->create() is called. But to make use of ID
in ->create(), it should be available before ->create().
In another thinking, considering the ID is tightly coupled with "css",
it should be allocated when "css" is allocated.
This patch moves alloc_css_id() to css allocation routine. Now, only 2 subsys,
memory and blkio are useing ID. (To support complicated hierarchy walk.)
ID will be used in mem cgroup's ->create(), later.
Note:
If someone changes rules of css allocation, ID allocation should be moved too.
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
block/blk-cgroup.c | 9 ++++++++
include/linux/cgroup.h | 16 ++++++++-------
kernel/cgroup.c | 50 ++++++++++++++-----------------------------------
mm/memcontrol.c | 5 ++++
4 files changed, 38 insertions(+), 42 deletions(-)
Index: mmotm-0811/kernel/cgroup.c
===================================================================
--- mmotm-0811.orig/kernel/cgroup.c
+++ mmotm-0811/kernel/cgroup.c
@@ -289,9 +289,6 @@ struct cg_cgroup_link {
static struct css_set init_css_set;
static struct cg_cgroup_link init_css_set_link;
-static int cgroup_init_idr(struct cgroup_subsys *ss,
- struct cgroup_subsys_state *css);
-
/* css_set_lock protects the list of css_set objects, and the
* chain of tasks off each css_set. Nests outside task->alloc_lock
* due to cgroup_iter_start() */
@@ -770,9 +767,6 @@ static struct backing_dev_info cgroup_ba
.capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
};
-static int alloc_css_id(struct cgroup_subsys *ss,
- struct cgroup *parent, struct cgroup *child);
-
static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
{
struct inode *inode = new_inode(sb);
@@ -3257,7 +3251,8 @@ static void init_cgroup_css(struct cgrou
css->cgroup = cgrp;
atomic_set(&css->refcnt, 1);
css->flags = 0;
- css->id = NULL;
+ if (!ss->use_id)
+ css->id = NULL;
if (cgrp == dummytop)
set_bit(CSS_ROOT, &css->flags);
BUG_ON(cgrp->subsys[ss->subsys_id]);
@@ -3342,12 +3337,6 @@ static long cgroup_create(struct cgroup
goto err_destroy;
}
init_cgroup_css(css, ss, cgrp);
- if (ss->use_id) {
- err = alloc_css_id(ss, parent, cgrp);
- if (err)
- goto err_destroy;
- }
- /* At error, ->destroy() callback has to free assigned ID. */
}
cgroup_lock_hierarchy(root);
@@ -3709,17 +3698,6 @@ int __init_or_module cgroup_load_subsys(
/* our new subsystem will be attached to the dummy hierarchy. */
init_cgroup_css(css, ss, dummytop);
- /* init_idr must be after init_cgroup_css because it sets css->id. */
- if (ss->use_id) {
- int ret = cgroup_init_idr(ss, css);
- if (ret) {
- dummytop->subsys[ss->subsys_id] = NULL;
- ss->destroy(ss, dummytop);
- subsys[i] = NULL;
- mutex_unlock(&cgroup_mutex);
- return ret;
- }
- }
/*
* Now we need to entangle the css into the existing css_sets. unlike
@@ -3888,8 +3866,6 @@ int __init cgroup_init(void)
struct cgroup_subsys *ss = subsys[i];
if (!ss->early_init)
cgroup_init_subsys(ss);
- if (ss->use_id)
- cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]);
}
/* Add init_css_set to the hash table */
@@ -4603,8 +4579,8 @@ err_out:
}
-static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss,
- struct cgroup_subsys_state *rootcss)
+static int cgroup_init_idr(struct cgroup_subsys *ss,
+ struct cgroup_subsys_state *rootcss)
{
struct css_id *newid;
@@ -4616,21 +4592,25 @@ static int __init_or_module cgroup_init_
return PTR_ERR(newid);
newid->stack[0] = newid->id;
- newid->css = rootcss;
- rootcss->id = newid;
+ rcu_assign_pointer(newid->css, rootcss);
+ rcu_assign_pointer(rootcss->id, newid);
return 0;
}
-static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent,
- struct cgroup *child)
+int alloc_css_id(struct cgroup_subsys *ss,
+ struct cgroup *cgrp, struct cgroup_subsys_state *css)
{
int subsys_id, i, depth = 0;
- struct cgroup_subsys_state *parent_css, *child_css;
+ struct cgroup_subsys_state *parent_css;
+ struct cgroup *parent;
struct css_id *child_id, *parent_id;
+ if (cgrp == dummytop)
+ return cgroup_init_idr(ss, css);
+
+ parent = cgrp->parent;
subsys_id = ss->subsys_id;
parent_css = parent->subsys[subsys_id];
- child_css = child->subsys[subsys_id];
parent_id = parent_css->id;
depth = parent_id->depth + 1;
@@ -4645,7 +4625,7 @@ static int alloc_css_id(struct cgroup_su
* child_id->css pointer will be set after this cgroup is available
* see cgroup_populate_dir()
*/
- rcu_assign_pointer(child_css->id, child_id);
+ rcu_assign_pointer(css->id, child_id);
return 0;
}
Index: mmotm-0811/include/linux/cgroup.h
===================================================================
--- mmotm-0811.orig/include/linux/cgroup.h
+++ mmotm-0811/include/linux/cgroup.h
@@ -583,9 +583,11 @@ int cgroup_attach_task_current_cg(struct
/*
* CSS ID is ID for cgroup_subsys_state structs under subsys. This only works
* if cgroup_subsys.use_id == true. It can be used for looking up and scanning.
- * CSS ID is assigned at cgroup allocation (create) automatically
- * and removed when subsys calls free_css_id() function. This is because
- * the lifetime of cgroup_subsys_state is subsys's matter.
+ * CSS ID must be assigned by subsys itself at cgroup creation and deleted
+ * when subsys calls free_css_id() function. This is because the life time of
+ * of cgroup_subsys_state is subsys's matter.
+ *
+ * ID->css look up is available after cgroup's directory is populated.
*
* Looking up and scanning function should be called under rcu_read_lock().
* Taking cgroup_mutex()/hierarchy_mutex() is not necessary for following calls.
@@ -593,10 +595,10 @@ int cgroup_attach_task_current_cg(struct
* destroyed". The caller should check css and cgroup's status.
*/
-/*
- * Typically Called at ->destroy(), or somewhere the subsys frees
- * cgroup_subsys_state.
- */
+/* Should be called in ->create() by subsys itself */
+int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *newgr,
+ struct cgroup_subsys_state *css);
+/* Typically Called at ->destroy(), or somewhere the subsys frees css */
void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css);
/* Find a cgroup_subsys_state which has given ID */
Index: mmotm-0811/mm/memcontrol.c
===================================================================
--- mmotm-0811.orig/mm/memcontrol.c
+++ mmotm-0811/mm/memcontrol.c
@@ -4141,6 +4141,11 @@ mem_cgroup_create(struct cgroup_subsys *
if (alloc_mem_cgroup_per_zone_info(mem, node))
goto free_out;
+ error = alloc_css_id(ss, cont, &mem->css);
+ if (error)
+ goto free_out;
+ /* Here, css_id(&mem->css) works. but css_lookup(id)->mem doesn't */
+
/* root ? */
if (cont->parent == NULL) {
int cpu;
Index: mmotm-0811/block/blk-cgroup.c
===================================================================
--- mmotm-0811.orig/block/blk-cgroup.c
+++ mmotm-0811/block/blk-cgroup.c
@@ -958,9 +958,13 @@ blkiocg_create(struct cgroup_subsys *sub
{
struct blkio_cgroup *blkcg;
struct cgroup *parent = cgroup->parent;
+ int ret;
if (!parent) {
blkcg = &blkio_root_cgroup;
+ ret = alloc_css_id(subsys, cgroup, &blkcg->css);
+ if (ret)
+ return ERR_PTR(ret);
goto done;
}
@@ -971,6 +975,11 @@ blkiocg_create(struct cgroup_subsys *sub
blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
if (!blkcg)
return ERR_PTR(-ENOMEM);
+ ret = alloc_css_id(subsys, cgroup, &blkcg->css);
+ if (ret) {
+ kfree(blkcg);
+ return ERR_PTR(ret);
+ }
blkcg->weight = BLKIO_WEIGHT_DEFAULT;
done:
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH 1/5] cgroup: do ID allocation under css allocator.
2010-08-25 8:06 ` [PATCH 1/5] cgroup: do ID allocation under css allocator KAMEZAWA Hiroyuki
@ 2010-08-25 14:15 ` Balbir Singh
2010-08-26 0:13 ` KAMEZAWA Hiroyuki
2010-08-30 5:44 ` Daisuke Nishimura
1 sibling, 1 reply; 19+ messages in thread
From: Balbir Singh @ 2010-08-25 14:15 UTC (permalink / raw)
To: KAMEZAWA Hiroyuki
Cc: linux-mm, nishimura@mxp.nes.nec.co.jp, gthelen, m-ikeda,
akpm@linux-foundation.org, linux-kernel@vger.kernel.org,
menage@google.com, lizf@cn.fujitsu.com
* KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> [2010-08-25 17:06:40]:
> From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
>
> Now, css'id is allocated after ->create() is called. But to make use of ID
> in ->create(), it should be available before ->create().
>
> In another thinking, considering the ID is tightly coupled with "css",
> it should be allocated when "css" is allocated.
> This patch moves alloc_css_id() to css allocation routine. Now, only 2 subsys,
> memory and blkio are useing ID. (To support complicated hierarchy walk.)
^^^^ typo
>
> ID will be used in mem cgroup's ->create(), later.
>
> Note:
> If someone changes rules of css allocation, ID allocation should be moved too.
>
What rules? could you please elaborate?
Seems cleaner, may be we need to update cgroups.txt?
--
Three Cheers,
Balbir
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH 1/5] cgroup: do ID allocation under css allocator.
2010-08-25 14:15 ` Balbir Singh
@ 2010-08-26 0:13 ` KAMEZAWA Hiroyuki
2010-08-31 6:33 ` Balbir Singh
0 siblings, 1 reply; 19+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-08-26 0:13 UTC (permalink / raw)
To: balbir
Cc: linux-mm, nishimura@mxp.nes.nec.co.jp, gthelen, m-ikeda,
akpm@linux-foundation.org, linux-kernel@vger.kernel.org,
menage@google.com, lizf@cn.fujitsu.com
On Wed, 25 Aug 2010 19:45:00 +0530
Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> * KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> [2010-08-25 17:06:40]:
>
> > From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> >
> > Now, css'id is allocated after ->create() is called. But to make use of ID
> > in ->create(), it should be available before ->create().
> >
> > In another thinking, considering the ID is tightly coupled with "css",
> > it should be allocated when "css" is allocated.
> > This patch moves alloc_css_id() to css allocation routine. Now, only 2 subsys,
> > memory and blkio are useing ID. (To support complicated hierarchy walk.)
> ^^^^ typo
> >
> > ID will be used in mem cgroup's ->create(), later.
> >
> > Note:
> > If someone changes rules of css allocation, ID allocation should be moved too.
> >
>
> What rules? could you please elaborate?
>
See Paul Menage's mail. He said "allocating css object under kernel/cgroup.c
will make kernel/cgroup.c cleaner." But it seems too big for my purpose.
> Seems cleaner, may be we need to update cgroups.txt?
Hmm. will look into.
Thanks,
-Kame
>
> --
> Three Cheers,
> Balbir
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
>
>
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH 1/5] cgroup: do ID allocation under css allocator.
2010-08-26 0:13 ` KAMEZAWA Hiroyuki
@ 2010-08-31 6:33 ` Balbir Singh
0 siblings, 0 replies; 19+ messages in thread
From: Balbir Singh @ 2010-08-31 6:33 UTC (permalink / raw)
To: KAMEZAWA Hiroyuki
Cc: linux-mm, nishimura@mxp.nes.nec.co.jp, gthelen, m-ikeda,
akpm@linux-foundation.org, linux-kernel@vger.kernel.org,
menage@google.com, lizf@cn.fujitsu.com
* KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> [2010-08-26 09:13:46]:
> On Wed, 25 Aug 2010 19:45:00 +0530
> Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
>
> > * KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> [2010-08-25 17:06:40]:
> >
> > > From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> > >
> > > Now, css'id is allocated after ->create() is called. But to make use of ID
> > > in ->create(), it should be available before ->create().
> > >
> > > In another thinking, considering the ID is tightly coupled with "css",
> > > it should be allocated when "css" is allocated.
> > > This patch moves alloc_css_id() to css allocation routine. Now, only 2 subsys,
> > > memory and blkio are useing ID. (To support complicated hierarchy walk.)
> > ^^^^ typo
> > >
> > > ID will be used in mem cgroup's ->create(), later.
> > >
> > > Note:
> > > If someone changes rules of css allocation, ID allocation should be moved too.
> > >
> >
> > What rules? could you please elaborate?
> >
> See Paul Menage's mail. He said "allocating css object under kernel/cgroup.c
> will make kernel/cgroup.c cleaner." But it seems too big for my purpose.
>
> > Seems cleaner, may be we need to update cgroups.txt?
>
> Hmm. will look into.
>
OK, the patch looks good to me otherwise
Acked-by: Balbir Singh <balbir@linux.vnet.ibm.com>
--
Three Cheers,
Balbir
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH 1/5] cgroup: do ID allocation under css allocator.
2010-08-25 8:06 ` [PATCH 1/5] cgroup: do ID allocation under css allocator KAMEZAWA Hiroyuki
2010-08-25 14:15 ` Balbir Singh
@ 2010-08-30 5:44 ` Daisuke Nishimura
1 sibling, 0 replies; 19+ messages in thread
From: Daisuke Nishimura @ 2010-08-30 5:44 UTC (permalink / raw)
To: KAMEZAWA Hiroyuki
Cc: linux-mm, balbir@linux.vnet.ibm.com, gthelen, m-ikeda,
akpm@linux-foundation.org, linux-kernel@vger.kernel.org,
menage@google.com, lizf@cn.fujitsu.com, Daisuke Nishimura
On Wed, 25 Aug 2010 17:06:40 +0900
KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
>
> Now, css'id is allocated after ->create() is called. But to make use of ID
> in ->create(), it should be available before ->create().
>
> In another thinking, considering the ID is tightly coupled with "css",
> it should be allocated when "css" is allocated.
> This patch moves alloc_css_id() to css allocation routine. Now, only 2 subsys,
> memory and blkio are useing ID. (To support complicated hierarchy walk.)
>
> ID will be used in mem cgroup's ->create(), later.
>
> Note:
> If someone changes rules of css allocation, ID allocation should be moved too.
>
> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
I think we need some signs from Paul and Li, but anyway
Reviewed-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Thanks,
Daisuke Nishimura.
> ---
> block/blk-cgroup.c | 9 ++++++++
> include/linux/cgroup.h | 16 ++++++++-------
> kernel/cgroup.c | 50 ++++++++++++++-----------------------------------
> mm/memcontrol.c | 5 ++++
> 4 files changed, 38 insertions(+), 42 deletions(-)
>
> Index: mmotm-0811/kernel/cgroup.c
> ===================================================================
> --- mmotm-0811.orig/kernel/cgroup.c
> +++ mmotm-0811/kernel/cgroup.c
> @@ -289,9 +289,6 @@ struct cg_cgroup_link {
> static struct css_set init_css_set;
> static struct cg_cgroup_link init_css_set_link;
>
> -static int cgroup_init_idr(struct cgroup_subsys *ss,
> - struct cgroup_subsys_state *css);
> -
> /* css_set_lock protects the list of css_set objects, and the
> * chain of tasks off each css_set. Nests outside task->alloc_lock
> * due to cgroup_iter_start() */
> @@ -770,9 +767,6 @@ static struct backing_dev_info cgroup_ba
> .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
> };
>
> -static int alloc_css_id(struct cgroup_subsys *ss,
> - struct cgroup *parent, struct cgroup *child);
> -
> static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
> {
> struct inode *inode = new_inode(sb);
> @@ -3257,7 +3251,8 @@ static void init_cgroup_css(struct cgrou
> css->cgroup = cgrp;
> atomic_set(&css->refcnt, 1);
> css->flags = 0;
> - css->id = NULL;
> + if (!ss->use_id)
> + css->id = NULL;
> if (cgrp == dummytop)
> set_bit(CSS_ROOT, &css->flags);
> BUG_ON(cgrp->subsys[ss->subsys_id]);
> @@ -3342,12 +3337,6 @@ static long cgroup_create(struct cgroup
> goto err_destroy;
> }
> init_cgroup_css(css, ss, cgrp);
> - if (ss->use_id) {
> - err = alloc_css_id(ss, parent, cgrp);
> - if (err)
> - goto err_destroy;
> - }
> - /* At error, ->destroy() callback has to free assigned ID. */
> }
>
> cgroup_lock_hierarchy(root);
> @@ -3709,17 +3698,6 @@ int __init_or_module cgroup_load_subsys(
>
> /* our new subsystem will be attached to the dummy hierarchy. */
> init_cgroup_css(css, ss, dummytop);
> - /* init_idr must be after init_cgroup_css because it sets css->id. */
> - if (ss->use_id) {
> - int ret = cgroup_init_idr(ss, css);
> - if (ret) {
> - dummytop->subsys[ss->subsys_id] = NULL;
> - ss->destroy(ss, dummytop);
> - subsys[i] = NULL;
> - mutex_unlock(&cgroup_mutex);
> - return ret;
> - }
> - }
>
> /*
> * Now we need to entangle the css into the existing css_sets. unlike
> @@ -3888,8 +3866,6 @@ int __init cgroup_init(void)
> struct cgroup_subsys *ss = subsys[i];
> if (!ss->early_init)
> cgroup_init_subsys(ss);
> - if (ss->use_id)
> - cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]);
> }
>
> /* Add init_css_set to the hash table */
> @@ -4603,8 +4579,8 @@ err_out:
>
> }
>
> -static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss,
> - struct cgroup_subsys_state *rootcss)
> +static int cgroup_init_idr(struct cgroup_subsys *ss,
> + struct cgroup_subsys_state *rootcss)
> {
> struct css_id *newid;
>
> @@ -4616,21 +4592,25 @@ static int __init_or_module cgroup_init_
> return PTR_ERR(newid);
>
> newid->stack[0] = newid->id;
> - newid->css = rootcss;
> - rootcss->id = newid;
> + rcu_assign_pointer(newid->css, rootcss);
> + rcu_assign_pointer(rootcss->id, newid);
> return 0;
> }
>
> -static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent,
> - struct cgroup *child)
> +int alloc_css_id(struct cgroup_subsys *ss,
> + struct cgroup *cgrp, struct cgroup_subsys_state *css)
> {
> int subsys_id, i, depth = 0;
> - struct cgroup_subsys_state *parent_css, *child_css;
> + struct cgroup_subsys_state *parent_css;
> + struct cgroup *parent;
> struct css_id *child_id, *parent_id;
>
> + if (cgrp == dummytop)
> + return cgroup_init_idr(ss, css);
> +
> + parent = cgrp->parent;
> subsys_id = ss->subsys_id;
> parent_css = parent->subsys[subsys_id];
> - child_css = child->subsys[subsys_id];
> parent_id = parent_css->id;
> depth = parent_id->depth + 1;
>
> @@ -4645,7 +4625,7 @@ static int alloc_css_id(struct cgroup_su
> * child_id->css pointer will be set after this cgroup is available
> * see cgroup_populate_dir()
> */
> - rcu_assign_pointer(child_css->id, child_id);
> + rcu_assign_pointer(css->id, child_id);
>
> return 0;
> }
> Index: mmotm-0811/include/linux/cgroup.h
> ===================================================================
> --- mmotm-0811.orig/include/linux/cgroup.h
> +++ mmotm-0811/include/linux/cgroup.h
> @@ -583,9 +583,11 @@ int cgroup_attach_task_current_cg(struct
> /*
> * CSS ID is ID for cgroup_subsys_state structs under subsys. This only works
> * if cgroup_subsys.use_id == true. It can be used for looking up and scanning.
> - * CSS ID is assigned at cgroup allocation (create) automatically
> - * and removed when subsys calls free_css_id() function. This is because
> - * the lifetime of cgroup_subsys_state is subsys's matter.
> + * CSS ID must be assigned by subsys itself at cgroup creation and deleted
> + * when subsys calls free_css_id() function. This is because the life time of
> + * of cgroup_subsys_state is subsys's matter.
> + *
> + * ID->css look up is available after cgroup's directory is populated.
> *
> * Looking up and scanning function should be called under rcu_read_lock().
> * Taking cgroup_mutex()/hierarchy_mutex() is not necessary for following calls.
> @@ -593,10 +595,10 @@ int cgroup_attach_task_current_cg(struct
> * destroyed". The caller should check css and cgroup's status.
> */
>
> -/*
> - * Typically Called at ->destroy(), or somewhere the subsys frees
> - * cgroup_subsys_state.
> - */
> +/* Should be called in ->create() by subsys itself */
> +int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *newgr,
> + struct cgroup_subsys_state *css);
> +/* Typically Called at ->destroy(), or somewhere the subsys frees css */
> void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css);
>
> /* Find a cgroup_subsys_state which has given ID */
> Index: mmotm-0811/mm/memcontrol.c
> ===================================================================
> --- mmotm-0811.orig/mm/memcontrol.c
> +++ mmotm-0811/mm/memcontrol.c
> @@ -4141,6 +4141,11 @@ mem_cgroup_create(struct cgroup_subsys *
> if (alloc_mem_cgroup_per_zone_info(mem, node))
> goto free_out;
>
> + error = alloc_css_id(ss, cont, &mem->css);
> + if (error)
> + goto free_out;
> + /* Here, css_id(&mem->css) works. but css_lookup(id)->mem doesn't */
> +
> /* root ? */
> if (cont->parent == NULL) {
> int cpu;
> Index: mmotm-0811/block/blk-cgroup.c
> ===================================================================
> --- mmotm-0811.orig/block/blk-cgroup.c
> +++ mmotm-0811/block/blk-cgroup.c
> @@ -958,9 +958,13 @@ blkiocg_create(struct cgroup_subsys *sub
> {
> struct blkio_cgroup *blkcg;
> struct cgroup *parent = cgroup->parent;
> + int ret;
>
> if (!parent) {
> blkcg = &blkio_root_cgroup;
> + ret = alloc_css_id(subsys, cgroup, &blkcg->css);
> + if (ret)
> + return ERR_PTR(ret);
> goto done;
> }
>
> @@ -971,6 +975,11 @@ blkiocg_create(struct cgroup_subsys *sub
> blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
> if (!blkcg)
> return ERR_PTR(-ENOMEM);
> + ret = alloc_css_id(subsys, cgroup, &blkcg->css);
> + if (ret) {
> + kfree(blkcg);
> + return ERR_PTR(ret);
> + }
>
> blkcg->weight = BLKIO_WEIGHT_DEFAULT;
> done:
>
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 19+ messages in thread
* [PATCH 2/5] memcg: quick memcg lookup array
2010-08-25 8:04 [PATCH 0/5] memcg: towards I/O aware memcg v6 KAMEZAWA Hiroyuki
2010-08-25 8:06 ` [PATCH 1/5] cgroup: do ID allocation under css allocator KAMEZAWA Hiroyuki
@ 2010-08-25 8:07 ` KAMEZAWA Hiroyuki
2010-08-30 8:03 ` Daisuke Nishimura
2010-08-31 7:14 ` Balbir Singh
2010-08-25 8:09 ` [PATCH 3/5] memcg: use ID instead of pointer in page_cgroup KAMEZAWA Hiroyuki
` (2 subsequent siblings)
4 siblings, 2 replies; 19+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-08-25 8:07 UTC (permalink / raw)
To: KAMEZAWA Hiroyuki
Cc: linux-mm, nishimura@mxp.nes.nec.co.jp, balbir@linux.vnet.ibm.com,
gthelen, m-ikeda, akpm@linux-foundation.org,
linux-kernel@vger.kernel.org, menage@google.com,
lizf@cn.fujitsu.com
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Now, memory cgroup has an ID per cgroup and make use of it at
- hierarchy walk,
- swap recording.
This patch is for making more use of it. The final purpose is
to replace page_cgroup->mem_cgroup's pointer to an unsigned short.
This patch caches a pointer of memcg in an array. By this, we
don't have to call css_lookup() which requires radix-hash walk.
This saves some amount of memory footprint at lookup memcg via id.
Changelog: 20100825
- applied comments.
Changelog: 20100811
- adjusted onto mmotm-2010-08-11
- fixed RCU related parts.
- use attach_id() callback.
Changelog: 20100804
- fixed description in init/Kconfig
Changelog: 20100730
- fixed rcu_read_unlock() placement.
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
init/Kconfig | 10 +++++++
mm/memcontrol.c | 75 +++++++++++++++++++++++++++++++++++++++++---------------
2 files changed, 65 insertions(+), 20 deletions(-)
Index: mmotm-0811/mm/memcontrol.c
===================================================================
--- mmotm-0811.orig/mm/memcontrol.c
+++ mmotm-0811/mm/memcontrol.c
@@ -195,6 +195,7 @@ static void mem_cgroup_oom_notify(struct
*/
struct mem_cgroup {
struct cgroup_subsys_state css;
+ int valid; /* for checking validness under RCU access.*/
/*
* the counter to account for memory usage
*/
@@ -294,6 +295,29 @@ static bool move_file(void)
&mc.to->move_charge_at_immigrate);
}
+/* 0 is unused */
+static atomic_t mem_cgroup_num;
+#define NR_MEMCG_GROUPS (CONFIG_MEM_CGROUP_MAX_GROUPS + 1)
+static struct mem_cgroup *mem_cgroups[NR_MEMCG_GROUPS] __read_mostly;
+
+/* Must be called under rcu_read_lock */
+static struct mem_cgroup *id_to_memcg(unsigned short id)
+{
+ struct mem_cgroup *mem;
+ /* see mem_cgroup_free() */
+ mem = rcu_dereference_check(mem_cgroups[id], rcu_read_lock_held());
+ if (likely(mem && mem->valid))
+ return mem;
+ return NULL;
+}
+
+static void register_memcg_id(struct mem_cgroup *mem)
+{
+ int id = css_id(&mem->css);
+ rcu_assign_pointer(mem_cgroups[id], mem);
+ VM_BUG_ON(!mem->valid);
+}
+
/*
* Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
* limit reclaim to prevent infinite loops, if they ever occur.
@@ -1847,18 +1871,7 @@ static void mem_cgroup_cancel_charge(str
* it's concern. (dropping refcnt from swap can be called against removed
* memcg.)
*/
-static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
-{
- struct cgroup_subsys_state *css;
- /* ID 0 is unused ID */
- if (!id)
- return NULL;
- css = css_lookup(&mem_cgroup_subsys, id);
- if (!css)
- return NULL;
- return container_of(css, struct mem_cgroup, css);
-}
struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
{
@@ -1879,7 +1892,7 @@ struct mem_cgroup *try_get_mem_cgroup_fr
ent.val = page_private(page);
id = lookup_swap_cgroup(ent);
rcu_read_lock();
- mem = mem_cgroup_lookup(id);
+ mem = id_to_memcg(id);
if (mem && !css_tryget(&mem->css))
mem = NULL;
rcu_read_unlock();
@@ -2231,7 +2244,7 @@ __mem_cgroup_commit_charge_swapin(struct
id = swap_cgroup_record(ent, 0);
rcu_read_lock();
- memcg = mem_cgroup_lookup(id);
+ memcg = id_to_memcg(id);
if (memcg) {
/*
* This recorded memcg can be obsolete one. So, avoid
@@ -2240,9 +2253,10 @@ __mem_cgroup_commit_charge_swapin(struct
if (!mem_cgroup_is_root(memcg))
res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
mem_cgroup_swap_statistics(memcg, false);
+ rcu_read_unlock();
mem_cgroup_put(memcg);
- }
- rcu_read_unlock();
+ } else
+ rcu_read_unlock();
}
/*
* At swapin, we may charge account against cgroup which has no tasks.
@@ -2495,7 +2509,7 @@ void mem_cgroup_uncharge_swap(swp_entry_
id = swap_cgroup_record(ent, 0);
rcu_read_lock();
- memcg = mem_cgroup_lookup(id);
+ memcg = id_to_memcg(id);
if (memcg) {
/*
* We uncharge this because swap is freed.
@@ -2504,9 +2518,10 @@ void mem_cgroup_uncharge_swap(swp_entry_
if (!mem_cgroup_is_root(memcg))
res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
mem_cgroup_swap_statistics(memcg, false);
+ rcu_read_unlock();
mem_cgroup_put(memcg);
- }
- rcu_read_unlock();
+ } else
+ rcu_read_unlock();
}
/**
@@ -4010,6 +4025,9 @@ static struct mem_cgroup *mem_cgroup_all
struct mem_cgroup *mem;
int size = sizeof(struct mem_cgroup);
+ if (atomic_read(&mem_cgroup_num) == NR_MEMCG_GROUPS)
+ return NULL;
+
/* Can be very big if MAX_NUMNODES is very big */
if (size < PAGE_SIZE)
mem = kmalloc(size, GFP_KERNEL);
@@ -4020,6 +4038,7 @@ static struct mem_cgroup *mem_cgroup_all
return NULL;
memset(mem, 0, size);
+ mem->valid = 1;
mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
if (!mem->stat) {
if (size < PAGE_SIZE)
@@ -4049,6 +4068,7 @@ static void __mem_cgroup_free(struct mem
mem_cgroup_remove_from_trees(mem);
free_css_id(&mem_cgroup_subsys, &mem->css);
+ atomic_dec(&mem_cgroup_num);
for_each_node_state(node, N_POSSIBLE)
free_mem_cgroup_per_zone_info(mem, node);
@@ -4059,6 +4079,19 @@ static void __mem_cgroup_free(struct mem
vfree(mem);
}
+static void mem_cgroup_free(struct mem_cgroup *mem)
+{
+ /* No more lookup */
+ mem->valid = 0;
+ rcu_assign_pointer(mem_cgroups[css_id(&mem->css)], NULL);
+ /*
+ * Because we call vfree() etc...use synchronize_rcu() rather than
+ * call_rcu();
+ */
+ synchronize_rcu();
+ __mem_cgroup_free(mem);
+}
+
static void mem_cgroup_get(struct mem_cgroup *mem)
{
atomic_inc(&mem->refcnt);
@@ -4068,7 +4101,7 @@ static void __mem_cgroup_put(struct mem_
{
if (atomic_sub_and_test(count, &mem->refcnt)) {
struct mem_cgroup *parent = parent_mem_cgroup(mem);
- __mem_cgroup_free(mem);
+ mem_cgroup_free(mem);
if (parent)
mem_cgroup_put(parent);
}
@@ -4189,9 +4222,11 @@ mem_cgroup_create(struct cgroup_subsys *
atomic_set(&mem->refcnt, 1);
mem->move_charge_at_immigrate = 0;
mutex_init(&mem->thresholds_lock);
+ atomic_inc(&mem_cgroup_num);
+ register_memcg_id(mem);
return &mem->css;
free_out:
- __mem_cgroup_free(mem);
+ mem_cgroup_free(mem);
root_mem_cgroup = NULL;
return ERR_PTR(error);
}
Index: mmotm-0811/init/Kconfig
===================================================================
--- mmotm-0811.orig/init/Kconfig
+++ mmotm-0811/init/Kconfig
@@ -594,6 +594,16 @@ config CGROUP_MEM_RES_CTLR_SWAP
Now, memory usage of swap_cgroup is 2 bytes per entry. If swap page
size is 4096bytes, 512k per 1Gbytes of swap.
+config MEM_CGROUP_MAX_GROUPS
+ int "Maximum number of memory cgroups on a system"
+ range 1 65535
+ default 8192 if 64BIT
+ default 2048 if 32BIT
+ help
+ Memory cgroup has limitation of the number of groups created.
+ Please select your favorite value. The more you allow, the more
+ memory(a pointer per group) will be consumed.
+
menuconfig CGROUP_SCHED
bool "Group CPU scheduler"
depends on EXPERIMENTAL && CGROUPS
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH 2/5] memcg: quick memcg lookup array
2010-08-25 8:07 ` [PATCH 2/5] memcg: quick memcg lookup array KAMEZAWA Hiroyuki
@ 2010-08-30 8:03 ` Daisuke Nishimura
2010-09-01 0:29 ` KAMEZAWA Hiroyuki
2010-08-31 7:14 ` Balbir Singh
1 sibling, 1 reply; 19+ messages in thread
From: Daisuke Nishimura @ 2010-08-30 8:03 UTC (permalink / raw)
To: KAMEZAWA Hiroyuki
Cc: linux-mm, balbir@linux.vnet.ibm.com, gthelen, m-ikeda,
akpm@linux-foundation.org, linux-kernel@vger.kernel.org,
menage@google.com, lizf@cn.fujitsu.com, Daisuke Nishimura
> Index: mmotm-0811/mm/memcontrol.c
> ===================================================================
> --- mmotm-0811.orig/mm/memcontrol.c
> +++ mmotm-0811/mm/memcontrol.c
> @@ -195,6 +195,7 @@ static void mem_cgroup_oom_notify(struct
> */
> struct mem_cgroup {
> struct cgroup_subsys_state css;
> + int valid; /* for checking validness under RCU access.*/
> /*
> * the counter to account for memory usage
> */
Do we really need to add this new member ?
Can't we safely access "mem(=rcu_dereference(mem_cgroup[id]))" under rcu_read_lock() ?
(iow, "mem" is not freed ?)
> @@ -4049,6 +4068,7 @@ static void __mem_cgroup_free(struct mem
> mem_cgroup_remove_from_trees(mem);
> free_css_id(&mem_cgroup_subsys, &mem->css);
>
> + atomic_dec(&mem_cgroup_num);
> for_each_node_state(node, N_POSSIBLE)
> free_mem_cgroup_per_zone_info(mem, node);
>
> @@ -4059,6 +4079,19 @@ static void __mem_cgroup_free(struct mem
> vfree(mem);
> }
>
> +static void mem_cgroup_free(struct mem_cgroup *mem)
> +{
> + /* No more lookup */
> + mem->valid = 0;
> + rcu_assign_pointer(mem_cgroups[css_id(&mem->css)], NULL);
> + /*
> + * Because we call vfree() etc...use synchronize_rcu() rather than
> + * call_rcu();
> + */
> + synchronize_rcu();
> + __mem_cgroup_free(mem);
> +}
> +
> static void mem_cgroup_get(struct mem_cgroup *mem)
> {
> atomic_inc(&mem->refcnt);
> @@ -4068,7 +4101,7 @@ static void __mem_cgroup_put(struct mem_
> {
> if (atomic_sub_and_test(count, &mem->refcnt)) {
> struct mem_cgroup *parent = parent_mem_cgroup(mem);
> - __mem_cgroup_free(mem);
> + mem_cgroup_free(mem);
> if (parent)
> mem_cgroup_put(parent);
> }
> @@ -4189,9 +4222,11 @@ mem_cgroup_create(struct cgroup_subsys *
> atomic_set(&mem->refcnt, 1);
> mem->move_charge_at_immigrate = 0;
> mutex_init(&mem->thresholds_lock);
> + atomic_inc(&mem_cgroup_num);
> + register_memcg_id(mem);
> return &mem->css;
> free_out:
> - __mem_cgroup_free(mem);
> + mem_cgroup_free(mem);
> root_mem_cgroup = NULL;
> return ERR_PTR(error);
> }
I think mem_cgroup_num should be increased at mem_cgroup_alloc(), because it
is decreased at __mem_cgroup_free(). Otherwise, it can be decreased while it
has not been increased, if mem_cgroup_create() fails after mem_cgroup_alloc().
Thanks,
Daisuke Nishimura.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH 2/5] memcg: quick memcg lookup array
2010-08-30 8:03 ` Daisuke Nishimura
@ 2010-09-01 0:29 ` KAMEZAWA Hiroyuki
0 siblings, 0 replies; 19+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-09-01 0:29 UTC (permalink / raw)
To: Daisuke Nishimura
Cc: linux-mm, balbir@linux.vnet.ibm.com, gthelen, m-ikeda,
akpm@linux-foundation.org, linux-kernel@vger.kernel.org,
menage@google.com, lizf@cn.fujitsu.com
On Mon, 30 Aug 2010 17:03:24 +0900
Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:
> > Index: mmotm-0811/mm/memcontrol.c
> > ===================================================================
> > --- mmotm-0811.orig/mm/memcontrol.c
> > +++ mmotm-0811/mm/memcontrol.c
> > @@ -195,6 +195,7 @@ static void mem_cgroup_oom_notify(struct
> > */
> > struct mem_cgroup {
> > struct cgroup_subsys_state css;
> > + int valid; /* for checking validness under RCU access.*/
> > /*
> > * the counter to account for memory usage
> > */
> Do we really need to add this new member ?
> Can't we safely access "mem(=rcu_dereference(mem_cgroup[id]))" under rcu_read_lock() ?
> (iow, "mem" is not freed ?)
>
Maybe this can be removed. I'll check again.
>
> > @@ -4049,6 +4068,7 @@ static void __mem_cgroup_free(struct mem
> > mem_cgroup_remove_from_trees(mem);
> > free_css_id(&mem_cgroup_subsys, &mem->css);
> >
> > + atomic_dec(&mem_cgroup_num);
> > for_each_node_state(node, N_POSSIBLE)
> > free_mem_cgroup_per_zone_info(mem, node);
> >
> > @@ -4059,6 +4079,19 @@ static void __mem_cgroup_free(struct mem
> > vfree(mem);
> > }
> >
> > +static void mem_cgroup_free(struct mem_cgroup *mem)
> > +{
> > + /* No more lookup */
> > + mem->valid = 0;
> > + rcu_assign_pointer(mem_cgroups[css_id(&mem->css)], NULL);
> > + /*
> > + * Because we call vfree() etc...use synchronize_rcu() rather than
> > + * call_rcu();
> > + */
> > + synchronize_rcu();
> > + __mem_cgroup_free(mem);
> > +}
> > +
> > static void mem_cgroup_get(struct mem_cgroup *mem)
> > {
> > atomic_inc(&mem->refcnt);
> > @@ -4068,7 +4101,7 @@ static void __mem_cgroup_put(struct mem_
> > {
> > if (atomic_sub_and_test(count, &mem->refcnt)) {
> > struct mem_cgroup *parent = parent_mem_cgroup(mem);
> > - __mem_cgroup_free(mem);
> > + mem_cgroup_free(mem);
> > if (parent)
> > mem_cgroup_put(parent);
> > }
> > @@ -4189,9 +4222,11 @@ mem_cgroup_create(struct cgroup_subsys *
> > atomic_set(&mem->refcnt, 1);
> > mem->move_charge_at_immigrate = 0;
> > mutex_init(&mem->thresholds_lock);
> > + atomic_inc(&mem_cgroup_num);
> > + register_memcg_id(mem);
> > return &mem->css;
> > free_out:
> > - __mem_cgroup_free(mem);
> > + mem_cgroup_free(mem);
> > root_mem_cgroup = NULL;
> > return ERR_PTR(error);
> > }
> I think mem_cgroup_num should be increased at mem_cgroup_alloc(), because it
> is decreased at __mem_cgroup_free(). Otherwise, it can be decreased while it
> has not been increased, if mem_cgroup_create() fails after mem_cgroup_alloc().
>
Hmm. thank you for checking, I'll fix.
Thanks,
-Kame
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH 2/5] memcg: quick memcg lookup array
2010-08-25 8:07 ` [PATCH 2/5] memcg: quick memcg lookup array KAMEZAWA Hiroyuki
2010-08-30 8:03 ` Daisuke Nishimura
@ 2010-08-31 7:14 ` Balbir Singh
2010-09-01 0:21 ` KAMEZAWA Hiroyuki
1 sibling, 1 reply; 19+ messages in thread
From: Balbir Singh @ 2010-08-31 7:14 UTC (permalink / raw)
To: KAMEZAWA Hiroyuki
Cc: linux-mm, nishimura@mxp.nes.nec.co.jp, gthelen, m-ikeda,
akpm@linux-foundation.org, linux-kernel@vger.kernel.org,
menage@google.com, lizf@cn.fujitsu.com
* KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> [2010-08-25 17:07:41]:
> From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
>
> Now, memory cgroup has an ID per cgroup and make use of it at
> - hierarchy walk,
> - swap recording.
>
> This patch is for making more use of it. The final purpose is
> to replace page_cgroup->mem_cgroup's pointer to an unsigned short.
>
> This patch caches a pointer of memcg in an array. By this, we
> don't have to call css_lookup() which requires radix-hash walk.
> This saves some amount of memory footprint at lookup memcg via id.
>
> Changelog: 20100825
> - applied comments.
>
> Changelog: 20100811
> - adjusted onto mmotm-2010-08-11
> - fixed RCU related parts.
> - use attach_id() callback.
>
> Changelog: 20100804
> - fixed description in init/Kconfig
>
> Changelog: 20100730
> - fixed rcu_read_unlock() placement.
>
> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> ---
> init/Kconfig | 10 +++++++
> mm/memcontrol.c | 75 +++++++++++++++++++++++++++++++++++++++++---------------
> 2 files changed, 65 insertions(+), 20 deletions(-)
>
> Index: mmotm-0811/mm/memcontrol.c
> ===================================================================
> --- mmotm-0811.orig/mm/memcontrol.c
> +++ mmotm-0811/mm/memcontrol.c
> @@ -195,6 +195,7 @@ static void mem_cgroup_oom_notify(struct
> */
> struct mem_cgroup {
> struct cgroup_subsys_state css;
> + int valid; /* for checking validness under RCU access.*/
> /*
> * the counter to account for memory usage
> */
> @@ -294,6 +295,29 @@ static bool move_file(void)
> &mc.to->move_charge_at_immigrate);
> }
>
> +/* 0 is unused */
> +static atomic_t mem_cgroup_num;
> +#define NR_MEMCG_GROUPS (CONFIG_MEM_CGROUP_MAX_GROUPS + 1)
> +static struct mem_cgroup *mem_cgroups[NR_MEMCG_GROUPS] __read_mostly;
> +
> +/* Must be called under rcu_read_lock */
> +static struct mem_cgroup *id_to_memcg(unsigned short id)
> +{
> + struct mem_cgroup *mem;
> + /* see mem_cgroup_free() */
> + mem = rcu_dereference_check(mem_cgroups[id], rcu_read_lock_held());
> + if (likely(mem && mem->valid))
> + return mem;
> + return NULL;
> +}
> +
> +static void register_memcg_id(struct mem_cgroup *mem)
> +{
> + int id = css_id(&mem->css);
> + rcu_assign_pointer(mem_cgroups[id], mem);
> + VM_BUG_ON(!mem->valid);
> +}
> +
> /*
> * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
> * limit reclaim to prevent infinite loops, if they ever occur.
> @@ -1847,18 +1871,7 @@ static void mem_cgroup_cancel_charge(str
> * it's concern. (dropping refcnt from swap can be called against removed
> * memcg.)
> */
> -static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
> -{
> - struct cgroup_subsys_state *css;
>
> - /* ID 0 is unused ID */
> - if (!id)
> - return NULL;
> - css = css_lookup(&mem_cgroup_subsys, id);
> - if (!css)
> - return NULL;
> - return container_of(css, struct mem_cgroup, css);
> -}
>
> struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
> {
> @@ -1879,7 +1892,7 @@ struct mem_cgroup *try_get_mem_cgroup_fr
> ent.val = page_private(page);
> id = lookup_swap_cgroup(ent);
> rcu_read_lock();
> - mem = mem_cgroup_lookup(id);
> + mem = id_to_memcg(id);
> if (mem && !css_tryget(&mem->css))
> mem = NULL;
> rcu_read_unlock();
> @@ -2231,7 +2244,7 @@ __mem_cgroup_commit_charge_swapin(struct
>
> id = swap_cgroup_record(ent, 0);
> rcu_read_lock();
> - memcg = mem_cgroup_lookup(id);
> + memcg = id_to_memcg(id);
> if (memcg) {
> /*
> * This recorded memcg can be obsolete one. So, avoid
> @@ -2240,9 +2253,10 @@ __mem_cgroup_commit_charge_swapin(struct
> if (!mem_cgroup_is_root(memcg))
> res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
> mem_cgroup_swap_statistics(memcg, false);
> + rcu_read_unlock();
> mem_cgroup_put(memcg);
> - }
> - rcu_read_unlock();
> + } else
> + rcu_read_unlock();
> }
> /*
> * At swapin, we may charge account against cgroup which has no tasks.
> @@ -2495,7 +2509,7 @@ void mem_cgroup_uncharge_swap(swp_entry_
>
> id = swap_cgroup_record(ent, 0);
> rcu_read_lock();
> - memcg = mem_cgroup_lookup(id);
> + memcg = id_to_memcg(id);
> if (memcg) {
> /*
> * We uncharge this because swap is freed.
> @@ -2504,9 +2518,10 @@ void mem_cgroup_uncharge_swap(swp_entry_
> if (!mem_cgroup_is_root(memcg))
> res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
> mem_cgroup_swap_statistics(memcg, false);
> + rcu_read_unlock();
> mem_cgroup_put(memcg);
> - }
> - rcu_read_unlock();
> + } else
> + rcu_read_unlock();
> }
>
> /**
> @@ -4010,6 +4025,9 @@ static struct mem_cgroup *mem_cgroup_all
> struct mem_cgroup *mem;
> int size = sizeof(struct mem_cgroup);
>
> + if (atomic_read(&mem_cgroup_num) == NR_MEMCG_GROUPS)
> + return NULL;
> +
> /* Can be very big if MAX_NUMNODES is very big */
> if (size < PAGE_SIZE)
> mem = kmalloc(size, GFP_KERNEL);
> @@ -4020,6 +4038,7 @@ static struct mem_cgroup *mem_cgroup_all
> return NULL;
>
> memset(mem, 0, size);
> + mem->valid = 1;
> mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
> if (!mem->stat) {
> if (size < PAGE_SIZE)
> @@ -4049,6 +4068,7 @@ static void __mem_cgroup_free(struct mem
> mem_cgroup_remove_from_trees(mem);
> free_css_id(&mem_cgroup_subsys, &mem->css);
>
> + atomic_dec(&mem_cgroup_num);
> for_each_node_state(node, N_POSSIBLE)
> free_mem_cgroup_per_zone_info(mem, node);
>
> @@ -4059,6 +4079,19 @@ static void __mem_cgroup_free(struct mem
> vfree(mem);
> }
>
> +static void mem_cgroup_free(struct mem_cgroup *mem)
> +{
> + /* No more lookup */
> + mem->valid = 0;
> + rcu_assign_pointer(mem_cgroups[css_id(&mem->css)], NULL);
> + /*
> + * Because we call vfree() etc...use synchronize_rcu() rather than
> + * call_rcu();
> + */
> + synchronize_rcu();
> + __mem_cgroup_free(mem);
> +}
> +
> static void mem_cgroup_get(struct mem_cgroup *mem)
> {
> atomic_inc(&mem->refcnt);
> @@ -4068,7 +4101,7 @@ static void __mem_cgroup_put(struct mem_
> {
> if (atomic_sub_and_test(count, &mem->refcnt)) {
> struct mem_cgroup *parent = parent_mem_cgroup(mem);
> - __mem_cgroup_free(mem);
> + mem_cgroup_free(mem);
> if (parent)
> mem_cgroup_put(parent);
> }
> @@ -4189,9 +4222,11 @@ mem_cgroup_create(struct cgroup_subsys *
> atomic_set(&mem->refcnt, 1);
> mem->move_charge_at_immigrate = 0;
> mutex_init(&mem->thresholds_lock);
> + atomic_inc(&mem_cgroup_num);
> + register_memcg_id(mem);
> return &mem->css;
> free_out:
> - __mem_cgroup_free(mem);
> + mem_cgroup_free(mem);
> root_mem_cgroup = NULL;
> return ERR_PTR(error);
> }
> Index: mmotm-0811/init/Kconfig
> ===================================================================
> --- mmotm-0811.orig/init/Kconfig
> +++ mmotm-0811/init/Kconfig
> @@ -594,6 +594,16 @@ config CGROUP_MEM_RES_CTLR_SWAP
> Now, memory usage of swap_cgroup is 2 bytes per entry. If swap page
> size is 4096bytes, 512k per 1Gbytes of swap.
>
> +config MEM_CGROUP_MAX_GROUPS
> + int "Maximum number of memory cgroups on a system"
> + range 1 65535
> + default 8192 if 64BIT
> + default 2048 if 32BIT
> + help
> + Memory cgroup has limitation of the number of groups created.
> + Please select your favorite value. The more you allow, the more
> + memory(a pointer per group) will be consumed.
> +
Looks good, quick thought - should we expost memcg->id to user space
through a config file? I don't see any reason at this point, unless we
do it for all controllers.
--
Three Cheers,
Balbir
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH 2/5] memcg: quick memcg lookup array
2010-08-31 7:14 ` Balbir Singh
@ 2010-09-01 0:21 ` KAMEZAWA Hiroyuki
0 siblings, 0 replies; 19+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-09-01 0:21 UTC (permalink / raw)
To: balbir
Cc: linux-mm, nishimura@mxp.nes.nec.co.jp, gthelen, m-ikeda,
akpm@linux-foundation.org, linux-kernel@vger.kernel.org,
menage@google.com, lizf@cn.fujitsu.com
On Tue, 31 Aug 2010 12:44:14 +0530
Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> * KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> [2010-08-25 17:07:41]:
>
> > From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> >
> > Now, memory cgroup has an ID per cgroup and make use of it at
> > - hierarchy walk,
> > - swap recording.
> >
> > This patch is for making more use of it. The final purpose is
> > to replace page_cgroup->mem_cgroup's pointer to an unsigned short.
> >
> > This patch caches a pointer of memcg in an array. By this, we
> > don't have to call css_lookup() which requires radix-hash walk.
> > This saves some amount of memory footprint at lookup memcg via id.
> >
> > Changelog: 20100825
> > - applied comments.
> >
> > Changelog: 20100811
> > - adjusted onto mmotm-2010-08-11
> > - fixed RCU related parts.
> > - use attach_id() callback.
> >
> > Changelog: 20100804
> > - fixed description in init/Kconfig
> >
> > Changelog: 20100730
> > - fixed rcu_read_unlock() placement.
> >
> > Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> > ---
> > init/Kconfig | 10 +++++++
> > mm/memcontrol.c | 75 +++++++++++++++++++++++++++++++++++++++++---------------
> > 2 files changed, 65 insertions(+), 20 deletions(-)
> >
> > Index: mmotm-0811/mm/memcontrol.c
> > ===================================================================
> > --- mmotm-0811.orig/mm/memcontrol.c
> > +++ mmotm-0811/mm/memcontrol.c
> > @@ -195,6 +195,7 @@ static void mem_cgroup_oom_notify(struct
> > */
> > struct mem_cgroup {
> > struct cgroup_subsys_state css;
> > + int valid; /* for checking validness under RCU access.*/
> > /*
> > * the counter to account for memory usage
> > */
> > @@ -294,6 +295,29 @@ static bool move_file(void)
> > &mc.to->move_charge_at_immigrate);
> > }
> >
> > +/* 0 is unused */
> > +static atomic_t mem_cgroup_num;
> > +#define NR_MEMCG_GROUPS (CONFIG_MEM_CGROUP_MAX_GROUPS + 1)
> > +static struct mem_cgroup *mem_cgroups[NR_MEMCG_GROUPS] __read_mostly;
> > +
> > +/* Must be called under rcu_read_lock */
> > +static struct mem_cgroup *id_to_memcg(unsigned short id)
> > +{
> > + struct mem_cgroup *mem;
> > + /* see mem_cgroup_free() */
> > + mem = rcu_dereference_check(mem_cgroups[id], rcu_read_lock_held());
> > + if (likely(mem && mem->valid))
> > + return mem;
> > + return NULL;
> > +}
> > +
> > +static void register_memcg_id(struct mem_cgroup *mem)
> > +{
> > + int id = css_id(&mem->css);
> > + rcu_assign_pointer(mem_cgroups[id], mem);
> > + VM_BUG_ON(!mem->valid);
> > +}
> > +
> > /*
> > * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
> > * limit reclaim to prevent infinite loops, if they ever occur.
> > @@ -1847,18 +1871,7 @@ static void mem_cgroup_cancel_charge(str
> > * it's concern. (dropping refcnt from swap can be called against removed
> > * memcg.)
> > */
> > -static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
> > -{
> > - struct cgroup_subsys_state *css;
> >
> > - /* ID 0 is unused ID */
> > - if (!id)
> > - return NULL;
> > - css = css_lookup(&mem_cgroup_subsys, id);
> > - if (!css)
> > - return NULL;
> > - return container_of(css, struct mem_cgroup, css);
> > -}
> >
> > struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
> > {
> > @@ -1879,7 +1892,7 @@ struct mem_cgroup *try_get_mem_cgroup_fr
> > ent.val = page_private(page);
> > id = lookup_swap_cgroup(ent);
> > rcu_read_lock();
> > - mem = mem_cgroup_lookup(id);
> > + mem = id_to_memcg(id);
> > if (mem && !css_tryget(&mem->css))
> > mem = NULL;
> > rcu_read_unlock();
> > @@ -2231,7 +2244,7 @@ __mem_cgroup_commit_charge_swapin(struct
> >
> > id = swap_cgroup_record(ent, 0);
> > rcu_read_lock();
> > - memcg = mem_cgroup_lookup(id);
> > + memcg = id_to_memcg(id);
> > if (memcg) {
> > /*
> > * This recorded memcg can be obsolete one. So, avoid
> > @@ -2240,9 +2253,10 @@ __mem_cgroup_commit_charge_swapin(struct
> > if (!mem_cgroup_is_root(memcg))
> > res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
> > mem_cgroup_swap_statistics(memcg, false);
> > + rcu_read_unlock();
> > mem_cgroup_put(memcg);
> > - }
> > - rcu_read_unlock();
> > + } else
> > + rcu_read_unlock();
> > }
> > /*
> > * At swapin, we may charge account against cgroup which has no tasks.
> > @@ -2495,7 +2509,7 @@ void mem_cgroup_uncharge_swap(swp_entry_
> >
> > id = swap_cgroup_record(ent, 0);
> > rcu_read_lock();
> > - memcg = mem_cgroup_lookup(id);
> > + memcg = id_to_memcg(id);
> > if (memcg) {
> > /*
> > * We uncharge this because swap is freed.
> > @@ -2504,9 +2518,10 @@ void mem_cgroup_uncharge_swap(swp_entry_
> > if (!mem_cgroup_is_root(memcg))
> > res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
> > mem_cgroup_swap_statistics(memcg, false);
> > + rcu_read_unlock();
> > mem_cgroup_put(memcg);
> > - }
> > - rcu_read_unlock();
> > + } else
> > + rcu_read_unlock();
> > }
> >
> > /**
> > @@ -4010,6 +4025,9 @@ static struct mem_cgroup *mem_cgroup_all
> > struct mem_cgroup *mem;
> > int size = sizeof(struct mem_cgroup);
> >
> > + if (atomic_read(&mem_cgroup_num) == NR_MEMCG_GROUPS)
> > + return NULL;
> > +
> > /* Can be very big if MAX_NUMNODES is very big */
> > if (size < PAGE_SIZE)
> > mem = kmalloc(size, GFP_KERNEL);
> > @@ -4020,6 +4038,7 @@ static struct mem_cgroup *mem_cgroup_all
> > return NULL;
> >
> > memset(mem, 0, size);
> > + mem->valid = 1;
> > mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
> > if (!mem->stat) {
> > if (size < PAGE_SIZE)
> > @@ -4049,6 +4068,7 @@ static void __mem_cgroup_free(struct mem
> > mem_cgroup_remove_from_trees(mem);
> > free_css_id(&mem_cgroup_subsys, &mem->css);
> >
> > + atomic_dec(&mem_cgroup_num);
> > for_each_node_state(node, N_POSSIBLE)
> > free_mem_cgroup_per_zone_info(mem, node);
> >
> > @@ -4059,6 +4079,19 @@ static void __mem_cgroup_free(struct mem
> > vfree(mem);
> > }
> >
> > +static void mem_cgroup_free(struct mem_cgroup *mem)
> > +{
> > + /* No more lookup */
> > + mem->valid = 0;
> > + rcu_assign_pointer(mem_cgroups[css_id(&mem->css)], NULL);
> > + /*
> > + * Because we call vfree() etc...use synchronize_rcu() rather than
> > + * call_rcu();
> > + */
> > + synchronize_rcu();
> > + __mem_cgroup_free(mem);
> > +}
> > +
> > static void mem_cgroup_get(struct mem_cgroup *mem)
> > {
> > atomic_inc(&mem->refcnt);
> > @@ -4068,7 +4101,7 @@ static void __mem_cgroup_put(struct mem_
> > {
> > if (atomic_sub_and_test(count, &mem->refcnt)) {
> > struct mem_cgroup *parent = parent_mem_cgroup(mem);
> > - __mem_cgroup_free(mem);
> > + mem_cgroup_free(mem);
> > if (parent)
> > mem_cgroup_put(parent);
> > }
> > @@ -4189,9 +4222,11 @@ mem_cgroup_create(struct cgroup_subsys *
> > atomic_set(&mem->refcnt, 1);
> > mem->move_charge_at_immigrate = 0;
> > mutex_init(&mem->thresholds_lock);
> > + atomic_inc(&mem_cgroup_num);
> > + register_memcg_id(mem);
> > return &mem->css;
> > free_out:
> > - __mem_cgroup_free(mem);
> > + mem_cgroup_free(mem);
> > root_mem_cgroup = NULL;
> > return ERR_PTR(error);
> > }
> > Index: mmotm-0811/init/Kconfig
> > ===================================================================
> > --- mmotm-0811.orig/init/Kconfig
> > +++ mmotm-0811/init/Kconfig
> > @@ -594,6 +594,16 @@ config CGROUP_MEM_RES_CTLR_SWAP
> > Now, memory usage of swap_cgroup is 2 bytes per entry. If swap page
> > size is 4096bytes, 512k per 1Gbytes of swap.
> >
> > +config MEM_CGROUP_MAX_GROUPS
> > + int "Maximum number of memory cgroups on a system"
> > + range 1 65535
> > + default 8192 if 64BIT
> > + default 2048 if 32BIT
> > + help
> > + Memory cgroup has limitation of the number of groups created.
> > + Please select your favorite value. The more you allow, the more
> > + memory(a pointer per group) will be consumed.
> > +
>
> Looks good, quick thought - should we expost memcg->id to user space
> through a config file? I don't see any reason at this point, unless we
> do it for all controllers.
>
I wonder....showing whether 2 interfaces as "path name" and "id" to users is a
good thing or not. Yes, it's convenient to developpers as me and you, others,
but I don't think it's useful to users, novice people.
I'd like to avoid showing show memcg->id to users as much as possible. I don't
want to say but memcg is enough complicated.
Thanks,
-Kame
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 19+ messages in thread
* [PATCH 3/5] memcg: use ID instead of pointer in page_cgroup
2010-08-25 8:04 [PATCH 0/5] memcg: towards I/O aware memcg v6 KAMEZAWA Hiroyuki
2010-08-25 8:06 ` [PATCH 1/5] cgroup: do ID allocation under css allocator KAMEZAWA Hiroyuki
2010-08-25 8:07 ` [PATCH 2/5] memcg: quick memcg lookup array KAMEZAWA Hiroyuki
@ 2010-08-25 8:09 ` KAMEZAWA Hiroyuki
2010-08-31 2:14 ` Daisuke Nishimura
2010-08-25 8:10 ` [PATCH 4/5] memcg: lockless update of file stat with move-account safe method KAMEZAWA Hiroyuki
2010-08-25 8:11 ` [PATCH 5/5] memcg: generic file stat accounting interface KAMEZAWA Hiroyuki
4 siblings, 1 reply; 19+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-08-25 8:09 UTC (permalink / raw)
To: KAMEZAWA Hiroyuki
Cc: linux-mm, nishimura@mxp.nes.nec.co.jp, balbir@linux.vnet.ibm.com,
gthelen, m-ikeda, akpm@linux-foundation.org,
linux-kernel@vger.kernel.org, menage@google.com,
lizf@cn.fujitsu.com
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Now, addresses of memory cgroup can be calculated by their ID without complex.
This patch relplaces pc->mem_cgroup from a pointer to a unsigned short.
On 64bit architecture, this offers us more 6bytes room per page_cgroup.
Use 2bytes for blkio-cgroup's page tracking. More 4bytes will be used for
some light-weight concurrent access.
We may able to move this id onto flags field but ...go step by step.
Changelog: 20100824
- fixed comments, and typo.
Changelog: 20100811
- using new rcu APIs, as rcu_dereference_check() etc.
Changelog: 20100804
- added comments to page_cgroup.h
Changelog: 20100730
- fixed some garbage added by debug code in early stage
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
include/linux/page_cgroup.h | 6 ++++
mm/memcontrol.c | 53 ++++++++++++++++++++++++++++----------------
mm/page_cgroup.c | 2 -
3 files changed, 40 insertions(+), 21 deletions(-)
Index: mmotm-0811/include/linux/page_cgroup.h
===================================================================
--- mmotm-0811.orig/include/linux/page_cgroup.h
+++ mmotm-0811/include/linux/page_cgroup.h
@@ -9,10 +9,14 @@
* page_cgroup helps us identify information about the cgroup
* All page cgroups are allocated at boot or memory hotplug event,
* then the page cgroup for pfn always exists.
+ *
+ * TODO: It seems ID for cgroup can be packed into "flags". But there will
+ * be race between assigning ID <-> set/clear flags. Please be careful.
*/
struct page_cgroup {
unsigned long flags;
- struct mem_cgroup *mem_cgroup;
+ unsigned short mem_cgroup; /* ID of assigned memory cgroup */
+ unsigned short blk_cgroup; /* Not Used..but will be. */
struct page *page;
struct list_head lru; /* per cgroup LRU list */
};
Index: mmotm-0811/mm/page_cgroup.c
===================================================================
--- mmotm-0811.orig/mm/page_cgroup.c
+++ mmotm-0811/mm/page_cgroup.c
@@ -15,7 +15,7 @@ static void __meminit
__init_page_cgroup(struct page_cgroup *pc, unsigned long pfn)
{
pc->flags = 0;
- pc->mem_cgroup = NULL;
+ pc->mem_cgroup = 0;
pc->page = pfn_to_page(pfn);
INIT_LIST_HEAD(&pc->lru);
}
Index: mmotm-0811/mm/memcontrol.c
===================================================================
--- mmotm-0811.orig/mm/memcontrol.c
+++ mmotm-0811/mm/memcontrol.c
@@ -300,12 +300,16 @@ static atomic_t mem_cgroup_num;
#define NR_MEMCG_GROUPS (CONFIG_MEM_CGROUP_MAX_GROUPS + 1)
static struct mem_cgroup *mem_cgroups[NR_MEMCG_GROUPS] __read_mostly;
-/* Must be called under rcu_read_lock */
-static struct mem_cgroup *id_to_memcg(unsigned short id)
+/*
+ * Must be called under rcu_read_lock, Set safe==true if you're sure
+ * you're in safe condition...under lock_page_cgroup() etc.
+ */
+static struct mem_cgroup *id_to_memcg(unsigned short id, bool safe)
{
struct mem_cgroup *mem;
/* see mem_cgroup_free() */
- mem = rcu_dereference_check(mem_cgroups[id], rcu_read_lock_held());
+ mem = rcu_dereference_check(mem_cgroups[id],
+ rcu_read_lock_held() || safe);
if (likely(mem && mem->valid))
return mem;
return NULL;
@@ -381,7 +385,12 @@ struct cgroup_subsys_state *mem_cgroup_c
static struct mem_cgroup_per_zone *
page_cgroup_zoneinfo(struct page_cgroup *pc)
{
- struct mem_cgroup *mem = pc->mem_cgroup;
+ /*
+ * The caller should guarantee this "pc" is under lock. In typical
+ * case, this function is called by lru function with zone->lru_lock.
+ * It is a safe access.
+ */
+ struct mem_cgroup *mem = id_to_memcg(pc->mem_cgroup, true);
int nid = page_cgroup_nid(pc);
int zid = page_cgroup_zid(pc);
@@ -723,6 +732,11 @@ static inline bool mem_cgroup_is_root(st
return (mem == root_mem_cgroup);
}
+static inline bool mem_cgroup_is_rootid(unsigned short id)
+{
+ return (id == 1);
+}
+
/*
* Following LRU functions are allowed to be used without PCG_LOCK.
* Operations are called by routine of global LRU independently from memcg.
@@ -755,7 +769,7 @@ void mem_cgroup_del_lru_list(struct page
*/
mz = page_cgroup_zoneinfo(pc);
MEM_CGROUP_ZSTAT(mz, lru) -= 1;
- if (mem_cgroup_is_root(pc->mem_cgroup))
+ if (mem_cgroup_is_rootid(pc->mem_cgroup))
return;
VM_BUG_ON(list_empty(&pc->lru));
list_del_init(&pc->lru);
@@ -782,7 +796,7 @@ void mem_cgroup_rotate_lru_list(struct p
*/
smp_rmb();
/* unused or root page is not rotated. */
- if (!PageCgroupUsed(pc) || mem_cgroup_is_root(pc->mem_cgroup))
+ if (!PageCgroupUsed(pc) || mem_cgroup_is_rootid(pc->mem_cgroup))
return;
mz = page_cgroup_zoneinfo(pc);
list_move(&pc->lru, &mz->lists[lru]);
@@ -808,7 +822,7 @@ void mem_cgroup_add_lru_list(struct page
mz = page_cgroup_zoneinfo(pc);
MEM_CGROUP_ZSTAT(mz, lru) += 1;
SetPageCgroupAcctLRU(pc);
- if (mem_cgroup_is_root(pc->mem_cgroup))
+ if (mem_cgroup_is_rootid(pc->mem_cgroup))
return;
list_add(&pc->lru, &mz->lists[lru]);
}
@@ -1497,7 +1511,7 @@ void mem_cgroup_update_file_mapped(struc
return;
lock_page_cgroup(pc);
- mem = pc->mem_cgroup;
+ mem = id_to_memcg(pc->mem_cgroup, true);
if (!mem || !PageCgroupUsed(pc))
goto done;
@@ -1885,14 +1899,14 @@ struct mem_cgroup *try_get_mem_cgroup_fr
pc = lookup_page_cgroup(page);
lock_page_cgroup(pc);
if (PageCgroupUsed(pc)) {
- mem = pc->mem_cgroup;
+ mem = id_to_memcg(pc->mem_cgroup, true);
if (mem && !css_tryget(&mem->css))
mem = NULL;
} else if (PageSwapCache(page)) {
ent.val = page_private(page);
id = lookup_swap_cgroup(ent);
rcu_read_lock();
- mem = id_to_memcg(id);
+ mem = id_to_memcg(id, false);
if (mem && !css_tryget(&mem->css))
mem = NULL;
rcu_read_unlock();
@@ -1921,7 +1935,7 @@ static void __mem_cgroup_commit_charge(s
return;
}
- pc->mem_cgroup = mem;
+ pc->mem_cgroup = css_id(&mem->css);
/*
* We access a page_cgroup asynchronously without lock_page_cgroup().
* Especially when a page_cgroup is taken from a page, pc->mem_cgroup
@@ -1979,7 +1993,7 @@ static void __mem_cgroup_move_account(st
VM_BUG_ON(PageLRU(pc->page));
VM_BUG_ON(!PageCgroupLocked(pc));
VM_BUG_ON(!PageCgroupUsed(pc));
- VM_BUG_ON(pc->mem_cgroup != from);
+ VM_BUG_ON(id_to_memcg(pc->mem_cgroup, true) != from);
if (PageCgroupFileMapped(pc)) {
/* Update mapped_file data for mem_cgroup */
@@ -1994,7 +2008,7 @@ static void __mem_cgroup_move_account(st
mem_cgroup_cancel_charge(from);
/* caller should have done css_get */
- pc->mem_cgroup = to;
+ pc->mem_cgroup = css_id(&to->css);
mem_cgroup_charge_statistics(to, pc, true);
/*
* We charges against "to" which may not have any tasks. Then, "to"
@@ -2014,7 +2028,7 @@ static int mem_cgroup_move_account(struc
{
int ret = -EINVAL;
lock_page_cgroup(pc);
- if (PageCgroupUsed(pc) && pc->mem_cgroup == from) {
+ if (PageCgroupUsed(pc) && id_to_memcg(pc->mem_cgroup, true) == from) {
__mem_cgroup_move_account(pc, from, to, uncharge);
ret = 0;
}
@@ -2244,7 +2258,7 @@ __mem_cgroup_commit_charge_swapin(struct
id = swap_cgroup_record(ent, 0);
rcu_read_lock();
- memcg = id_to_memcg(id);
+ memcg = id_to_memcg(id, false);
if (memcg) {
/*
* This recorded memcg can be obsolete one. So, avoid
@@ -2354,7 +2368,7 @@ __mem_cgroup_uncharge_common(struct page
lock_page_cgroup(pc);
- mem = pc->mem_cgroup;
+ mem = id_to_memcg(pc->mem_cgroup, true);
if (!PageCgroupUsed(pc))
goto unlock_out;
@@ -2509,7 +2523,7 @@ void mem_cgroup_uncharge_swap(swp_entry_
id = swap_cgroup_record(ent, 0);
rcu_read_lock();
- memcg = id_to_memcg(id);
+ memcg = id_to_memcg(id, false);
if (memcg) {
/*
* We uncharge this because swap is freed.
@@ -2600,7 +2614,7 @@ int mem_cgroup_prepare_migration(struct
pc = lookup_page_cgroup(page);
lock_page_cgroup(pc);
if (PageCgroupUsed(pc)) {
- mem = pc->mem_cgroup;
+ mem = id_to_memcg(pc->mem_cgroup, true);
css_get(&mem->css);
/*
* At migrating an anonymous page, its mapcount goes down
@@ -4440,7 +4454,8 @@ static int is_target_pte_for_mc(struct v
* mem_cgroup_move_account() checks the pc is valid or not under
* the lock.
*/
- if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
+ if (PageCgroupUsed(pc) &&
+ id_to_memcg(pc->mem_cgroup, true) == mc.from) {
ret = MC_TARGET_PAGE;
if (target)
target->page = page;
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH 3/5] memcg: use ID instead of pointer in page_cgroup
2010-08-25 8:09 ` [PATCH 3/5] memcg: use ID instead of pointer in page_cgroup KAMEZAWA Hiroyuki
@ 2010-08-31 2:14 ` Daisuke Nishimura
0 siblings, 0 replies; 19+ messages in thread
From: Daisuke Nishimura @ 2010-08-31 2:14 UTC (permalink / raw)
To: KAMEZAWA Hiroyuki
Cc: linux-mm, balbir@linux.vnet.ibm.com, gthelen, m-ikeda,
akpm@linux-foundation.org, linux-kernel@vger.kernel.org,
menage@google.com, lizf@cn.fujitsu.com, Daisuke Nishimura
On Wed, 25 Aug 2010 17:09:00 +0900
KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
>
> Now, addresses of memory cgroup can be calculated by their ID without complex.
> This patch relplaces pc->mem_cgroup from a pointer to a unsigned short.
> On 64bit architecture, this offers us more 6bytes room per page_cgroup.
> Use 2bytes for blkio-cgroup's page tracking. More 4bytes will be used for
> some light-weight concurrent access.
>
> We may able to move this id onto flags field but ...go step by step.
>
> Changelog: 20100824
> - fixed comments, and typo.
> Changelog: 20100811
> - using new rcu APIs, as rcu_dereference_check() etc.
> Changelog: 20100804
> - added comments to page_cgroup.h
> Changelog: 20100730
> - fixed some garbage added by debug code in early stage
>
> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 19+ messages in thread
* [PATCH 4/5] memcg: lockless update of file stat with move-account safe method
2010-08-25 8:04 [PATCH 0/5] memcg: towards I/O aware memcg v6 KAMEZAWA Hiroyuki
` (2 preceding siblings ...)
2010-08-25 8:09 ` [PATCH 3/5] memcg: use ID instead of pointer in page_cgroup KAMEZAWA Hiroyuki
@ 2010-08-25 8:10 ` KAMEZAWA Hiroyuki
2010-08-31 3:51 ` Daisuke Nishimura
2010-08-25 8:11 ` [PATCH 5/5] memcg: generic file stat accounting interface KAMEZAWA Hiroyuki
4 siblings, 1 reply; 19+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-08-25 8:10 UTC (permalink / raw)
To: KAMEZAWA Hiroyuki
Cc: linux-mm, nishimura@mxp.nes.nec.co.jp, balbir@linux.vnet.ibm.com,
gthelen, m-ikeda, akpm@linux-foundation.org,
linux-kernel@vger.kernel.org, menage@google.com,
lizf@cn.fujitsu.com
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
At accounting file events per memory cgroup, we need to find memory cgroup
via page_cgroup->mem_cgroup. Now, we use lock_page_cgroup().
But, considering the context which page-cgroup for files are accessed,
we can use alternative light-weight mutual execusion in the most case.
At handling file-caches, the only race we have to take care of is "moving"
account, IOW, overwriting page_cgroup->mem_cgroup. Because file status
update is done while the page-cache is in stable state, we don't have to
take care of race with charge/uncharge.
Unlike charge/uncharge, "move" happens not so frequently. It happens only when
rmdir() and task-moving (with a special settings.)
This patch adds a race-checker for file-cache-status accounting v.s. account
moving. The new per-cpu-per-memcg counter MEM_CGROUP_ON_MOVE is added.
The routine for account move
1. Increment it before start moving
2. Call synchronize_rcu()
3. Decrement it after the end of moving.
By this, file-status-counting routine can check it needs to call
lock_page_cgroup(). In most case, I doesn't need to call it.
Changelog: 20100825
- added a comment about mc.lock
- fixed bad lock.
Changelog: 20100804
- added a comment for possible optimization hint.
Changelog: 20100730
- some cleanup.
Changelog: 20100729
- replaced __this_cpu_xxx() with this_cpu_xxx
(because we don't call spinlock)
- added VM_BUG_ON().
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
mm/memcontrol.c | 81 +++++++++++++++++++++++++++++++++++++++++++++++---------
1 file changed, 69 insertions(+), 12 deletions(-)
Index: mmotm-0811/mm/memcontrol.c
===================================================================
--- mmotm-0811.orig/mm/memcontrol.c
+++ mmotm-0811/mm/memcontrol.c
@@ -90,6 +90,7 @@ enum mem_cgroup_stat_index {
MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */
MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
MEM_CGROUP_EVENTS, /* incremented at every pagein/pageout */
+ MEM_CGROUP_ON_MOVE, /* A check for locking move account/status */
MEM_CGROUP_STAT_NSTATS,
};
@@ -1089,7 +1090,52 @@ static unsigned int get_swappiness(struc
return swappiness;
}
-/* A routine for testing mem is not under move_account */
+static void mem_cgroup_start_move(struct mem_cgroup *mem)
+{
+ int cpu;
+ /*
+ * reuse mc.lock.
+ */
+ spin_lock(&mc.lock);
+ /* TODO: Can we optimize this by for_each_online_cpu() ? */
+ for_each_possible_cpu(cpu)
+ per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1;
+ spin_unlock(&mc.lock);
+
+ synchronize_rcu();
+}
+
+static void mem_cgroup_end_move(struct mem_cgroup *mem)
+{
+ int cpu;
+
+ if (!mem)
+ return;
+ /* for fast checking in mem_cgroup_update_file_stat() etc..*/
+ spin_lock(&mc.lock);
+ for_each_possible_cpu(cpu)
+ per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1;
+ spin_unlock(&mc.lock);
+}
+
+/*
+ * mem_cgroup_is_moved -- checking a cgroup is mc.from target or not.
+ * used for avoiding race.
+ * mem_cgroup_under_move -- checking a cgroup is mc.from or mc.to or
+ * under hierarchy of them. used for waiting at
+ * memory pressure.
+ * Result of is_moved can be trusted until the end of rcu_read_unlock().
+ * The caller must do
+ * rcu_read_lock();
+ * result = mem_cgroup_is_moved();
+ * .....make use of result here....
+ * rcu_read_unlock();
+ */
+static bool mem_cgroup_is_moved(struct mem_cgroup *mem)
+{
+ VM_BUG_ON(!rcu_read_lock_held());
+ return this_cpu_read(mem->stat->count[MEM_CGROUP_ON_MOVE]) > 0;
+}
static bool mem_cgroup_under_move(struct mem_cgroup *mem)
{
@@ -1505,29 +1551,36 @@ void mem_cgroup_update_file_mapped(struc
{
struct mem_cgroup *mem;
struct page_cgroup *pc;
+ bool need_lock = false;
pc = lookup_page_cgroup(page);
if (unlikely(!pc))
return;
-
- lock_page_cgroup(pc);
+ rcu_read_lock();
mem = id_to_memcg(pc->mem_cgroup, true);
- if (!mem || !PageCgroupUsed(pc))
+ if (likely(mem)) {
+ if (mem_cgroup_is_moved(mem)) {
+ /* need to serialize with move_account */
+ lock_page_cgroup(pc);
+ need_lock = true;
+ mem = id_to_memcg(pc->mem_cgroup, true);
+ if (unlikely(!mem))
+ goto done;
+ }
+ }
+ if (unlikely(!PageCgroupUsed(pc)))
goto done;
-
- /*
- * Preemption is already disabled. We can use __this_cpu_xxx
- */
if (val > 0) {
- __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
+ this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
SetPageCgroupFileMapped(pc);
} else {
- __this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
+ this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
ClearPageCgroupFileMapped(pc);
}
-
done:
- unlock_page_cgroup(pc);
+ if (need_lock)
+ unlock_page_cgroup(pc);
+ rcu_read_unlock();
}
/*
@@ -3067,6 +3120,7 @@ move_account:
lru_add_drain_all();
drain_all_stock_sync();
ret = 0;
+ mem_cgroup_start_move(mem);
for_each_node_state(node, N_HIGH_MEMORY) {
for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
enum lru_list l;
@@ -3080,6 +3134,7 @@ move_account:
if (ret)
break;
}
+ mem_cgroup_end_move(mem);
memcg_oom_recover(mem);
/* it seems parent cgroup doesn't have enough mem */
if (ret == -ENOMEM)
@@ -4564,6 +4619,7 @@ static void mem_cgroup_clear_mc(void)
mc.to = NULL;
mc.moving_task = NULL;
spin_unlock(&mc.lock);
+ mem_cgroup_end_move(from);
memcg_oom_recover(from);
memcg_oom_recover(to);
wake_up_all(&mc.waitq);
@@ -4594,6 +4650,7 @@ static int mem_cgroup_can_attach(struct
VM_BUG_ON(mc.moved_charge);
VM_BUG_ON(mc.moved_swap);
VM_BUG_ON(mc.moving_task);
+ mem_cgroup_start_move(from);
spin_lock(&mc.lock);
mc.from = from;
mc.to = mem;
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH 4/5] memcg: lockless update of file stat with move-account safe method
2010-08-25 8:10 ` [PATCH 4/5] memcg: lockless update of file stat with move-account safe method KAMEZAWA Hiroyuki
@ 2010-08-31 3:51 ` Daisuke Nishimura
2010-09-01 0:31 ` KAMEZAWA Hiroyuki
0 siblings, 1 reply; 19+ messages in thread
From: Daisuke Nishimura @ 2010-08-31 3:51 UTC (permalink / raw)
To: KAMEZAWA Hiroyuki
Cc: linux-mm, balbir@linux.vnet.ibm.com, gthelen, m-ikeda,
akpm@linux-foundation.org, linux-kernel@vger.kernel.org,
menage@google.com, lizf@cn.fujitsu.com, Daisuke Nishimura
On Wed, 25 Aug 2010 17:10:50 +0900
KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
>
> At accounting file events per memory cgroup, we need to find memory cgroup
> via page_cgroup->mem_cgroup. Now, we use lock_page_cgroup().
>
> But, considering the context which page-cgroup for files are accessed,
> we can use alternative light-weight mutual execusion in the most case.
> At handling file-caches, the only race we have to take care of is "moving"
> account, IOW, overwriting page_cgroup->mem_cgroup. Because file status
> update is done while the page-cache is in stable state, we don't have to
> take care of race with charge/uncharge.
>
> Unlike charge/uncharge, "move" happens not so frequently. It happens only when
> rmdir() and task-moving (with a special settings.)
> This patch adds a race-checker for file-cache-status accounting v.s. account
> moving. The new per-cpu-per-memcg counter MEM_CGROUP_ON_MOVE is added.
> The routine for account move
> 1. Increment it before start moving
> 2. Call synchronize_rcu()
> 3. Decrement it after the end of moving.
> By this, file-status-counting routine can check it needs to call
> lock_page_cgroup(). In most case, I doesn't need to call it.
>
> Changelog: 20100825
> - added a comment about mc.lock
> - fixed bad lock.
> Changelog: 20100804
> - added a comment for possible optimization hint.
> Changelog: 20100730
> - some cleanup.
> Changelog: 20100729
> - replaced __this_cpu_xxx() with this_cpu_xxx
> (because we don't call spinlock)
> - added VM_BUG_ON().
>
> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
(snip)
> @@ -1505,29 +1551,36 @@ void mem_cgroup_update_file_mapped(struc
> {
> struct mem_cgroup *mem;
> struct page_cgroup *pc;
> + bool need_lock = false;
>
> pc = lookup_page_cgroup(page);
> if (unlikely(!pc))
> return;
> -
> - lock_page_cgroup(pc);
> + rcu_read_lock();
> mem = id_to_memcg(pc->mem_cgroup, true);
It doesn't cause any problem, but I think it would be better to change this to
"id_to_memcg(..., false)". It's just under rcu_read_lock(), not under page_cgroup
lock anymore.
Otherwise, it looks good to me.
Reviewed-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Thanks,
Daisuke Nishimura.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH 4/5] memcg: lockless update of file stat with move-account safe method
2010-08-31 3:51 ` Daisuke Nishimura
@ 2010-09-01 0:31 ` KAMEZAWA Hiroyuki
0 siblings, 0 replies; 19+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-09-01 0:31 UTC (permalink / raw)
To: Daisuke Nishimura
Cc: linux-mm, balbir@linux.vnet.ibm.com, gthelen, m-ikeda,
akpm@linux-foundation.org, linux-kernel@vger.kernel.org,
menage@google.com, lizf@cn.fujitsu.com
On Tue, 31 Aug 2010 12:51:18 +0900
Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:
> On Wed, 25 Aug 2010 17:10:50 +0900
> KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
>
> > From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> >
> > At accounting file events per memory cgroup, we need to find memory cgroup
> > via page_cgroup->mem_cgroup. Now, we use lock_page_cgroup().
> >
> > But, considering the context which page-cgroup for files are accessed,
> > we can use alternative light-weight mutual execusion in the most case.
> > At handling file-caches, the only race we have to take care of is "moving"
> > account, IOW, overwriting page_cgroup->mem_cgroup. Because file status
> > update is done while the page-cache is in stable state, we don't have to
> > take care of race with charge/uncharge.
> >
> > Unlike charge/uncharge, "move" happens not so frequently. It happens only when
> > rmdir() and task-moving (with a special settings.)
> > This patch adds a race-checker for file-cache-status accounting v.s. account
> > moving. The new per-cpu-per-memcg counter MEM_CGROUP_ON_MOVE is added.
> > The routine for account move
> > 1. Increment it before start moving
> > 2. Call synchronize_rcu()
> > 3. Decrement it after the end of moving.
> > By this, file-status-counting routine can check it needs to call
> > lock_page_cgroup(). In most case, I doesn't need to call it.
> >
> > Changelog: 20100825
> > - added a comment about mc.lock
> > - fixed bad lock.
> > Changelog: 20100804
> > - added a comment for possible optimization hint.
> > Changelog: 20100730
> > - some cleanup.
> > Changelog: 20100729
> > - replaced __this_cpu_xxx() with this_cpu_xxx
> > (because we don't call spinlock)
> > - added VM_BUG_ON().
> >
> > Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
>
> (snip)
>
> > @@ -1505,29 +1551,36 @@ void mem_cgroup_update_file_mapped(struc
> > {
> > struct mem_cgroup *mem;
> > struct page_cgroup *pc;
> > + bool need_lock = false;
> >
> > pc = lookup_page_cgroup(page);
> > if (unlikely(!pc))
> > return;
> > -
> > - lock_page_cgroup(pc);
> > + rcu_read_lock();
> > mem = id_to_memcg(pc->mem_cgroup, true);
> It doesn't cause any problem, but I think it would be better to change this to
> "id_to_memcg(..., false)". It's just under rcu_read_lock(), not under page_cgroup
> lock anymore.
>
ok, I'll apply your suggestion.
> Otherwise, it looks good to me.
>
> Reviewed-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
>
Thanks!
-Kame
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 19+ messages in thread
* [PATCH 5/5] memcg: generic file stat accounting interface
2010-08-25 8:04 [PATCH 0/5] memcg: towards I/O aware memcg v6 KAMEZAWA Hiroyuki
` (3 preceding siblings ...)
2010-08-25 8:10 ` [PATCH 4/5] memcg: lockless update of file stat with move-account safe method KAMEZAWA Hiroyuki
@ 2010-08-25 8:11 ` KAMEZAWA Hiroyuki
2010-08-31 4:33 ` Daisuke Nishimura
4 siblings, 1 reply; 19+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-08-25 8:11 UTC (permalink / raw)
To: KAMEZAWA Hiroyuki
Cc: linux-mm, nishimura@mxp.nes.nec.co.jp, balbir@linux.vnet.ibm.com,
gthelen, m-ikeda, akpm@linux-foundation.org,
linux-kernel@vger.kernel.org, menage@google.com,
lizf@cn.fujitsu.com
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Preparing for adding new status arounf file caches.(dirty, writeback,etc..)
Using a unified macro and more generic names.
All counters will have the same rule for updating.
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
include/linux/memcontrol.h | 24 ++++++++++++++++++++++
include/linux/page_cgroup.h | 19 ++++++++++++------
mm/memcontrol.c | 46 ++++++++++++++++++--------------------------
3 files changed, 56 insertions(+), 33 deletions(-)
Index: mmotm-0811/include/linux/memcontrol.h
===================================================================
--- mmotm-0811.orig/include/linux/memcontrol.h
+++ mmotm-0811/include/linux/memcontrol.h
@@ -25,6 +25,30 @@ struct page_cgroup;
struct page;
struct mm_struct;
+/*
+ * Per-cpu Statistics for memory cgroup.
+ */
+enum mem_cgroup_stat_index {
+ /*
+ * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
+ */
+ MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */
+ MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */
+ MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */
+ MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */
+ MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
+ MEM_CGROUP_EVENTS, /* incremented at every pagein/pageout */
+ MEM_CGROUP_ON_MOVE, /* A check for locking move account/status */
+ /* When you add new member for file-stat, please update page_cgroup.h */
+ MEM_CGROUP_FSTAT_BASE,
+ MEM_CGROUP_FSTAT_FILE_MAPPED = MEM_CGROUP_FSTAT_BASE,
+ MEM_CGROUP_FSTAT_END,
+ MEM_CGROUP_STAT_NSTATS = MEM_CGROUP_FSTAT_END,
+};
+
+#define MEMCG_FSTAT_IDX(idx) ((idx) - MEM_CGROUP_FSTAT_BASE)
+#define NR_FILE_FLAGS_MEMCG ((MEM_CGROUP_FSTAT_END - MEM_CGROUP_FSTAT_BASE))
+
extern unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
struct list_head *dst,
unsigned long *scanned, int order,
Index: mmotm-0811/mm/memcontrol.c
===================================================================
--- mmotm-0811.orig/mm/memcontrol.c
+++ mmotm-0811/mm/memcontrol.c
@@ -76,24 +76,6 @@ static int really_do_swap_account __init
#define THRESHOLDS_EVENTS_THRESH (7) /* once in 128 */
#define SOFTLIMIT_EVENTS_THRESH (10) /* once in 1024 */
-/*
- * Statistics for memory cgroup.
- */
-enum mem_cgroup_stat_index {
- /*
- * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
- */
- MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */
- MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */
- MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */
- MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */
- MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */
- MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
- MEM_CGROUP_EVENTS, /* incremented at every pagein/pageout */
- MEM_CGROUP_ON_MOVE, /* A check for locking move account/status */
-
- MEM_CGROUP_STAT_NSTATS,
-};
struct mem_cgroup_stat_cpu {
s64 count[MEM_CGROUP_STAT_NSTATS];
@@ -1547,7 +1529,8 @@ bool mem_cgroup_handle_oom(struct mem_cg
* Currently used to update mapped file statistics, but the routine can be
* generalized to update other statistics as well.
*/
-void mem_cgroup_update_file_mapped(struct page *page, int val)
+static void
+mem_cgroup_update_file_stat(struct page *page, unsigned int idx, int val)
{
struct mem_cgroup *mem;
struct page_cgroup *pc;
@@ -1571,11 +1554,11 @@ void mem_cgroup_update_file_mapped(struc
if (unlikely(!PageCgroupUsed(pc)))
goto done;
if (val > 0) {
- this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
- SetPageCgroupFileMapped(pc);
+ this_cpu_inc(mem->stat->count[idx]);
+ set_bit(fflag_idx(MEMCG_FSTAT_IDX(idx)), &pc->flags);
} else {
- this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
- ClearPageCgroupFileMapped(pc);
+ this_cpu_dec(mem->stat->count[idx]);
+ clear_bit(fflag_idx(MEMCG_FSTAT_IDX(idx)), &pc->flags);
}
done:
if (need_lock)
@@ -1583,6 +1566,12 @@ done:
rcu_read_unlock();
}
+void mem_cgroup_update_file_mapped(struct page *page, int val)
+{
+ return mem_cgroup_update_file_stat(page,
+ MEM_CGROUP_FSTAT_FILE_MAPPED, val);
+}
+
/*
* size of first charge trial. "32" comes from vmscan.c's magic value.
* TODO: maybe necessary to use big numbers in big irons.
@@ -2042,17 +2031,20 @@ static void __mem_cgroup_commit_charge(s
static void __mem_cgroup_move_account(struct page_cgroup *pc,
struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
{
+ int i;
VM_BUG_ON(from == to);
VM_BUG_ON(PageLRU(pc->page));
VM_BUG_ON(!PageCgroupLocked(pc));
VM_BUG_ON(!PageCgroupUsed(pc));
VM_BUG_ON(id_to_memcg(pc->mem_cgroup, true) != from);
- if (PageCgroupFileMapped(pc)) {
+ for (i = MEM_CGROUP_FSTAT_BASE; i < MEM_CGROUP_FSTAT_END; ++i) {
+ if (!test_bit(fflag_idx(MEMCG_FSTAT_IDX(i)), &pc->flags))
+ continue;
/* Update mapped_file data for mem_cgroup */
preempt_disable();
- __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
- __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
+ __this_cpu_dec(from->stat->count[i]);
+ __this_cpu_inc(to->stat->count[i]);
preempt_enable();
}
mem_cgroup_charge_statistics(from, pc, false);
@@ -3483,7 +3475,7 @@ static int mem_cgroup_get_local_stat(str
s->stat[MCS_CACHE] += val * PAGE_SIZE;
val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
s->stat[MCS_RSS] += val * PAGE_SIZE;
- val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED);
+ val = mem_cgroup_read_stat(mem, MEM_CGROUP_FSTAT_FILE_MAPPED);
s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;
val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGIN_COUNT);
s->stat[MCS_PGPGIN] += val;
Index: mmotm-0811/include/linux/page_cgroup.h
===================================================================
--- mmotm-0811.orig/include/linux/page_cgroup.h
+++ mmotm-0811/include/linux/page_cgroup.h
@@ -3,6 +3,7 @@
#ifdef CONFIG_CGROUP_MEM_RES_CTLR
#include <linux/bit_spinlock.h>
+#include <linux/memcontrol.h> /* for flags */
/*
* Page Cgroup can be considered as an extended mem_map.
* A page_cgroup page is associated with every page descriptor. The
@@ -43,10 +44,22 @@ enum {
PCG_CACHE, /* charged as cache */
PCG_USED, /* this object is in use. */
PCG_ACCT_LRU, /* page has been accounted for */
- PCG_FILE_MAPPED, /* page is accounted as "mapped" */
PCG_MIGRATION, /* under page migration */
+ PCG_FILE_FLAGS_MEMCG, /* see memcontrol.h */
+ PCG_FILE_FLAGS_MEMCG_END
+ = PCG_FILE_FLAGS_MEMCG + NR_FILE_FLAGS_MEMCG - 1,
};
+/*
+ * file-stat flags are defined regarding to memcg's stat information.
+ * Here, just defines a macro for indexing
+ */
+static inline int fflag_idx(int idx)
+{
+ VM_BUG_ON((idx) >= NR_FILE_FLAGS_MEMCG);
+ return (idx) + PCG_FILE_FLAGS_MEMCG;
+}
+
#define TESTPCGFLAG(uname, lname) \
static inline int PageCgroup##uname(struct page_cgroup *pc) \
{ return test_bit(PCG_##lname, &pc->flags); }
@@ -79,11 +92,6 @@ CLEARPCGFLAG(AcctLRU, ACCT_LRU)
TESTPCGFLAG(AcctLRU, ACCT_LRU)
TESTCLEARPCGFLAG(AcctLRU, ACCT_LRU)
-
-SETPCGFLAG(FileMapped, FILE_MAPPED)
-CLEARPCGFLAG(FileMapped, FILE_MAPPED)
-TESTPCGFLAG(FileMapped, FILE_MAPPED)
-
SETPCGFLAG(Migration, MIGRATION)
CLEARPCGFLAG(Migration, MIGRATION)
TESTPCGFLAG(Migration, MIGRATION)
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH 5/5] memcg: generic file stat accounting interface
2010-08-25 8:11 ` [PATCH 5/5] memcg: generic file stat accounting interface KAMEZAWA Hiroyuki
@ 2010-08-31 4:33 ` Daisuke Nishimura
2010-09-01 0:31 ` KAMEZAWA Hiroyuki
0 siblings, 1 reply; 19+ messages in thread
From: Daisuke Nishimura @ 2010-08-31 4:33 UTC (permalink / raw)
To: KAMEZAWA Hiroyuki
Cc: linux-mm, balbir@linux.vnet.ibm.com, gthelen, m-ikeda,
akpm@linux-foundation.org, linux-kernel@vger.kernel.org,
menage@google.com, lizf@cn.fujitsu.com, Daisuke Nishimura
On Wed, 25 Aug 2010 17:11:40 +0900
KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
>
> Preparing for adding new status arounf file caches.(dirty, writeback,etc..)
> Using a unified macro and more generic names.
> All counters will have the same rule for updating.
>
> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
one nitpick.
> @@ -2042,17 +2031,20 @@ static void __mem_cgroup_commit_charge(s
> static void __mem_cgroup_move_account(struct page_cgroup *pc,
> struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
> {
> + int i;
> VM_BUG_ON(from == to);
> VM_BUG_ON(PageLRU(pc->page));
> VM_BUG_ON(!PageCgroupLocked(pc));
> VM_BUG_ON(!PageCgroupUsed(pc));
> VM_BUG_ON(id_to_memcg(pc->mem_cgroup, true) != from);
>
> - if (PageCgroupFileMapped(pc)) {
> + for (i = MEM_CGROUP_FSTAT_BASE; i < MEM_CGROUP_FSTAT_END; ++i) {
> + if (!test_bit(fflag_idx(MEMCG_FSTAT_IDX(i)), &pc->flags))
> + continue;
> /* Update mapped_file data for mem_cgroup */
It might be better to update this comment too.
/* Update file-stat data for mem_cgroup */
or something ?
Thanks,
Daisuke Nishimura.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH 5/5] memcg: generic file stat accounting interface
2010-08-31 4:33 ` Daisuke Nishimura
@ 2010-09-01 0:31 ` KAMEZAWA Hiroyuki
0 siblings, 0 replies; 19+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-09-01 0:31 UTC (permalink / raw)
To: Daisuke Nishimura
Cc: linux-mm, balbir@linux.vnet.ibm.com, gthelen, m-ikeda,
akpm@linux-foundation.org, linux-kernel@vger.kernel.org,
menage@google.com, lizf@cn.fujitsu.com
On Tue, 31 Aug 2010 13:33:29 +0900
Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:
> On Wed, 25 Aug 2010 17:11:40 +0900
> KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
>
> > From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> >
> > Preparing for adding new status arounf file caches.(dirty, writeback,etc..)
> > Using a unified macro and more generic names.
> > All counters will have the same rule for updating.
> >
> > Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
>
> Reviewed-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
>
> one nitpick.
>
> > @@ -2042,17 +2031,20 @@ static void __mem_cgroup_commit_charge(s
> > static void __mem_cgroup_move_account(struct page_cgroup *pc,
> > struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
> > {
> > + int i;
> > VM_BUG_ON(from == to);
> > VM_BUG_ON(PageLRU(pc->page));
> > VM_BUG_ON(!PageCgroupLocked(pc));
> > VM_BUG_ON(!PageCgroupUsed(pc));
> > VM_BUG_ON(id_to_memcg(pc->mem_cgroup, true) != from);
> >
> > - if (PageCgroupFileMapped(pc)) {
> > + for (i = MEM_CGROUP_FSTAT_BASE; i < MEM_CGROUP_FSTAT_END; ++i) {
> > + if (!test_bit(fflag_idx(MEMCG_FSTAT_IDX(i)), &pc->flags))
> > + continue;
> > /* Update mapped_file data for mem_cgroup */
> It might be better to update this comment too.
>
> /* Update file-stat data for mem_cgroup */
>
> or something ?
>
Nice catch. I'll fix this.
-Kame
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 19+ messages in thread