* [RFC][PATCH 2/2] memcg: hierarchy reclaim with CGROUP ID
2008-11-27 7:05 [RFC][PATCH 1/2] CGROUP ID and Hierarchy Code KAMEZAWA Hiroyuki
@ 2008-11-27 7:08 ` KAMEZAWA Hiroyuki
2008-11-27 7:46 ` [RFC][PATCH 1/2] CGROUP ID and Hierarchy Code Daisuke Nishimura
` (2 subsequent siblings)
3 siblings, 0 replies; 12+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-11-27 7:08 UTC (permalink / raw)
To: KAMEZAWA Hiroyuki
Cc: linux-kernel@vger.kernel.org, lizf@cn.fujitsu.com,
menage@google.com, balbir@linux.vnet.ibm.com,
nishimura@mxp.nes.nec.co.jp, taka
Implement hierarchy reclaim by cgroup_id.
What changes:
- reclaim is not done by tree-walk algorithm
- mem_cgroup->last_schan_child is ID, not pointer.
- no cgroup_lock.
- scanning order is just defined by ID's order.
(Scan by round-robin logic.)
- Order of scanning can be changed easily(maybe).
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujisu.com>
mm/memcontrol.c | 129 +++++++++++---------------------------------------------
1 file changed, 27 insertions(+), 102 deletions(-)
Index: mmotm-2.6.28-Nov24/mm/memcontrol.c
===================================================================
--- mmotm-2.6.28-Nov24.orig/mm/memcontrol.c
+++ mmotm-2.6.28-Nov24/mm/memcontrol.c
@@ -148,7 +148,7 @@ struct mem_cgroup {
* While reclaiming in a hiearchy, we cache the last child we
* reclaimed from. Protected by cgroup_lock()
*/
- struct mem_cgroup *last_scanned_child;
+ int last_scan_child;
/*
* Should the accounting and control be hierarchical, per subtree?
*/
@@ -472,102 +472,31 @@ unsigned long mem_cgroup_isolate_pages(u
return nr_taken;
}
-#define mem_cgroup_from_res_counter(counter, member) \
- container_of(counter, struct mem_cgroup, member)
-
+#define mem_cgroup_from_res_counter(counter, member) \
+ container_of(counter, struct mem_cgroup, member)
/*
- * This routine finds the DFS walk successor. This routine should be
- * called with cgroup_mutex held
+ * get the cgroup under hierarchy under root. start from root->last_scan_child
+ * and root->last_scanned_child is updated.
*/
static struct mem_cgroup *
-mem_cgroup_get_next_node(struct mem_cgroup *curr, struct mem_cgroup *root_mem)
-{
- struct cgroup *cgroup, *curr_cgroup, *root_cgroup;
-
- curr_cgroup = curr->css.cgroup;
- root_cgroup = root_mem->css.cgroup;
-
- if (!list_empty(&curr_cgroup->children)) {
- /*
- * Walk down to children
- */
- mem_cgroup_put(curr);
- cgroup = list_entry(curr_cgroup->children.next,
- struct cgroup, sibling);
- curr = mem_cgroup_from_cont(cgroup);
- mem_cgroup_get(curr);
- goto done;
- }
-
-visit_parent:
- if (curr_cgroup == root_cgroup) {
- mem_cgroup_put(curr);
- curr = root_mem;
- mem_cgroup_get(curr);
- goto done;
- }
-
- /*
- * Goto next sibling
- */
- if (curr_cgroup->sibling.next != &curr_cgroup->parent->children) {
- mem_cgroup_put(curr);
- cgroup = list_entry(curr_cgroup->sibling.next, struct cgroup,
- sibling);
- curr = mem_cgroup_from_cont(cgroup);
- mem_cgroup_get(curr);
- goto done;
- }
-
- /*
- * Go up to next parent and next parent's sibling if need be
- */
- curr_cgroup = curr_cgroup->parent;
- goto visit_parent;
-
-done:
- root_mem->last_scanned_child = curr;
- return curr;
-}
-
-/*
- * Visit the first child (need not be the first child as per the ordering
- * of the cgroup list, since we track last_scanned_child) of @mem and use
- * that to reclaim free pages from.
- */
-static struct mem_cgroup *
-mem_cgroup_get_first_node(struct mem_cgroup *root_mem)
+mem_cgroup_get_reclaim_target(struct mem_cgroup *root_mem)
{
struct cgroup *cgroup;
+ struct cgroup *root = root_mem->css.cgroup;
struct mem_cgroup *ret;
- bool obsolete = (root_mem->last_scanned_child &&
- root_mem->last_scanned_child->obsolete);
-
- /*
- * Scan all children under the mem_cgroup mem
- */
- cgroup_lock();
- if (list_empty(&root_mem->css.cgroup->children)) {
- ret = root_mem;
- goto done;
- }
-
- if (!root_mem->last_scanned_child || obsolete) {
-
- if (obsolete)
- mem_cgroup_put(root_mem->last_scanned_child);
+ int id;
- cgroup = list_first_entry(&root_mem->css.cgroup->children,
- struct cgroup, sibling);
+ while (!ret) {
+ rcu_read_lock();
+ cgroup = cgroup_get_next(root_mem->last_scan_child, root, &id);
ret = mem_cgroup_from_cont(cgroup);
- mem_cgroup_get(ret);
- } else
- ret = mem_cgroup_get_next_node(root_mem->last_scanned_child,
- root_mem);
+ rcu_read_unlock();
+ root_mem->last_scan_child = id + 1;
+ if (ret->obsolete)
+ ret = NULL;
+ }
+ mem_cgroup_get(ret);
-done:
- root_mem->last_scanned_child = ret;
- cgroup_unlock();
return ret;
}
@@ -581,7 +510,7 @@ done:
static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
gfp_t gfp_mask, bool noswap)
{
- struct mem_cgroup *next_mem;
+ struct mem_cgroup *next_mem, *start;
int ret = 0;
/*
@@ -595,23 +524,21 @@ static int mem_cgroup_hierarchical_recla
if (res_counter_check_under_limit(&root_mem->res))
return 0;
- next_mem = mem_cgroup_get_first_node(root_mem);
-
- while (next_mem != root_mem) {
+ next_mem = mem_cgroup_get_reclaim_target(root_mem);
+ start = next_mem;
+ do {
if (next_mem->obsolete) {
mem_cgroup_put(next_mem);
- cgroup_lock();
- next_mem = mem_cgroup_get_first_node(root_mem);
- cgroup_unlock();
+ next_mem = mem_cgroup_get_reclaim_target(root_mem);
continue;
}
ret = try_to_free_mem_cgroup_pages(next_mem, gfp_mask, noswap);
+ mem_cgroup_put(next_mem);
if (res_counter_check_under_limit(&root_mem->res))
- return 0;
- cgroup_lock();
- next_mem = mem_cgroup_get_next_node(next_mem, root_mem);
- cgroup_unlock();
- }
+ break;
+ next_mem = mem_cgroup_get_reclaim_target(root_mem);
+ } while (start != next_mem);
+
return ret;
}
@@ -1959,8 +1886,6 @@ mem_cgroup_create(struct cgroup_subsys *
res_counter_init(&mem->memsw, NULL);
}
- mem->last_scanned_child = NULL;
-
return &mem->css;
free_out:
for_each_node_state(node, N_POSSIBLE)
^ permalink raw reply [flat|nested] 12+ messages in thread* Re: [RFC][PATCH 1/2] CGROUP ID and Hierarchy Code
2008-11-27 7:05 [RFC][PATCH 1/2] CGROUP ID and Hierarchy Code KAMEZAWA Hiroyuki
2008-11-27 7:08 ` [RFC][PATCH 2/2] memcg: hierarchy reclaim with CGROUP ID KAMEZAWA Hiroyuki
@ 2008-11-27 7:46 ` Daisuke Nishimura
2008-11-27 8:00 ` KAMEZAWA Hiroyuki
2008-11-27 18:54 ` Balbir Singh
2008-11-28 1:28 ` Li Zefan
3 siblings, 1 reply; 12+ messages in thread
From: Daisuke Nishimura @ 2008-11-27 7:46 UTC (permalink / raw)
To: KAMEZAWA Hiroyuki
Cc: linux-kernel@vger.kernel.org, lizf@cn.fujitsu.com,
menage@google.com, balbir@linux.vnet.ibm.com, taka, nishimura
On Thu, 27 Nov 2008 16:05:48 +0900, KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> Patches for CGROUP ID and Hierarchy Code.
> (based on mmotm Nov/24...I'll update later.)
>
> CGROUP ID is a unique value for each cgroup (struct cgroup)
> Hierarchy Code is array of CGROUP ID which contains all ID od ancestors.
> and tries to remove cgroup_lock() to walk tree.
>
> This is just an example-level patch but I'm now testing/refleshing this.
> Works well with memcg's hierarchy relcaim.
> Any comment is welcome.
>
Great work!
I've not reviewed these patches in detail yet, I agree to this direction
to remove cgroup_lock() at hierarcal reclaim.
Actualy, I was thinking it would be better to manage some array or list
at mkdir/rmdir.
hmm, but should this ID be implemented in cgroup level, not in memcg level?
Thanks,
Daisuke Nishimura.
> -Kame
>
> ==
> A toy patch for Cgroup ID and hierarchy code.
>
> This patch tries to assing a ID to each cgroup. Attach unique ID to each
> cgroup and provides following functions.
>
> - cgroup_lookup(id)
> returns cgroup of id.
> - cgroup_get_next(id, root, foundid)
> returns the next cgroup under "root" by scanning bitmap (not by tree-walk)
> - cgroup_is_ancestor(cgroup, root)
> returns true if "root" is ancestor of cgroup.
>
> There is several reasons to develop this.
>
> - While trying to implement hierarchy in memory cgroup, we had to
> implement "walk under hierarchy" code.
> Now it's consists of cgroup_lock and tree up-down code.
> But taking "cgroup_lock" in walking tree can cause deadlocks.
> cgroup_lock() is too big. Easier way is helpful.
>
> - SwapCgroup uses array of "pointer" to record the owner of swaps.
> By ID, we can reduce this to "short" or "int". This means ID is
> useful for reducing space consumption by pointer if the access cost
> is not problem.
> (I hear bio-cgroup will use the same kind of...)
>
> Example) OOM-Killer under hierarchy.
> do {
> rcu_read_lock();
> next = cgroup_get_next(id, root, nextid);
> /* check sanity of next here */
> /* increment refcnt to "next" */
> rcu_read_unlock();
> if (!next)
> break;
> cgroup_scan_tasks(select_bad_process);
> } while (1);
>
>
> Characteristics:
> - Each cgroup get new ID when created.
> - cgroup ID contains "ID" and "Depth in tree" and hierarchy code.
> - hierarchy code is array of IDs of ancestors.
>
> Consideration:
> - gang lookup if necessary.
> - I'd like to use "(unsigned)short" to cgroup_id for saving space...
> - MAX_DEPTH is bad ?
>
> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
>
> include/linux/cgroup.h | 42 +++++++++++
> include/linux/idr.h | 1
> kernel/cgroup.c | 180 ++++++++++++++++++++++++++++++++++++++++++++++++-
> lib/idr.c | 46 ++++++++++++
> 4 files changed, 266 insertions(+), 3 deletions(-)
>
> Index: mmotm-2.6.28-Nov24/include/linux/cgroup.h
> ===================================================================
> --- mmotm-2.6.28-Nov24.orig/include/linux/cgroup.h
> +++ mmotm-2.6.28-Nov24/include/linux/cgroup.h
> @@ -61,6 +61,20 @@ struct cgroup_subsys_state {
> unsigned long flags;
> };
>
> +/*
> + * Cgroup ID for *internal* identification and lookup. For user-land,"path"
> + * of cgroup works well.
> + */
> +#define MAX_CGROUP_DEPTH (10)
> +struct cgroup_id {
> + struct cgroup *myself;
> + unsigned int id;
> + atomic_t refcnt;
> + unsigned int depth;
> + unsigned int hierarchy_code[MAX_CGROUP_DEPTH];
> + struct list_head list;
> +};
> +
> /* bits in struct cgroup_subsys_state flags field */
> enum {
> CSS_ROOT, /* This CSS is the root of the subsystem */
> @@ -145,6 +159,9 @@ struct cgroup {
> int pids_use_count;
> /* Length of the current tasks_pids array */
> int pids_length;
> +
> + /* Cgroup ID */
> + struct cgroup_id *id;
> };
>
> /* A css_set is a structure holding pointers to a set of
> @@ -393,6 +410,31 @@ void cgroup_iter_end(struct cgroup *cgrp
> int cgroup_scan_tasks(struct cgroup_scanner *scan);
> int cgroup_attach_task(struct cgroup *, struct task_struct *);
>
> +/*
> + * For supporting cgroup lookup and hierarchy management.
> + * Giving Flat view of cgroup hierarchy rather than tree.
> + */
> +/* An interface for usual lookup */
> +struct cgroup *cgroup_lookup(int id);
> +/* get next cgroup under tree (for scan) */
> +struct cgroup *cgroup_get_next(int id, struct cgroup *root, int *foundid);
> +
> +static inline bool
> +cgroup_is_ancestor(struct cgroup *cur, struct cgroup *root)
> +{
> + struct cgroup_id *curid = cur->id;
> + struct cgroup_id *rootid = root->id;
> +
> + return (curid->hierarchy_code[rootid->depth] = rootid->id);
> +}
> +
> +static inline int cgroup_myid(struct cgroup *cgrp)
> +{
> + return cgrp->id->id;
> +}
> +
> +/* TODO: gang lookup should be */
> +
> #else /* !CONFIG_CGROUPS */
>
> static inline int cgroup_init_early(void) { return 0; }
> Index: mmotm-2.6.28-Nov24/kernel/cgroup.c
> ===================================================================
> --- mmotm-2.6.28-Nov24.orig/kernel/cgroup.c
> +++ mmotm-2.6.28-Nov24/kernel/cgroup.c
> @@ -46,7 +46,7 @@
> #include <linux/cgroupstats.h>
> #include <linux/hash.h>
> #include <linux/namei.h>
> -
> +#include <linux/idr.h>
> #include <asm/atomic.h>
>
> static DEFINE_MUTEX(cgroup_mutex);
> @@ -547,6 +547,162 @@ void cgroup_unlock(void)
> mutex_unlock(&cgroup_mutex);
> }
>
> +
> +/*
> + * Cgroup ID and lookup functions.
> + * cgid->myself pointer is safe under rcu_read_lock() because d_put() of
> + * cgroup, which finally frees cgroup pointer, uses rcu_synchronize().
> + */
> +
> +static DEFINE_IDR(cgroup_idr);
> +static DEFINE_SPINLOCK(cgroup_idr_lock);
> +
> +static int cgroup_prepare_id(struct cgroup *parent, struct cgroup_id **id)
> +{
> + struct cgroup_id *newid;
> + int myid, error;
> +
> + /* check depth */
> + if (parent && parent->id->depth + 1 >= MAX_CGROUP_DEPTH)
> + return -ENOSPC;
> + newid = kzalloc(sizeof(*newid), GFP_KERNEL);
> + if (!newid)
> + return -ENOMEM;
> + /* get id */
> + if (unlikely(!idr_pre_get(&cgroup_idr, GFP_KERNEL))) {
> + error = -ENOMEM;
> + goto err_out;
> + }
> + spin_lock_irq(&cgroup_idr_lock);
> + error = idr_get_new(&cgroup_idr, newid, &myid);
> + spin_unlock_irq(&cgroup_idr_lock);
> + if (error)
> + goto err_out;
> +
> + newid->id = myid;
> + atomic_set(&newid->refcnt, 1);
> + INIT_LIST_HEAD(&newid->list);
> + *id = newid;
> + return 0;
> +err_out:
> + kfree(newid);
> + return error;
> +}
> +
> +static void cgroup_id_put(struct cgroup_id *cgid)
> +{
> + unsigned long flags;
> +
> + if (atomic_dec_and_test(&cgid->refcnt)) {
> + spin_lock_irqsave(&cgroup_idr_lock, flags);
> + idr_remove(&cgroup_idr, cgid->id);
> + spin_unlock_irqrestore(&cgroup_idr_lock, flags);
> + kfree(cgid);
> + }
> +}
> +
> +static void cgroup_id_attach(struct cgroup_id *cgid,
> + struct cgroup *cg, struct cgroup *parent)
> +{
> + if (parent) {
> + struct cgroup_id *parent_id = parent->id;
> + int i;
> +
> + cgid->depth = parent_id->depth + 1;
> + /* Inherit hierarchy code from parent */
> + for (i = 0; i < cgid->depth; i++) {
> + cgid->hierarchy_code[i] =
> + parent_id->hierarchy_code[i];
> + cgid->hierarchy_code[cgid->depth] = cgid->id;
> + }
> + } else {
> + cgid->depth = 0;
> + cgid->hierarchy_code[0] = cgid->id;
> + }
> + rcu_assign_pointer(cgid->myself, cg);
> + cg->id = cgid;
> +
> + return;
> +}
> +
> +static void cgroup_id_detach(struct cgroup *cg)
> +{
> + struct cgroup_id *id = cg->id;
> +
> + rcu_assign_pointer(id->myself, NULL);
> + cgroup_id_put(id);
> +}
> +
> +/**
> + * cgroup_lookup - lookup cgroup by id
> + * @id: the id of cgroup to be looked up
> + *
> + * Retruns pointer to cgroup if there is vald cgroup with id, NULL if not.
> + * Should be called under rcu_read_lock() or cgroup_lock().
> + */
> +
> +struct cgroup *cgroup_lookup(int id)
> +{
> + struct cgroup *cgrp;
> + struct cgroup_id *cgid;
> +
> + rcu_read_lock();
> + cgid = idr_find(&cgroup_idr, id);
> + rcu_read_unlock();
> +
> + if (unlikely(!cgid))
> + return NULL;
> +
> + cgrp = rcu_dereference(cgid->myself);
> + if (unlikely(!cgrp || test_bit(CGRP_REMOVED, &cgrp->flags)))
> + return NULL;
> +
> + return cgrp;
> +}
> +
> +/**
> + * cgroup_get_next - lookup next cgroup under specified hierarchy.
> + * @id: current position of iteration.
> + * @root: search tree under this. .
> + * @foundid: position of found object.
> + *
> + * Search next cgroup under the specified hierarchy. If "cur" is NULL,
> + * start from root cgroup. Called under rcu_read_lock() or cgroup_lock()
> + * is necessary (to access a found cgroup.).
> + */
> +struct cgroup *cgroup_get_next(int id, struct cgroup *root, int *foundid)
> +{
> + struct cgroup *ret = NULL;
> + struct cgroup_id *tmp, *rootid;
> + int tmpid, depth, targetid;
> + unsigned long flags;
> +
> + rootid = root->id;
> + depth = rootid->depth;
> + targetid = rootid->id;
> + tmpid = id;
> +
> + do {
> + /* scan next entry from bitmap(tree) */
> + spin_lock_irqsave(&cgroup_idr_lock, flags);
> + tmp = idr_get_next(&cgroup_idr, &tmpid);
> + spin_unlock_irqrestore(&cgroup_idr_lock, flags);
> +
> + if (tmp) {
> + ret = rcu_dereference(tmp->myself);
> + /* Sanity check and check hierarchy */
> + if (ret && !test_bit(CGRP_REMOVED, &ret->flags)
> + && tmp->hierarchy_code[depth] == targetid)
> + break;
> + tmpid = tmpid + 1;
> + } else
> + tmpid = 0;
> + } while (1);
> +
> + *foundid = tmpid;
> + return ret;
> +}
> +
> /*
> * A couple of forward declarations required, due to cyclic reference loop:
> * cgroup_mkdir -> cgroup_create -> cgroup_populate_dir ->
> @@ -991,6 +1147,7 @@ static int cgroup_get_sb(struct file_sys
> /* New superblock */
> struct cgroup *cgrp = &root->top_cgroup;
> struct inode *inode;
> + struct cgroup_id *newid;
> int i;
>
> BUG_ON(sb->s_root != NULL);
> @@ -998,6 +1155,11 @@ static int cgroup_get_sb(struct file_sys
> ret = cgroup_get_rootdir(sb);
> if (ret)
> goto drop_new_super;
> +
> + ret = cgroup_prepare_id(NULL, &newid);
> + if (ret)
> + goto drop_new_super;
> +
> inode = sb->s_root->d_inode;
>
> mutex_lock(&inode->i_mutex);
> @@ -1062,7 +1224,7 @@ static int cgroup_get_sb(struct file_sys
> BUG_ON(!list_empty(&cgrp->sibling));
> BUG_ON(!list_empty(&cgrp->children));
> BUG_ON(root->number_of_cgroups != 1);
> -
> + cgroup_id_attach(newid, cgrp, NULL);
> cgroup_populate_dir(cgrp);
> mutex_unlock(&inode->i_mutex);
> mutex_unlock(&cgroup_mutex);
> @@ -1997,6 +2159,7 @@ int cgroup_scan_tasks(struct cgroup_scan
> return 0;
> }
>
> +
> /*
> * Stuff for reading the 'tasks' file.
> *
> @@ -2349,11 +2512,18 @@ static long cgroup_create(struct cgroup
> int err = 0;
> struct cgroup_subsys *ss;
> struct super_block *sb = root->sb;
> + struct cgroup_id *cgid = NULL;
>
> cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
> if (!cgrp)
> return -ENOMEM;
>
> + err = cgroup_prepare_id(parent, &cgid);
> + if (err) {
> + kfree(cgrp);
> + return err;
> + }
> +
> /* Grab a reference on the superblock so the hierarchy doesn't
> * get deleted on unmount if there are child cgroups. This
> * can be done outside cgroup_mutex, since the sb can't
> @@ -2394,6 +2564,8 @@ static long cgroup_create(struct cgroup
> err = cgroup_populate_dir(cgrp);
> /* If err < 0, we have a half-filled directory - oh well ;) */
>
> + cgroup_id_attach(cgid, cgrp, parent);
> +
> mutex_unlock(&cgroup_mutex);
> mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
>
> @@ -2415,7 +2587,7 @@ static long cgroup_create(struct cgroup
>
> /* Release the reference count that we took on the superblock */
> deactivate_super(sb);
> -
> + cgroup_id_put(cgid);
> kfree(cgrp);
> return err;
> }
> @@ -2494,6 +2666,8 @@ static int cgroup_rmdir(struct inode *un
> return -EBUSY;
> }
>
> + cgroup_id_detach(cgrp);
> +
> spin_lock(&release_list_lock);
> set_bit(CGRP_REMOVED, &cgrp->flags);
> if (!list_empty(&cgrp->release_list))
> Index: mmotm-2.6.28-Nov24/include/linux/idr.h
> ===================================================================
> --- mmotm-2.6.28-Nov24.orig/include/linux/idr.h
> +++ mmotm-2.6.28-Nov24/include/linux/idr.h
> @@ -106,6 +106,7 @@ int idr_get_new(struct idr *idp, void *p
> int idr_get_new_above(struct idr *idp, void *ptr, int starting_id, int *id);
> int idr_for_each(struct idr *idp,
> int (*fn)(int id, void *p, void *data), void *data);
> +void *idr_get_next(struct idr *idp, int *nextid);
> void *idr_replace(struct idr *idp, void *ptr, int id);
> void idr_remove(struct idr *idp, int id);
> void idr_remove_all(struct idr *idp);
> Index: mmotm-2.6.28-Nov24/lib/idr.c
> ===================================================================
> --- mmotm-2.6.28-Nov24.orig/lib/idr.c
> +++ mmotm-2.6.28-Nov24/lib/idr.c
> @@ -573,6 +573,52 @@ int idr_for_each(struct idr *idp,
> EXPORT_SYMBOL(idr_for_each);
>
> /**
> + * idr_get_next - lookup next opbject of id to given id.
> + * @idp: idr handle
> + * @id: pointer to lookup key
> + *
> + * Retruns pointer to registered object with id, which is next number to
> + * given id.
> + */
> +
> +void *idr_get_next(struct idr *idp, int *nextidp)
> +{
> + struct idr_layer *p, *pa[MAX_LEVEL];
> + struct idr_layer **paa = &pa[0];
> + int id = *nextidp;
> + int n, max;
> +
> + /* find first ent */
> + n = idp->layers * IDR_BITS;
> + max = 1 << n;
> + p = rcu_dereference(idp->top);
> + if (!p)
> + return NULL;
> +
> + while (id < max) {
> + while (n > 0 && p) {
> + n -= IDR_BITS;
> + *paa++ = p;
> + p = rcu_dereference(p->ary[(id >> n) & IDR_MASK]);
> + }
> +
> + if (p) {
> + *nextidp = id;
> + return p;
> + }
> +
> + id += 1 << n;
> + while (n < fls(id)) {
> + n += IDR_BITS;
> + p = *--paa;
> + }
> + }
> + return NULL;
> +}
> +
> +
> +
> +/**
> * idr_replace - replace pointer for given id
> * @idp: idr handle
> * @ptr: pointer you want associated with the id
>
^ permalink raw reply [flat|nested] 12+ messages in thread* Re: [RFC][PATCH 1/2] CGROUP ID and Hierarchy Code
2008-11-27 7:46 ` [RFC][PATCH 1/2] CGROUP ID and Hierarchy Code Daisuke Nishimura
@ 2008-11-27 8:00 ` KAMEZAWA Hiroyuki
0 siblings, 0 replies; 12+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-11-27 8:00 UTC (permalink / raw)
To: Daisuke Nishimura
Cc: linux-kernel@vger.kernel.org, lizf@cn.fujitsu.com,
menage@google.com, balbir@linux.vnet.ibm.com, taka
On Thu, 27 Nov 2008 16:46:32 +0900
Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:
> On Thu, 27 Nov 2008 16:05:48 +0900, KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> > Patches for CGROUP ID and Hierarchy Code.
> > (based on mmotm Nov/24...I'll update later.)
> >
> > CGROUP ID is a unique value for each cgroup (struct cgroup)
> > Hierarchy Code is array of CGROUP ID which contains all ID od ancestors.
> > and tries to remove cgroup_lock() to walk tree.
> >
> > This is just an example-level patch but I'm now testing/refleshing this.
> > Works well with memcg's hierarchy relcaim.
> > Any comment is welcome.
> >
> Great work!
>
thanks.
> I've not reviewed these patches in detail yet, I agree to this direction
> to remove cgroup_lock() at hierarcal reclaim.
> Actualy, I was thinking it would be better to manage some array or list
> at mkdir/rmdir.
>
I have to check races in detail before removing RFC ;)
> hmm, but should this ID be implemented in cgroup level, not in memcg level?
>
Yes. 2 reasons.
1. When I added cgroup-ID to swap_cgroup, some people said "please do in cgroup
generic layer".
2. bio-cgroup people will use this.
Thanks,
-Kame
>
> Thanks,
> Daisuke Nishimura.
>
> > -Kame
> >
> > ==
> > A toy patch for Cgroup ID and hierarchy code.
> >
> > This patch tries to assing a ID to each cgroup. Attach unique ID to each
> > cgroup and provides following functions.
> >
> > - cgroup_lookup(id)
> > returns cgroup of id.
> > - cgroup_get_next(id, root, foundid)
> > returns the next cgroup under "root" by scanning bitmap (not by tree-walk)
> > - cgroup_is_ancestor(cgroup, root)
> > returns true if "root" is ancestor of cgroup.
> >
> > There is several reasons to develop this.
> >
> > - While trying to implement hierarchy in memory cgroup, we had to
> > implement "walk under hierarchy" code.
> > Now it's consists of cgroup_lock and tree up-down code.
> > But taking "cgroup_lock" in walking tree can cause deadlocks.
> > cgroup_lock() is too big. Easier way is helpful.
> >
> > - SwapCgroup uses array of "pointer" to record the owner of swaps.
> > By ID, we can reduce this to "short" or "int". This means ID is
> > useful for reducing space consumption by pointer if the access cost
> > is not problem.
> > (I hear bio-cgroup will use the same kind of...)
> >
> > Example) OOM-Killer under hierarchy.
> > do {
> > rcu_read_lock();
> > next = cgroup_get_next(id, root, nextid);
> > /* check sanity of next here */
> > /* increment refcnt to "next" */
> > rcu_read_unlock();
> > if (!next)
> > break;
> > cgroup_scan_tasks(select_bad_process);
> > } while (1);
> >
> >
> > Characteristics:
> > - Each cgroup get new ID when created.
> > - cgroup ID contains "ID" and "Depth in tree" and hierarchy code.
> > - hierarchy code is array of IDs of ancestors.
> >
> > Consideration:
> > - gang lookup if necessary.
> > - I'd like to use "(unsigned)short" to cgroup_id for saving space...
> > - MAX_DEPTH is bad ?
> >
> > Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> >
> > include/linux/cgroup.h | 42 +++++++++++
> > include/linux/idr.h | 1
> > kernel/cgroup.c | 180 ++++++++++++++++++++++++++++++++++++++++++++++++-
> > lib/idr.c | 46 ++++++++++++
> > 4 files changed, 266 insertions(+), 3 deletions(-)
> >
> > Index: mmotm-2.6.28-Nov24/include/linux/cgroup.h
> > ===================================================================
> > --- mmotm-2.6.28-Nov24.orig/include/linux/cgroup.h
> > +++ mmotm-2.6.28-Nov24/include/linux/cgroup.h
> > @@ -61,6 +61,20 @@ struct cgroup_subsys_state {
> > unsigned long flags;
> > };
> >
> > +/*
> > + * Cgroup ID for *internal* identification and lookup. For user-land,"path"
> > + * of cgroup works well.
> > + */
> > +#define MAX_CGROUP_DEPTH (10)
> > +struct cgroup_id {
> > + struct cgroup *myself;
> > + unsigned int id;
> > + atomic_t refcnt;
> > + unsigned int depth;
> > + unsigned int hierarchy_code[MAX_CGROUP_DEPTH];
> > + struct list_head list;
> > +};
> > +
> > /* bits in struct cgroup_subsys_state flags field */
> > enum {
> > CSS_ROOT, /* This CSS is the root of the subsystem */
> > @@ -145,6 +159,9 @@ struct cgroup {
> > int pids_use_count;
> > /* Length of the current tasks_pids array */
> > int pids_length;
> > +
> > + /* Cgroup ID */
> > + struct cgroup_id *id;
> > };
> >
> > /* A css_set is a structure holding pointers to a set of
> > @@ -393,6 +410,31 @@ void cgroup_iter_end(struct cgroup *cgrp
> > int cgroup_scan_tasks(struct cgroup_scanner *scan);
> > int cgroup_attach_task(struct cgroup *, struct task_struct *);
> >
> > +/*
> > + * For supporting cgroup lookup and hierarchy management.
> > + * Giving Flat view of cgroup hierarchy rather than tree.
> > + */
> > +/* An interface for usual lookup */
> > +struct cgroup *cgroup_lookup(int id);
> > +/* get next cgroup under tree (for scan) */
> > +struct cgroup *cgroup_get_next(int id, struct cgroup *root, int *foundid);
> > +
> > +static inline bool
> > +cgroup_is_ancestor(struct cgroup *cur, struct cgroup *root)
> > +{
> > + struct cgroup_id *curid = cur->id;
> > + struct cgroup_id *rootid = root->id;
> > +
> > + return (curid->hierarchy_code[rootid->depth] = rootid->id);
> > +}
> > +
> > +static inline int cgroup_myid(struct cgroup *cgrp)
> > +{
> > + return cgrp->id->id;
> > +}
> > +
> > +/* TODO: gang lookup should be */
> > +
> > #else /* !CONFIG_CGROUPS */
> >
> > static inline int cgroup_init_early(void) { return 0; }
> > Index: mmotm-2.6.28-Nov24/kernel/cgroup.c
> > ===================================================================
> > --- mmotm-2.6.28-Nov24.orig/kernel/cgroup.c
> > +++ mmotm-2.6.28-Nov24/kernel/cgroup.c
> > @@ -46,7 +46,7 @@
> > #include <linux/cgroupstats.h>
> > #include <linux/hash.h>
> > #include <linux/namei.h>
> > -
> > +#include <linux/idr.h>
> > #include <asm/atomic.h>
> >
> > static DEFINE_MUTEX(cgroup_mutex);
> > @@ -547,6 +547,162 @@ void cgroup_unlock(void)
> > mutex_unlock(&cgroup_mutex);
> > }
> >
> > +
> > +/*
> > + * Cgroup ID and lookup functions.
> > + * cgid->myself pointer is safe under rcu_read_lock() because d_put() of
> > + * cgroup, which finally frees cgroup pointer, uses rcu_synchronize().
> > + */
> > +
> > +static DEFINE_IDR(cgroup_idr);
> > +static DEFINE_SPINLOCK(cgroup_idr_lock);
> > +
> > +static int cgroup_prepare_id(struct cgroup *parent, struct cgroup_id **id)
> > +{
> > + struct cgroup_id *newid;
> > + int myid, error;
> > +
> > + /* check depth */
> > + if (parent && parent->id->depth + 1 >= MAX_CGROUP_DEPTH)
> > + return -ENOSPC;
> > + newid = kzalloc(sizeof(*newid), GFP_KERNEL);
> > + if (!newid)
> > + return -ENOMEM;
> > + /* get id */
> > + if (unlikely(!idr_pre_get(&cgroup_idr, GFP_KERNEL))) {
> > + error = -ENOMEM;
> > + goto err_out;
> > + }
> > + spin_lock_irq(&cgroup_idr_lock);
> > + error = idr_get_new(&cgroup_idr, newid, &myid);
> > + spin_unlock_irq(&cgroup_idr_lock);
> > + if (error)
> > + goto err_out;
> > +
> > + newid->id = myid;
> > + atomic_set(&newid->refcnt, 1);
> > + INIT_LIST_HEAD(&newid->list);
> > + *id = newid;
> > + return 0;
> > +err_out:
> > + kfree(newid);
> > + return error;
> > +}
> > +
> > +static void cgroup_id_put(struct cgroup_id *cgid)
> > +{
> > + unsigned long flags;
> > +
> > + if (atomic_dec_and_test(&cgid->refcnt)) {
> > + spin_lock_irqsave(&cgroup_idr_lock, flags);
> > + idr_remove(&cgroup_idr, cgid->id);
> > + spin_unlock_irqrestore(&cgroup_idr_lock, flags);
> > + kfree(cgid);
> > + }
> > +}
> > +
> > +static void cgroup_id_attach(struct cgroup_id *cgid,
> > + struct cgroup *cg, struct cgroup *parent)
> > +{
> > + if (parent) {
> > + struct cgroup_id *parent_id = parent->id;
> > + int i;
> > +
> > + cgid->depth = parent_id->depth + 1;
> > + /* Inherit hierarchy code from parent */
> > + for (i = 0; i < cgid->depth; i++) {
> > + cgid->hierarchy_code[i] =
> > + parent_id->hierarchy_code[i];
> > + cgid->hierarchy_code[cgid->depth] = cgid->id;
> > + }
> > + } else {
> > + cgid->depth = 0;
> > + cgid->hierarchy_code[0] = cgid->id;
> > + }
> > + rcu_assign_pointer(cgid->myself, cg);
> > + cg->id = cgid;
> > +
> > + return;
> > +}
> > +
> > +static void cgroup_id_detach(struct cgroup *cg)
> > +{
> > + struct cgroup_id *id = cg->id;
> > +
> > + rcu_assign_pointer(id->myself, NULL);
> > + cgroup_id_put(id);
> > +}
> > +
> > +/**
> > + * cgroup_lookup - lookup cgroup by id
> > + * @id: the id of cgroup to be looked up
> > + *
> > + * Retruns pointer to cgroup if there is vald cgroup with id, NULL if not.
> > + * Should be called under rcu_read_lock() or cgroup_lock().
> > + */
> > +
> > +struct cgroup *cgroup_lookup(int id)
> > +{
> > + struct cgroup *cgrp;
> > + struct cgroup_id *cgid;
> > +
> > + rcu_read_lock();
> > + cgid = idr_find(&cgroup_idr, id);
> > + rcu_read_unlock();
> > +
> > + if (unlikely(!cgid))
> > + return NULL;
> > +
> > + cgrp = rcu_dereference(cgid->myself);
> > + if (unlikely(!cgrp || test_bit(CGRP_REMOVED, &cgrp->flags)))
> > + return NULL;
> > +
> > + return cgrp;
> > +}
> > +
> > +/**
> > + * cgroup_get_next - lookup next cgroup under specified hierarchy.
> > + * @id: current position of iteration.
> > + * @root: search tree under this. .
> > + * @foundid: position of found object.
> > + *
> > + * Search next cgroup under the specified hierarchy. If "cur" is NULL,
> > + * start from root cgroup. Called under rcu_read_lock() or cgroup_lock()
> > + * is necessary (to access a found cgroup.).
> > + */
> > +struct cgroup *cgroup_get_next(int id, struct cgroup *root, int *foundid)
> > +{
> > + struct cgroup *ret = NULL;
> > + struct cgroup_id *tmp, *rootid;
> > + int tmpid, depth, targetid;
> > + unsigned long flags;
> > +
> > + rootid = root->id;
> > + depth = rootid->depth;
> > + targetid = rootid->id;
> > + tmpid = id;
> > +
> > + do {
> > + /* scan next entry from bitmap(tree) */
> > + spin_lock_irqsave(&cgroup_idr_lock, flags);
> > + tmp = idr_get_next(&cgroup_idr, &tmpid);
> > + spin_unlock_irqrestore(&cgroup_idr_lock, flags);
> > +
> > + if (tmp) {
> > + ret = rcu_dereference(tmp->myself);
> > + /* Sanity check and check hierarchy */
> > + if (ret && !test_bit(CGRP_REMOVED, &ret->flags)
> > + && tmp->hierarchy_code[depth] == targetid)
> > + break;
> > + tmpid = tmpid + 1;
> > + } else
> > + tmpid = 0;
> > + } while (1);
> > +
> > + *foundid = tmpid;
> > + return ret;
> > +}
> > +
> > /*
> > * A couple of forward declarations required, due to cyclic reference loop:
> > * cgroup_mkdir -> cgroup_create -> cgroup_populate_dir ->
> > @@ -991,6 +1147,7 @@ static int cgroup_get_sb(struct file_sys
> > /* New superblock */
> > struct cgroup *cgrp = &root->top_cgroup;
> > struct inode *inode;
> > + struct cgroup_id *newid;
> > int i;
> >
> > BUG_ON(sb->s_root != NULL);
> > @@ -998,6 +1155,11 @@ static int cgroup_get_sb(struct file_sys
> > ret = cgroup_get_rootdir(sb);
> > if (ret)
> > goto drop_new_super;
> > +
> > + ret = cgroup_prepare_id(NULL, &newid);
> > + if (ret)
> > + goto drop_new_super;
> > +
> > inode = sb->s_root->d_inode;
> >
> > mutex_lock(&inode->i_mutex);
> > @@ -1062,7 +1224,7 @@ static int cgroup_get_sb(struct file_sys
> > BUG_ON(!list_empty(&cgrp->sibling));
> > BUG_ON(!list_empty(&cgrp->children));
> > BUG_ON(root->number_of_cgroups != 1);
> > -
> > + cgroup_id_attach(newid, cgrp, NULL);
> > cgroup_populate_dir(cgrp);
> > mutex_unlock(&inode->i_mutex);
> > mutex_unlock(&cgroup_mutex);
> > @@ -1997,6 +2159,7 @@ int cgroup_scan_tasks(struct cgroup_scan
> > return 0;
> > }
> >
> > +
> > /*
> > * Stuff for reading the 'tasks' file.
> > *
> > @@ -2349,11 +2512,18 @@ static long cgroup_create(struct cgroup
> > int err = 0;
> > struct cgroup_subsys *ss;
> > struct super_block *sb = root->sb;
> > + struct cgroup_id *cgid = NULL;
> >
> > cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
> > if (!cgrp)
> > return -ENOMEM;
> >
> > + err = cgroup_prepare_id(parent, &cgid);
> > + if (err) {
> > + kfree(cgrp);
> > + return err;
> > + }
> > +
> > /* Grab a reference on the superblock so the hierarchy doesn't
> > * get deleted on unmount if there are child cgroups. This
> > * can be done outside cgroup_mutex, since the sb can't
> > @@ -2394,6 +2564,8 @@ static long cgroup_create(struct cgroup
> > err = cgroup_populate_dir(cgrp);
> > /* If err < 0, we have a half-filled directory - oh well ;) */
> >
> > + cgroup_id_attach(cgid, cgrp, parent);
> > +
> > mutex_unlock(&cgroup_mutex);
> > mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
> >
> > @@ -2415,7 +2587,7 @@ static long cgroup_create(struct cgroup
> >
> > /* Release the reference count that we took on the superblock */
> > deactivate_super(sb);
> > -
> > + cgroup_id_put(cgid);
> > kfree(cgrp);
> > return err;
> > }
> > @@ -2494,6 +2666,8 @@ static int cgroup_rmdir(struct inode *un
> > return -EBUSY;
> > }
> >
> > + cgroup_id_detach(cgrp);
> > +
> > spin_lock(&release_list_lock);
> > set_bit(CGRP_REMOVED, &cgrp->flags);
> > if (!list_empty(&cgrp->release_list))
> > Index: mmotm-2.6.28-Nov24/include/linux/idr.h
> > ===================================================================
> > --- mmotm-2.6.28-Nov24.orig/include/linux/idr.h
> > +++ mmotm-2.6.28-Nov24/include/linux/idr.h
> > @@ -106,6 +106,7 @@ int idr_get_new(struct idr *idp, void *p
> > int idr_get_new_above(struct idr *idp, void *ptr, int starting_id, int *id);
> > int idr_for_each(struct idr *idp,
> > int (*fn)(int id, void *p, void *data), void *data);
> > +void *idr_get_next(struct idr *idp, int *nextid);
> > void *idr_replace(struct idr *idp, void *ptr, int id);
> > void idr_remove(struct idr *idp, int id);
> > void idr_remove_all(struct idr *idp);
> > Index: mmotm-2.6.28-Nov24/lib/idr.c
> > ===================================================================
> > --- mmotm-2.6.28-Nov24.orig/lib/idr.c
> > +++ mmotm-2.6.28-Nov24/lib/idr.c
> > @@ -573,6 +573,52 @@ int idr_for_each(struct idr *idp,
> > EXPORT_SYMBOL(idr_for_each);
> >
> > /**
> > + * idr_get_next - lookup next opbject of id to given id.
> > + * @idp: idr handle
> > + * @id: pointer to lookup key
> > + *
> > + * Retruns pointer to registered object with id, which is next number to
> > + * given id.
> > + */
> > +
> > +void *idr_get_next(struct idr *idp, int *nextidp)
> > +{
> > + struct idr_layer *p, *pa[MAX_LEVEL];
> > + struct idr_layer **paa = &pa[0];
> > + int id = *nextidp;
> > + int n, max;
> > +
> > + /* find first ent */
> > + n = idp->layers * IDR_BITS;
> > + max = 1 << n;
> > + p = rcu_dereference(idp->top);
> > + if (!p)
> > + return NULL;
> > +
> > + while (id < max) {
> > + while (n > 0 && p) {
> > + n -= IDR_BITS;
> > + *paa++ = p;
> > + p = rcu_dereference(p->ary[(id >> n) & IDR_MASK]);
> > + }
> > +
> > + if (p) {
> > + *nextidp = id;
> > + return p;
> > + }
> > +
> > + id += 1 << n;
> > + while (n < fls(id)) {
> > + n += IDR_BITS;
> > + p = *--paa;
> > + }
> > + }
> > + return NULL;
> > +}
> > +
> > +
> > +
> > +/**
> > * idr_replace - replace pointer for given id
> > * @idp: idr handle
> > * @ptr: pointer you want associated with the id
> >
>
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [RFC][PATCH 1/2] CGROUP ID and Hierarchy Code
2008-11-27 7:05 [RFC][PATCH 1/2] CGROUP ID and Hierarchy Code KAMEZAWA Hiroyuki
2008-11-27 7:08 ` [RFC][PATCH 2/2] memcg: hierarchy reclaim with CGROUP ID KAMEZAWA Hiroyuki
2008-11-27 7:46 ` [RFC][PATCH 1/2] CGROUP ID and Hierarchy Code Daisuke Nishimura
@ 2008-11-27 18:54 ` Balbir Singh
2008-11-28 0:08 ` KAMEZAWA Hiroyuki
2008-11-28 1:28 ` Li Zefan
3 siblings, 1 reply; 12+ messages in thread
From: Balbir Singh @ 2008-11-27 18:54 UTC (permalink / raw)
To: KAMEZAWA Hiroyuki
Cc: linux-kernel@vger.kernel.org, lizf@cn.fujitsu.com,
menage@google.com, nishimura@mxp.nes.nec.co.jp, taka
KAMEZAWA Hiroyuki wrote:
> Patches for CGROUP ID and Hierarchy Code.
> (based on mmotm Nov/24...I'll update later.)
Hi, Kame, others on the mailing list,
I am currently attending FOSS.IN (http://foss.in), I hope to review/see these
patches and other reports next week.
--
Balbir
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [RFC][PATCH 1/2] CGROUP ID and Hierarchy Code
2008-11-27 18:54 ` Balbir Singh
@ 2008-11-28 0:08 ` KAMEZAWA Hiroyuki
0 siblings, 0 replies; 12+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-11-28 0:08 UTC (permalink / raw)
To: balbir
Cc: linux-kernel@vger.kernel.org, lizf@cn.fujitsu.com,
menage@google.com, nishimura@mxp.nes.nec.co.jp, taka
On Fri, 28 Nov 2008 00:24:16 +0530
Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> KAMEZAWA Hiroyuki wrote:
> > Patches for CGROUP ID and Hierarchy Code.
> > (based on mmotm Nov/24...I'll update later.)
>
> Hi, Kame, others on the mailing list,
>
> I am currently attending FOSS.IN (http://foss.in), I hope to review/see these
> patches and other reports next week.
>
Thank you. I'm not in hurry and I'll have to change this version to some extent.
(updated version will be in the next week.)
enjoy FOSS.IN.
Thanks,
-Kame
> --
> Balbir
>
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [RFC][PATCH 1/2] CGROUP ID and Hierarchy Code
2008-11-27 7:05 [RFC][PATCH 1/2] CGROUP ID and Hierarchy Code KAMEZAWA Hiroyuki
` (2 preceding siblings ...)
2008-11-27 18:54 ` Balbir Singh
@ 2008-11-28 1:28 ` Li Zefan
2008-11-28 1:35 ` KAMEZAWA Hiroyuki
3 siblings, 1 reply; 12+ messages in thread
From: Li Zefan @ 2008-11-28 1:28 UTC (permalink / raw)
To: KAMEZAWA Hiroyuki
Cc: linux-kernel@vger.kernel.org, menage@google.com,
balbir@linux.vnet.ibm.com, nishimura@mxp.nes.nec.co.jp, taka
KAMEZAWA Hiroyuki wrote:
> Patches for CGROUP ID and Hierarchy Code.
> (based on mmotm Nov/24...I'll update later.)
>
> CGROUP ID is a unique value for each cgroup (struct cgroup)
> Hierarchy Code is array of CGROUP ID which contains all ID od ancestors.
> and tries to remove cgroup_lock() to walk tree.
>
> This is just an example-level patch but I'm now testing/refleshing this.
> Works well with memcg's hierarchy relcaim.
> Any comment is welcome.
>
> -Kame
>
> ==
> A toy patch for Cgroup ID and hierarchy code.
>
> This patch tries to assing a ID to each cgroup. Attach unique ID to each
> cgroup and provides following functions.
>
> - cgroup_lookup(id)
> returns cgroup of id.
> - cgroup_get_next(id, root, foundid)
> returns the next cgroup under "root" by scanning bitmap (not by tree-walk)
> - cgroup_is_ancestor(cgroup, root)
> returns true if "root" is ancestor of cgroup.
>
> There is several reasons to develop this.
>
> - While trying to implement hierarchy in memory cgroup, we had to
> implement "walk under hierarchy" code.
> Now it's consists of cgroup_lock and tree up-down code.
> But taking "cgroup_lock" in walking tree can cause deadlocks.
> cgroup_lock() is too big. Easier way is helpful.
>
> - SwapCgroup uses array of "pointer" to record the owner of swaps.
> By ID, we can reduce this to "short" or "int". This means ID is
> useful for reducing space consumption by pointer if the access cost
> is not problem.
> (I hear bio-cgroup will use the same kind of...)
>
> Example) OOM-Killer under hierarchy.
> do {
> rcu_read_lock();
> next = cgroup_get_next(id, root, nextid);
> /* check sanity of next here */
> /* increment refcnt to "next" */
> rcu_read_unlock();
> if (!next)
> break;
It's not safe to use the cgroup returned by cgroup_get_next() outside rcu lock.
---------------------------------------------
cgroup_rmdir()
(check refcnt == 0)
rcu_read_lock();
cgrp = cgroup_get_next()
(inc refcnt)
rcu_read_unlock();
cgroup_id_detach()
cgroup_diput();
kfree(cgrp);
use cgrp
---------------------------------------------
I think the safe way is:
rcu_read_lock();
cgrp = cgroup_get_next()
if (!inc_not_zero(cgrp) {
rcu_read_unlock();
return NULL;
}
rcu_read_unlock();
return cgrp;
But it's also safe to use cgrp = list_entry(&parent->children.next) for the above
scenario, seems you don't have to invent this cgroup_get_next().
> cgroup_scan_tasks(select_bad_process);
> } while (1);
>
>
> Characteristics:
> - Each cgroup get new ID when created.
> - cgroup ID contains "ID" and "Depth in tree" and hierarchy code.
> - hierarchy code is array of IDs of ancestors.
>
> Consideration:
> - gang lookup if necessary.
> - I'd like to use "(unsigned)short" to cgroup_id for saving space...
> - MAX_DEPTH is bad ?
>
> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
>
> include/linux/cgroup.h | 42 +++++++++++
> include/linux/idr.h | 1
> kernel/cgroup.c | 180 ++++++++++++++++++++++++++++++++++++++++++++++++-
> lib/idr.c | 46 ++++++++++++
> 4 files changed, 266 insertions(+), 3 deletions(-)
>
> Index: mmotm-2.6.28-Nov24/include/linux/cgroup.h
> ===================================================================
> --- mmotm-2.6.28-Nov24.orig/include/linux/cgroup.h
> +++ mmotm-2.6.28-Nov24/include/linux/cgroup.h
> @@ -61,6 +61,20 @@ struct cgroup_subsys_state {
> unsigned long flags;
> };
>
> +/*
> + * Cgroup ID for *internal* identification and lookup. For user-land,"path"
> + * of cgroup works well.
> + */
> +#define MAX_CGROUP_DEPTH (10)
> +struct cgroup_id {
> + struct cgroup *myself;
> + unsigned int id;
> + atomic_t refcnt;
> + unsigned int depth;
> + unsigned int hierarchy_code[MAX_CGROUP_DEPTH];
> + struct list_head list;
> +};
> +
> /* bits in struct cgroup_subsys_state flags field */
> enum {
> CSS_ROOT, /* This CSS is the root of the subsystem */
> @@ -145,6 +159,9 @@ struct cgroup {
> int pids_use_count;
> /* Length of the current tasks_pids array */
> int pids_length;
> +
> + /* Cgroup ID */
> + struct cgroup_id *id;
> };
>
> /* A css_set is a structure holding pointers to a set of
> @@ -393,6 +410,31 @@ void cgroup_iter_end(struct cgroup *cgrp
> int cgroup_scan_tasks(struct cgroup_scanner *scan);
> int cgroup_attach_task(struct cgroup *, struct task_struct *);
>
> +/*
> + * For supporting cgroup lookup and hierarchy management.
> + * Giving Flat view of cgroup hierarchy rather than tree.
> + */
> +/* An interface for usual lookup */
> +struct cgroup *cgroup_lookup(int id);
> +/* get next cgroup under tree (for scan) */
> +struct cgroup *cgroup_get_next(int id, struct cgroup *root, int *foundid);
> +
> +static inline bool
> +cgroup_is_ancestor(struct cgroup *cur, struct cgroup *root)
> +{
> + struct cgroup_id *curid = cur->id;
> + struct cgroup_id *rootid = root->id;
> +
> + return (curid->hierarchy_code[rootid->depth] = rootid->id);
> +}
> +
> +static inline int cgroup_myid(struct cgroup *cgrp)
> +{
> + return cgrp->id->id;
> +}
> +
> +/* TODO: gang lookup should be */
> +
> #else /* !CONFIG_CGROUPS */
>
> static inline int cgroup_init_early(void) { return 0; }
> Index: mmotm-2.6.28-Nov24/kernel/cgroup.c
> ===================================================================
> --- mmotm-2.6.28-Nov24.orig/kernel/cgroup.c
> +++ mmotm-2.6.28-Nov24/kernel/cgroup.c
> @@ -46,7 +46,7 @@
> #include <linux/cgroupstats.h>
> #include <linux/hash.h>
> #include <linux/namei.h>
> -
> +#include <linux/idr.h>
> #include <asm/atomic.h>
>
> static DEFINE_MUTEX(cgroup_mutex);
> @@ -547,6 +547,162 @@ void cgroup_unlock(void)
> mutex_unlock(&cgroup_mutex);
> }
>
> +
> +/*
> + * Cgroup ID and lookup functions.
> + * cgid->myself pointer is safe under rcu_read_lock() because d_put() of
> + * cgroup, which finally frees cgroup pointer, uses rcu_synchronize().
> + */
> +
> +static DEFINE_IDR(cgroup_idr);
> +static DEFINE_SPINLOCK(cgroup_idr_lock);
> +
> +static int cgroup_prepare_id(struct cgroup *parent, struct cgroup_id **id)
> +{
> + struct cgroup_id *newid;
> + int myid, error;
> +
> + /* check depth */
> + if (parent && parent->id->depth + 1 >= MAX_CGROUP_DEPTH)
> + return -ENOSPC;
> + newid = kzalloc(sizeof(*newid), GFP_KERNEL);
> + if (!newid)
> + return -ENOMEM;
> + /* get id */
> + if (unlikely(!idr_pre_get(&cgroup_idr, GFP_KERNEL))) {
> + error = -ENOMEM;
> + goto err_out;
> + }
> + spin_lock_irq(&cgroup_idr_lock);
> + error = idr_get_new(&cgroup_idr, newid, &myid);
> + spin_unlock_irq(&cgroup_idr_lock);
> + if (error)
> + goto err_out;
> +
> + newid->id = myid;
> + atomic_set(&newid->refcnt, 1);
> + INIT_LIST_HEAD(&newid->list);
> + *id = newid;
> + return 0;
> +err_out:
> + kfree(newid);
> + return error;
> +}
> +
> +static void cgroup_id_put(struct cgroup_id *cgid)
> +{
> + unsigned long flags;
> +
> + if (atomic_dec_and_test(&cgid->refcnt)) {
> + spin_lock_irqsave(&cgroup_idr_lock, flags);
> + idr_remove(&cgroup_idr, cgid->id);
> + spin_unlock_irqrestore(&cgroup_idr_lock, flags);
> + kfree(cgid);
> + }
> +}
> +
> +static void cgroup_id_attach(struct cgroup_id *cgid,
> + struct cgroup *cg, struct cgroup *parent)
> +{
> + if (parent) {
> + struct cgroup_id *parent_id = parent->id;
> + int i;
> +
> + cgid->depth = parent_id->depth + 1;
> + /* Inherit hierarchy code from parent */
> + for (i = 0; i < cgid->depth; i++) {
> + cgid->hierarchy_code[i] =
> + parent_id->hierarchy_code[i];
> + cgid->hierarchy_code[cgid->depth] = cgid->id;
> + }
> + } else {
> + cgid->depth = 0;
> + cgid->hierarchy_code[0] = cgid->id;
> + }
> + rcu_assign_pointer(cgid->myself, cg);
> + cg->id = cgid;
> +
> + return;
> +}
> +
> +static void cgroup_id_detach(struct cgroup *cg)
> +{
> + struct cgroup_id *id = cg->id;
> +
> + rcu_assign_pointer(id->myself, NULL);
> + cgroup_id_put(id);
> +}
> +
> +/**
> + * cgroup_lookup - lookup cgroup by id
> + * @id: the id of cgroup to be looked up
> + *
> + * Retruns pointer to cgroup if there is vald cgroup with id, NULL if not.
> + * Should be called under rcu_read_lock() or cgroup_lock().
> + */
> +
> +struct cgroup *cgroup_lookup(int id)
> +{
> + struct cgroup *cgrp;
> + struct cgroup_id *cgid;
> +
> + rcu_read_lock();
> + cgid = idr_find(&cgroup_idr, id);
> + rcu_read_unlock();
> +
> + if (unlikely(!cgid))
> + return NULL;
> +
> + cgrp = rcu_dereference(cgid->myself);
> + if (unlikely(!cgrp || test_bit(CGRP_REMOVED, &cgrp->flags)))
> + return NULL;
> +
> + return cgrp;
> +}
> +
> +/**
> + * cgroup_get_next - lookup next cgroup under specified hierarchy.
> + * @id: current position of iteration.
> + * @root: search tree under this. .
> + * @foundid: position of found object.
> + *
> + * Search next cgroup under the specified hierarchy. If "cur" is NULL,
> + * start from root cgroup. Called under rcu_read_lock() or cgroup_lock()
> + * is necessary (to access a found cgroup.).
> + */
> +struct cgroup *cgroup_get_next(int id, struct cgroup *root, int *foundid)
> +{
> + struct cgroup *ret = NULL;
> + struct cgroup_id *tmp, *rootid;
> + int tmpid, depth, targetid;
> + unsigned long flags;
> +
> + rootid = root->id;
> + depth = rootid->depth;
> + targetid = rootid->id;
> + tmpid = id;
> +
> + do {
> + /* scan next entry from bitmap(tree) */
> + spin_lock_irqsave(&cgroup_idr_lock, flags);
> + tmp = idr_get_next(&cgroup_idr, &tmpid);
> + spin_unlock_irqrestore(&cgroup_idr_lock, flags);
> +
> + if (tmp) {
> + ret = rcu_dereference(tmp->myself);
> + /* Sanity check and check hierarchy */
> + if (ret && !test_bit(CGRP_REMOVED, &ret->flags)
> + && tmp->hierarchy_code[depth] == targetid)
> + break;
> + tmpid = tmpid + 1;
> + } else
> + tmpid = 0;
> + } while (1);
> +
> + *foundid = tmpid;
> + return ret;
> +}
> +
> /*
> * A couple of forward declarations required, due to cyclic reference loop:
> * cgroup_mkdir -> cgroup_create -> cgroup_populate_dir ->
> @@ -991,6 +1147,7 @@ static int cgroup_get_sb(struct file_sys
> /* New superblock */
> struct cgroup *cgrp = &root->top_cgroup;
> struct inode *inode;
> + struct cgroup_id *newid;
> int i;
>
> BUG_ON(sb->s_root != NULL);
> @@ -998,6 +1155,11 @@ static int cgroup_get_sb(struct file_sys
> ret = cgroup_get_rootdir(sb);
> if (ret)
> goto drop_new_super;
> +
> + ret = cgroup_prepare_id(NULL, &newid);
> + if (ret)
> + goto drop_new_super;
> +
> inode = sb->s_root->d_inode;
>
> mutex_lock(&inode->i_mutex);
> @@ -1062,7 +1224,7 @@ static int cgroup_get_sb(struct file_sys
> BUG_ON(!list_empty(&cgrp->sibling));
> BUG_ON(!list_empty(&cgrp->children));
> BUG_ON(root->number_of_cgroups != 1);
> -
> + cgroup_id_attach(newid, cgrp, NULL);
> cgroup_populate_dir(cgrp);
> mutex_unlock(&inode->i_mutex);
> mutex_unlock(&cgroup_mutex);
> @@ -1997,6 +2159,7 @@ int cgroup_scan_tasks(struct cgroup_scan
> return 0;
> }
>
> +
> /*
> * Stuff for reading the 'tasks' file.
> *
> @@ -2349,11 +2512,18 @@ static long cgroup_create(struct cgroup
> int err = 0;
> struct cgroup_subsys *ss;
> struct super_block *sb = root->sb;
> + struct cgroup_id *cgid = NULL;
>
> cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
> if (!cgrp)
> return -ENOMEM;
>
> + err = cgroup_prepare_id(parent, &cgid);
> + if (err) {
> + kfree(cgrp);
> + return err;
> + }
> +
> /* Grab a reference on the superblock so the hierarchy doesn't
> * get deleted on unmount if there are child cgroups. This
> * can be done outside cgroup_mutex, since the sb can't
> @@ -2394,6 +2564,8 @@ static long cgroup_create(struct cgroup
> err = cgroup_populate_dir(cgrp);
> /* If err < 0, we have a half-filled directory - oh well ;) */
>
> + cgroup_id_attach(cgid, cgrp, parent);
> +
> mutex_unlock(&cgroup_mutex);
> mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
>
> @@ -2415,7 +2587,7 @@ static long cgroup_create(struct cgroup
>
> /* Release the reference count that we took on the superblock */
> deactivate_super(sb);
> -
> + cgroup_id_put(cgid);
> kfree(cgrp);
> return err;
> }
> @@ -2494,6 +2666,8 @@ static int cgroup_rmdir(struct inode *un
> return -EBUSY;
> }
>
> + cgroup_id_detach(cgrp);
> +
> spin_lock(&release_list_lock);
> set_bit(CGRP_REMOVED, &cgrp->flags);
> if (!list_empty(&cgrp->release_list))
> Index: mmotm-2.6.28-Nov24/include/linux/idr.h
> ===================================================================
> --- mmotm-2.6.28-Nov24.orig/include/linux/idr.h
> +++ mmotm-2.6.28-Nov24/include/linux/idr.h
> @@ -106,6 +106,7 @@ int idr_get_new(struct idr *idp, void *p
> int idr_get_new_above(struct idr *idp, void *ptr, int starting_id, int *id);
> int idr_for_each(struct idr *idp,
> int (*fn)(int id, void *p, void *data), void *data);
> +void *idr_get_next(struct idr *idp, int *nextid);
> void *idr_replace(struct idr *idp, void *ptr, int id);
> void idr_remove(struct idr *idp, int id);
> void idr_remove_all(struct idr *idp);
> Index: mmotm-2.6.28-Nov24/lib/idr.c
> ===================================================================
> --- mmotm-2.6.28-Nov24.orig/lib/idr.c
> +++ mmotm-2.6.28-Nov24/lib/idr.c
> @@ -573,6 +573,52 @@ int idr_for_each(struct idr *idp,
> EXPORT_SYMBOL(idr_for_each);
>
> /**
> + * idr_get_next - lookup next opbject of id to given id.
> + * @idp: idr handle
> + * @id: pointer to lookup key
> + *
> + * Retruns pointer to registered object with id, which is next number to
> + * given id.
> + */
> +
> +void *idr_get_next(struct idr *idp, int *nextidp)
> +{
> + struct idr_layer *p, *pa[MAX_LEVEL];
> + struct idr_layer **paa = &pa[0];
> + int id = *nextidp;
> + int n, max;
> +
> + /* find first ent */
> + n = idp->layers * IDR_BITS;
> + max = 1 << n;
> + p = rcu_dereference(idp->top);
> + if (!p)
> + return NULL;
> +
> + while (id < max) {
> + while (n > 0 && p) {
> + n -= IDR_BITS;
> + *paa++ = p;
> + p = rcu_dereference(p->ary[(id >> n) & IDR_MASK]);
> + }
> +
> + if (p) {
> + *nextidp = id;
> + return p;
> + }
> +
> + id += 1 << n;
> + while (n < fls(id)) {
> + n += IDR_BITS;
> + p = *--paa;
> + }
> + }
> + return NULL;
> +}
> +
> +
> +
> +/**
> * idr_replace - replace pointer for given id
> * @idp: idr handle
> * @ptr: pointer you want associated with the id
>
>
^ permalink raw reply [flat|nested] 12+ messages in thread* Re: [RFC][PATCH 1/2] CGROUP ID and Hierarchy Code
2008-11-28 1:28 ` Li Zefan
@ 2008-11-28 1:35 ` KAMEZAWA Hiroyuki
2008-11-28 1:51 ` KAMEZAWA Hiroyuki
2008-11-28 2:18 ` KAMEZAWA Hiroyuki
0 siblings, 2 replies; 12+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-11-28 1:35 UTC (permalink / raw)
To: Li Zefan
Cc: linux-kernel@vger.kernel.org, menage@google.com,
balbir@linux.vnet.ibm.com, nishimura@mxp.nes.nec.co.jp, taka
On Fri, 28 Nov 2008 09:28:33 +0800
Li Zefan <lizf@cn.fujitsu.com> wrote:
> KAMEZAWA Hiroyuki wrote:
> > Patches for CGROUP ID and Hierarchy Code.
> > (based on mmotm Nov/24...I'll update later.)
> >
> > CGROUP ID is a unique value for each cgroup (struct cgroup)
> > Hierarchy Code is array of CGROUP ID which contains all ID od ancestors.
> > and tries to remove cgroup_lock() to walk tree.
> >
> > This is just an example-level patch but I'm now testing/refleshing this.
> > Works well with memcg's hierarchy relcaim.
> > Any comment is welcome.
> >
> > -Kame
> >
> > ==
> > A toy patch for Cgroup ID and hierarchy code.
> >
> > This patch tries to assing a ID to each cgroup. Attach unique ID to each
> > cgroup and provides following functions.
> >
> > - cgroup_lookup(id)
> > returns cgroup of id.
> > - cgroup_get_next(id, root, foundid)
> > returns the next cgroup under "root" by scanning bitmap (not by tree-walk)
> > - cgroup_is_ancestor(cgroup, root)
> > returns true if "root" is ancestor of cgroup.
> >
> > There is several reasons to develop this.
> >
> > - While trying to implement hierarchy in memory cgroup, we had to
> > implement "walk under hierarchy" code.
> > Now it's consists of cgroup_lock and tree up-down code.
> > But taking "cgroup_lock" in walking tree can cause deadlocks.
> > cgroup_lock() is too big. Easier way is helpful.
> >
> > - SwapCgroup uses array of "pointer" to record the owner of swaps.
> > By ID, we can reduce this to "short" or "int". This means ID is
> > useful for reducing space consumption by pointer if the access cost
> > is not problem.
> > (I hear bio-cgroup will use the same kind of...)
> >
> > Example) OOM-Killer under hierarchy.
> > do {
> > rcu_read_lock();
> > next = cgroup_get_next(id, root, nextid);
> > /* check sanity of next here */
> > /* increment refcnt to "next" */
> > rcu_read_unlock();
> > if (!next)
> > break;
>
> It's not safe to use the cgroup returned by cgroup_get_next() outside rcu lock.
>
> ---------------------------------------------
> cgroup_rmdir()
> (check refcnt == 0)
> rcu_read_lock();
> cgrp = cgroup_get_next()
> (inc refcnt)
> rcu_read_unlock();
> cgroup_id_detach()
> cgroup_diput();
> kfree(cgrp);
> use cgrp
> ---------------------------------------------
>
> I think the safe way is:
>
> rcu_read_lock();
> cgrp = cgroup_get_next()
> if (!inc_not_zero(cgrp) {
> rcu_read_unlock();
> return NULL;
> }
> rcu_read_unlock();
> return cgrp;
>
> But it's also safe to use cgrp = list_entry(&parent->children.next) for the above
> scenario, seems you don't have to invent this cgroup_get_next().
>
But list-walk can't provide us view of hierarchy. Up-Down list walk is better ?
please see memcg's code. it met some troubles.
I'll make kfree(cgrp) to be called by RCU.
Thanks,
-Kame
> > cgroup_scan_tasks(select_bad_process);
> > } while (1);
> >
> >
> > Characteristics:
> > - Each cgroup get new ID when created.
> > - cgroup ID contains "ID" and "Depth in tree" and hierarchy code.
> > - hierarchy code is array of IDs of ancestors.
> >
> > Consideration:
> > - gang lookup if necessary.
> > - I'd like to use "(unsigned)short" to cgroup_id for saving space...
> > - MAX_DEPTH is bad ?
> >
> > Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> >
> > include/linux/cgroup.h | 42 +++++++++++
> > include/linux/idr.h | 1
> > kernel/cgroup.c | 180 ++++++++++++++++++++++++++++++++++++++++++++++++-
> > lib/idr.c | 46 ++++++++++++
> > 4 files changed, 266 insertions(+), 3 deletions(-)
> >
> > Index: mmotm-2.6.28-Nov24/include/linux/cgroup.h
> > ===================================================================
> > --- mmotm-2.6.28-Nov24.orig/include/linux/cgroup.h
> > +++ mmotm-2.6.28-Nov24/include/linux/cgroup.h
> > @@ -61,6 +61,20 @@ struct cgroup_subsys_state {
> > unsigned long flags;
> > };
> >
> > +/*
> > + * Cgroup ID for *internal* identification and lookup. For user-land,"path"
> > + * of cgroup works well.
> > + */
> > +#define MAX_CGROUP_DEPTH (10)
> > +struct cgroup_id {
> > + struct cgroup *myself;
> > + unsigned int id;
> > + atomic_t refcnt;
> > + unsigned int depth;
> > + unsigned int hierarchy_code[MAX_CGROUP_DEPTH];
> > + struct list_head list;
> > +};
> > +
> > /* bits in struct cgroup_subsys_state flags field */
> > enum {
> > CSS_ROOT, /* This CSS is the root of the subsystem */
> > @@ -145,6 +159,9 @@ struct cgroup {
> > int pids_use_count;
> > /* Length of the current tasks_pids array */
> > int pids_length;
> > +
> > + /* Cgroup ID */
> > + struct cgroup_id *id;
> > };
> >
> > /* A css_set is a structure holding pointers to a set of
> > @@ -393,6 +410,31 @@ void cgroup_iter_end(struct cgroup *cgrp
> > int cgroup_scan_tasks(struct cgroup_scanner *scan);
> > int cgroup_attach_task(struct cgroup *, struct task_struct *);
> >
> > +/*
> > + * For supporting cgroup lookup and hierarchy management.
> > + * Giving Flat view of cgroup hierarchy rather than tree.
> > + */
> > +/* An interface for usual lookup */
> > +struct cgroup *cgroup_lookup(int id);
> > +/* get next cgroup under tree (for scan) */
> > +struct cgroup *cgroup_get_next(int id, struct cgroup *root, int *foundid);
> > +
> > +static inline bool
> > +cgroup_is_ancestor(struct cgroup *cur, struct cgroup *root)
> > +{
> > + struct cgroup_id *curid = cur->id;
> > + struct cgroup_id *rootid = root->id;
> > +
> > + return (curid->hierarchy_code[rootid->depth] = rootid->id);
> > +}
> > +
> > +static inline int cgroup_myid(struct cgroup *cgrp)
> > +{
> > + return cgrp->id->id;
> > +}
> > +
> > +/* TODO: gang lookup should be */
> > +
> > #else /* !CONFIG_CGROUPS */
> >
> > static inline int cgroup_init_early(void) { return 0; }
> > Index: mmotm-2.6.28-Nov24/kernel/cgroup.c
> > ===================================================================
> > --- mmotm-2.6.28-Nov24.orig/kernel/cgroup.c
> > +++ mmotm-2.6.28-Nov24/kernel/cgroup.c
> > @@ -46,7 +46,7 @@
> > #include <linux/cgroupstats.h>
> > #include <linux/hash.h>
> > #include <linux/namei.h>
> > -
> > +#include <linux/idr.h>
> > #include <asm/atomic.h>
> >
> > static DEFINE_MUTEX(cgroup_mutex);
> > @@ -547,6 +547,162 @@ void cgroup_unlock(void)
> > mutex_unlock(&cgroup_mutex);
> > }
> >
> > +
> > +/*
> > + * Cgroup ID and lookup functions.
> > + * cgid->myself pointer is safe under rcu_read_lock() because d_put() of
> > + * cgroup, which finally frees cgroup pointer, uses rcu_synchronize().
> > + */
> > +
> > +static DEFINE_IDR(cgroup_idr);
> > +static DEFINE_SPINLOCK(cgroup_idr_lock);
> > +
> > +static int cgroup_prepare_id(struct cgroup *parent, struct cgroup_id **id)
> > +{
> > + struct cgroup_id *newid;
> > + int myid, error;
> > +
> > + /* check depth */
> > + if (parent && parent->id->depth + 1 >= MAX_CGROUP_DEPTH)
> > + return -ENOSPC;
> > + newid = kzalloc(sizeof(*newid), GFP_KERNEL);
> > + if (!newid)
> > + return -ENOMEM;
> > + /* get id */
> > + if (unlikely(!idr_pre_get(&cgroup_idr, GFP_KERNEL))) {
> > + error = -ENOMEM;
> > + goto err_out;
> > + }
> > + spin_lock_irq(&cgroup_idr_lock);
> > + error = idr_get_new(&cgroup_idr, newid, &myid);
> > + spin_unlock_irq(&cgroup_idr_lock);
> > + if (error)
> > + goto err_out;
> > +
> > + newid->id = myid;
> > + atomic_set(&newid->refcnt, 1);
> > + INIT_LIST_HEAD(&newid->list);
> > + *id = newid;
> > + return 0;
> > +err_out:
> > + kfree(newid);
> > + return error;
> > +}
> > +
> > +static void cgroup_id_put(struct cgroup_id *cgid)
> > +{
> > + unsigned long flags;
> > +
> > + if (atomic_dec_and_test(&cgid->refcnt)) {
> > + spin_lock_irqsave(&cgroup_idr_lock, flags);
> > + idr_remove(&cgroup_idr, cgid->id);
> > + spin_unlock_irqrestore(&cgroup_idr_lock, flags);
> > + kfree(cgid);
> > + }
> > +}
> > +
> > +static void cgroup_id_attach(struct cgroup_id *cgid,
> > + struct cgroup *cg, struct cgroup *parent)
> > +{
> > + if (parent) {
> > + struct cgroup_id *parent_id = parent->id;
> > + int i;
> > +
> > + cgid->depth = parent_id->depth + 1;
> > + /* Inherit hierarchy code from parent */
> > + for (i = 0; i < cgid->depth; i++) {
> > + cgid->hierarchy_code[i] =
> > + parent_id->hierarchy_code[i];
> > + cgid->hierarchy_code[cgid->depth] = cgid->id;
> > + }
> > + } else {
> > + cgid->depth = 0;
> > + cgid->hierarchy_code[0] = cgid->id;
> > + }
> > + rcu_assign_pointer(cgid->myself, cg);
> > + cg->id = cgid;
> > +
> > + return;
> > +}
> > +
> > +static void cgroup_id_detach(struct cgroup *cg)
> > +{
> > + struct cgroup_id *id = cg->id;
> > +
> > + rcu_assign_pointer(id->myself, NULL);
> > + cgroup_id_put(id);
> > +}
> > +
> > +/**
> > + * cgroup_lookup - lookup cgroup by id
> > + * @id: the id of cgroup to be looked up
> > + *
> > + * Retruns pointer to cgroup if there is vald cgroup with id, NULL if not.
> > + * Should be called under rcu_read_lock() or cgroup_lock().
> > + */
> > +
> > +struct cgroup *cgroup_lookup(int id)
> > +{
> > + struct cgroup *cgrp;
> > + struct cgroup_id *cgid;
> > +
> > + rcu_read_lock();
> > + cgid = idr_find(&cgroup_idr, id);
> > + rcu_read_unlock();
> > +
> > + if (unlikely(!cgid))
> > + return NULL;
> > +
> > + cgrp = rcu_dereference(cgid->myself);
> > + if (unlikely(!cgrp || test_bit(CGRP_REMOVED, &cgrp->flags)))
> > + return NULL;
> > +
> > + return cgrp;
> > +}
> > +
> > +/**
> > + * cgroup_get_next - lookup next cgroup under specified hierarchy.
> > + * @id: current position of iteration.
> > + * @root: search tree under this. .
> > + * @foundid: position of found object.
> > + *
> > + * Search next cgroup under the specified hierarchy. If "cur" is NULL,
> > + * start from root cgroup. Called under rcu_read_lock() or cgroup_lock()
> > + * is necessary (to access a found cgroup.).
> > + */
> > +struct cgroup *cgroup_get_next(int id, struct cgroup *root, int *foundid)
> > +{
> > + struct cgroup *ret = NULL;
> > + struct cgroup_id *tmp, *rootid;
> > + int tmpid, depth, targetid;
> > + unsigned long flags;
> > +
> > + rootid = root->id;
> > + depth = rootid->depth;
> > + targetid = rootid->id;
> > + tmpid = id;
> > +
> > + do {
> > + /* scan next entry from bitmap(tree) */
> > + spin_lock_irqsave(&cgroup_idr_lock, flags);
> > + tmp = idr_get_next(&cgroup_idr, &tmpid);
> > + spin_unlock_irqrestore(&cgroup_idr_lock, flags);
> > +
> > + if (tmp) {
> > + ret = rcu_dereference(tmp->myself);
> > + /* Sanity check and check hierarchy */
> > + if (ret && !test_bit(CGRP_REMOVED, &ret->flags)
> > + && tmp->hierarchy_code[depth] == targetid)
> > + break;
> > + tmpid = tmpid + 1;
> > + } else
> > + tmpid = 0;
> > + } while (1);
> > +
> > + *foundid = tmpid;
> > + return ret;
> > +}
> > +
> > /*
> > * A couple of forward declarations required, due to cyclic reference loop:
> > * cgroup_mkdir -> cgroup_create -> cgroup_populate_dir ->
> > @@ -991,6 +1147,7 @@ static int cgroup_get_sb(struct file_sys
> > /* New superblock */
> > struct cgroup *cgrp = &root->top_cgroup;
> > struct inode *inode;
> > + struct cgroup_id *newid;
> > int i;
> >
> > BUG_ON(sb->s_root != NULL);
> > @@ -998,6 +1155,11 @@ static int cgroup_get_sb(struct file_sys
> > ret = cgroup_get_rootdir(sb);
> > if (ret)
> > goto drop_new_super;
> > +
> > + ret = cgroup_prepare_id(NULL, &newid);
> > + if (ret)
> > + goto drop_new_super;
> > +
> > inode = sb->s_root->d_inode;
> >
> > mutex_lock(&inode->i_mutex);
> > @@ -1062,7 +1224,7 @@ static int cgroup_get_sb(struct file_sys
> > BUG_ON(!list_empty(&cgrp->sibling));
> > BUG_ON(!list_empty(&cgrp->children));
> > BUG_ON(root->number_of_cgroups != 1);
> > -
> > + cgroup_id_attach(newid, cgrp, NULL);
> > cgroup_populate_dir(cgrp);
> > mutex_unlock(&inode->i_mutex);
> > mutex_unlock(&cgroup_mutex);
> > @@ -1997,6 +2159,7 @@ int cgroup_scan_tasks(struct cgroup_scan
> > return 0;
> > }
> >
> > +
> > /*
> > * Stuff for reading the 'tasks' file.
> > *
> > @@ -2349,11 +2512,18 @@ static long cgroup_create(struct cgroup
> > int err = 0;
> > struct cgroup_subsys *ss;
> > struct super_block *sb = root->sb;
> > + struct cgroup_id *cgid = NULL;
> >
> > cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
> > if (!cgrp)
> > return -ENOMEM;
> >
> > + err = cgroup_prepare_id(parent, &cgid);
> > + if (err) {
> > + kfree(cgrp);
> > + return err;
> > + }
> > +
> > /* Grab a reference on the superblock so the hierarchy doesn't
> > * get deleted on unmount if there are child cgroups. This
> > * can be done outside cgroup_mutex, since the sb can't
> > @@ -2394,6 +2564,8 @@ static long cgroup_create(struct cgroup
> > err = cgroup_populate_dir(cgrp);
> > /* If err < 0, we have a half-filled directory - oh well ;) */
> >
> > + cgroup_id_attach(cgid, cgrp, parent);
> > +
> > mutex_unlock(&cgroup_mutex);
> > mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
> >
> > @@ -2415,7 +2587,7 @@ static long cgroup_create(struct cgroup
> >
> > /* Release the reference count that we took on the superblock */
> > deactivate_super(sb);
> > -
> > + cgroup_id_put(cgid);
> > kfree(cgrp);
> > return err;
> > }
> > @@ -2494,6 +2666,8 @@ static int cgroup_rmdir(struct inode *un
> > return -EBUSY;
> > }
> >
> > + cgroup_id_detach(cgrp);
> > +
> > spin_lock(&release_list_lock);
> > set_bit(CGRP_REMOVED, &cgrp->flags);
> > if (!list_empty(&cgrp->release_list))
> > Index: mmotm-2.6.28-Nov24/include/linux/idr.h
> > ===================================================================
> > --- mmotm-2.6.28-Nov24.orig/include/linux/idr.h
> > +++ mmotm-2.6.28-Nov24/include/linux/idr.h
> > @@ -106,6 +106,7 @@ int idr_get_new(struct idr *idp, void *p
> > int idr_get_new_above(struct idr *idp, void *ptr, int starting_id, int *id);
> > int idr_for_each(struct idr *idp,
> > int (*fn)(int id, void *p, void *data), void *data);
> > +void *idr_get_next(struct idr *idp, int *nextid);
> > void *idr_replace(struct idr *idp, void *ptr, int id);
> > void idr_remove(struct idr *idp, int id);
> > void idr_remove_all(struct idr *idp);
> > Index: mmotm-2.6.28-Nov24/lib/idr.c
> > ===================================================================
> > --- mmotm-2.6.28-Nov24.orig/lib/idr.c
> > +++ mmotm-2.6.28-Nov24/lib/idr.c
> > @@ -573,6 +573,52 @@ int idr_for_each(struct idr *idp,
> > EXPORT_SYMBOL(idr_for_each);
> >
> > /**
> > + * idr_get_next - lookup next opbject of id to given id.
> > + * @idp: idr handle
> > + * @id: pointer to lookup key
> > + *
> > + * Retruns pointer to registered object with id, which is next number to
> > + * given id.
> > + */
> > +
> > +void *idr_get_next(struct idr *idp, int *nextidp)
> > +{
> > + struct idr_layer *p, *pa[MAX_LEVEL];
> > + struct idr_layer **paa = &pa[0];
> > + int id = *nextidp;
> > + int n, max;
> > +
> > + /* find first ent */
> > + n = idp->layers * IDR_BITS;
> > + max = 1 << n;
> > + p = rcu_dereference(idp->top);
> > + if (!p)
> > + return NULL;
> > +
> > + while (id < max) {
> > + while (n > 0 && p) {
> > + n -= IDR_BITS;
> > + *paa++ = p;
> > + p = rcu_dereference(p->ary[(id >> n) & IDR_MASK]);
> > + }
> > +
> > + if (p) {
> > + *nextidp = id;
> > + return p;
> > + }
> > +
> > + id += 1 << n;
> > + while (n < fls(id)) {
> > + n += IDR_BITS;
> > + p = *--paa;
> > + }
> > + }
> > + return NULL;
> > +}
> > +
> > +
> > +
> > +/**
> > * idr_replace - replace pointer for given id
> > * @idp: idr handle
> > * @ptr: pointer you want associated with the id
> >
> >
>
^ permalink raw reply [flat|nested] 12+ messages in thread* Re: [RFC][PATCH 1/2] CGROUP ID and Hierarchy Code
2008-11-28 1:35 ` KAMEZAWA Hiroyuki
@ 2008-11-28 1:51 ` KAMEZAWA Hiroyuki
2008-11-28 2:18 ` KAMEZAWA Hiroyuki
1 sibling, 0 replies; 12+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-11-28 1:51 UTC (permalink / raw)
To: KAMEZAWA Hiroyuki
Cc: Li Zefan, linux-kernel@vger.kernel.org, menage@google.com,
balbir@linux.vnet.ibm.com, nishimura@mxp.nes.nec.co.jp, taka
On Fri, 28 Nov 2008 10:35:33 +0900
KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> On Fri, 28 Nov 2008 09:28:33 +0800
> Li Zefan <lizf@cn.fujitsu.com> wrote:
>
> > KAMEZAWA Hiroyuki wrote:
> > > Patches for CGROUP ID and Hierarchy Code.
> > > (based on mmotm Nov/24...I'll update later.)
> > >
> > > CGROUP ID is a unique value for each cgroup (struct cgroup)
> > > Hierarchy Code is array of CGROUP ID which contains all ID od ancestors.
> > > and tries to remove cgroup_lock() to walk tree.
> > >
> > > This is just an example-level patch but I'm now testing/refleshing this.
> > > Works well with memcg's hierarchy relcaim.
> > > Any comment is welcome.
> > >
> > > -Kame
> > >
> > > ==
> > > A toy patch for Cgroup ID and hierarchy code.
> > >
> > > This patch tries to assing a ID to each cgroup. Attach unique ID to each
> > > cgroup and provides following functions.
> > >
> > > - cgroup_lookup(id)
> > > returns cgroup of id.
> > > - cgroup_get_next(id, root, foundid)
> > > returns the next cgroup under "root" by scanning bitmap (not by tree-walk)
> > > - cgroup_is_ancestor(cgroup, root)
> > > returns true if "root" is ancestor of cgroup.
> > >
> > > There is several reasons to develop this.
> > >
> > > - While trying to implement hierarchy in memory cgroup, we had to
> > > implement "walk under hierarchy" code.
> > > Now it's consists of cgroup_lock and tree up-down code.
> > > But taking "cgroup_lock" in walking tree can cause deadlocks.
> > > cgroup_lock() is too big. Easier way is helpful.
> > >
> > > - SwapCgroup uses array of "pointer" to record the owner of swaps.
> > > By ID, we can reduce this to "short" or "int". This means ID is
> > > useful for reducing space consumption by pointer if the access cost
> > > is not problem.
> > > (I hear bio-cgroup will use the same kind of...)
> > >
> > > Example) OOM-Killer under hierarchy.
> > > do {
> > > rcu_read_lock();
> > > next = cgroup_get_next(id, root, nextid);
> > > /* check sanity of next here */
> > > /* increment refcnt to "next" */
> > > rcu_read_unlock();
> > > if (!next)
> > > break;
> >
> > It's not safe to use the cgroup returned by cgroup_get_next() outside rcu lock.
> >
> > ---------------------------------------------
> > cgroup_rmdir()
> > (check refcnt == 0)
> > rcu_read_lock();
> > cgrp = cgroup_get_next()
> > (inc refcnt)
> > rcu_read_unlock();
> > cgroup_id_detach()
> > cgroup_diput();
> > kfree(cgrp);
> > use cgrp
> > ---------------------------------------------
> >
> > I think the safe way is:
> >
> > rcu_read_lock();
> > cgrp = cgroup_get_next()
> > if (!inc_not_zero(cgrp) {
> > rcu_read_unlock();
> > return NULL;
> > }
> > rcu_read_unlock();
> > return cgrp;
> >
> > But it's also safe to use cgrp = list_entry(&parent->children.next) for the above
> > scenario, seems you don't have to invent this cgroup_get_next().
> >
> But list-walk can't provide us view of hierarchy. Up-Down list walk is better ?
> please see memcg's code. it met some troubles.
> I'll make kfree(cgrp) to be called by RCU.
>
One more thing, I didn't mentioned that "increment cgroup's refcnt"
My intention was to "increment subsystem's private refcnt".
Sorry for bad text.
"Increaseing cgroup's refcnt" is bad because the user may see unexpected -EBUSY.
Just delaying free() is better (and I'll use free by RCU.)
Thanks,
-Kame
> Thanks,
> -Kame
>
>
> > > cgroup_scan_tasks(select_bad_process);
> > > } while (1);
> > >
> > >
> > > Characteristics:
> > > - Each cgroup get new ID when created.
> > > - cgroup ID contains "ID" and "Depth in tree" and hierarchy code.
> > > - hierarchy code is array of IDs of ancestors.
> > >
> > > Consideration:
> > > - gang lookup if necessary.
> > > - I'd like to use "(unsigned)short" to cgroup_id for saving space...
> > > - MAX_DEPTH is bad ?
> > >
> > > Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> > >
> > > include/linux/cgroup.h | 42 +++++++++++
> > > include/linux/idr.h | 1
> > > kernel/cgroup.c | 180 ++++++++++++++++++++++++++++++++++++++++++++++++-
> > > lib/idr.c | 46 ++++++++++++
> > > 4 files changed, 266 insertions(+), 3 deletions(-)
> > >
> > > Index: mmotm-2.6.28-Nov24/include/linux/cgroup.h
> > > ===================================================================
> > > --- mmotm-2.6.28-Nov24.orig/include/linux/cgroup.h
> > > +++ mmotm-2.6.28-Nov24/include/linux/cgroup.h
> > > @@ -61,6 +61,20 @@ struct cgroup_subsys_state {
> > > unsigned long flags;
> > > };
> > >
> > > +/*
> > > + * Cgroup ID for *internal* identification and lookup. For user-land,"path"
> > > + * of cgroup works well.
> > > + */
> > > +#define MAX_CGROUP_DEPTH (10)
> > > +struct cgroup_id {
> > > + struct cgroup *myself;
> > > + unsigned int id;
> > > + atomic_t refcnt;
> > > + unsigned int depth;
> > > + unsigned int hierarchy_code[MAX_CGROUP_DEPTH];
> > > + struct list_head list;
> > > +};
> > > +
> > > /* bits in struct cgroup_subsys_state flags field */
> > > enum {
> > > CSS_ROOT, /* This CSS is the root of the subsystem */
> > > @@ -145,6 +159,9 @@ struct cgroup {
> > > int pids_use_count;
> > > /* Length of the current tasks_pids array */
> > > int pids_length;
> > > +
> > > + /* Cgroup ID */
> > > + struct cgroup_id *id;
> > > };
> > >
> > > /* A css_set is a structure holding pointers to a set of
> > > @@ -393,6 +410,31 @@ void cgroup_iter_end(struct cgroup *cgrp
> > > int cgroup_scan_tasks(struct cgroup_scanner *scan);
> > > int cgroup_attach_task(struct cgroup *, struct task_struct *);
> > >
> > > +/*
> > > + * For supporting cgroup lookup and hierarchy management.
> > > + * Giving Flat view of cgroup hierarchy rather than tree.
> > > + */
> > > +/* An interface for usual lookup */
> > > +struct cgroup *cgroup_lookup(int id);
> > > +/* get next cgroup under tree (for scan) */
> > > +struct cgroup *cgroup_get_next(int id, struct cgroup *root, int *foundid);
> > > +
> > > +static inline bool
> > > +cgroup_is_ancestor(struct cgroup *cur, struct cgroup *root)
> > > +{
> > > + struct cgroup_id *curid = cur->id;
> > > + struct cgroup_id *rootid = root->id;
> > > +
> > > + return (curid->hierarchy_code[rootid->depth] = rootid->id);
> > > +}
> > > +
> > > +static inline int cgroup_myid(struct cgroup *cgrp)
> > > +{
> > > + return cgrp->id->id;
> > > +}
> > > +
> > > +/* TODO: gang lookup should be */
> > > +
> > > #else /* !CONFIG_CGROUPS */
> > >
> > > static inline int cgroup_init_early(void) { return 0; }
> > > Index: mmotm-2.6.28-Nov24/kernel/cgroup.c
> > > ===================================================================
> > > --- mmotm-2.6.28-Nov24.orig/kernel/cgroup.c
> > > +++ mmotm-2.6.28-Nov24/kernel/cgroup.c
> > > @@ -46,7 +46,7 @@
> > > #include <linux/cgroupstats.h>
> > > #include <linux/hash.h>
> > > #include <linux/namei.h>
> > > -
> > > +#include <linux/idr.h>
> > > #include <asm/atomic.h>
> > >
> > > static DEFINE_MUTEX(cgroup_mutex);
> > > @@ -547,6 +547,162 @@ void cgroup_unlock(void)
> > > mutex_unlock(&cgroup_mutex);
> > > }
> > >
> > > +
> > > +/*
> > > + * Cgroup ID and lookup functions.
> > > + * cgid->myself pointer is safe under rcu_read_lock() because d_put() of
> > > + * cgroup, which finally frees cgroup pointer, uses rcu_synchronize().
> > > + */
> > > +
> > > +static DEFINE_IDR(cgroup_idr);
> > > +static DEFINE_SPINLOCK(cgroup_idr_lock);
> > > +
> > > +static int cgroup_prepare_id(struct cgroup *parent, struct cgroup_id **id)
> > > +{
> > > + struct cgroup_id *newid;
> > > + int myid, error;
> > > +
> > > + /* check depth */
> > > + if (parent && parent->id->depth + 1 >= MAX_CGROUP_DEPTH)
> > > + return -ENOSPC;
> > > + newid = kzalloc(sizeof(*newid), GFP_KERNEL);
> > > + if (!newid)
> > > + return -ENOMEM;
> > > + /* get id */
> > > + if (unlikely(!idr_pre_get(&cgroup_idr, GFP_KERNEL))) {
> > > + error = -ENOMEM;
> > > + goto err_out;
> > > + }
> > > + spin_lock_irq(&cgroup_idr_lock);
> > > + error = idr_get_new(&cgroup_idr, newid, &myid);
> > > + spin_unlock_irq(&cgroup_idr_lock);
> > > + if (error)
> > > + goto err_out;
> > > +
> > > + newid->id = myid;
> > > + atomic_set(&newid->refcnt, 1);
> > > + INIT_LIST_HEAD(&newid->list);
> > > + *id = newid;
> > > + return 0;
> > > +err_out:
> > > + kfree(newid);
> > > + return error;
> > > +}
> > > +
> > > +static void cgroup_id_put(struct cgroup_id *cgid)
> > > +{
> > > + unsigned long flags;
> > > +
> > > + if (atomic_dec_and_test(&cgid->refcnt)) {
> > > + spin_lock_irqsave(&cgroup_idr_lock, flags);
> > > + idr_remove(&cgroup_idr, cgid->id);
> > > + spin_unlock_irqrestore(&cgroup_idr_lock, flags);
> > > + kfree(cgid);
> > > + }
> > > +}
> > > +
> > > +static void cgroup_id_attach(struct cgroup_id *cgid,
> > > + struct cgroup *cg, struct cgroup *parent)
> > > +{
> > > + if (parent) {
> > > + struct cgroup_id *parent_id = parent->id;
> > > + int i;
> > > +
> > > + cgid->depth = parent_id->depth + 1;
> > > + /* Inherit hierarchy code from parent */
> > > + for (i = 0; i < cgid->depth; i++) {
> > > + cgid->hierarchy_code[i] =
> > > + parent_id->hierarchy_code[i];
> > > + cgid->hierarchy_code[cgid->depth] = cgid->id;
> > > + }
> > > + } else {
> > > + cgid->depth = 0;
> > > + cgid->hierarchy_code[0] = cgid->id;
> > > + }
> > > + rcu_assign_pointer(cgid->myself, cg);
> > > + cg->id = cgid;
> > > +
> > > + return;
> > > +}
> > > +
> > > +static void cgroup_id_detach(struct cgroup *cg)
> > > +{
> > > + struct cgroup_id *id = cg->id;
> > > +
> > > + rcu_assign_pointer(id->myself, NULL);
> > > + cgroup_id_put(id);
> > > +}
> > > +
> > > +/**
> > > + * cgroup_lookup - lookup cgroup by id
> > > + * @id: the id of cgroup to be looked up
> > > + *
> > > + * Retruns pointer to cgroup if there is vald cgroup with id, NULL if not.
> > > + * Should be called under rcu_read_lock() or cgroup_lock().
> > > + */
> > > +
> > > +struct cgroup *cgroup_lookup(int id)
> > > +{
> > > + struct cgroup *cgrp;
> > > + struct cgroup_id *cgid;
> > > +
> > > + rcu_read_lock();
> > > + cgid = idr_find(&cgroup_idr, id);
> > > + rcu_read_unlock();
> > > +
> > > + if (unlikely(!cgid))
> > > + return NULL;
> > > +
> > > + cgrp = rcu_dereference(cgid->myself);
> > > + if (unlikely(!cgrp || test_bit(CGRP_REMOVED, &cgrp->flags)))
> > > + return NULL;
> > > +
> > > + return cgrp;
> > > +}
> > > +
> > > +/**
> > > + * cgroup_get_next - lookup next cgroup under specified hierarchy.
> > > + * @id: current position of iteration.
> > > + * @root: search tree under this. .
> > > + * @foundid: position of found object.
> > > + *
> > > + * Search next cgroup under the specified hierarchy. If "cur" is NULL,
> > > + * start from root cgroup. Called under rcu_read_lock() or cgroup_lock()
> > > + * is necessary (to access a found cgroup.).
> > > + */
> > > +struct cgroup *cgroup_get_next(int id, struct cgroup *root, int *foundid)
> > > +{
> > > + struct cgroup *ret = NULL;
> > > + struct cgroup_id *tmp, *rootid;
> > > + int tmpid, depth, targetid;
> > > + unsigned long flags;
> > > +
> > > + rootid = root->id;
> > > + depth = rootid->depth;
> > > + targetid = rootid->id;
> > > + tmpid = id;
> > > +
> > > + do {
> > > + /* scan next entry from bitmap(tree) */
> > > + spin_lock_irqsave(&cgroup_idr_lock, flags);
> > > + tmp = idr_get_next(&cgroup_idr, &tmpid);
> > > + spin_unlock_irqrestore(&cgroup_idr_lock, flags);
> > > +
> > > + if (tmp) {
> > > + ret = rcu_dereference(tmp->myself);
> > > + /* Sanity check and check hierarchy */
> > > + if (ret && !test_bit(CGRP_REMOVED, &ret->flags)
> > > + && tmp->hierarchy_code[depth] == targetid)
> > > + break;
> > > + tmpid = tmpid + 1;
> > > + } else
> > > + tmpid = 0;
> > > + } while (1);
> > > +
> > > + *foundid = tmpid;
> > > + return ret;
> > > +}
> > > +
> > > /*
> > > * A couple of forward declarations required, due to cyclic reference loop:
> > > * cgroup_mkdir -> cgroup_create -> cgroup_populate_dir ->
> > > @@ -991,6 +1147,7 @@ static int cgroup_get_sb(struct file_sys
> > > /* New superblock */
> > > struct cgroup *cgrp = &root->top_cgroup;
> > > struct inode *inode;
> > > + struct cgroup_id *newid;
> > > int i;
> > >
> > > BUG_ON(sb->s_root != NULL);
> > > @@ -998,6 +1155,11 @@ static int cgroup_get_sb(struct file_sys
> > > ret = cgroup_get_rootdir(sb);
> > > if (ret)
> > > goto drop_new_super;
> > > +
> > > + ret = cgroup_prepare_id(NULL, &newid);
> > > + if (ret)
> > > + goto drop_new_super;
> > > +
> > > inode = sb->s_root->d_inode;
> > >
> > > mutex_lock(&inode->i_mutex);
> > > @@ -1062,7 +1224,7 @@ static int cgroup_get_sb(struct file_sys
> > > BUG_ON(!list_empty(&cgrp->sibling));
> > > BUG_ON(!list_empty(&cgrp->children));
> > > BUG_ON(root->number_of_cgroups != 1);
> > > -
> > > + cgroup_id_attach(newid, cgrp, NULL);
> > > cgroup_populate_dir(cgrp);
> > > mutex_unlock(&inode->i_mutex);
> > > mutex_unlock(&cgroup_mutex);
> > > @@ -1997,6 +2159,7 @@ int cgroup_scan_tasks(struct cgroup_scan
> > > return 0;
> > > }
> > >
> > > +
> > > /*
> > > * Stuff for reading the 'tasks' file.
> > > *
> > > @@ -2349,11 +2512,18 @@ static long cgroup_create(struct cgroup
> > > int err = 0;
> > > struct cgroup_subsys *ss;
> > > struct super_block *sb = root->sb;
> > > + struct cgroup_id *cgid = NULL;
> > >
> > > cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
> > > if (!cgrp)
> > > return -ENOMEM;
> > >
> > > + err = cgroup_prepare_id(parent, &cgid);
> > > + if (err) {
> > > + kfree(cgrp);
> > > + return err;
> > > + }
> > > +
> > > /* Grab a reference on the superblock so the hierarchy doesn't
> > > * get deleted on unmount if there are child cgroups. This
> > > * can be done outside cgroup_mutex, since the sb can't
> > > @@ -2394,6 +2564,8 @@ static long cgroup_create(struct cgroup
> > > err = cgroup_populate_dir(cgrp);
> > > /* If err < 0, we have a half-filled directory - oh well ;) */
> > >
> > > + cgroup_id_attach(cgid, cgrp, parent);
> > > +
> > > mutex_unlock(&cgroup_mutex);
> > > mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
> > >
> > > @@ -2415,7 +2587,7 @@ static long cgroup_create(struct cgroup
> > >
> > > /* Release the reference count that we took on the superblock */
> > > deactivate_super(sb);
> > > -
> > > + cgroup_id_put(cgid);
> > > kfree(cgrp);
> > > return err;
> > > }
> > > @@ -2494,6 +2666,8 @@ static int cgroup_rmdir(struct inode *un
> > > return -EBUSY;
> > > }
> > >
> > > + cgroup_id_detach(cgrp);
> > > +
> > > spin_lock(&release_list_lock);
> > > set_bit(CGRP_REMOVED, &cgrp->flags);
> > > if (!list_empty(&cgrp->release_list))
> > > Index: mmotm-2.6.28-Nov24/include/linux/idr.h
> > > ===================================================================
> > > --- mmotm-2.6.28-Nov24.orig/include/linux/idr.h
> > > +++ mmotm-2.6.28-Nov24/include/linux/idr.h
> > > @@ -106,6 +106,7 @@ int idr_get_new(struct idr *idp, void *p
> > > int idr_get_new_above(struct idr *idp, void *ptr, int starting_id, int *id);
> > > int idr_for_each(struct idr *idp,
> > > int (*fn)(int id, void *p, void *data), void *data);
> > > +void *idr_get_next(struct idr *idp, int *nextid);
> > > void *idr_replace(struct idr *idp, void *ptr, int id);
> > > void idr_remove(struct idr *idp, int id);
> > > void idr_remove_all(struct idr *idp);
> > > Index: mmotm-2.6.28-Nov24/lib/idr.c
> > > ===================================================================
> > > --- mmotm-2.6.28-Nov24.orig/lib/idr.c
> > > +++ mmotm-2.6.28-Nov24/lib/idr.c
> > > @@ -573,6 +573,52 @@ int idr_for_each(struct idr *idp,
> > > EXPORT_SYMBOL(idr_for_each);
> > >
> > > /**
> > > + * idr_get_next - lookup next opbject of id to given id.
> > > + * @idp: idr handle
> > > + * @id: pointer to lookup key
> > > + *
> > > + * Retruns pointer to registered object with id, which is next number to
> > > + * given id.
> > > + */
> > > +
> > > +void *idr_get_next(struct idr *idp, int *nextidp)
> > > +{
> > > + struct idr_layer *p, *pa[MAX_LEVEL];
> > > + struct idr_layer **paa = &pa[0];
> > > + int id = *nextidp;
> > > + int n, max;
> > > +
> > > + /* find first ent */
> > > + n = idp->layers * IDR_BITS;
> > > + max = 1 << n;
> > > + p = rcu_dereference(idp->top);
> > > + if (!p)
> > > + return NULL;
> > > +
> > > + while (id < max) {
> > > + while (n > 0 && p) {
> > > + n -= IDR_BITS;
> > > + *paa++ = p;
> > > + p = rcu_dereference(p->ary[(id >> n) & IDR_MASK]);
> > > + }
> > > +
> > > + if (p) {
> > > + *nextidp = id;
> > > + return p;
> > > + }
> > > +
> > > + id += 1 << n;
> > > + while (n < fls(id)) {
> > > + n += IDR_BITS;
> > > + p = *--paa;
> > > + }
> > > + }
> > > + return NULL;
> > > +}
> > > +
> > > +
> > > +
> > > +/**
> > > * idr_replace - replace pointer for given id
> > > * @idp: idr handle
> > > * @ptr: pointer you want associated with the id
> > >
> > >
> >
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>
^ permalink raw reply [flat|nested] 12+ messages in thread* Re: [RFC][PATCH 1/2] CGROUP ID and Hierarchy Code
2008-11-28 1:35 ` KAMEZAWA Hiroyuki
2008-11-28 1:51 ` KAMEZAWA Hiroyuki
@ 2008-11-28 2:18 ` KAMEZAWA Hiroyuki
2008-11-28 2:38 ` Li Zefan
1 sibling, 1 reply; 12+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-11-28 2:18 UTC (permalink / raw)
To: KAMEZAWA Hiroyuki
Cc: Li Zefan, linux-kernel@vger.kernel.org, menage@google.com,
balbir@linux.vnet.ibm.com, nishimura@mxp.nes.nec.co.jp, taka
On Fri, 28 Nov 2008 10:35:33 +0900
KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> On Fri, 28 Nov 2008 09:28:33 +0800
> Li Zefan <lizf@cn.fujitsu.com> wrote:
> > I think the safe way is:
> >
> > rcu_read_lock();
> > cgrp = cgroup_get_next()
> > if (!inc_not_zero(cgrp) {
> > rcu_read_unlock();
> > return NULL;
> > }
> > rcu_read_unlock();
> > return cgrp;
> >
> > But it's also safe to use cgrp = list_entry(&parent->children.next) for the above
> > scenario, seems you don't have to invent this cgroup_get_next().
> >
> But list-walk can't provide us view of hierarchy. Up-Down list walk is better ?
> please see memcg's code. it met some troubles.
> I'll make kfree(cgrp) to be called by RCU.
>
Ah please, I have a question.
Following is right ?
- until mount, cgroup's subsystem root is tied to dummy? root cgroup.
- cgroup cannot be unmounted while there are any children.
- cgroup_root is freed/allocated at umount/mount, then I have to handle
this event in cgroup-id.
(maybe current code has leak? in umount.)
Thanks,
-Kame
^ permalink raw reply [flat|nested] 12+ messages in thread* Re: [RFC][PATCH 1/2] CGROUP ID and Hierarchy Code
2008-11-28 2:18 ` KAMEZAWA Hiroyuki
@ 2008-11-28 2:38 ` Li Zefan
2008-11-28 2:41 ` KAMEZAWA Hiroyuki
0 siblings, 1 reply; 12+ messages in thread
From: Li Zefan @ 2008-11-28 2:38 UTC (permalink / raw)
To: KAMEZAWA Hiroyuki
Cc: linux-kernel@vger.kernel.org, menage@google.com,
balbir@linux.vnet.ibm.com, nishimura@mxp.nes.nec.co.jp, taka
KAMEZAWA Hiroyuki wrote:
> On Fri, 28 Nov 2008 10:35:33 +0900
> KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
>
>> On Fri, 28 Nov 2008 09:28:33 +0800
>> Li Zefan <lizf@cn.fujitsu.com> wrote:
>
>>> I think the safe way is:
>>>
>>> rcu_read_lock();
>>> cgrp = cgroup_get_next()
>>> if (!inc_not_zero(cgrp) {
>>> rcu_read_unlock();
>>> return NULL;
>>> }
>>> rcu_read_unlock();
>>> return cgrp;
>>>
>>> But it's also safe to use cgrp = list_entry(&parent->children.next) for the above
>>> scenario, seems you don't have to invent this cgroup_get_next().
>>>
>> But list-walk can't provide us view of hierarchy. Up-Down list walk is better ?
>> please see memcg's code. it met some troubles.
>> I'll make kfree(cgrp) to be called by RCU.
>>
>
> Ah please, I have a question.
> Following is right ?
> - until mount, cgroup's subsystem root is tied to dummy? root cgroup.
Yes.
> - cgroup cannot be unmounted while there are any children.
No, you can unmount. You can have it a try. :)
> - cgroup_root is freed/allocated at umount/mount, then I have to handle
No, kill_sb() is called when deactivate_super() decreases sb->s_active to 0, and
whenever we create a cgroup, sb->s_active is increased, so if there are sub-dirs
the cgroup_root won't be freed at umount.
> this event in cgroup-id.
> (maybe current code has leak? in umount.)
>
So I think there is no leak. ;)
> Thanks,
> -Kame
>
>
>
^ permalink raw reply [flat|nested] 12+ messages in thread* Re: [RFC][PATCH 1/2] CGROUP ID and Hierarchy Code
2008-11-28 2:38 ` Li Zefan
@ 2008-11-28 2:41 ` KAMEZAWA Hiroyuki
0 siblings, 0 replies; 12+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-11-28 2:41 UTC (permalink / raw)
To: Li Zefan
Cc: linux-kernel@vger.kernel.org, menage@google.com,
balbir@linux.vnet.ibm.com, nishimura@mxp.nes.nec.co.jp, taka
On Fri, 28 Nov 2008 10:38:26 +0800
Li Zefan <lizf@cn.fujitsu.com> wrote:
> KAMEZAWA Hiroyuki wrote:
> > On Fri, 28 Nov 2008 10:35:33 +0900
> > KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> >
> >> On Fri, 28 Nov 2008 09:28:33 +0800
> >> Li Zefan <lizf@cn.fujitsu.com> wrote:
> >
> >>> I think the safe way is:
> >>>
> >>> rcu_read_lock();
> >>> cgrp = cgroup_get_next()
> >>> if (!inc_not_zero(cgrp) {
> >>> rcu_read_unlock();
> >>> return NULL;
> >>> }
> >>> rcu_read_unlock();
> >>> return cgrp;
> >>>
> >>> But it's also safe to use cgrp = list_entry(&parent->children.next) for the above
> >>> scenario, seems you don't have to invent this cgroup_get_next().
> >>>
> >> But list-walk can't provide us view of hierarchy. Up-Down list walk is better ?
> >> please see memcg's code. it met some troubles.
> >> I'll make kfree(cgrp) to be called by RCU.
> >>
> >
> > Ah please, I have a question.
> > Following is right ?
> > - until mount, cgroup's subsystem root is tied to dummy? root cgroup.
>
> Yes.
>
> > - cgroup cannot be unmounted while there are any children.
>
> No, you can unmount. You can have it a try. :)
>
I tried and did ;)
> > - cgroup_root is freed/allocated at umount/mount, then I have to handle
>
> No, kill_sb() is called when deactivate_super() decreases sb->s_active to 0, and
> whenever we create a cgroup, sb->s_active is increased, so if there are sub-dirs
> the cgroup_root won't be freed at umount.
>
Thank you! very helpful.
So, I should take care ofe only kill_sb() case.
(it frees pointer of root (and root->topcgroup.) and I have to update it.)
Regards,
-Kame
> > this event in cgroup-id.
> > (maybe current code has leak? in umount.)
> >
>
> So I think there is no leak. ;)
>
> > Thanks,
> > -Kame
> >
> >
> >
>
^ permalink raw reply [flat|nested] 12+ messages in thread