* [PATCH v2 2/6] sched: split out css_online/css_offline from tg creation/destruction
[not found] ` <5100D4FE.9080205-hv44wF8Li93QT0dZR+AlfA@public.gmane.org>
@ 2013-01-24 6:30 ` Li Zefan
[not found] ` <5100D518.8000103-hv44wF8Li93QT0dZR+AlfA@public.gmane.org>
2013-01-24 6:31 ` [PATCH v2 3/6] sched: remove redundant NULL cgroup check in task_group_path() Li Zefan
` (4 subsequent siblings)
5 siblings, 1 reply; 14+ messages in thread
From: Li Zefan @ 2013-01-24 6:30 UTC (permalink / raw)
To: Tejun Heo; +Cc: Andrew Morton, Peter Zijlstra, Ingo Molnar, LKML, Cgroups
This is a preparaton for later patches.
- What do we gain from cpu_cgroup_css_online():
After ss->css_alloc() and before ss->css_online(), there's a small
window that tg->css.cgroup is NULL. With this change, tg won't be seen
before ss->css_online(), where it's added to the global list, so we're
guaranteed we'll never see NULL tg->css.cgroup.
- What do we gain from cpu_cgroup_css_offline():
tg is freed via RCU, so is cgroup. Without this change, This is how
synchronization works:
cgroup_rmdir()
no ss->css_offline()
diput()
syncornize_rcu()
ss->css_free() <-- unregister tg, and free it via call_rcu()
kfree_rcu(cgroup) <-- wait possible refs to cgroup, and free cgroup
We can't just kfree(cgroup), because tg might access tg->css.cgroup.
With this change:
cgroup_rmdir()
ss->css_offline() <-- unregister tg
diput()
synchronize_rcu() <-- wait possible refs to tg and cgroup
ss->css_free() <-- free tg
kfree_rcu(cgroup) <-- free cgroup
As you see, kfree_rcu() is redundant now.
Signed-off-by: Li Zefan <lizefan-hv44wF8Li93QT0dZR+AlfA@public.gmane.org>
---
v2: changelog rephrased
---
include/linux/sched.h | 3 +++
kernel/sched/auto_group.c | 3 +++
kernel/sched/core.c | 49 +++++++++++++++++++++++++++++++++++++----------
3 files changed, 45 insertions(+), 10 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 206bb08..577eb97 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2750,7 +2750,10 @@ extern void normalize_rt_tasks(void);
extern struct task_group root_task_group;
extern struct task_group *sched_create_group(struct task_group *parent);
+extern void sched_online_group(struct task_group *tg,
+ struct task_group *parent);
extern void sched_destroy_group(struct task_group *tg);
+extern void sched_offline_group(struct task_group *tg);
extern void sched_move_task(struct task_struct *tsk);
#ifdef CONFIG_FAIR_GROUP_SCHED
extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index 0984a21..64de5f8 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -35,6 +35,7 @@ static inline void autogroup_destroy(struct kref *kref)
ag->tg->rt_se = NULL;
ag->tg->rt_rq = NULL;
#endif
+ sched_offline_group(ag->tg);
sched_destroy_group(ag->tg);
}
@@ -76,6 +77,8 @@ static inline struct autogroup *autogroup_create(void)
if (IS_ERR(tg))
goto out_free;
+ sched_online_group(tg, &root_task_group);
+
kref_init(&ag->kref);
init_rwsem(&ag->lock);
ag->id = atomic_inc_return(&autogroup_seq_nr);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 257002c..1061672 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7159,7 +7159,6 @@ static void free_sched_group(struct task_group *tg)
struct task_group *sched_create_group(struct task_group *parent)
{
struct task_group *tg;
- unsigned long flags;
tg = kzalloc(sizeof(*tg), GFP_KERNEL);
if (!tg)
@@ -7171,6 +7170,17 @@ struct task_group *sched_create_group(struct task_group *parent)
if (!alloc_rt_sched_group(tg, parent))
goto err;
+ return tg;
+
+err:
+ free_sched_group(tg);
+ return ERR_PTR(-ENOMEM);
+}
+
+void sched_online_group(struct task_group *tg, struct task_group *parent)
+{
+ unsigned long flags;
+
spin_lock_irqsave(&task_group_lock, flags);
list_add_rcu(&tg->list, &task_groups);
@@ -7180,12 +7190,6 @@ struct task_group *sched_create_group(struct task_group *parent)
INIT_LIST_HEAD(&tg->children);
list_add_rcu(&tg->siblings, &parent->children);
spin_unlock_irqrestore(&task_group_lock, flags);
-
- return tg;
-
-err:
- free_sched_group(tg);
- return ERR_PTR(-ENOMEM);
}
/* rcu callback to free various structures associated with a task group */
@@ -7198,6 +7202,12 @@ static void free_sched_group_rcu(struct rcu_head *rhp)
/* Destroy runqueue etc associated with a task group */
void sched_destroy_group(struct task_group *tg)
{
+ /* wait for possible concurrent references to cfs_rqs complete */
+ call_rcu(&tg->rcu, free_sched_group_rcu);
+}
+
+void sched_offline_group(struct task_group *tg)
+{
unsigned long flags;
int i;
@@ -7209,9 +7219,6 @@ void sched_destroy_group(struct task_group *tg)
list_del_rcu(&tg->list);
list_del_rcu(&tg->siblings);
spin_unlock_irqrestore(&task_group_lock, flags);
-
- /* wait for possible concurrent references to cfs_rqs complete */
- call_rcu(&tg->rcu, free_sched_group_rcu);
}
/* change task's runqueue when it moves between groups.
@@ -7563,6 +7570,19 @@ static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp)
return &tg->css;
}
+static int cpu_cgroup_css_online(struct cgroup *cgrp)
+{
+ struct task_group *tg = cgroup_tg(cgrp);
+ struct task_group *parent;
+
+ if (!cgrp->parent)
+ return 0;
+
+ parent = cgroup_tg(cgrp->parent);
+ sched_online_group(tg, parent);
+ return 0;
+}
+
static void cpu_cgroup_css_free(struct cgroup *cgrp)
{
struct task_group *tg = cgroup_tg(cgrp);
@@ -7570,6 +7590,13 @@ static void cpu_cgroup_css_free(struct cgroup *cgrp)
sched_destroy_group(tg);
}
+static void cpu_cgroup_css_offline(struct cgroup *cgrp)
+{
+ struct task_group *tg = cgroup_tg(cgrp);
+
+ sched_offline_group(tg);
+}
+
static int cpu_cgroup_can_attach(struct cgroup *cgrp,
struct cgroup_taskset *tset)
{
@@ -7925,6 +7952,8 @@ struct cgroup_subsys cpu_cgroup_subsys = {
.name = "cpu",
.css_alloc = cpu_cgroup_css_alloc,
.css_free = cpu_cgroup_css_free,
+ .css_online = cpu_cgroup_css_online,
+ .css_offline = cpu_cgroup_css_offline,
.can_attach = cpu_cgroup_can_attach,
.attach = cpu_cgroup_attach,
.exit = cpu_cgroup_exit,
--
1.8.0.2
^ permalink raw reply related [flat|nested] 14+ messages in thread* [PATCH v2 3/6] sched: remove redundant NULL cgroup check in task_group_path()
[not found] ` <5100D4FE.9080205-hv44wF8Li93QT0dZR+AlfA@public.gmane.org>
2013-01-24 6:30 ` [PATCH v2 2/6] sched: split out css_online/css_offline from tg creation/destruction Li Zefan
@ 2013-01-24 6:31 ` Li Zefan
2013-01-24 6:31 ` [PATCH v2 4/6] cgroup: remove duplicate RCU free on struct cgroup Li Zefan
` (3 subsequent siblings)
5 siblings, 0 replies; 14+ messages in thread
From: Li Zefan @ 2013-01-24 6:31 UTC (permalink / raw)
To: Tejun Heo; +Cc: Andrew Morton, Peter Zijlstra, Ingo Molnar, LKML, Cgroups
A task_group won't be online (thus no one can see it) until
cpu_cgroup_css_online(), and at that time tg->css.cgroup has
been initialized, so this NULL check is redundant.
Signed-off-by: Li Zefan <lizefan-hv44wF8Li93QT0dZR+AlfA@public.gmane.org>
---
kernel/sched/debug.c | 7 -------
1 file changed, 7 deletions(-)
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 2cd3c1b..38df0db 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -110,13 +110,6 @@ static char *task_group_path(struct task_group *tg)
if (autogroup_path(tg, group_path, PATH_MAX))
return group_path;
- /*
- * May be NULL if the underlying cgroup isn't fully-created yet
- */
- if (!tg->css.cgroup) {
- group_path[0] = '\0';
- return group_path;
- }
cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
return group_path;
}
--
1.8.0.2
^ permalink raw reply related [flat|nested] 14+ messages in thread* [PATCH v2 4/6] cgroup: remove duplicate RCU free on struct cgroup
[not found] ` <5100D4FE.9080205-hv44wF8Li93QT0dZR+AlfA@public.gmane.org>
2013-01-24 6:30 ` [PATCH v2 2/6] sched: split out css_online/css_offline from tg creation/destruction Li Zefan
2013-01-24 6:31 ` [PATCH v2 3/6] sched: remove redundant NULL cgroup check in task_group_path() Li Zefan
@ 2013-01-24 6:31 ` Li Zefan
2013-01-24 6:31 ` [PATCH v2 5/6] cgroup: remove synchronize_rcu() from cgroup_diput() Li Zefan
` (2 subsequent siblings)
5 siblings, 0 replies; 14+ messages in thread
From: Li Zefan @ 2013-01-24 6:31 UTC (permalink / raw)
To: Tejun Heo; +Cc: Andrew Morton, Peter Zijlstra, Ingo Molnar, LKML, Cgroups
When destroying a cgroup, though in cgroup_diput() we've called
synchronize_rcu(), we then still have to free it via call_rcu().
The story is, long ago to fix a race between reading /proc/sched_debug
and freeing cgroup, the code was changed to utilize call_rcu(). See
commit a47295e6bc42ad35f9c15ac66f598aa24debd4e2 ("cgroups: make
cgroup_path() RCU-safe")
As we've fixed cpu cgroup that cpu_cgroup_offline_css() is used
to unregister a task_group so there won't be concurrent access
to this task_group after synchronize_rcu() in diput(). Now we can
just kfree(cgrp).
Signed-off-by: Li Zefan <lizefan-hv44wF8Li93QT0dZR+AlfA@public.gmane.org>
---
kernel/cgroup.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index b9a76e2..a39e2b0 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -892,7 +892,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
simple_xattrs_free(&cgrp->xattrs);
ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id);
- kfree_rcu(cgrp, rcu_head);
+ kfree(cgrp);
} else {
struct cfent *cfe = __d_cfe(dentry);
struct cgroup *cgrp = dentry->d_parent->d_fsdata;
--
1.8.0.2
^ permalink raw reply related [flat|nested] 14+ messages in thread* [PATCH v2 5/6] cgroup: remove synchronize_rcu() from cgroup_diput()
[not found] ` <5100D4FE.9080205-hv44wF8Li93QT0dZR+AlfA@public.gmane.org>
` (2 preceding siblings ...)
2013-01-24 6:31 ` [PATCH v2 4/6] cgroup: remove duplicate RCU free on struct cgroup Li Zefan
@ 2013-01-24 6:31 ` Li Zefan
2013-01-24 6:32 ` [PATCH v2 6/6] cgroup: remove bogus comments in cgroup_diput() Li Zefan
2013-01-24 20:06 ` [PATCH v2 1/6] cgroup: initialize cgrp->dentry before css_alloc() Tejun Heo
5 siblings, 0 replies; 14+ messages in thread
From: Li Zefan @ 2013-01-24 6:31 UTC (permalink / raw)
To: Tejun Heo; +Cc: Andrew Morton, Peter Zijlstra, Ingo Molnar, LKML, Cgroups
Free cgroup via call_rcu(). The actual work is done through
workqueue.
Signed-off-by: Li Zefan <lizefan-hv44wF8Li93QT0dZR+AlfA@public.gmane.org>
---
v2: move INIT_WORK() from cgroup_create() to init_cgroup_housekeeping()
---
include/linux/cgroup.h | 1 +
kernel/cgroup.c | 72 ++++++++++++++++++++++++++++++--------------------
2 files changed, 44 insertions(+), 29 deletions(-)
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 8118a31..900af59 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -203,6 +203,7 @@ struct cgroup {
/* For RCU-protected deletion */
struct rcu_head rcu_head;
+ struct work_struct free_work;
/* List of events which userspace want to receive */
struct list_head event_list;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a39e2b0..f18b0d9 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -852,12 +852,52 @@ static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
return inode;
}
+static void cgroup_free_fn(struct work_struct *work)
+{
+ struct cgroup *cgrp = container_of(work, struct cgroup, free_work);
+ struct cgroup_subsys *ss;
+
+ mutex_lock(&cgroup_mutex);
+ /*
+ * Release the subsystem state objects.
+ */
+ for_each_subsys(cgrp->root, ss)
+ ss->css_free(cgrp);
+
+ cgrp->root->number_of_cgroups--;
+ mutex_unlock(&cgroup_mutex);
+
+ /*
+ * Drop the active superblock reference that we took when we
+ * created the cgroup
+ */
+ deactivate_super(cgrp->root->sb);
+
+ /*
+ * if we're getting rid of the cgroup, refcount should ensure
+ * that there are no pidlists left.
+ */
+ BUG_ON(!list_empty(&cgrp->pidlists));
+
+ simple_xattrs_free(&cgrp->xattrs);
+
+ ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id);
+ kfree(cgrp);
+}
+
+static void cgroup_free_rcu(struct rcu_head *head)
+{
+ struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head);
+
+ schedule_work(&cgrp->free_work);
+}
+
static void cgroup_diput(struct dentry *dentry, struct inode *inode)
{
/* is dentry a directory ? if so, kfree() associated cgroup */
if (S_ISDIR(inode->i_mode)) {
struct cgroup *cgrp = dentry->d_fsdata;
- struct cgroup_subsys *ss;
+
BUG_ON(!(cgroup_is_removed(cgrp)));
/* It's possible for external users to be holding css
* reference counts on a cgroup; css_put() needs to
@@ -865,34 +905,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
* the reference count in order to know if it needs to
* queue the cgroup to be handled by the release
* agent */
- synchronize_rcu();
-
- mutex_lock(&cgroup_mutex);
- /*
- * Release the subsystem state objects.
- */
- for_each_subsys(cgrp->root, ss)
- ss->css_free(cgrp);
-
- cgrp->root->number_of_cgroups--;
- mutex_unlock(&cgroup_mutex);
-
- /*
- * Drop the active superblock reference that we took when we
- * created the cgroup
- */
- deactivate_super(cgrp->root->sb);
-
- /*
- * if we're getting rid of the cgroup, refcount should ensure
- * that there are no pidlists left.
- */
- BUG_ON(!list_empty(&cgrp->pidlists));
-
- simple_xattrs_free(&cgrp->xattrs);
-
- ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id);
- kfree(cgrp);
+ call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
} else {
struct cfent *cfe = __d_cfe(dentry);
struct cgroup *cgrp = dentry->d_parent->d_fsdata;
@@ -1391,6 +1404,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
INIT_LIST_HEAD(&cgrp->allcg_node);
INIT_LIST_HEAD(&cgrp->release_list);
INIT_LIST_HEAD(&cgrp->pidlists);
+ INIT_WORK(&cgrp->free_work, cgroup_free_fn);
mutex_init(&cgrp->pidlist_mutex);
INIT_LIST_HEAD(&cgrp->event_list);
spin_lock_init(&cgrp->event_list_lock);
--
1.8.0.2
^ permalink raw reply related [flat|nested] 14+ messages in thread* [PATCH v2 6/6] cgroup: remove bogus comments in cgroup_diput()
[not found] ` <5100D4FE.9080205-hv44wF8Li93QT0dZR+AlfA@public.gmane.org>
` (3 preceding siblings ...)
2013-01-24 6:31 ` [PATCH v2 5/6] cgroup: remove synchronize_rcu() from cgroup_diput() Li Zefan
@ 2013-01-24 6:32 ` Li Zefan
[not found] ` <5100D562.6050407-hv44wF8Li93QT0dZR+AlfA@public.gmane.org>
2013-01-24 20:06 ` [PATCH v2 1/6] cgroup: initialize cgrp->dentry before css_alloc() Tejun Heo
5 siblings, 1 reply; 14+ messages in thread
From: Li Zefan @ 2013-01-24 6:32 UTC (permalink / raw)
To: Tejun Heo; +Cc: Andrew Morton, Peter Zijlstra, Ingo Molnar, LKML, Cgroups
Since commit 48ddbe194623ae089cc0576e60363f2d2e85662a
("cgroup: make css->refcnt clearing on cgroup removal optional"),
each css holds a ref on cgroup's dentry, so cgroup_diput() won't be
called until all css' refs go down to 0, which invalids the comments.
Signed-off-by: Li Zefan <lizefan-hv44wF8Li93QT0dZR+AlfA@public.gmane.org>
---
kernel/cgroup.c | 6 ------
1 file changed, 6 deletions(-)
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index f18b0d9..368cff1 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -899,12 +899,6 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
struct cgroup *cgrp = dentry->d_fsdata;
BUG_ON(!(cgroup_is_removed(cgrp)));
- /* It's possible for external users to be holding css
- * reference counts on a cgroup; css_put() needs to
- * be able to access the cgroup after decrementing
- * the reference count in order to know if it needs to
- * queue the cgroup to be handled by the release
- * agent */
call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
} else {
struct cfent *cfe = __d_cfe(dentry);
--
1.8.0.2
^ permalink raw reply related [flat|nested] 14+ messages in thread* Re: [PATCH v2 1/6] cgroup: initialize cgrp->dentry before css_alloc()
[not found] ` <5100D4FE.9080205-hv44wF8Li93QT0dZR+AlfA@public.gmane.org>
` (4 preceding siblings ...)
2013-01-24 6:32 ` [PATCH v2 6/6] cgroup: remove bogus comments in cgroup_diput() Li Zefan
@ 2013-01-24 20:06 ` Tejun Heo
5 siblings, 0 replies; 14+ messages in thread
From: Tejun Heo @ 2013-01-24 20:06 UTC (permalink / raw)
To: Li Zefan; +Cc: Andrew Morton, Peter Zijlstra, Ingo Molnar, LKML, Cgroups
On Thu, Jan 24, 2013 at 02:30:22PM +0800, Li Zefan wrote:
> With this change, we're guaranteed that cgroup_path() won't see NULL
> cgrp->dentry, and thus we can remove the NULL check in it.
>
> (Well, it's not true, because dummptop.dentry is always NULL)
>
> Signed-off-by: Li Zefan <lizefan-hv44wF8Li93QT0dZR+AlfA@public.gmane.org>
Appled 1-6 to cgroup/for-3.9.
Thanks.
--
tejun
^ permalink raw reply [flat|nested] 14+ messages in thread