Linux Power Management development
 help / color / mirror / Atom feed
* [PATCH 4/9] cgroup_freezer: trivial cleanups
From: Tejun Heo @ 2012-11-03  8:38 UTC (permalink / raw)
  To: lizefan-hv44wF8Li93QT0dZR+AlfA, mhocko-AlSwsSmVLrQ,
	rjw-KKrjLPT3xs0
  Cc: linux-pm-u79uwXL29TY76Z2rM5mHXA, fweisbec-Re5JQEeQqe8AvxtiuMwx3w,
	containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Tejun Heo,
	cgroups-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1351931915-1701-1-git-send-email-tj-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>

* Clean-up indentation and line-breaks.  Drop the invalid comment
  about freezer->lock.

* Make all internal functions take @freezer instead of both @cgroup
  and @freezer.

Signed-off-by: Tejun Heo <tj-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
---
 kernel/cgroup_freezer.c | 41 +++++++++++++++++++----------------------
 1 file changed, 19 insertions(+), 22 deletions(-)

diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index bedefd9..975b3d8 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -29,17 +29,15 @@ enum freezer_state {
 };
 
 struct freezer {
-	struct cgroup_subsys_state css;
-	enum freezer_state state;
-	spinlock_t lock; /* protects _writes_ to state */
+	struct cgroup_subsys_state	css;
+	enum freezer_state		state;
+	spinlock_t			lock;
 };
 
-static inline struct freezer *cgroup_freezer(
-		struct cgroup *cgroup)
+static inline struct freezer *cgroup_freezer(struct cgroup *cgroup)
 {
-	return container_of(
-		cgroup_subsys_state(cgroup, freezer_subsys_id),
-		struct freezer, css);
+	return container_of(cgroup_subsys_state(cgroup, freezer_subsys_id),
+			    struct freezer, css);
 }
 
 static inline struct freezer *task_freezer(struct task_struct *task)
@@ -180,8 +178,9 @@ out:
  * migrated into or out of @cgroup, so we can't verify task states against
  * @freezer state here.  See freezer_attach() for details.
  */
-static void update_if_frozen(struct cgroup *cgroup, struct freezer *freezer)
+static void update_if_frozen(struct freezer *freezer)
 {
+	struct cgroup *cgroup = freezer->css.cgroup;
 	struct cgroup_iter it;
 	struct task_struct *task;
 
@@ -211,12 +210,11 @@ notyet:
 static int freezer_read(struct cgroup *cgroup, struct cftype *cft,
 			struct seq_file *m)
 {
-	struct freezer *freezer;
+	struct freezer *freezer = cgroup_freezer(cgroup);
 	enum freezer_state state;
 
-	freezer = cgroup_freezer(cgroup);
 	spin_lock_irq(&freezer->lock);
-	update_if_frozen(cgroup, freezer);
+	update_if_frozen(freezer);
 	state = freezer->state;
 	spin_unlock_irq(&freezer->lock);
 
@@ -225,8 +223,9 @@ static int freezer_read(struct cgroup *cgroup, struct cftype *cft,
 	return 0;
 }
 
-static void freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
+static void freeze_cgroup(struct freezer *freezer)
 {
+	struct cgroup *cgroup = freezer->css.cgroup;
 	struct cgroup_iter it;
 	struct task_struct *task;
 
@@ -236,8 +235,9 @@ static void freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
 	cgroup_iter_end(cgroup, &it);
 }
 
-static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
+static void unfreeze_cgroup(struct freezer *freezer)
 {
+	struct cgroup *cgroup = freezer->css.cgroup;
 	struct cgroup_iter it;
 	struct task_struct *task;
 
@@ -247,11 +247,9 @@ static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
 	cgroup_iter_end(cgroup, &it);
 }
 
-static void freezer_change_state(struct cgroup *cgroup,
+static void freezer_change_state(struct freezer *freezer,
 				 enum freezer_state goal_state)
 {
-	struct freezer *freezer = cgroup_freezer(cgroup);
-
 	/* also synchronizes against task migration, see freezer_attach() */
 	spin_lock_irq(&freezer->lock);
 
@@ -260,13 +258,13 @@ static void freezer_change_state(struct cgroup *cgroup,
 		if (freezer->state != CGROUP_THAWED)
 			atomic_dec(&system_freezing_cnt);
 		freezer->state = CGROUP_THAWED;
-		unfreeze_cgroup(cgroup, freezer);
+		unfreeze_cgroup(freezer);
 		break;
 	case CGROUP_FROZEN:
 		if (freezer->state == CGROUP_THAWED)
 			atomic_inc(&system_freezing_cnt);
 		freezer->state = CGROUP_FREEZING;
-		freeze_cgroup(cgroup, freezer);
+		freeze_cgroup(freezer);
 		break;
 	default:
 		BUG();
@@ -275,8 +273,7 @@ static void freezer_change_state(struct cgroup *cgroup,
 	spin_unlock_irq(&freezer->lock);
 }
 
-static int freezer_write(struct cgroup *cgroup,
-			 struct cftype *cft,
+static int freezer_write(struct cgroup *cgroup, struct cftype *cft,
 			 const char *buffer)
 {
 	enum freezer_state goal_state;
@@ -288,7 +285,7 @@ static int freezer_write(struct cgroup *cgroup,
 	else
 		return -EINVAL;
 
-	freezer_change_state(cgroup, goal_state);
+	freezer_change_state(cgroup_freezer(cgroup), goal_state);
 	return 0;
 }
 
-- 
1.7.11.7

^ permalink raw reply related

* [PATCH 3/9] cgroup: implement generic child / descendant walk macros
From: Tejun Heo @ 2012-11-03  8:38 UTC (permalink / raw)
  To: lizefan-hv44wF8Li93QT0dZR+AlfA, mhocko-AlSwsSmVLrQ,
	rjw-KKrjLPT3xs0
  Cc: linux-pm-u79uwXL29TY76Z2rM5mHXA, fweisbec-Re5JQEeQqe8AvxtiuMwx3w,
	containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Tejun Heo,
	cgroups-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1351931915-1701-1-git-send-email-tj-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>

Currently, cgroup doesn't provide any generic helper for walking a
given cgroup's children or descendants.  This patch adds the following
three macros.

* cgroup_for_each_child() - walk immediate children of a cgroup.

* cgroup_for_each_descendant_pre() - visit all descendants of a cgroup
  in pre-order tree traversal.

* cgroup_for_each_descendant_post() - visit all descendants of a
  cgroup in post-order tree traversal.

All three only require the user to hold RCU read lock during
traversal.  Verifying that each iterated cgroup is online is the
responsibility of the user.  When used with proper synchronization,
cgroup_for_each_descendant_pre() can be used to propagate config
updates to descendants in reliable way.  See comments for details.

Signed-off-by: Tejun Heo <tj-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
---
 include/linux/cgroup.h | 82 +++++++++++++++++++++++++++++++++++++++++++++++
 kernel/cgroup.c        | 86 ++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 168 insertions(+)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 90c33eb..0020329 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -534,6 +534,88 @@ static inline struct cgroup* task_cgroup(struct task_struct *task,
 	return task_subsys_state(task, subsys_id)->cgroup;
 }
 
+/**
+ * cgroup_for_each_child - iterate through children of a cgroup
+ * @pos: the cgroup * to use as the loop cursor
+ * @cgroup: cgroup whose children to walk
+ *
+ * Walk @cgroup's children.  Must be called under rcu_read_lock().  A child
+ * cgroup which hasn't finished ->post_create() or already has finished
+ * ->pre_destroy() may show up during traversal and it's each subsystem's
+ * responsibility to verify that each @pos is alive.
+ *
+ * If a subsystem synchronizes against the parent in its ->post_create()
+ * and before starting iterating, a cgroup which finished ->post_create()
+ * is guaranteed to be visible in the future iterations.
+ */
+#define cgroup_for_each_child(pos, cgroup)				\
+	list_for_each_entry_rcu(pos, &(cgroup)->children, sibling)
+
+struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,
+					  struct cgroup *cgroup);
+
+/**
+ * cgroup_for_each_descendant_pre - pre-order walk of a cgroup's descendants
+ * @pos: the cgroup * to use as the loop cursor
+ * @cgroup: cgroup whose descendants to walk
+ *
+ * Walk @cgroup's descendants.  Must be called under rcu_read_lock().  A
+ * descendant cgroup which hasn't finished ->post_create() or already has
+ * finished ->pre_destroy() may show up during traversal and it's each
+ * subsystem's responsibility to verify that each @pos is alive.
+ *
+ * If a subsystem synchronizes against the parent in its ->post_create()
+ * and before starting iterating, and synchronizes against @pos on each
+ * iteration, any descendant cgroup which finished ->post_create() is
+ * guaranteed to be visible in the future iterations.
+ *
+ * In other words, the following guarantees that a descendant can't escape
+ * configuration of its ancestors.
+ *
+ * my_post_create(@cgrp)
+ * {
+ *	Lock @cgrp->parent and @cgrp;
+ *	Inherit config from @cgrp->parent;
+ *	Unlock both.
+ * }
+ *
+ * my_update_config(@cgrp)
+ * {
+ *	Lock @cgrp;
+ *	Update @cgrp's config;
+ *	Unlock @cgrp;
+ *
+ *	cgroup_for_each_descendant_pre(@pos, @cgrp) {
+ *		Lock @pos;
+ *		Verify @pos is alive and inherit config from @pos->parent;
+ *		Unlock @pos;
+ *	}
+ * }
+ *
+ * Alternatively, a subsystem may choose to use a single global lock to
+ * synchronize ->post_create() and ->pre_destroy() against tree-walking
+ * operations.
+ */
+#define cgroup_for_each_descendant_pre(pos, cgroup)			\
+	for (pos = cgroup_next_descendant_pre(NULL, (cgroup)); (pos);	\
+	     pos = cgroup_next_descendant_pre((pos), (cgroup)))
+
+struct cgroup *cgroup_next_descendant_post(struct cgroup *pos,
+					   struct cgroup *cgroup);
+
+/**
+ * cgroup_for_each_descendant_post - post-order walk of a cgroup's descendants
+ * @pos: the cgroup * to use as the loop cursor
+ * @cgroup: cgroup whose descendants to walk
+ *
+ * Similar to cgroup_for_each_descendant_pre() but performs post-order
+ * traversal instead.  Note that the walk visibility guarantee described in
+ * pre-order walk doesn't apply the same to post-order walks.
+ */
+#define cgroup_for_each_descendant_post(pos, cgroup)			\
+	for (pos = cgroup_next_descendant_post(NULL, (cgroup)); (pos);	\
+	     pos = cgroup_next_descendant_post((pos), (cgroup)))
+
 /* A cgroup_iter should be treated as an opaque object */
 struct cgroup_iter {
 	struct list_head *cg_link;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index cc5d2a0..8bd662c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2985,6 +2985,92 @@ static void cgroup_enable_task_cg_lists(void)
 	write_unlock(&css_set_lock);
 }
 
+/**
+ * cgroup_next_descendant_pre - find the next descendant for pre-order walk
+ * @pos: the current position (%NULL to initiate traversal)
+ * @cgroup: cgroup whose descendants to walk
+ *
+ * To be used by cgroup_for_each_descendant_pre().  Find the next
+ * descendant to visit for pre-order traversal of @cgroup's descendants.
+ */
+struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,
+					  struct cgroup *cgroup)
+{
+	struct cgroup *next;
+
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
+	/* if first iteration, pretend we just visited @cgroup */
+	if (!pos) {
+		if (list_empty(&cgroup->children))
+			return NULL;
+		pos = cgroup;
+	}
+
+	/* visit the first child if exists */
+	next = list_first_or_null_rcu(&pos->children, struct cgroup, sibling);
+	if (next)
+		return next;
+
+	/* no child, visit my or the closest ancestor's next sibling */
+	do {
+		next = list_entry_rcu(pos->sibling.next, struct cgroup,
+				      sibling);
+		if (&next->sibling != &pos->parent->children)
+			return next;
+
+		pos = pos->parent;
+	} while (pos != cgroup);
+
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre);
+
+static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos)
+{
+	struct cgroup *last;
+
+	do {
+		last = pos;
+		pos = list_first_or_null_rcu(&pos->children, struct cgroup,
+					     sibling);
+	} while (pos);
+
+	return last;
+}
+
+/**
+ * cgroup_next_descendant_post - find the next descendant for post-order walk
+ * @pos: the current position (%NULL to initiate traversal)
+ * @cgroup: cgroup whose descendants to walk
+ *
+ * To be used by cgroup_for_each_descendant_post().  Find the next
+ * descendant to visit for post-order traversal of @cgroup's descendants.
+ */
+struct cgroup *cgroup_next_descendant_post(struct cgroup *pos,
+					   struct cgroup *cgroup)
+{
+	struct cgroup *next;
+
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
+	/* if first iteration, visit the leftmost descendant */
+	if (!pos) {
+		next = cgroup_leftmost_descendant(cgroup);
+		return next != cgroup ? next : NULL;
+	}
+
+	/* if there's an unvisited sibling, visit its leftmost descendant */
+	next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling);
+	if (&next->sibling != &pos->parent->children)
+		return cgroup_leftmost_descendant(next);
+
+	/* no sibling left, visit parent */
+	next = pos->parent;
+	return next != cgroup ? next : NULL;
+}
+EXPORT_SYMBOL_GPL(cgroup_next_descendant_post);
+
 void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
 	__acquires(css_set_lock)
 {
-- 
1.7.11.7

^ permalink raw reply related

* [PATCH 2/9] cgroup: Use rculist ops for cgroup->children
From: Tejun Heo @ 2012-11-03  8:38 UTC (permalink / raw)
  To: lizefan-hv44wF8Li93QT0dZR+AlfA, mhocko-AlSwsSmVLrQ,
	rjw-KKrjLPT3xs0
  Cc: linux-pm-u79uwXL29TY76Z2rM5mHXA, fweisbec-Re5JQEeQqe8AvxtiuMwx3w,
	containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Tejun Heo,
	cgroups-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1351931915-1701-1-git-send-email-tj-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>

Use RCU safe list operations for cgroup->children.  This will be used
to implement cgroup children / descendant walking which can be used by
controllers.

Note that cgroup_create() now puts a new cgroup at the end of the
->children list instead of head.  This isn't strictly necessary but is
done so that the iteration order is more conventional.

Signed-off-by: Tejun Heo <tj-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
---
 include/linux/cgroup.h | 1 +
 kernel/cgroup.c        | 8 +++-----
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index b442122..90c33eb 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -12,6 +12,7 @@
 #include <linux/cpumask.h>
 #include <linux/nodemask.h>
 #include <linux/rcupdate.h>
+#include <linux/rculist.h>
 #include <linux/cgroupstats.h>
 #include <linux/prio_heap.h>
 #include <linux/rwsem.h>
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index f05d992..cc5d2a0 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1650,7 +1650,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 
 		free_cg_links(&tmp_cg_links);
 
-		BUG_ON(!list_empty(&root_cgrp->sibling));
 		BUG_ON(!list_empty(&root_cgrp->children));
 		BUG_ON(root->number_of_cgroups != 1);
 
@@ -1699,7 +1698,6 @@ static void cgroup_kill_sb(struct super_block *sb) {
 
 	BUG_ON(root->number_of_cgroups != 1);
 	BUG_ON(!list_empty(&cgrp->children));
-	BUG_ON(!list_empty(&cgrp->sibling));
 
 	mutex_lock(&cgroup_mutex);
 	mutex_lock(&cgroup_root_mutex);
@@ -4053,7 +4051,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 		}
 	}
 
-	list_add(&cgrp->sibling, &cgrp->parent->children);
+	list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
 	root->number_of_cgroups++;
 
 	err = cgroup_create_dir(cgrp, dentry, mode);
@@ -4084,7 +4082,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 
  err_remove:
 
-	list_del(&cgrp->sibling);
+	list_del_rcu(&cgrp->sibling);
 	root->number_of_cgroups--;
 
  err_destroy:
@@ -4210,7 +4208,7 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
 	raw_spin_unlock(&release_list_lock);
 
 	/* delete this cgroup from parent->children */
-	list_del_init(&cgrp->sibling);
+	list_del_rcu(&cgrp->sibling);
 
 	list_del_init(&cgrp->allcg_node);
 
-- 
1.7.11.7

^ permalink raw reply related

* [PATCH 1/9] cgroup: add cgroup_subsys->post_create()
From: Tejun Heo @ 2012-11-03  8:38 UTC (permalink / raw)
  To: lizefan-hv44wF8Li93QT0dZR+AlfA, mhocko-AlSwsSmVLrQ,
	rjw-KKrjLPT3xs0
  Cc: linux-pm-u79uwXL29TY76Z2rM5mHXA, fweisbec-Re5JQEeQqe8AvxtiuMwx3w,
	containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Tejun Heo,
	cgroups-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1351931915-1701-1-git-send-email-tj-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>

Currently, there's no way for a controller to find out whether a new
cgroup finished all ->create() allocatinos successfully and is
considered "live" by cgroup.

This becomes a problem later when we add generic descendants walking
to cgroup which can be used by controllers as controllers don't have a
synchronization point where it can synchronize against new cgroups
appearing in such walks.

This patch adds ->post_create().  It's called after all ->create()
succeeded and the cgroup is linked into the generic cgroup hierarchy.
This plays the counterpart of ->pre_destroy().

Signed-off-by: Tejun Heo <tj-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
Cc: Glauber Costa <glommer-bzQdu9zFT3WakBO8gow8eQ@public.gmane.org>
---
 include/linux/cgroup.h |  1 +
 kernel/cgroup.c        | 12 ++++++++++--
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index fe876a7..b442122 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -438,6 +438,7 @@ int cgroup_taskset_size(struct cgroup_taskset *tset);
 
 struct cgroup_subsys {
 	struct cgroup_subsys_state *(*create)(struct cgroup *cgrp);
+	void (*post_create)(struct cgroup *cgrp);
 	void (*pre_destroy)(struct cgroup *cgrp);
 	void (*destroy)(struct cgroup *cgrp);
 	int (*can_attach)(struct cgroup *cgrp, struct cgroup_taskset *tset);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index e3045ad..f05d992 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4060,10 +4060,15 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 	if (err < 0)
 		goto err_remove;
 
-	/* each css holds a ref to the cgroup's dentry */
-	for_each_subsys(root, ss)
+	for_each_subsys(root, ss) {
+		/* each css holds a ref to the cgroup's dentry */
 		dget(dentry);
 
+		/* creation succeeded, notify subsystems */
+		if (ss->post_create)
+			ss->post_create(cgrp);
+	}
+
 	/* The cgroup directory was pre-locked for us */
 	BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex));
 
@@ -4281,6 +4286,9 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
 
 	ss->active = 1;
 
+	if (ss->post_create)
+		ss->post_create(&ss->root->top_cgroup);
+
 	/* this function shouldn't be used with modular subsystems, since they
 	 * need to register a subsys_id, among other things */
 	BUG_ON(ss->module);
-- 
1.7.11.7

^ permalink raw reply related

* [PATCHSET cgroup/for-3.8] cgroup_freezer: implement proper hierarchy support
From: Tejun Heo @ 2012-11-03  8:38 UTC (permalink / raw)
  To: lizefan-hv44wF8Li93QT0dZR+AlfA, mhocko-AlSwsSmVLrQ,
	rjw-KKrjLPT3xs0
  Cc: cgroups-u79uwXL29TY76Z2rM5mHXA,
	containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	fweisbec-Re5JQEeQqe8AvxtiuMwx3w,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-pm-u79uwXL29TY76Z2rM5mHXA

Hello,

This patchset implement proper hierarchy support for cgroup_freezer as
discussed in "[RFC] cgroup TODOs"[1].

The patchset first implements generic cgroup iteration macros -
cgroup_for_each_children(), cgroup_for_each_descendant_{pre|post}().
Combined with the newly introduced ->post_create() callback, this
allows controllers to implement reliable iteration over descendants
without messing with cgroup internal locking.  Controllers can perform
reliable walking using simple hierarchy-wide locking or finer-grained
parent-children locking.

Using the iteration macros and ->post_create(), cgroup_freezer is
updated to propagate state updates to and collect FROZEN completions
from the descendants.  This removes .broken_hierarchy marking from
cgroup_freezer.

cgroup_freezer hierarchy support is implemented using finer-grained
locking not necessarily because it's necessary but more because I
wanted an example controller doing that.

This patchset contains the following nine patches.

 0001-cgroup-add-cgroup_subsys-post_create.patch
 0002-cgroup-Use-rculist-ops-for-cgroup-children.patch
 0003-cgroup-implement-generic-child-descendant-walk-macro.patch
 0004-cgroup_freezer-trivial-cleanups.patch
 0005-cgroup_freezer-prepare-freezer_change_state-for-full.patch
 0006-cgroup_freezer-make-freezer-state-mask-of-flags.patch
 0007-cgroup_freezer-introduce-CGROUP_FREEZING_-SELF-PAREN.patch
 0008-cgroup_freezer-add-post_create-and-pre_destroy-and-t.patch
 0009-cgroup_freezer-implement-proper-hierarchy-support.patch

0001-0003 implement cgroup descendant iterators.

0004-0008 prepare cgroup_freezer for hierarchy support.  0009
implements it.

This patchset is on top of

 v3.6 (a0d271cbfe)
+ [2] the first three patches of
      "memcg/cgroup: do not fail fail on pre_destroy callbacks" patchset
+ [3] "cgroup: simplify cgroup removal path" v2 patchset

with cgroup/for-3.8 pulled into it.  The branch is rather floaty at
the moment so it would be the easiest to pull the following branch for
review.

 git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git review-cgroup_freezer-hierarchy

Thanks.

 kernel/cgroup.c         |  106 +++++++++++++-
 kernel/cgroup_freezer.c |  359 +++++++++++++++++++++++++++++++++++-------------
 3 files changed, 445 insertions(+), 104 deletions(-)

--
tejun

[1] http://thread.gmane.org/gmane.linux.kernel.containers/23698
[2] http://thread.gmane.org/gmane.linux.kernel.cgroups/4757
[3] http://thread.gmane.org/gmane.linux.kernel.cgroups/4861

^ permalink raw reply

* [PATCH v4 6/6] USB: forbid memory allocation with I/O during bus reset
From: Ming Lei @ 2012-11-03  8:35 UTC (permalink / raw)
  To: linux-kernel
  Cc: Alan Stern, Oliver Neukum, Minchan Kim, Greg Kroah-Hartman,
	Rafael J. Wysocki, Jens Axboe, David S. Miller, Andrew Morton,
	netdev, linux-usb, linux-pm, linux-mm, Ming Lei
In-Reply-To: <1351931714-11689-1-git-send-email-ming.lei@canonical.com>

If one storage interface or usb network interface(iSCSI case)
exists in current configuration, memory allocation with
GFP_KERNEL during usb_device_reset() might trigger I/O transfer
on the storage interface itself and cause deadlock because
the 'us->dev_mutex' is held in .pre_reset() and the storage
interface can't do I/O transfer when the reset is triggered
by other interface, or the error handling can't be completed
if the reset is triggered by the storage itself(error handling path).

Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: Oliver Neukum <oneukum@suse.de>
Signed-off-by: Ming Lei <ming.lei@canonical.com>
---
v4:
	- mark current memalloc_noio for every usb device reset
---
 drivers/usb/core/hub.c |   13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c
index 5b131b6..788e652 100644
--- a/drivers/usb/core/hub.c
+++ b/drivers/usb/core/hub.c
@@ -5044,6 +5044,7 @@ int usb_reset_device(struct usb_device *udev)
 {
 	int ret;
 	int i;
+	unsigned int noio_flag;
 	struct usb_host_config *config = udev->actconfig;
 
 	if (udev->state == USB_STATE_NOTATTACHED ||
@@ -5053,6 +5054,17 @@ int usb_reset_device(struct usb_device *udev)
 		return -EINVAL;
 	}
 
+	/*
+	 * Don't allocate memory with GFP_KERNEL in current
+	 * context to avoid possible deadlock if usb mass
+	 * storage interface or usbnet interface(iSCSI case)
+	 * is included in current configuration. The easist
+	 * approach is to do it for every device reset,
+	 * because the device 'memalloc_noio' flag may have
+	 * not been set before reseting the usb device.
+	 */
+	memalloc_noio_save(noio_flag);
+
 	/* Prevent autosuspend during the reset */
 	usb_autoresume_device(udev);
 
@@ -5097,6 +5109,7 @@ int usb_reset_device(struct usb_device *udev)
 	}
 
 	usb_autosuspend_device(udev);
+	memalloc_noio_restore(noio_flag);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(usb_reset_device);
-- 
1.7.9.5

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [PATCH v4 5/6] PM / Runtime: force memory allocation with no I/O during Runtime PM callbcack
From: Ming Lei @ 2012-11-03  8:35 UTC (permalink / raw)
  To: linux-kernel
  Cc: Alan Stern, Oliver Neukum, Minchan Kim, Greg Kroah-Hartman,
	Rafael J. Wysocki, Jens Axboe, David S. Miller, Andrew Morton,
	netdev, linux-usb, linux-pm, linux-mm, Ming Lei
In-Reply-To: <1351931714-11689-1-git-send-email-ming.lei@canonical.com>

This patch applies the introduced memalloc_noio_save() and
memalloc_noio_restore() to force memory allocation with no I/O
during runtime_resume/runtime_suspend callback on device with
the flag of 'memalloc_noio' set.

Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: Oliver Neukum <oneukum@suse.de>
Cc: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Ming Lei <ming.lei@canonical.com>
---
v4:
	- runtime_suspend need this too because rpm_resume may wait for
	completion of concurrent runtime_suspend, so deadlock still may
	be triggered in runtime_suspend path.
---
 drivers/base/power/runtime.c |   32 ++++++++++++++++++++++++++++++--
 1 file changed, 30 insertions(+), 2 deletions(-)

diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c
index d477924..7ed17a9 100644
--- a/drivers/base/power/runtime.c
+++ b/drivers/base/power/runtime.c
@@ -368,6 +368,7 @@ static int rpm_suspend(struct device *dev, int rpmflags)
 	int (*callback)(struct device *);
 	struct device *parent = NULL;
 	int retval;
+	unsigned int noio_flag;
 
 	trace_rpm_suspend(dev, rpmflags);
 
@@ -477,7 +478,20 @@ static int rpm_suspend(struct device *dev, int rpmflags)
 	if (!callback && dev->driver && dev->driver->pm)
 		callback = dev->driver->pm->runtime_suspend;
 
-	retval = rpm_callback(callback, dev);
+	/*
+	 * Deadlock might be caused if memory allocation with GFP_KERNEL
+	 * happens inside runtime_suspend callback of one block device's
+	 * ancestor or the block device itself. Network device might be
+	 * thought as part of iSCSI block device, so network device and
+	 * its ancestor should be marked as memalloc_noio.
+	 */
+	if (dev->power.memalloc_noio) {
+		memalloc_noio_save(noio_flag);
+		retval = rpm_callback(callback, dev);
+		memalloc_noio_restore(noio_flag);
+	} else {
+		retval = rpm_callback(callback, dev);
+	}
 	if (retval)
 		goto fail;
 
@@ -560,6 +574,7 @@ static int rpm_resume(struct device *dev, int rpmflags)
 	int (*callback)(struct device *);
 	struct device *parent = NULL;
 	int retval = 0;
+	unsigned int noio_flag;
 
 	trace_rpm_resume(dev, rpmflags);
 
@@ -709,7 +724,20 @@ static int rpm_resume(struct device *dev, int rpmflags)
 	if (!callback && dev->driver && dev->driver->pm)
 		callback = dev->driver->pm->runtime_resume;
 
-	retval = rpm_callback(callback, dev);
+	/*
+	 * Deadlock might be caused if memory allocation with GFP_KERNEL
+	 * happens inside runtime_resume callback of one block device's
+	 * ancestor or the block device itself. Network device might be
+	 * thought as part of iSCSI block device, so network device and
+	 * its ancestor should be marked as memalloc_noio.
+	 */
+	if (dev->power.memalloc_noio) {
+		memalloc_noio_save(noio_flag);
+		retval = rpm_callback(callback, dev);
+		memalloc_noio_restore(noio_flag);
+	} else {
+		retval = rpm_callback(callback, dev);
+	}
 	if (retval) {
 		__update_runtime_status(dev, RPM_SUSPENDED);
 		pm_runtime_cancel_pending(dev);
-- 
1.7.9.5

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [PATCH v4 4/6] net/core: apply pm_runtime_set_memalloc_noio on network devices
From: Ming Lei @ 2012-11-03  8:35 UTC (permalink / raw)
  To: linux-kernel
  Cc: Alan Stern, Oliver Neukum, Minchan Kim, Greg Kroah-Hartman,
	Rafael J. Wysocki, Jens Axboe, David S. Miller, Andrew Morton,
	netdev, linux-usb, linux-pm, linux-mm, Ming Lei, Eric Dumazet,
	David Decotigny, Tom Herbert, Ingo Molnar
In-Reply-To: <1351931714-11689-1-git-send-email-ming.lei@canonical.com>

Deadlock might be caused by allocating memory with GFP_KERNEL in
runtime_resume and runtime_suspend callback of network devices in
iSCSI situation, so mark network devices and its ancestor as
'memalloc_noio' with the introduced pm_runtime_set_memalloc_noio().

Cc: "David S. Miller" <davem@davemloft.net>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: David Decotigny <david.decotigny@google.com>
Cc: Tom Herbert <therbert@google.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Ming Lei <ming.lei@canonical.com>
---
v4:
	 - call pm_runtime_set_memalloc_noio(ddev, true) after
	   device_add
---
 net/core/net-sysfs.c |    5 +++++
 1 file changed, 5 insertions(+)

diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index bcf02f6..a55d255 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -22,6 +22,7 @@
 #include <linux/vmalloc.h>
 #include <linux/export.h>
 #include <linux/jiffies.h>
+#include <linux/pm_runtime.h>
 #include <net/wext.h>
 
 #include "net-sysfs.h"
@@ -1386,6 +1387,8 @@ void netdev_unregister_kobject(struct net_device * net)
 
 	remove_queue_kobjects(net);
 
+	pm_runtime_set_memalloc_noio(dev, false);
+
 	device_del(dev);
 }
 
@@ -1421,6 +1424,8 @@ int netdev_register_kobject(struct net_device *net)
 		return error;
 	}
 
+	pm_runtime_set_memalloc_noio(dev, true);
+
 	return error;
 }
 
-- 
1.7.9.5

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [PATCH v4 3/6] block/genhd.c: apply pm_runtime_set_memalloc_noio on block devices
From: Ming Lei @ 2012-11-03  8:35 UTC (permalink / raw)
  To: linux-kernel
  Cc: Alan Stern, Oliver Neukum, Minchan Kim, Greg Kroah-Hartman,
	Rafael J. Wysocki, Jens Axboe, David S. Miller, Andrew Morton,
	netdev, linux-usb, linux-pm, linux-mm, Ming Lei
In-Reply-To: <1351931714-11689-1-git-send-email-ming.lei@canonical.com>

This patch applyes the introduced pm_runtime_set_memalloc_noio on
block device so that PM core will teach mm to not allocate memory with
GFP_IOFS when calling the runtime_resume and runtime_suspend callback
for block devices and its ancestors.

Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Ming Lei <ming.lei@canonical.com>
---
v4:
	- call pm_runtime_set_memalloc_noio(ddev, true) after device_add
---
 block/genhd.c |    9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/block/genhd.c b/block/genhd.c
index 9e02cd6..f3fe3aa 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -18,6 +18,7 @@
 #include <linux/mutex.h>
 #include <linux/idr.h>
 #include <linux/log2.h>
+#include <linux/pm_runtime.h>
 
 #include "blk.h"
 
@@ -532,6 +533,13 @@ static void register_disk(struct gendisk *disk)
 			return;
 		}
 	}
+
+	/* avoid probable deadlock caused by allocating memory with
+	 * GFP_KERNEL in runtime_resume callback of its all ancestor
+	 * deivces
+	 */
+	pm_runtime_set_memalloc_noio(ddev, true);
+
 	disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj);
 	disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj);
 
@@ -661,6 +669,7 @@ void del_gendisk(struct gendisk *disk)
 	disk->driverfs_dev = NULL;
 	if (!sysfs_deprecated)
 		sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
+	pm_runtime_set_memalloc_noio(disk_to_dev(disk), false);
 	device_del(disk_to_dev(disk));
 }
 EXPORT_SYMBOL(del_gendisk);
-- 
1.7.9.5

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [PATCH v4 2/6] PM / Runtime: introduce pm_runtime_set_memalloc_noio()
From: Ming Lei @ 2012-11-03  8:35 UTC (permalink / raw)
  To: linux-kernel
  Cc: Alan Stern, Oliver Neukum, Minchan Kim, Greg Kroah-Hartman,
	Rafael J. Wysocki, Jens Axboe, David S. Miller, Andrew Morton,
	netdev, linux-usb, linux-pm, linux-mm, Ming Lei
In-Reply-To: <1351931714-11689-1-git-send-email-ming.lei@canonical.com>

The patch introduces the flag of memalloc_noio in 'struct dev_pm_info'
to help PM core to teach mm not allocating memory with GFP_KERNEL
flag for avoiding probable deadlock.

As explained in the comment, any GFP_KERNEL allocation inside
runtime_resume() or runtime_suspend() on any one of device in
the path from one block or network device to the root device
in the device tree may cause deadlock, the introduced
pm_runtime_set_memalloc_noio() sets or clears the flag on
device in the path recursively.

Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Signed-off-by: Ming Lei <ming.lei@canonical.com>
---
v4:
	- rename memalloc_noio_resume as memalloc_noio
	- remove pm_runtime_get_memalloc_noio()
	- add comments on pm_runtime_set_memalloc_noio
v3:
	- introduce pm_runtime_get_memalloc_noio()
	- hold one global lock on pm_runtime_set_memalloc_noio
	- hold device power lock when accessing memalloc_noio_resume
	  flag suggested by Alan Stern
	- implement pm_runtime_set_memalloc_noio without recursion
	  suggested by Alan Stern
v2:
	- introduce pm_runtime_set_memalloc_noio()
---
 drivers/base/power/runtime.c |   57 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/pm.h           |    1 +
 include/linux/pm_runtime.h   |    3 +++
 3 files changed, 61 insertions(+)

diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c
index 3148b10..d477924 100644
--- a/drivers/base/power/runtime.c
+++ b/drivers/base/power/runtime.c
@@ -124,6 +124,63 @@ unsigned long pm_runtime_autosuspend_expiration(struct device *dev)
 }
 EXPORT_SYMBOL_GPL(pm_runtime_autosuspend_expiration);
 
+static int dev_memalloc_noio(struct device *dev, void *data)
+{
+	return dev->power.memalloc_noio;
+}
+
+/*
+ * pm_runtime_set_memalloc_noio - Set a device's memalloc_noio flag.
+ * @dev: Device to handle.
+ * @enable: True for setting the flag and False for clearing the flag.
+ *
+ * Set the flag for all devices in the path from the device to the
+ * root device in the device tree if @enable is true, otherwise clear
+ * the flag for devices in the path whose siblings don't set the flag.
+ *
+ * The function should only be called by block device, or network
+ * device driver for solving the deadlock problem during runtime
+ * resume/suspend:
+ * 	if memory allocation with GFP_KERNEL is called inside runtime
+ * 	resume/suspend callback of any one of its ancestors(or the
+ * 	block device itself), the deadlock may be triggered inside the
+ * 	memory allocation since it might not complete until the block
+ * 	device becomes active and the involed page I/O finishes. The
+ * 	situation is pointed out first by Alan Stern. Network device
+ * 	are involved in iSCSI kind of situation.
+ *
+ * The lock of dev_hotplug_mutex is held in the function for handling
+ * hotplug race because pm_runtime_set_memalloc_noio() may be called
+ * in async probe().
+ *
+ * The function should be called between device_add() and device_del()
+ * on the affected device(block/network device).
+ */
+void pm_runtime_set_memalloc_noio(struct device *dev, bool enable)
+{
+	static DEFINE_MUTEX(dev_hotplug_mutex);
+
+	mutex_lock(&dev_hotplug_mutex);
+	for(;;) {
+		/* hold power lock since bitfield is not SMP-safe. */
+		spin_lock_irq(&dev->power.lock);
+		dev->power.memalloc_noio = enable;
+		spin_unlock_irq(&dev->power.lock);
+
+		dev = dev->parent;
+
+		/* only clear the flag for one device if all
+		 * children of the device don't set the flag.
+		 */
+		if (!dev || (!enable &&
+			     device_for_each_child(dev, NULL,
+						   dev_memalloc_noio)))
+			break;
+	}
+	mutex_unlock(&dev_hotplug_mutex);
+}
+EXPORT_SYMBOL_GPL(pm_runtime_set_memalloc_noio);
+
 /**
  * rpm_check_suspend_allowed - Test whether a device may be suspended.
  * @dev: Device to test.
diff --git a/include/linux/pm.h b/include/linux/pm.h
index 03d7bb1..1a8a69d 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -538,6 +538,7 @@ struct dev_pm_info {
 	unsigned int		irq_safe:1;
 	unsigned int		use_autosuspend:1;
 	unsigned int		timer_autosuspends:1;
+	unsigned int		memalloc_noio:1;
 	enum rpm_request	request;
 	enum rpm_status		runtime_status;
 	int			runtime_error;
diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h
index f271860..775e063 100644
--- a/include/linux/pm_runtime.h
+++ b/include/linux/pm_runtime.h
@@ -47,6 +47,7 @@ extern void pm_runtime_set_autosuspend_delay(struct device *dev, int delay);
 extern unsigned long pm_runtime_autosuspend_expiration(struct device *dev);
 extern void pm_runtime_update_max_time_suspended(struct device *dev,
 						 s64 delta_ns);
+extern void pm_runtime_set_memalloc_noio(struct device *dev, bool enable);
 
 static inline bool pm_children_suspended(struct device *dev)
 {
@@ -149,6 +150,8 @@ static inline void pm_runtime_set_autosuspend_delay(struct device *dev,
 						int delay) {}
 static inline unsigned long pm_runtime_autosuspend_expiration(
 				struct device *dev) { return 0; }
+static inline void pm_runtime_set_memalloc_noio(struct device *dev,
+						bool enable){}
 
 #endif /* !CONFIG_PM_RUNTIME */
 
-- 
1.7.9.5

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [PATCH v4 1/6] mm: teach mm by current context info to not do I/O during memory allocation
From: Ming Lei @ 2012-11-03  8:35 UTC (permalink / raw)
  To: linux-kernel
  Cc: Alan Stern, Oliver Neukum, Minchan Kim, Greg Kroah-Hartman,
	Rafael J. Wysocki, Jens Axboe, David S. Miller, Andrew Morton,
	netdev, linux-usb, linux-pm, linux-mm, Ming Lei, Jiri Kosina,
	Mel Gorman, KAMEZAWA Hiroyuki, Michal Hocko, Ingo Molnar,
	Peter Zijlstra
In-Reply-To: <1351931714-11689-1-git-send-email-ming.lei@canonical.com>

This patch introduces PF_MEMALLOC_NOIO on process flag('flags' field of
'struct task_struct'), so that the flag can be set by one task
to avoid doing I/O inside memory allocation in the task's context.

The patch trys to solve one deadlock problem caused by block device,
and the problem may happen at least in the below situations:

- during block device runtime resume, if memory allocation with
GFP_KERNEL is called inside runtime resume callback of any one
of its ancestors(or the block device itself), the deadlock may be
triggered inside the memory allocation since it might not complete
until the block device becomes active and the involed page I/O finishes.
The situation is pointed out first by Alan Stern. It is not a good
approach to convert all GFP_KERNEL[1] in the path into GFP_NOIO because
several subsystems may be involved(for example, PCI, USB and SCSI may
be involved for usb mass stoarage device, network devices involved too
in the iSCSI case)

- during block device runtime suspend, because runtime resume need
to wait for completion of concurrent runtime suspend.

- during error handling of usb mass storage deivce, USB bus reset
will be put on the device, so there shouldn't have any
memory allocation with GFP_KERNEL during USB bus reset, otherwise
the deadlock similar with above may be triggered. Unfortunately, any
usb device may include one mass storage interface in theory, so it
requires all usb interface drivers to handle the situation. In fact,
most usb drivers don't know how to handle bus reset on the device
and don't provide .pre_set() and .post_reset() callback at all, so
USB core has to unbind and bind driver for these devices. So it
is still not practical to resort to GFP_NOIO for solving the problem.

Also the introduced solution can be used by block subsystem or block
drivers too, for example, set the PF_MEMALLOC_NOIO flag before doing
actual I/O transfer.

It is not a good idea to convert all these GFP_KERNEL in the
affected path into GFP_NOIO because these functions doing that may be
implemented as library and will be called in many other contexts.

In fact, memalloc_noio() can convert some of current static GFP_NOIO
allocation into GFP_KERNEL back in other non-affected contexts, at least
almost all GFP_NOIO in USB subsystem can be converted into GFP_KERNEL
after applying the approach and make allocation with GFP_IO
only happen in runtime resume/bus reset/block I/O transfer contexts
generally.

[1], several GFP_KERNEL allocation examples in runtime resume path

- pci subsystem
acpi_os_allocate
	<-acpi_ut_allocate
		<-ACPI_ALLOCATE_ZEROED
			<-acpi_evaluate_object
				<-__acpi_bus_set_power
					<-acpi_bus_set_power
						<-acpi_pci_set_power_state
							<-platform_pci_set_power_state
								<-pci_platform_power_transition
									<-__pci_complete_power_transition
										<-pci_set_power_state
											<-pci_restore_standard_config
												<-pci_pm_runtime_resume
- usb subsystem
usb_get_status
	<-finish_port_resume
		<-usb_port_resume
			<-generic_resume
				<-usb_resume_device
					<-usb_resume_both
						<-usb_runtime_resume

- some individual usb drivers
usblp, uvc, gspca, most of dvb-usb-v2 media drivers, cpia2, az6007, ....

That is just what I have found.  Unfortunately, this allocation can
only be found by human being now, and there should be many not found
since any function in the resume path(call tree) may allocate memory
with GFP_KERNEL.

Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: Oliver Neukum <oneukum@suse.de>
Cc: Jiri Kosina <jiri.kosina@suse.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Signed-off-by: Minchan Kim <minchan@kernel.org>
Signed-off-by: Ming Lei <ming.lei@canonical.com>
---
v4:
	- fix comment
v3:
	- no change
v2:
        - remove changes on 'may_writepage' and 'may_swap' because that
          isn't related with the patchset, and can't introduce I/O in
          allocation path if GFP_IOFS is unset, so handing 'may_swap'
          and may_writepage on GFP_NOIO or GFP_NOFS  should be a
          mm internal thing, and let mm guys deal with that, :-).

          Looks clearing the two may_XXX flag only excludes dirty pages
	  	  and anon pages for relaiming, and the behaviour should be decided
          by GFP FLAG, IMO.

        - unset GFP_IOFS in try_to_free_pages() path since
          alloc_page_buffers()
          and dma_alloc_from_contiguous may drop into the path, as
          pointed by KAMEZAWA Hiroyuki
v1:
        - take Minchan's change to avoid the check in alloc_page hot
          path

        - change the helpers' style into save/restore as suggested by
          Alan Stern
---
 include/linux/sched.h |   10 ++++++++++
 mm/page_alloc.c       |   10 +++++++++-
 mm/vmscan.c           |   12 ++++++++++++
 3 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index fb27acd..283fe86 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1805,6 +1805,7 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *
 #define PF_FROZEN	0x00010000	/* frozen for system suspend */
 #define PF_FSTRANS	0x00020000	/* inside a filesystem transaction */
 #define PF_KSWAPD	0x00040000	/* I am kswapd */
+#define PF_MEMALLOC_NOIO 0x00080000	/* Allocating memory without IO involved */
 #define PF_LESS_THROTTLE 0x00100000	/* Throttle me less: I clean memory */
 #define PF_KTHREAD	0x00200000	/* I am a kernel thread */
 #define PF_RANDOMIZE	0x00400000	/* randomize virtual address space */
@@ -1842,6 +1843,15 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *
 #define tsk_used_math(p) ((p)->flags & PF_USED_MATH)
 #define used_math() tsk_used_math(current)
 
+#define memalloc_noio() (current->flags & PF_MEMALLOC_NOIO)
+#define memalloc_noio_save(flag) do { \
+	(flag) = current->flags & PF_MEMALLOC_NOIO; \
+	current->flags |= PF_MEMALLOC_NOIO; \
+} while (0)
+#define memalloc_noio_restore(flag) do { \
+	current->flags = (current->flags & ~PF_MEMALLOC_NOIO) | flag; \
+} while (0)
+
 /*
  * task->jobctl flags
  */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 45c916b..3c47049 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2634,10 +2634,18 @@ retry_cpuset:
 	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
 			zonelist, high_zoneidx, alloc_flags,
 			preferred_zone, migratetype);
-	if (unlikely(!page))
+	if (unlikely(!page)) {
+		/*
+		 * Runtime PM, block IO and its error handling path
+		 * can deadlock because I/O on the device might not
+		 * complete.
+		 */
+		if (unlikely(memalloc_noio()))
+			gfp_mask &= ~GFP_IOFS;
 		page = __alloc_pages_slowpath(gfp_mask, order,
 				zonelist, high_zoneidx, nodemask,
 				preferred_zone, migratetype);
+	}
 
 	trace_mm_page_alloc(page, order, gfp_mask, migratetype);
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 10090c8..035088a 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2304,6 +2304,12 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 		.gfp_mask = sc.gfp_mask,
 	};
 
+	if (unlikely(memalloc_noio())) {
+		gfp_mask &= ~GFP_IOFS;
+		sc.gfp_mask = gfp_mask;
+		shrink.gfp_mask = sc.gfp_mask;
+	}
+
 	throttle_direct_reclaim(gfp_mask, zonelist, nodemask);
 
 	/*
@@ -3304,6 +3310,12 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 	};
 	unsigned long nr_slab_pages0, nr_slab_pages1;
 
+	if (unlikely(memalloc_noio())) {
+		gfp_mask &= ~GFP_IOFS;
+		sc.gfp_mask = gfp_mask;
+		shrink.gfp_mask = sc.gfp_mask;
+	}
+
 	cond_resched();
 	/*
 	 * We need to be able to allocate from the reserves for RECLAIM_SWAP
-- 
1.7.9.5

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [PATCH v4 0/6] solve deadlock caused by memory allocation with I/O
From: Ming Lei @ 2012-11-03  8:35 UTC (permalink / raw)
  To: linux-kernel
  Cc: Alan Stern, Oliver Neukum, Minchan Kim, Greg Kroah-Hartman,
	Rafael J. Wysocki, Jens Axboe, David S. Miller, Andrew Morton,
	netdev, linux-usb, linux-pm, linux-mm

This patchset try to solve one deadlock problem which might be caused
by memory allocation with block I/O during runtime PM and block device
error handling path. Traditionly, the problem is addressed by passing
GFP_NOIO statically to mm, but that is not a effective solution, see
detailed description in patch 1's commit log.

This patch set introduces one process flag and trys to fix the deadlock
problem on block device/network device during runtime PM or usb bus reset.

The 1st one is the change on include/sched.h and mm.

The 2nd patch introduces the flag of memalloc_noio on 'dev_pm_info',
and pm_runtime_set_memalloc_noio(), so that PM Core can teach mm to not
allocate mm with GFP_IOFS during the runtime_resume callback only on
device with the flag set.

The following 2 patches apply the introduced pm_runtime_set_memalloc_noio()
to mark all devices as memalloc_noio_resume in the path from the block or
network device to the root device in device tree.

The last 2 patches are applied again PM and USB subsystem to demonstrate
how to use the introduced mechanism to fix the deadlock problem.

Change logs:
V4:
	- patches from the 2nd to the 6th changed	
	- call pm_runtime_set_memalloc_noio() after device_add() as pointed
	by Alan
	- set PF_MEMALLOC_NOIO during runtime_suspend()

V3:
        - patch 2/6 and 5/6 changed, see their commit log
        - remove RFC from title since several guys have expressed that
        it is a reasonable solution
V2:
        - remove changes on 'may_writepage' and 'may_swap'(1/6)
        - unset GFP_IOFS in try_to_free_pages() path(1/6)
        - introduce pm_runtime_set_memalloc_noio()
        - only apply the meachnism on block/network device and its ancestors
        for runtime resume context
V1:
        - take Minchan's change to avoid the check in alloc_page hot path
        - change the helpers' style into save/restore as suggested by Alan
        - memory allocation with no io in usb bus reset path for all devices
        as suggested by Greg and Oliver

 block/genhd.c                |    9 +++++
 drivers/base/power/runtime.c |   89 +++++++++++++++++++++++++++++++++++++++++-
 drivers/usb/core/hub.c       |   13 ++++++
 include/linux/pm.h           |    1 +
 include/linux/pm_runtime.h   |    3 ++
 include/linux/sched.h        |   10 +++++
 mm/page_alloc.c              |   10 ++++-
 mm/vmscan.c                  |   12 ++++++
 net/core/net-sysfs.c         |    5 +++


Thanks,
--
Ming Lei

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: [BUGFIX 1/2] PCI/PM: Fix deadlock when unbind device if its parent in D3cold
From: Huang Ying @ 2012-11-03  5:06 UTC (permalink / raw)
  To: Bjorn Helgaas
  Cc: linux-kernel, linux-pci, linux-pm, Rafael J. Wysocki,
	Zhang Yanmin
In-Reply-To: <CAErSpo7hiFQOC+OPZretkyp5EcL84zEWCAiQgEE5Ewq--gUziQ@mail.gmail.com>

On Fri, 2012-11-02 at 10:50 -0600, Bjorn Helgaas wrote:
> On Wed, Oct 24, 2012 at 12:54 AM, Huang Ying <ying.huang@intel.com> wrote:
> > If a PCI device and its parents are put into D3cold, unbinding the
> > device will trigger deadlock as follow:
> >
> > - driver_unbind
> >   - device_release_driver
> >     - device_lock(dev)                          <--- previous lock here
> >     - __device_release_driver
> >       - pm_runtime_get_sync
> >         ...
> >           - rpm_resume(dev)
> >             - rpm_resume(dev->parent)
> >               ...
> >                 - pci_pm_runtime_resume
> >                   ...
> >                   - pci_set_power_state
> >                     - __pci_start_power_transition
> >                       - pci_wakeup_bus(dev->parent->subordinate)
> >                         - pci_walk_bus
> >                           - device_lock(dev)    <--- dead lock here
> >
> >
> > If we do not do device_lock in pci_walk_bus, we can avoid dead lock.
> > Device_lock in pci_walk_bus is introduced in commit:
> > d71374dafbba7ec3f67371d3b7e9f6310a588808, corresponding email thread
> > is: https://lkml.org/lkml/2006/5/26/38.  The patch author Zhang Yanmin
> > said device_lock is added to pci_walk_bus because:
> >
> >   Some error handling functions call pci_walk_bus. For example, PCIe
> >   aer. Here we lock the device, so the driver wouldn't detach from the
> >   device, as the cb might call driver's callback function.
> >
> > So I fixed the dead lock as follow:
> >
> > - remove device_lock from pci_walk_bus
> > - add device_lock into callback if callback will call driver's callback
> >
> > I checked pci_walk_bus users one by one, and found only PCIe aer needs
> > device lock.
> 
> Is there a problem report or bugzilla for this issue?

No.  I found this issue when I developed kexec fixing.

Best Regards,
Huang Ying

> > Signed-off-by: Huang Ying <ying.huang@intel.com>
> > Cc: Zhang Yanmin <yanmin.zhang@intel.com>
> 
> Should this go to stable as well?  The D3cold support appeared in
> v3.6, so my guess is that this fix could go to v3.6.x.
> 
> > ---
> >  drivers/pci/bus.c                  |    3 ---
> >  drivers/pci/pcie/aer/aerdrv_core.c |   20 ++++++++++++++++----
> >  2 files changed, 16 insertions(+), 7 deletions(-)
> >
> > --- a/drivers/pci/bus.c
> > +++ b/drivers/pci/bus.c
> > @@ -320,10 +320,7 @@ void pci_walk_bus(struct pci_bus *top, i
> >                 } else
> >                         next = dev->bus_list.next;
> >
> > -               /* Run device routines with the device locked */
> > -               device_lock(&dev->dev);
> >                 retval = cb(dev, userdata);
> > -               device_unlock(&dev->dev);
> >                 if (retval)
> >                         break;
> >         }
> > --- a/drivers/pci/pcie/aer/aerdrv_core.c
> > +++ b/drivers/pci/pcie/aer/aerdrv_core.c
> > @@ -213,6 +213,7 @@ static int report_error_detected(struct
> >         struct aer_broadcast_data *result_data;
> >         result_data = (struct aer_broadcast_data *) data;
> >
> > +       device_lock(&dev->dev);
> >         dev->error_state = result_data->state;
> >
> >         if (!dev->driver ||
> > @@ -231,12 +232,14 @@ static int report_error_detected(struct
> >                                    dev->driver ?
> >                                    "no AER-aware driver" : "no driver");
> >                 }
> > -               return 0;
> > +               goto out;
> >         }
> >
> >         err_handler = dev->driver->err_handler;
> >         vote = err_handler->error_detected(dev, result_data->state);
> >         result_data->result = merge_result(result_data->result, vote);
> > +out:
> > +       device_unlock(&dev->dev);
> >         return 0;
> >  }
> >
> > @@ -247,14 +250,17 @@ static int report_mmio_enabled(struct pc
> >         struct aer_broadcast_data *result_data;
> >         result_data = (struct aer_broadcast_data *) data;
> >
> > +       device_lock(&dev->dev);
> >         if (!dev->driver ||
> >                 !dev->driver->err_handler ||
> >                 !dev->driver->err_handler->mmio_enabled)
> > -               return 0;
> > +               goto out;
> >
> >         err_handler = dev->driver->err_handler;
> >         vote = err_handler->mmio_enabled(dev);
> >         result_data->result = merge_result(result_data->result, vote);
> > +out:
> > +       device_unlock(&dev->dev);
> >         return 0;
> >  }
> >
> > @@ -265,14 +271,17 @@ static int report_slot_reset(struct pci_
> >         struct aer_broadcast_data *result_data;
> >         result_data = (struct aer_broadcast_data *) data;
> >
> > +       device_lock(&dev->dev);
> >         if (!dev->driver ||
> >                 !dev->driver->err_handler ||
> >                 !dev->driver->err_handler->slot_reset)
> > -               return 0;
> > +               goto out;
> >
> >         err_handler = dev->driver->err_handler;
> >         vote = err_handler->slot_reset(dev);
> >         result_data->result = merge_result(result_data->result, vote);
> > +out:
> > +       device_unlock(&dev->dev);
> >         return 0;
> >  }
> >
> > @@ -280,15 +289,18 @@ static int report_resume(struct pci_dev
> >  {
> >         const struct pci_error_handlers *err_handler;
> >
> > +       device_lock(&dev->dev);
> >         dev->error_state = pci_channel_io_normal;
> >
> >         if (!dev->driver ||
> >                 !dev->driver->err_handler ||
> >                 !dev->driver->err_handler->resume)
> > -               return 0;
> > +               goto out;
> >
> >         err_handler = dev->driver->err_handler;
> >         err_handler->resume(dev);
> > +out:
> > +       device_unlock(&dev->dev);
> >         return 0;
> >  }
> >



^ permalink raw reply

* Re: [BUGFIX 2/2] PCI/PM: Resume device before shutdown
From: Huang Ying @ 2012-11-03  5:05 UTC (permalink / raw)
  To: Bjorn Helgaas; +Cc: linux-kernel, linux-pci, linux-pm, Rafael J. Wysocki
In-Reply-To: <CAErSpo5bKDrfOC86Cxv+0yFiyj1bMgWF=6XuY+DWFjaMBTQC0A@mail.gmail.com>

On Fri, 2012-11-02 at 10:52 -0600, Bjorn Helgaas wrote:
> On Wed, Oct 24, 2012 at 12:54 AM, Huang Ying <ying.huang@intel.com> wrote:
> > Some actions during shutdown need device to be in D0 state, such as
> > MSI shutdown etc, so resume device before shutdown.
> 
> Is there a problem report or bugzilla for this issue?  What are the
> symptoms by which a user could figure out that he needs this fix?

No bugzilla for this issue.  This issue will cause the corresponding
device lost in kexeced kernel.

Best Regards,
Huang Ying

> Should this be put in the stable tree as well?  If so, for v3.6 only?
> 
> > Signed-off-by: Huang Ying <ying.huang@intel.com>
> > ---
> >  drivers/pci/pci-driver.c |   12 ++----------
> >  1 file changed, 2 insertions(+), 10 deletions(-)
> >
> > --- a/drivers/pci/pci-driver.c
> > +++ b/drivers/pci/pci-driver.c
> > @@ -398,6 +398,8 @@ static void pci_device_shutdown(struct d
> >         struct pci_dev *pci_dev = to_pci_dev(dev);
> >         struct pci_driver *drv = pci_dev->driver;
> >
> > +       pm_runtime_resume(dev);
> > +
> >         if (drv && drv->shutdown)
> >                 drv->shutdown(pci_dev);
> >         pci_msi_shutdown(pci_dev);
> > @@ -408,16 +410,6 @@ static void pci_device_shutdown(struct d
> >          * continue to do DMA
> >          */
> >         pci_disable_device(pci_dev);
> > -
> > -       /*
> > -        * Devices may be enabled to wake up by runtime PM, but they need not
> > -        * be supposed to wake up the system from its "power off" state (e.g.
> > -        * ACPI S5).  Therefore disable wakeup for all devices that aren't
> > -        * supposed to wake up the system at this point.  The state argument
> > -        * will be ignored by pci_enable_wake().
> > -        */
> > -       if (!device_may_wakeup(dev))
> > -               pci_enable_wake(pci_dev, PCI_UNKNOWN, false);
> >  }
> >
> >  #ifdef CONFIG_PM

^ permalink raw reply

* Re: [RFC] Suspend/resume without VT switches
From: Jesse Barnes @ 2012-11-03  0:22 UTC (permalink / raw)
  To: Rafael J. Wysocki; +Cc: intel-gfx, linux-kernel, Linux PM list
In-Reply-To: <2324361.keTRK1rkYL@vostro.rjw.lan>

On 11/2/2012 4:38 PM, Rafael J. Wysocki wrote:
> On Friday, November 02, 2012 04:29:37 PM Jesse Barnes wrote:
>> On Fri, 02 Nov 2012 22:51:07 +0100
>> "Rafael J. Wysocki"<rjw@sisk.pl>  wrote:
>>
>>> On Friday, November 02, 2012 02:43:39 PM Jesse Barnes wrote:
>>>> I've lightly tested this with X and it definitely makes my
>>>> suspend/resume sequence a bit prettier.  It should speed things up
>>>> trivally as well, but most of those gains come from other changes to the
>>>> i915 driver (posted earlier to intel-gfx).
>>>>
>>>> Any thoughts?
>>>
>>> I like the idea.
>>>
>>>> I suspect we'll have to be more defensive about the
>>>> resume configuration in case the BIOS did something weird, but overall I
>>>> think we should be able to do this one way or another.
>>>
>>> Perhaps patch [1/2] should be [2/2] and vice versa? :-)
>>
>> But then it wouldn't compile?  I added the variable first, defaulting
>> to the current behavior, then made i915 support it and set the variable
>> to false there...  At least, that's what I intended to do.
>
> OK
>
> So what happens if there are multiple graphics adapters in the system?
> Including such that aren't handled by i915?  pm_vt_switch is global, so isn't
> there any problem with that?

Yeah I guess it depends on configuration.  Maybe only the driver that's 
currently driving the VT should be able to set this, then clear it if 
it's no longer in charge?  Not sure how to do that yet (ugg means I get 
to dig into the console layer again...)

Thanks,
Jesse


^ permalink raw reply

* [GIT PULL] Power management update for v3.7-rc4
From: Rafael J. Wysocki @ 2012-11-02 23:55 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Linux PM list, LKML, Andreas Herrmann

Hi Linus,

Please pull from the git repository at

  git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm.git pm-for-3.7-rc4

to receive one power management update for v3.7-rc4 as commit
29c4bcddaa62e2b9fd2ba85668f6718b0b43f0e3

  cpufreq / powernow-k8: Change maintainer's email address

on top of commit 8f0d8163b50e01f398b14bcd4dc039ac5ab18d64

  Linux 3.7-rc3

It changes the email address of Andreas Herrmann in the
drivers/cpufreq/powernow-k8.c file that he is the maintainer of.

Thanks!


 drivers/cpufreq/powernow-k8.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

---------------

Andreas Herrmann (1):
      cpufreq / powernow-k8: Change maintainer's email address


-- 
I speak only for myself.
Rafael J. Wysocki, Intel Open Source Technology Center.

^ permalink raw reply

* Re: [RFC] Suspend/resume without VT switches
From: Rafael J. Wysocki @ 2012-11-02 23:38 UTC (permalink / raw)
  To: Jesse Barnes; +Cc: intel-gfx, linux-kernel, Linux PM list
In-Reply-To: <20121102162937.1ec88cf1@jbarnes-desktop>

On Friday, November 02, 2012 04:29:37 PM Jesse Barnes wrote:
> On Fri, 02 Nov 2012 22:51:07 +0100
> "Rafael J. Wysocki" <rjw@sisk.pl> wrote:
> 
> > On Friday, November 02, 2012 02:43:39 PM Jesse Barnes wrote:
> > > I've lightly tested this with X and it definitely makes my
> > > suspend/resume sequence a bit prettier.  It should speed things up
> > > trivally as well, but most of those gains come from other changes to the
> > > i915 driver (posted earlier to intel-gfx).
> > > 
> > > Any thoughts?
> > 
> > I like the idea.
> > 
> > > I suspect we'll have to be more defensive about the
> > > resume configuration in case the BIOS did something weird, but overall I
> > > think we should be able to do this one way or another.
> > 
> > Perhaps patch [1/2] should be [2/2] and vice versa? :-)
> 
> But then it wouldn't compile?  I added the variable first, defaulting
> to the current behavior, then made i915 support it and set the variable
> to false there...  At least, that's what I intended to do.

OK

So what happens if there are multiple graphics adapters in the system?
Including such that aren't handled by i915?  pm_vt_switch is global, so isn't
there any problem with that?

Rafael


-- 
I speak only for myself.
Rafael J. Wysocki, Intel Open Source Technology Center.

^ permalink raw reply

* Re: Linux 3.7-rc3
From: Linus Torvalds @ 2012-11-02 23:10 UTC (permalink / raw)
  To: Rafael J. Wysocki
  Cc: Daniel Vetter, Linux Kernel Mailing List, Linux PM list,
	Greg Kroah-Hartman, David Airlie, Michal Hocko, Jiri Kosina,
	Ingo Molnar, Peter Zijlstra
In-Reply-To: <2859519.IO8aRDjQvR@vostro.rjw.lan>

On Fri, Nov 2, 2012 at 3:43 PM, Rafael J. Wysocki <rjw@sisk.pl> wrote:
>
> Well, not everything is rosy in the suspend land, though.  This is a
> failure to freeze khubd during the second in a row attempt to suspend to
> RAM (your current tree):

Ugh. So khubd is blocked in usb_start_wait_urb(), and apparently the
timeout for that block is longer than the freezing timeout.

There's a comment about why khubd needs to be freezable, but I wonder
if that whole thing isn't doing something wrong. Causing the suspend
to fail is definitely always the wrong thing.

Greg?

> [  125.780766] [ INFO: suspicious RCU usage. ]
> [  125.780804] 3.7.0-rc3+ #988 Not tainted
> [  125.780838] -------------------------------
> [  125.780875] /home/rafael/src/linux/kernel/sched/core.c:4497 suspicious rcu_dereference_check() usage!

Heh. The RCU usage is from the debug printout from sched_show_task(),
so it's "related", but it's a totally independent issue.

It's apparently because we've not done a "rcu_read_lock()" around that
sequence, but I seriously doubt we care. But it's technically a real
bug - even if the fix might be to just not print out the parent pid
(or to just ignore the bug and turn the rcu dereference into an
ACCESS_ONCE() or something.

Ingo, Peter, any comments about that sched/core.c:4497 RCU usage?

                Linus

^ permalink raw reply

* Re: Linux 3.7-rc3
From: Rafael J. Wysocki @ 2012-11-02 22:43 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Daniel Vetter, Linux Kernel Mailing List, Linux PM list,
	Greg Kroah-Hartman, David Airlie, Michal Hocko, Jiri Kosina
In-Reply-To: <CA+55aFzfbvb4HsY0viCXC0FnVt7fcyhzyZgR4R6tu5uYC8qzPA@mail.gmail.com>

On Friday, November 02, 2012 03:26:22 PM Linus Torvalds wrote:
> On Fri, Nov 2, 2012 at 3:23 PM, Rafael J. Wysocki <rjw@sisk.pl> wrote:
> >
> > Well, it appears that Tumbleweed has acquired a broken s2disk binary.
> > Replace it with one built from upstream sources and it magically starts to
> > work. Grumble.
> >
> > Sorry for the noise.
> 
> Heh. This was the report that so far made me most worried about the
> current state of the -rc process, so I'll curse you and be very
> relieved at the same time ;)

Well, not everything is rosy in the suspend land, though.  This is a
failure to freeze khubd during the second in a row attempt to suspend to
RAM (your current tree):

[  105.679281] PM: Syncing filesystems ... done.
[  105.693722] PM: Preparing system for mem sleep
[  105.760289] Freezing user space processes ... (elapsed 0.01 seconds) done.
[  115.398131] usb 5-2: device descriptor read/64, error -110
[  115.601126] usb 5-2: new full-speed USB device number 4 using uhci_hcd
[  105.775886] Freezing remaining freezable tasks ... 
[  125.780627] Freezing of tasks failed after 20.00 seconds (1 tasks refusing to freeze, wq_busy=0):
[  125.780718] khubd           D ffff88007c920000 
[  125.780730] ===============================
[  125.780766] [ INFO: suspicious RCU usage. ]
[  125.780804] 3.7.0-rc3+ #988 Not tainted
[  125.780838] -------------------------------
[  125.780875] /home/rafael/src/linux/kernel/sched/core.c:4497 suspicious rcu_dereference_check() usage!
[  125.780946] 
[  125.780946] other info that might help us debug this:
[  125.780946] 
[  125.781031] 
[  125.781031] rcu_scheduler_active = 1, debug_locks = 0
[  125.781087] 4 locks held by s2ram/4211:
[  125.781120]  #0:  (&buffer->mutex){+.+.+.}, at: [<ffffffff811e2acf>] sysfs_write_file+0x3f/0x160
[  125.781233]  #1:  (s_active#94){.+.+.+}, at: [<ffffffff811e2b58>] sysfs_write_file+0xc8/0x160
[  125.781339]  #2:  (pm_mutex){+.+.+.}, at: [<ffffffff81090a81>] pm_suspend+0x81/0x230
[  125.781439]  #3:  (tasklist_lock){.?.?..}, at: [<ffffffff8108feed>] try_to_freeze_tasks+0x2cd/0x3f0
[  125.781543] 
[  125.781543] stack backtrace:
[  125.781584] Pid: 4211, comm: s2ram Not tainted 3.7.0-rc3+ #988
[  125.781632] Call Trace:
[  125.781662]  [<ffffffff810a3c73>] lockdep_rcu_suspicious+0x103/0x140
[  125.781719]  [<ffffffff8107cf21>] sched_show_task+0x121/0x180
[  125.781770]  [<ffffffff8108ffb4>] try_to_freeze_tasks+0x394/0x3f0
[  125.781823]  [<ffffffff810903b5>] freeze_kernel_threads+0x25/0x80
[  125.781876]  [<ffffffff81090b65>] pm_suspend+0x165/0x230
[  125.781924]  [<ffffffff8108fa29>] state_store+0x99/0x100
[  125.781975]  [<ffffffff812f5867>] kobj_attr_store+0x17/0x20
[  125.782038]  [<ffffffff811e2b71>] sysfs_write_file+0xe1/0x160
[  125.782091]  [<ffffffff811667a6>] vfs_write+0xc6/0x180
[  125.782138]  [<ffffffff81166ada>] sys_write+0x5a/0xa0
[  125.782185]  [<ffffffff812ff6ae>] ? trace_hardirqs_on_thunk+0x3a/0x3f
[  125.782242]  [<ffffffff81669dd2>] system_call_fastpath+0x16/0x1b
[  125.782294]     0    29      2 0x00000000
[  125.782329]  ffff88007cb099d8 0000000000000046 ffff88007cb09998 ffffffff810a865e
[  125.784196]  ffff88007cb09fd8 ffff88007cb06040 ffff88007cb09fd8 ffff88007cb08000
[  125.786048]  ffff88007cb09fd8 ffff88007cb08000 00000000000131c0 ffff88007cb09fd8
[  125.787879] Call Trace:
[  125.789655]  [<ffffffff810a865e>] ? mark_held_locks+0x6e/0x130
[  125.791465]  [<ffffffff81667854>] schedule+0x24/0x70
[  125.793278]  [<ffffffff81664af3>] schedule_timeout+0x173/0x480
[  125.795118]  [<ffffffff810507c0>] ? lock_timer_base+0x70/0x70
[  125.796975]  [<ffffffff81666cc1>] wait_for_common+0xd1/0x170
[  125.798836]  [<ffffffff8107d6a0>] ? try_to_wake_up+0x320/0x320
[  125.800665]  [<ffffffff81666dee>] wait_for_completion_timeout+0xe/0x10
[  125.802524]  [<ffffffff814903ab>] usb_start_wait_urb+0xdb/0x170
[  125.804382]  [<ffffffff8149067c>] usb_control_msg+0xdc/0x110
[  125.806208]  [<ffffffff814873a9>] hub_port_init+0x649/0xa50
[  125.808002]  [<ffffffff810a88bd>] ? trace_hardirqs_on+0xd/0x10
[  125.809749]  [<ffffffff8148a38f>] hub_thread+0x6ff/0x1670
[  125.811448]  [<ffffffff81069a50>] ? wake_up_bit+0x40/0x40
[  125.813111]  [<ffffffff81489c90>] ? usb_new_device+0x2c0/0x2c0
[  125.814774]  [<ffffffff810691f6>] kthread+0xd6/0xe0
[  125.816440]  [<ffffffff81069120>] ? flush_kthread_worker+0x1a0/0x1a0
[  125.818117]  [<ffffffff81669d2c>] ret_from_fork+0x7c/0xb0
[  125.819797]  [<ffffffff81069120>] ? flush_kthread_worker+0x1a0/0x1a0
[  125.821521] 
[  125.823147] Restarting kernel threads ... done.
[  125.825210] Restarting tasks ... done.

Rafael


-- 
I speak only for myself.
Rafael J. Wysocki, Intel Open Source Technology Center.

^ permalink raw reply

* Re: Linux 3.7-rc3
From: Linus Torvalds @ 2012-11-02 22:26 UTC (permalink / raw)
  To: Rafael J. Wysocki
  Cc: Daniel Vetter, Linux Kernel Mailing List, Linux PM list,
	Greg Kroah-Hartman, David Airlie, Michal Hocko, Jiri Kosina
In-Reply-To: <2471412.ilAVQkroSv@vostro.rjw.lan>

On Fri, Nov 2, 2012 at 3:23 PM, Rafael J. Wysocki <rjw@sisk.pl> wrote:
>
> Well, it appears that Tumbleweed has acquired a broken s2disk binary.
> Replace it with one built from upstream sources and it magically starts to
> work. Grumble.
>
> Sorry for the noise.

Heh. This was the report that so far made me most worried about the
current state of the -rc process, so I'll curse you and be very
relieved at the same time ;)

                Linus

^ permalink raw reply

* Re: Linux 3.7-rc3
From: Rafael J. Wysocki @ 2012-11-02 22:23 UTC (permalink / raw)
  To: Daniel Vetter
  Cc: Linus Torvalds, Linux Kernel Mailing List, Linux PM list,
	Greg Kroah-Hartman, David Airlie, Michal Hocko, Jiri Kosina
In-Reply-To: <1490901.1Nt8OOcfdR@vostro.rjw.lan>

On Friday, November 02, 2012 10:56:22 PM Rafael J. Wysocki wrote:
> On Friday, November 02, 2012 10:43:02 PM Daniel Vetter wrote:
> > On Fri, Nov 2, 2012 at 10:40 PM, Rafael J. Wysocki <rjw@sisk.pl> wrote:
> > > Can you point me to a range of commits worth checking?
> > 
> > 8c3f929b6147e142..57df2ae9df6e335a98969cac is the core rework (plus a
> > bit related follow-up) without any other merges interfering. Might
> > oops if you open/close the lid without a suspend/resume cycle (since
> > that's only fixed later on), should otherwise work.
> 
> Well, as it turns out, v3.6.0 doesn't work too with my .config, so never
> mind (and thanks for help :-)).
> 
> I wonder what's wrong with my .config, though ...

Well, it appears that Tumbleweed has acquired a broken s2disk binary.
Replace it with one built from upstream sources and it magically starts to
work. Grumble.

Sorry for the noise.

Rafael


-- 
I speak only for myself.
Rafael J. Wysocki, Intel Open Source Technology Center.

^ permalink raw reply

* Re: Linux 3.7-rc3
From: Rafael J. Wysocki @ 2012-11-02 21:56 UTC (permalink / raw)
  To: Daniel Vetter
  Cc: Linus Torvalds, Linux Kernel Mailing List, Linux PM list,
	Greg Kroah-Hartman, David Airlie
In-Reply-To: <CAKMK7uENP_XWQ7UWjd+PnqTH-S-jYRjrnfY1TwocfbT6abL61w@mail.gmail.com>

On Friday, November 02, 2012 10:43:02 PM Daniel Vetter wrote:
> On Fri, Nov 2, 2012 at 10:40 PM, Rafael J. Wysocki <rjw@sisk.pl> wrote:
> > Can you point me to a range of commits worth checking?
> 
> 8c3f929b6147e142..57df2ae9df6e335a98969cac is the core rework (plus a
> bit related follow-up) without any other merges interfering. Might
> oops if you open/close the lid without a suspend/resume cycle (since
> that's only fixed later on), should otherwise work.

Well, as it turns out, v3.6.0 doesn't work too with my .config, so never
mind (and thanks for help :-)).

I wonder what's wrong with my .config, though ...

Rafael


-- 
I speak only for myself.
Rafael J. Wysocki, Intel Open Source Technology Center.

^ permalink raw reply

* Re: Linux 3.7-rc3
From: Daniel Vetter @ 2012-11-02 21:43 UTC (permalink / raw)
  To: Rafael J. Wysocki
  Cc: Linus Torvalds, Linux Kernel Mailing List, Linux PM list,
	Greg Kroah-Hartman, David Airlie
In-Reply-To: <4911582.xWBr8PDhoE@vostro.rjw.lan>

On Fri, Nov 2, 2012 at 10:40 PM, Rafael J. Wysocki <rjw@sisk.pl> wrote:
> Can you point me to a range of commits worth checking?

8c3f929b6147e142..57df2ae9df6e335a98969cac is the core rework (plus a
bit related follow-up) without any other merges interfering. Might
oops if you open/close the lid without a suspend/resume cycle (since
that's only fixed later on), should otherwise work.
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch

^ permalink raw reply

* Re: Linux 3.7-rc3
From: Rafael J. Wysocki @ 2012-11-02 21:40 UTC (permalink / raw)
  To: Daniel Vetter
  Cc: Linus Torvalds, Linux Kernel Mailing List, Linux PM list,
	Greg Kroah-Hartman, David Airlie
In-Reply-To: <CAKMK7uEgnpTET0_VXWm7JyGePnXL-q_nHgSWq4g=qWodwrg38Q@mail.gmail.com>

On Friday, November 02, 2012 10:07:10 PM Daniel Vetter wrote:
> On Fri, Nov 2, 2012 at 9:25 PM, Linus Torvalds
> <torvalds@linux-foundation.org> wrote:
> > On Mon, Oct 29, 2012 at 5:10 PM, Rafael J. Wysocki <rjw@sisk.pl> wrote:
> >>
> >> Unfortunately, s2disk is broken with this one and previous -rc.  In the
> >> majority of cases it just hangs the machine during hibernation, although
> >> sometimes it returns to user space reporting freezing problems, suspicions
> >> RCU usage and similar stuff, pretty much without any useful debug information.
> >
> > Ugh. I was hoping that we'd get more reports on this, right now
> > there's not enough information to even make a guess about what's up.
> >
> > I also note that no actual i915 people were cc'd, despite your
> > graphics driver suspect. Adding Daniel and Dave (one of the millions)
> > to the cc since they probably do have lkml but probably don't react
> > unless something gets pointed out.
> >
> > Daniel/Dave, do you have any reports of hibernation failing (or VT
> > switching problems) with i915 after 3.6?
> 
> Not that I've heard of. We did though rewrite the entire modeset
> sequence driving code in i915 for 3.7, removing massive amounts of
> hacks and stupid "let's disable/enable things harder" code simply
> because the old code couldn't keep track of the hw state well enough.
> So I wouldn't be surprised at all if that unearthed a hidden bug
> somewhere, but we've also added ridiculous amounts of self-checks. And
> thus far they caught all issues due to that rework (and some ancient
> bugs on top) when either the i915 driver or the bios did something
> nefarious ...
> 
> I've rechecked the git log and otherwise there's nothing else which
> pops up that would fit VT-switching/hibernate going bad but the
> modeset rework.

Can you point me to a range of commits worth checking?

Rafael


-- 
I speak only for myself.
Rafael J. Wysocki, Intel Open Source Technology Center.

^ permalink raw reply

* Re: Linux 3.7-rc3
From: Daniel Vetter @ 2012-11-02 21:07 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Rafael J. Wysocki, Linux Kernel Mailing List, Linux PM list,
	Greg Kroah-Hartman, David Airlie
In-Reply-To: <CA+55aFxm0D77c0RYJuAiq2EDJxn+hGKTEGYJnHXNM8XkVAYOTA@mail.gmail.com>

On Fri, Nov 2, 2012 at 9:25 PM, Linus Torvalds
<torvalds@linux-foundation.org> wrote:
> On Mon, Oct 29, 2012 at 5:10 PM, Rafael J. Wysocki <rjw@sisk.pl> wrote:
>>
>> Unfortunately, s2disk is broken with this one and previous -rc.  In the
>> majority of cases it just hangs the machine during hibernation, although
>> sometimes it returns to user space reporting freezing problems, suspicions
>> RCU usage and similar stuff, pretty much without any useful debug information.
>
> Ugh. I was hoping that we'd get more reports on this, right now
> there's not enough information to even make a guess about what's up.
>
> I also note that no actual i915 people were cc'd, despite your
> graphics driver suspect. Adding Daniel and Dave (one of the millions)
> to the cc since they probably do have lkml but probably don't react
> unless something gets pointed out.
>
> Daniel/Dave, do you have any reports of hibernation failing (or VT
> switching problems) with i915 after 3.6?

Not that I've heard of. We did though rewrite the entire modeset
sequence driving code in i915 for 3.7, removing massive amounts of
hacks and stupid "let's disable/enable things harder" code simply
because the old code couldn't keep track of the hw state well enough.
So I wouldn't be surprised at all if that unearthed a hidden bug
somewhere, but we've also added ridiculous amounts of self-checks. And
thus far they caught all issues due to that rework (and some ancient
bugs on top) when either the i915 driver or the bios did something
nefarious ...

I've rechecked the git log and otherwise there's nothing else which
pops up that would fit VT-switching/hibernate going bad but the
modeset rework.
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox