All of lore.kernel.org

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH v3 09/11] memcg: add cgroupfs interface to memcg dirty limits
From: Greg Thelen @ 2010-10-19  0:39 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-kernel, linux-mm, containers, Andrea Righi, Balbir Singh,
	KAMEZAWA Hiroyuki, Daisuke Nishimura, Minchan Kim, Ciju Rajan K,
	David Rientjes, Greg Thelen
In-Reply-To: <1287448784-25684-1-git-send-email-gthelen@google.com>

Add cgroupfs interface to memcg dirty page limits:
  Direct write-out is controlled with:
  - memory.dirty_ratio
  - memory.dirty_limit_in_bytes

  Background write-out is controlled with:
  - memory.dirty_background_ratio
  - memory.dirty_background_limit_bytes

Other memcg cgroupfs files support 'M', 'm', 'k', 'K', 'g'
and 'G' suffixes for byte counts.  This patch provides the
same functionality for memory.dirty_limit_in_bytes and
memory.dirty_background_limit_bytes.

Signed-off-by: Andrea Righi <arighi@develer.com>
Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Signed-off-by: Greg Thelen <gthelen@google.com>
---

Changelog since v1:
- Renamed newly created proc files:
  - memory.dirty_bytes -> memory.dirty_limit_in_bytes
  - memory.dirty_background_bytes -> memory.dirty_background_limit_in_bytes
- Allow [kKmMgG] suffixes for newly created dirty limit value cgroupfs files.

 mm/memcontrol.c |  116 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 116 insertions(+), 0 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 412ce73..580e665 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -100,6 +100,13 @@ enum mem_cgroup_stat_index {
 	MEM_CGROUP_STAT_NSTATS,
 };
 
+enum {
+	MEM_CGROUP_DIRTY_RATIO,
+	MEM_CGROUP_DIRTY_LIMIT_IN_BYTES,
+	MEM_CGROUP_DIRTY_BACKGROUND_RATIO,
+	MEM_CGROUP_DIRTY_BACKGROUND_LIMIT_IN_BYTES,
+};
+
 struct mem_cgroup_stat_cpu {
 	s64 count[MEM_CGROUP_STAT_NSTATS];
 };
@@ -4311,6 +4318,91 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
 	return 0;
 }
 
+static u64 mem_cgroup_dirty_read(struct cgroup *cgrp, struct cftype *cft)
+{
+	struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
+	bool root;
+
+	root = mem_cgroup_is_root(mem);
+
+	switch (cft->private) {
+	case MEM_CGROUP_DIRTY_RATIO:
+		return root ? vm_dirty_ratio : mem->dirty_param.dirty_ratio;
+	case MEM_CGROUP_DIRTY_LIMIT_IN_BYTES:
+		return root ? vm_dirty_bytes : mem->dirty_param.dirty_bytes;
+	case MEM_CGROUP_DIRTY_BACKGROUND_RATIO:
+		return root ? dirty_background_ratio :
+			mem->dirty_param.dirty_background_ratio;
+	case MEM_CGROUP_DIRTY_BACKGROUND_LIMIT_IN_BYTES:
+		return root ? dirty_background_bytes :
+			mem->dirty_param.dirty_background_bytes;
+	default:
+		BUG();
+	}
+}
+
+static int
+mem_cgroup_dirty_write_string(struct cgroup *cgrp, struct cftype *cft,
+				const char *buffer)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
+	int type = cft->private;
+	int ret = -EINVAL;
+	unsigned long long val;
+
+	if (cgrp->parent == NULL)
+		return ret;
+
+	switch (type) {
+	case MEM_CGROUP_DIRTY_LIMIT_IN_BYTES:
+		/* This function does all necessary parse...reuse it */
+		ret = res_counter_memparse_write_strategy(buffer, &val);
+		if (ret)
+			break;
+		memcg->dirty_param.dirty_bytes = val;
+		memcg->dirty_param.dirty_ratio  = 0;
+		break;
+	case MEM_CGROUP_DIRTY_BACKGROUND_LIMIT_IN_BYTES:
+		ret = res_counter_memparse_write_strategy(buffer, &val);
+		if (ret)
+			break;
+		memcg->dirty_param.dirty_background_bytes = val;
+		memcg->dirty_param.dirty_background_ratio = 0;
+		break;
+	default:
+		BUG();
+		break;
+	}
+	return ret;
+}
+
+static int
+mem_cgroup_dirty_write(struct cgroup *cgrp, struct cftype *cft, u64 val)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
+	int type = cft->private;
+
+	if (cgrp->parent == NULL)
+		return -EINVAL;
+	if ((type == MEM_CGROUP_DIRTY_RATIO ||
+	     type == MEM_CGROUP_DIRTY_BACKGROUND_RATIO) && val > 100)
+		return -EINVAL;
+	switch (type) {
+	case MEM_CGROUP_DIRTY_RATIO:
+		memcg->dirty_param.dirty_ratio = val;
+		memcg->dirty_param.dirty_bytes = 0;
+		break;
+	case MEM_CGROUP_DIRTY_BACKGROUND_RATIO:
+		memcg->dirty_param.dirty_background_ratio = val;
+		memcg->dirty_param.dirty_background_bytes = 0;
+		break;
+	default:
+		BUG();
+		break;
+	}
+	return 0;
+}
+
 static struct cftype mem_cgroup_files[] = {
 	{
 		.name = "usage_in_bytes",
@@ -4374,6 +4466,30 @@ static struct cftype mem_cgroup_files[] = {
 		.unregister_event = mem_cgroup_oom_unregister_event,
 		.private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
 	},
+	{
+		.name = "dirty_ratio",
+		.read_u64 = mem_cgroup_dirty_read,
+		.write_u64 = mem_cgroup_dirty_write,
+		.private = MEM_CGROUP_DIRTY_RATIO,
+	},
+	{
+		.name = "dirty_limit_in_bytes",
+		.read_u64 = mem_cgroup_dirty_read,
+		.write_string = mem_cgroup_dirty_write_string,
+		.private = MEM_CGROUP_DIRTY_LIMIT_IN_BYTES,
+	},
+	{
+		.name = "dirty_background_ratio",
+		.read_u64 = mem_cgroup_dirty_read,
+		.write_u64 = mem_cgroup_dirty_write,
+		.private = MEM_CGROUP_DIRTY_BACKGROUND_RATIO,
+	},
+	{
+		.name = "dirty_background_limit_in_bytes",
+		.read_u64 = mem_cgroup_dirty_read,
+		.write_string = mem_cgroup_dirty_write_string,
+		.private = MEM_CGROUP_DIRTY_BACKGROUND_LIMIT_IN_BYTES,
+	},
 };
 
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
-- 
1.7.1

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [PATCH v3 08/11] memcg: CPU hotplug lockdep warning fix
From: Greg Thelen @ 2010-10-19  0:39 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-kernel, linux-mm, containers, Andrea Righi, Balbir Singh,
	KAMEZAWA Hiroyuki, Daisuke Nishimura, Minchan Kim, Ciju Rajan K,
	David Rientjes
In-Reply-To: <1287448784-25684-1-git-send-email-gthelen@google.com>

From: Balbir Singh <balbir@linux.vnet.ibm.com>

memcg has lockdep warnings (sleep inside rcu lock)

From: Balbir Singh <balbir@linux.vnet.ibm.com>

Recent move to get_online_cpus() ends up calling get_online_cpus() from
mem_cgroup_read_stat(). However mem_cgroup_read_stat() is called under rcu
lock. get_online_cpus() can sleep. The dirty limit patches expose
this BUG more readily due to their usage of mem_cgroup_page_stat()

This patch address this issue as identified by lockdep and moves the
hotplug protection to a higher layer. This might increase the time
required to hotplug, but not by much.

Warning messages

BUG: sleeping function called from invalid context at kernel/cpu.c:62
in_atomic(): 0, irqs_disabled(): 0, pid: 6325, name: pagetest
2 locks held by pagetest/6325:
do_page_fault+0x27d/0x4a0
mem_cgroup_page_stat+0x0/0x23f
Pid: 6325, comm: pagetest Not tainted 2.6.36-rc5-mm1+ #201
Call Trace:
[<ffffffff81041224>] __might_sleep+0x12d/0x131
[<ffffffff8104f4af>] get_online_cpus+0x1c/0x51
[<ffffffff8110eedb>] mem_cgroup_read_stat+0x27/0xa3
[<ffffffff811125d2>] mem_cgroup_page_stat+0x131/0x23f
[<ffffffff811124a1>] ? mem_cgroup_page_stat+0x0/0x23f
[<ffffffff810d57c3>] global_dirty_limits+0x42/0xf8
[<ffffffff810d58b3>] throttle_vm_writeout+0x3a/0xb4
[<ffffffff810dc2f8>] shrink_zone+0x3e6/0x3f8
[<ffffffff81074a35>] ? ktime_get_ts+0xb2/0xbf
[<ffffffff810dd1aa>] do_try_to_free_pages+0x106/0x478
[<ffffffff810dd601>] try_to_free_mem_cgroup_pages+0xe5/0x14c
[<ffffffff8110f947>] mem_cgroup_hierarchical_reclaim+0x314/0x3a2
[<ffffffff81111b31>] __mem_cgroup_try_charge+0x29b/0x593
[<ffffffff8111194a>] ? __mem_cgroup_try_charge+0xb4/0x593
[<ffffffff81071258>] ? local_clock+0x40/0x59
[<ffffffff81009015>] ? sched_clock+0x9/0xd
[<ffffffff810710d5>] ? sched_clock_local+0x1c/0x82
[<ffffffff8111398a>] mem_cgroup_charge_common+0x4b/0x76
[<ffffffff81141469>] ? bio_add_page+0x36/0x38
[<ffffffff81113ba9>] mem_cgroup_cache_charge+0x1f4/0x214
[<ffffffff810cd195>] add_to_page_cache_locked+0x4a/0x148
....

Acked-by: Greg Thelen <gthelen@google.com>
Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
---
 mm/memcontrol.c |    4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f876919..412ce73 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -588,7 +588,6 @@ static s64 mem_cgroup_read_stat(struct mem_cgroup *mem,
 	int cpu;
 	s64 val = 0;
 
-	get_online_cpus();
 	for_each_online_cpu(cpu)
 		val += per_cpu(mem->stat->count[idx], cpu);
 #ifdef CONFIG_HOTPLUG_CPU
@@ -596,7 +595,6 @@ static s64 mem_cgroup_read_stat(struct mem_cgroup *mem,
 	val += mem->nocpu_base.count[idx];
 	spin_unlock(&mem->pcp_counter_lock);
 #endif
-	put_online_cpus();
 	return val;
 }
 
@@ -1300,6 +1298,7 @@ s64 mem_cgroup_page_stat(enum mem_cgroup_nr_pages_item item)
 	struct mem_cgroup *iter;
 	s64 value;
 
+	get_online_cpus();
 	rcu_read_lock();
 	mem = mem_cgroup_from_task(current);
 	if (mem && !mem_cgroup_is_root(mem)) {
@@ -1321,6 +1320,7 @@ s64 mem_cgroup_page_stat(enum mem_cgroup_nr_pages_item item)
 	} else
 		value = -EINVAL;
 	rcu_read_unlock();
+	put_online_cpus();
 
 	return value;
 }
-- 
1.7.1

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [PATCH v3 07/11] memcg: add dirty limits to mem_cgroup
From: Greg Thelen @ 2010-10-19  0:39 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-kernel, linux-mm, containers, Andrea Righi, Balbir Singh,
	KAMEZAWA Hiroyuki, Daisuke Nishimura, Minchan Kim, Ciju Rajan K,
	David Rientjes, Greg Thelen
In-Reply-To: <1287448784-25684-1-git-send-email-gthelen@google.com>

Extend mem_cgroup to contain dirty page limits.  Also add routines
allowing the kernel to query the dirty usage of a memcg.

These interfaces not used by the kernel yet.  A subsequent commit
will add kernel calls to utilize these new routines.

Signed-off-by: Greg Thelen <gthelen@google.com>
Signed-off-by: Andrea Righi <arighi@develer.com>
---

Changelog since v1:
- Rename (for clarity):
  - mem_cgroup_write_page_stat_item -> mem_cgroup_page_stat_item
  - mem_cgroup_read_page_stat_item -> mem_cgroup_nr_pages_item
- Removed unnecessary get_ prefix from get_xxx() functions.
- Avoid lockdep warnings by using rcu_read_[un]lock() in
  mem_cgroup_has_dirty_limit().

 include/linux/memcontrol.h |   44 ++++++++++
 mm/memcontrol.c            |  186 +++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 229 insertions(+), 1 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index ef2eec7..6f3a136 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -19,6 +19,7 @@
 
 #ifndef _LINUX_MEMCONTROL_H
 #define _LINUX_MEMCONTROL_H
+#include <linux/writeback.h>
 #include <linux/cgroup.h>
 struct mem_cgroup;
 struct page_cgroup;
@@ -33,6 +34,30 @@ enum mem_cgroup_page_stat_item {
 	MEMCG_NR_FILE_UNSTABLE_NFS, /* # of NFS unstable pages */
 };
 
+/* Cgroup memory statistics items exported to the kernel. */
+enum mem_cgroup_nr_pages_item {
+	MEMCG_NR_DIRTYABLE_PAGES,
+	MEMCG_NR_RECLAIM_PAGES,
+	MEMCG_NR_WRITEBACK,
+	MEMCG_NR_DIRTY_WRITEBACK_PAGES,
+};
+
+/* Dirty memory parameters */
+struct vm_dirty_param {
+	int dirty_ratio;
+	int dirty_background_ratio;
+	unsigned long dirty_bytes;
+	unsigned long dirty_background_bytes;
+};
+
+static inline void global_vm_dirty_param(struct vm_dirty_param *param)
+{
+	param->dirty_ratio = vm_dirty_ratio;
+	param->dirty_bytes = vm_dirty_bytes;
+	param->dirty_background_ratio = dirty_background_ratio;
+	param->dirty_background_bytes = dirty_background_bytes;
+}
+
 extern unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
 					struct list_head *dst,
 					unsigned long *scanned, int order,
@@ -145,6 +170,10 @@ static inline void mem_cgroup_dec_page_stat(struct page *page,
 	mem_cgroup_update_page_stat(page, idx, -1);
 }
 
+bool mem_cgroup_has_dirty_limit(void);
+void vm_dirty_param(struct vm_dirty_param *param);
+s64 mem_cgroup_page_stat(enum mem_cgroup_nr_pages_item item);
+
 unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
 						gfp_t gfp_mask);
 u64 mem_cgroup_get_limit(struct mem_cgroup *mem);
@@ -326,6 +355,21 @@ static inline void mem_cgroup_dec_page_stat(struct page *page,
 {
 }
 
+static inline bool mem_cgroup_has_dirty_limit(void)
+{
+	return false;
+}
+
+static inline void vm_dirty_param(struct vm_dirty_param *param)
+{
+	global_vm_dirty_param(param);
+}
+
+static inline s64 mem_cgroup_page_stat(enum mem_cgroup_nr_pages_item item)
+{
+	return -ENOSYS;
+}
+
 static inline
 unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
 					    gfp_t gfp_mask)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 3ac2693..f876919 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -233,6 +233,10 @@ struct mem_cgroup {
 	atomic_t	refcnt;
 
 	unsigned int	swappiness;
+
+	/* control memory cgroup dirty pages */
+	struct vm_dirty_param dirty_param;
+
 	/* OOM-Killer disable */
 	int		oom_kill_disable;
 
@@ -1149,6 +1153,178 @@ static unsigned int get_swappiness(struct mem_cgroup *memcg)
 	return swappiness;
 }
 
+/*
+ * Returns a snapshot of the current dirty limits which is not synchronized with
+ * the routines that change the dirty limits.  If this routine races with an
+ * update to the dirty bytes/ratio value, then the caller must handle the case
+ * where both dirty_[background_]_ratio and _bytes are set.
+ */
+static void __mem_cgroup_dirty_param(struct vm_dirty_param *param,
+				     struct mem_cgroup *mem)
+{
+	if (mem && !mem_cgroup_is_root(mem)) {
+		param->dirty_ratio = mem->dirty_param.dirty_ratio;
+		param->dirty_bytes = mem->dirty_param.dirty_bytes;
+		param->dirty_background_ratio =
+			mem->dirty_param.dirty_background_ratio;
+		param->dirty_background_bytes =
+			mem->dirty_param.dirty_background_bytes;
+	} else {
+		global_vm_dirty_param(param);
+	}
+}
+
+/*
+ * Get dirty memory parameters of the current memcg or global values (if memory
+ * cgroups are disabled or querying the root cgroup).
+ *
+ * The current task may be moved to other cgroup while we access cgroup changing
+ * the task's dirty limit.  But a precise check is meaningless because the task
+ * can be moved after our access and writeback tends to take long time.  At
+ * least, "memcg" will not be freed while holding rcu_read_lock().
+ */
+void vm_dirty_param(struct vm_dirty_param *param)
+{
+	struct mem_cgroup *memcg;
+
+	if (mem_cgroup_disabled()) {
+		global_vm_dirty_param(param);
+		return;
+	}
+
+	rcu_read_lock();
+	memcg = mem_cgroup_from_task(current);
+	__mem_cgroup_dirty_param(param, memcg);
+	rcu_read_unlock();
+}
+
+/*
+ * Return true if the current memory cgroup has local dirty memory settings.
+ * There is an allowed race between the current task migrating in-to/out-of the
+ * root cgroup while this routine runs.  So the return value may be incorrect if
+ * the current task is being simultaneously migrated.
+ */
+bool mem_cgroup_has_dirty_limit(void)
+{
+	struct mem_cgroup *mem;
+	bool ret;
+
+	if (mem_cgroup_disabled())
+		return false;
+
+	rcu_read_lock();
+	mem = mem_cgroup_from_task(current);
+	ret = mem && !mem_cgroup_is_root(mem);
+	rcu_read_unlock();
+
+	return ret;
+}
+
+static inline bool mem_cgroup_can_swap(struct mem_cgroup *memcg)
+{
+	if (!do_swap_account)
+		return nr_swap_pages > 0;
+	return !memcg->memsw_is_minimum &&
+		(res_counter_read_u64(&memcg->memsw, RES_LIMIT) > 0);
+}
+
+static s64 mem_cgroup_local_page_stat(struct mem_cgroup *mem,
+				      enum mem_cgroup_nr_pages_item item)
+{
+	s64 ret;
+
+	switch (item) {
+	case MEMCG_NR_DIRTYABLE_PAGES:
+		ret = mem_cgroup_read_stat(mem, LRU_ACTIVE_FILE) +
+			mem_cgroup_read_stat(mem, LRU_INACTIVE_FILE);
+		if (mem_cgroup_can_swap(mem))
+			ret += mem_cgroup_read_stat(mem, LRU_ACTIVE_ANON) +
+				mem_cgroup_read_stat(mem, LRU_INACTIVE_ANON);
+		break;
+	case MEMCG_NR_RECLAIM_PAGES:
+		ret = mem_cgroup_read_stat(mem,	MEM_CGROUP_STAT_FILE_DIRTY) +
+			mem_cgroup_read_stat(mem,
+					     MEM_CGROUP_STAT_FILE_UNSTABLE_NFS);
+		break;
+	case MEMCG_NR_WRITEBACK:
+		ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_WRITEBACK);
+		break;
+	case MEMCG_NR_DIRTY_WRITEBACK_PAGES:
+		ret = mem_cgroup_read_stat(mem,
+					   MEM_CGROUP_STAT_FILE_WRITEBACK) +
+			mem_cgroup_read_stat(mem,
+					     MEM_CGROUP_STAT_FILE_UNSTABLE_NFS);
+		break;
+	default:
+		BUG();
+		break;
+	}
+	return ret;
+}
+
+static unsigned long long
+memcg_hierarchical_free_pages(struct mem_cgroup *mem)
+{
+	struct cgroup *cgroup;
+	unsigned long long min_free, free;
+
+	min_free = res_counter_read_u64(&mem->res, RES_LIMIT) -
+		res_counter_read_u64(&mem->res, RES_USAGE);
+	cgroup = mem->css.cgroup;
+	if (!mem->use_hierarchy)
+		goto out;
+
+	while (cgroup->parent) {
+		cgroup = cgroup->parent;
+		mem = mem_cgroup_from_cont(cgroup);
+		if (!mem->use_hierarchy)
+			break;
+		free = res_counter_read_u64(&mem->res, RES_LIMIT) -
+			res_counter_read_u64(&mem->res, RES_USAGE);
+		min_free = min(min_free, free);
+	}
+out:
+	/* Translate free memory in pages */
+	return min_free >> PAGE_SHIFT;
+}
+
+/*
+ * mem_cgroup_page_stat() - get memory cgroup file cache statistics
+ * @item:      memory statistic item exported to the kernel
+ *
+ * Return the accounted statistic value.
+ */
+s64 mem_cgroup_page_stat(enum mem_cgroup_nr_pages_item item)
+{
+	struct mem_cgroup *mem;
+	struct mem_cgroup *iter;
+	s64 value;
+
+	rcu_read_lock();
+	mem = mem_cgroup_from_task(current);
+	if (mem && !mem_cgroup_is_root(mem)) {
+		/*
+		 * If we're looking for dirtyable pages we need to evaluate
+		 * free pages depending on the limit and usage of the parents
+		 * first of all.
+		 */
+		if (item == MEMCG_NR_DIRTYABLE_PAGES)
+			value = memcg_hierarchical_free_pages(mem);
+		else
+			value = 0;
+		/*
+		 * Recursively evaluate page statistics against all cgroup
+		 * under hierarchy tree
+		 */
+		for_each_mem_cgroup_tree(iter, mem)
+			value += mem_cgroup_local_page_stat(iter, item);
+	} else
+		value = -EINVAL;
+	rcu_read_unlock();
+
+	return value;
+}
+
 static void mem_cgroup_start_move(struct mem_cgroup *mem)
 {
 	int cpu;
@@ -4457,8 +4633,16 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
 	spin_lock_init(&mem->reclaim_param_lock);
 	INIT_LIST_HEAD(&mem->oom_notify);
 
-	if (parent)
+	if (parent) {
 		mem->swappiness = get_swappiness(parent);
+		__mem_cgroup_dirty_param(&mem->dirty_param, parent);
+	} else {
+		/*
+		 * The root cgroup dirty_param field is not used, instead,
+		 * system-wide dirty limits are used.
+		 */
+	}
+
 	atomic_set(&mem->refcnt, 1);
 	mem->move_charge_at_immigrate = 0;
 	mutex_init(&mem->thresholds_lock);
-- 
1.7.1

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [PATCH v3 06/11] memcg: add kernel calls for memcg dirty page stats
From: Greg Thelen @ 2010-10-19  0:39 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-kernel, linux-mm, containers, Andrea Righi, Balbir Singh,
	KAMEZAWA Hiroyuki, Daisuke Nishimura, Minchan Kim, Ciju Rajan K,
	David Rientjes, Greg Thelen
In-Reply-To: <1287448784-25684-1-git-send-email-gthelen@google.com>

Add calls into memcg dirty page accounting.  Notify memcg when pages
transition between clean, file dirty, writeback, and unstable nfs.
This allows the memory controller to maintain an accurate view of
the amount of its memory that is dirty.

Signed-off-by: Greg Thelen <gthelen@google.com>
Signed-off-by: Andrea Righi <arighi@develer.com>
---
 fs/nfs/write.c      |    4 ++++
 mm/filemap.c        |    1 +
 mm/page-writeback.c |    4 ++++
 mm/truncate.c       |    1 +
 4 files changed, 10 insertions(+), 0 deletions(-)

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 4c14c17..a3c39f7 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -450,6 +450,7 @@ nfs_mark_request_commit(struct nfs_page *req)
 			NFS_PAGE_TAG_COMMIT);
 	nfsi->ncommit++;
 	spin_unlock(&inode->i_lock);
+	mem_cgroup_inc_page_stat(req->wb_page, MEMCG_NR_FILE_UNSTABLE_NFS);
 	inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
 	inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE);
 	__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
@@ -461,6 +462,7 @@ nfs_clear_request_commit(struct nfs_page *req)
 	struct page *page = req->wb_page;
 
 	if (test_and_clear_bit(PG_CLEAN, &(req)->wb_flags)) {
+		mem_cgroup_dec_page_stat(page, MEMCG_NR_FILE_UNSTABLE_NFS);
 		dec_zone_page_state(page, NR_UNSTABLE_NFS);
 		dec_bdi_stat(page->mapping->backing_dev_info, BDI_RECLAIMABLE);
 		return 1;
@@ -1316,6 +1318,8 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how)
 		req = nfs_list_entry(head->next);
 		nfs_list_remove_request(req);
 		nfs_mark_request_commit(req);
+		mem_cgroup_dec_page_stat(req->wb_page,
+					 MEMCG_NR_FILE_UNSTABLE_NFS);
 		dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
 		dec_bdi_stat(req->wb_page->mapping->backing_dev_info,
 				BDI_RECLAIMABLE);
diff --git a/mm/filemap.c b/mm/filemap.c
index 49b2d2e..f6bd6f2 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -146,6 +146,7 @@ void __remove_from_page_cache(struct page *page)
 	 * having removed the page entirely.
 	 */
 	if (PageDirty(page) && mapping_cap_account_dirty(mapping)) {
+		mem_cgroup_dec_page_stat(page, MEMCG_NR_FILE_DIRTY);
 		dec_zone_page_state(page, NR_FILE_DIRTY);
 		dec_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
 	}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index b840afa..820eb66 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1114,6 +1114,7 @@ int __set_page_dirty_no_writeback(struct page *page)
 void account_page_dirtied(struct page *page, struct address_space *mapping)
 {
 	if (mapping_cap_account_dirty(mapping)) {
+		mem_cgroup_inc_page_stat(page, MEMCG_NR_FILE_DIRTY);
 		__inc_zone_page_state(page, NR_FILE_DIRTY);
 		__inc_zone_page_state(page, NR_DIRTIED);
 		__inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
@@ -1303,6 +1304,7 @@ int clear_page_dirty_for_io(struct page *page)
 		 * for more comments.
 		 */
 		if (TestClearPageDirty(page)) {
+			mem_cgroup_dec_page_stat(page, MEMCG_NR_FILE_DIRTY);
 			dec_zone_page_state(page, NR_FILE_DIRTY);
 			dec_bdi_stat(mapping->backing_dev_info,
 					BDI_RECLAIMABLE);
@@ -1333,6 +1335,7 @@ int test_clear_page_writeback(struct page *page)
 				__dec_bdi_stat(bdi, BDI_WRITEBACK);
 				__bdi_writeout_inc(bdi);
 			}
+			mem_cgroup_dec_page_stat(page, MEMCG_NR_FILE_WRITEBACK);
 		}
 		spin_unlock_irqrestore(&mapping->tree_lock, flags);
 	} else {
@@ -1360,6 +1363,7 @@ int test_set_page_writeback(struct page *page)
 						PAGECACHE_TAG_WRITEBACK);
 			if (bdi_cap_account_writeback(bdi))
 				__inc_bdi_stat(bdi, BDI_WRITEBACK);
+			mem_cgroup_inc_page_stat(page, MEMCG_NR_FILE_WRITEBACK);
 		}
 		if (!PageDirty(page))
 			radix_tree_tag_clear(&mapping->page_tree,
diff --git a/mm/truncate.c b/mm/truncate.c
index cd94607..54cca83 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -76,6 +76,7 @@ void cancel_dirty_page(struct page *page, unsigned int account_size)
 	if (TestClearPageDirty(page)) {
 		struct address_space *mapping = page->mapping;
 		if (mapping && mapping_cap_account_dirty(mapping)) {
+			mem_cgroup_dec_page_stat(page, MEMCG_NR_FILE_DIRTY);
 			dec_zone_page_state(page, NR_FILE_DIRTY);
 			dec_bdi_stat(mapping->backing_dev_info,
 					BDI_RECLAIMABLE);
-- 
1.7.1

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [PATCH v3 05/11] memcg: add dirty page accounting infrastructure
From: Greg Thelen @ 2010-10-19  0:39 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-kernel, linux-mm, containers, Andrea Righi, Balbir Singh,
	KAMEZAWA Hiroyuki, Daisuke Nishimura, Minchan Kim, Ciju Rajan K,
	David Rientjes, Greg Thelen
In-Reply-To: <1287448784-25684-1-git-send-email-gthelen@google.com>

Add memcg routines to track dirty, writeback, and unstable_NFS pages.
These routines are not yet used by the kernel to count such pages.
A later change adds kernel calls to these new routines.

Signed-off-by: Greg Thelen <gthelen@google.com>
Signed-off-by: Andrea Righi <arighi@develer.com>
---

Changelog since v1:
- Renamed "nfs"/"total_nfs" to "nfs_unstable"/"total_nfs_unstable" in per cgroup
  memory.stat to match /proc/meminfo.
- Rename (for clarity):
  - mem_cgroup_write_page_stat_item -> mem_cgroup_page_stat_item
  - mem_cgroup_read_page_stat_item -> mem_cgroup_nr_pages_item
- Remove redundant comments.
- Made mem_cgroup_move_account_page_stat() inline.

 include/linux/memcontrol.h |    3 ++
 mm/memcontrol.c            |   86 +++++++++++++++++++++++++++++++++++++++----
 2 files changed, 81 insertions(+), 8 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 067115c..ef2eec7 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -28,6 +28,9 @@ struct mm_struct;
 /* Stats that can be updated by kernel. */
 enum mem_cgroup_page_stat_item {
 	MEMCG_NR_FILE_MAPPED, /* # of pages charged as file rss */
+	MEMCG_NR_FILE_DIRTY, /* # of dirty pages in page cache */
+	MEMCG_NR_FILE_WRITEBACK, /* # of pages under writeback */
+	MEMCG_NR_FILE_UNSTABLE_NFS, /* # of NFS unstable pages */
 };
 
 extern unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 697f7b8..3ac2693 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -85,10 +85,13 @@ enum mem_cgroup_stat_index {
 	 */
 	MEM_CGROUP_STAT_CACHE, 	   /* # of pages charged as cache */
 	MEM_CGROUP_STAT_RSS,	   /* # of pages charged as anon rss */
-	MEM_CGROUP_STAT_FILE_MAPPED,  /* # of pages charged as file rss */
 	MEM_CGROUP_STAT_PGPGIN_COUNT,	/* # of pages paged in */
 	MEM_CGROUP_STAT_PGPGOUT_COUNT,	/* # of pages paged out */
 	MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
+	MEM_CGROUP_STAT_FILE_MAPPED,  /* # of pages charged as file rss */
+	MEM_CGROUP_STAT_FILE_DIRTY,	/* # of dirty pages in page cache */
+	MEM_CGROUP_STAT_FILE_WRITEBACK,		/* # of pages under writeback */
+	MEM_CGROUP_STAT_FILE_UNSTABLE_NFS,	/* # of NFS unstable pages */
 	MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */
 	/* incremented at every  pagein/pageout */
 	MEM_CGROUP_EVENTS = MEM_CGROUP_STAT_DATA,
@@ -1642,6 +1645,44 @@ void mem_cgroup_update_page_stat(struct page *page,
 			ClearPageCgroupFileMapped(pc);
 		idx = MEM_CGROUP_STAT_FILE_MAPPED;
 		break;
+
+	case MEMCG_NR_FILE_DIRTY:
+		/* Use Test{Set,Clear} to only un/charge the memcg once. */
+		if (val > 0) {
+			if (TestSetPageCgroupFileDirty(pc))
+				val = 0;
+		} else {
+			if (!TestClearPageCgroupFileDirty(pc))
+				val = 0;
+		}
+		idx = MEM_CGROUP_STAT_FILE_DIRTY;
+		break;
+
+	case MEMCG_NR_FILE_WRITEBACK:
+		/*
+		 * This counter is adjusted while holding the mapping's
+		 * tree_lock.  Therefore there is no race between settings and
+		 * clearing of this flag.
+		 */
+		if (val > 0)
+			SetPageCgroupFileWriteback(pc);
+		else
+			ClearPageCgroupFileWriteback(pc);
+		idx = MEM_CGROUP_STAT_FILE_WRITEBACK;
+		break;
+
+	case MEMCG_NR_FILE_UNSTABLE_NFS:
+		/* Use Test{Set,Clear} to only un/charge the memcg once. */
+		if (val > 0) {
+			if (TestSetPageCgroupFileUnstableNFS(pc))
+				val = 0;
+		} else {
+			if (!TestClearPageCgroupFileUnstableNFS(pc))
+				val = 0;
+		}
+		idx = MEM_CGROUP_STAT_FILE_UNSTABLE_NFS;
+		break;
+
 	default:
 		BUG();
 	}
@@ -2146,6 +2187,17 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
 	memcg_check_events(mem, pc->page);
 }
 
+static inline
+void mem_cgroup_move_account_page_stat(struct mem_cgroup *from,
+				       struct mem_cgroup *to,
+				       enum mem_cgroup_stat_index idx)
+{
+	preempt_disable();
+	__this_cpu_dec(from->stat->count[idx]);
+	__this_cpu_inc(to->stat->count[idx]);
+	preempt_enable();
+}
+
 /**
  * __mem_cgroup_move_account - move account of the page
  * @pc:	page_cgroup of the page.
@@ -2172,13 +2224,18 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc,
 	VM_BUG_ON(!PageCgroupUsed(pc));
 	VM_BUG_ON(pc->mem_cgroup != from);
 
-	if (PageCgroupFileMapped(pc)) {
-		/* Update mapped_file data for mem_cgroup */
-		preempt_disable();
-		__this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
-		__this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
-		preempt_enable();
-	}
+	if (PageCgroupFileMapped(pc))
+		mem_cgroup_move_account_page_stat(from, to,
+					MEM_CGROUP_STAT_FILE_MAPPED);
+	if (PageCgroupFileDirty(pc))
+		mem_cgroup_move_account_page_stat(from, to,
+					MEM_CGROUP_STAT_FILE_DIRTY);
+	if (PageCgroupFileWriteback(pc))
+		mem_cgroup_move_account_page_stat(from, to,
+					MEM_CGROUP_STAT_FILE_WRITEBACK);
+	if (PageCgroupFileUnstableNFS(pc))
+		mem_cgroup_move_account_page_stat(from, to,
+					MEM_CGROUP_STAT_FILE_UNSTABLE_NFS);
 	mem_cgroup_charge_statistics(from, pc, false);
 	if (uncharge)
 		/* This is not "cancel", but cancel_charge does all we need. */
@@ -3557,6 +3614,9 @@ enum {
 	MCS_PGPGIN,
 	MCS_PGPGOUT,
 	MCS_SWAP,
+	MCS_FILE_DIRTY,
+	MCS_WRITEBACK,
+	MCS_UNSTABLE_NFS,
 	MCS_INACTIVE_ANON,
 	MCS_ACTIVE_ANON,
 	MCS_INACTIVE_FILE,
@@ -3579,6 +3639,9 @@ struct {
 	{"pgpgin", "total_pgpgin"},
 	{"pgpgout", "total_pgpgout"},
 	{"swap", "total_swap"},
+	{"dirty", "total_dirty"},
+	{"writeback", "total_writeback"},
+	{"nfs_unstable", "total_nfs_unstable"},
 	{"inactive_anon", "total_inactive_anon"},
 	{"active_anon", "total_active_anon"},
 	{"inactive_file", "total_inactive_file"},
@@ -3608,6 +3671,13 @@ mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
 		s->stat[MCS_SWAP] += val * PAGE_SIZE;
 	}
 
+	val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_DIRTY);
+	s->stat[MCS_FILE_DIRTY] += val * PAGE_SIZE;
+	val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_WRITEBACK);
+	s->stat[MCS_WRITEBACK] += val * PAGE_SIZE;
+	val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_UNSTABLE_NFS);
+	s->stat[MCS_UNSTABLE_NFS] += val * PAGE_SIZE;
+
 	/* per zone stat */
 	val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON);
 	s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE;
-- 
1.7.1

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [PATCH v3 04/11] memcg: add lock to synchronize page accounting and migration
From: Greg Thelen @ 2010-10-19  0:39 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-kernel, linux-mm, containers, Andrea Righi, Balbir Singh,
	KAMEZAWA Hiroyuki, Daisuke Nishimura, Minchan Kim, Ciju Rajan K,
	David Rientjes, Greg Thelen
In-Reply-To: <1287448784-25684-1-git-send-email-gthelen@google.com>

From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>

From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>

Introduce a new bit spin lock, PCG_MOVE_LOCK, to synchronize
the page accounting and migration code.  This reworks the
locking scheme of _update_stat() and _move_account() by
adding new lock bit PCG_MOVE_LOCK, which is always taken
under IRQ disable.

1. If pages are being migrated from a memcg, then updates to
   that memcg page statistics are protected by grabbing
   PCG_MOVE_LOCK using move_lock_page_cgroup().  In an
   upcoming commit, memcg dirty page accounting will be
   updating memcg page accounting (specifically: num
   writeback pages) from IRQ context (softirq).  Avoid a
   deadlocking nested spin lock attempt by disabling irq on
   the local processor when grabbing the PCG_MOVE_LOCK.

2. lock for update_page_stat is used only for avoiding race
   with move_account().  So, IRQ awareness of
   lock_page_cgroup() itself is not a problem.  The problem
   is between mem_cgroup_update_page_stat() and
   mem_cgroup_move_account_page().

Trade-off:
  * Changing lock_page_cgroup() to always disable IRQ (or
    local_bh) has some impacts on performance and I think
    it's bad to disable IRQ when it's not necessary.
  * adding a new lock makes move_account() slower.  Score is
    here.

Performance Impact: moving a 8G anon process.

Before:
	real    0m0.792s
	user    0m0.000s
	sys     0m0.780s

After:
	real    0m0.854s
	user    0m0.000s
	sys     0m0.842s

This score is bad but planned patches for optimization can reduce
this impact.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Greg Thelen <gthelen@google.com>
---
 include/linux/page_cgroup.h |   31 ++++++++++++++++++++++++++++---
 mm/memcontrol.c             |    9 +++++++--
 2 files changed, 35 insertions(+), 5 deletions(-)

diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
index b59c298..509452e 100644
--- a/include/linux/page_cgroup.h
+++ b/include/linux/page_cgroup.h
@@ -35,15 +35,18 @@ struct page_cgroup *lookup_page_cgroup(struct page *page);
 
 enum {
 	/* flags for mem_cgroup */
-	PCG_LOCK,  /* page cgroup is locked */
+	PCG_LOCK,  /* Lock for pc->mem_cgroup and following bits. */
 	PCG_CACHE, /* charged as cache */
 	PCG_USED, /* this object is in use. */
-	PCG_ACCT_LRU, /* page has been accounted for */
+	PCG_MIGRATION, /* under page migration */
+	/* flags for mem_cgroup and file and I/O status */
+	PCG_MOVE_LOCK, /* For race between move_account v.s. following bits */
 	PCG_FILE_MAPPED, /* page is accounted as "mapped" */
 	PCG_FILE_DIRTY, /* page is dirty */
 	PCG_FILE_WRITEBACK, /* page is under writeback */
 	PCG_FILE_UNSTABLE_NFS, /* page is NFS unstable */
-	PCG_MIGRATION, /* under page migration */
+	/* No lock in page_cgroup */
+	PCG_ACCT_LRU, /* page has been accounted for (under lru_lock) */
 };
 
 #define TESTPCGFLAG(uname, lname)			\
@@ -119,6 +122,10 @@ static inline enum zone_type page_cgroup_zid(struct page_cgroup *pc)
 
 static inline void lock_page_cgroup(struct page_cgroup *pc)
 {
+	/*
+	 * Don't take this lock in IRQ context.
+	 * This lock is for pc->mem_cgroup, USED, CACHE, MIGRATION
+	 */
 	bit_spin_lock(PCG_LOCK, &pc->flags);
 }
 
@@ -127,6 +134,24 @@ static inline void unlock_page_cgroup(struct page_cgroup *pc)
 	bit_spin_unlock(PCG_LOCK, &pc->flags);
 }
 
+static inline void move_lock_page_cgroup(struct page_cgroup *pc,
+	unsigned long *flags)
+{
+	/*
+	 * We know updates to pc->flags of page cache's stats are from both of
+	 * usual context or IRQ context. Disable IRQ to avoid deadlock.
+	 */
+	local_irq_save(*flags);
+	bit_spin_lock(PCG_MOVE_LOCK, &pc->flags);
+}
+
+static inline void move_unlock_page_cgroup(struct page_cgroup *pc,
+	unsigned long *flags)
+{
+	bit_spin_unlock(PCG_MOVE_LOCK, &pc->flags);
+	local_irq_restore(*flags);
+}
+
 #else /* CONFIG_CGROUP_MEM_RES_CTLR */
 struct page_cgroup;
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 369879a..697f7b8 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1615,6 +1615,7 @@ void mem_cgroup_update_page_stat(struct page *page,
 	struct mem_cgroup *mem;
 	struct page_cgroup *pc = lookup_page_cgroup(page);
 	bool need_unlock = false;
+	unsigned long uninitialized_var(flags);
 
 	if (unlikely(!pc))
 		return;
@@ -1626,7 +1627,7 @@ void mem_cgroup_update_page_stat(struct page *page,
 	/* pc->mem_cgroup is unstable ? */
 	if (unlikely(mem_cgroup_stealed(mem))) {
 		/* take a lock against to access pc->mem_cgroup */
-		lock_page_cgroup(pc);
+		move_lock_page_cgroup(pc, &flags);
 		need_unlock = true;
 		mem = pc->mem_cgroup;
 		if (!mem || !PageCgroupUsed(pc))
@@ -1649,7 +1650,7 @@ void mem_cgroup_update_page_stat(struct page *page,
 
 out:
 	if (unlikely(need_unlock))
-		unlock_page_cgroup(pc);
+		move_unlock_page_cgroup(pc, &flags);
 	rcu_read_unlock();
 	return;
 }
@@ -2203,9 +2204,13 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
 		struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
 {
 	int ret = -EINVAL;
+	unsigned long flags;
+
 	lock_page_cgroup(pc);
 	if (PageCgroupUsed(pc) && pc->mem_cgroup == from) {
+		move_lock_page_cgroup(pc, &flags);
 		__mem_cgroup_move_account(pc, from, to, uncharge);
+		move_unlock_page_cgroup(pc, &flags);
 		ret = 0;
 	}
 	unlock_page_cgroup(pc);
-- 
1.7.1

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [PATCH v3 03/11] memcg: create extensible page stat update routines
From: Greg Thelen @ 2010-10-19  0:39 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-kernel, linux-mm, containers, Andrea Righi, Balbir Singh,
	KAMEZAWA Hiroyuki, Daisuke Nishimura, Minchan Kim, Ciju Rajan K,
	David Rientjes, Greg Thelen
In-Reply-To: <1287448784-25684-1-git-send-email-gthelen@google.com>

Replace usage of the mem_cgroup_update_file_mapped() memcg
statistic update routine with two new routines:
* mem_cgroup_inc_page_stat()
* mem_cgroup_dec_page_stat()

As before, only the file_mapped statistic is managed.  However,
these more general interfaces allow for new statistics to be
more easily added.  New statistics are added with memcg dirty
page accounting.

Signed-off-by: Greg Thelen <gthelen@google.com>
Signed-off-by: Andrea Righi <arighi@develer.com>
---

Changelog since v1:
- Rename (for clarity):
  - mem_cgroup_write_page_stat_item -> mem_cgroup_page_stat_item
  - mem_cgroup_read_page_stat_item -> mem_cgroup_nr_pages_item

 include/linux/memcontrol.h |   31 ++++++++++++++++++++++++++++---
 mm/memcontrol.c            |   16 +++++++---------
 mm/rmap.c                  |    4 ++--
 3 files changed, 37 insertions(+), 14 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 159a076..067115c 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -25,6 +25,11 @@ struct page_cgroup;
 struct page;
 struct mm_struct;
 
+/* Stats that can be updated by kernel. */
+enum mem_cgroup_page_stat_item {
+	MEMCG_NR_FILE_MAPPED, /* # of pages charged as file rss */
+};
+
 extern unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
 					struct list_head *dst,
 					unsigned long *scanned, int order,
@@ -121,7 +126,22 @@ static inline bool mem_cgroup_disabled(void)
 	return false;
 }
 
-void mem_cgroup_update_file_mapped(struct page *page, int val);
+void mem_cgroup_update_page_stat(struct page *page,
+				 enum mem_cgroup_page_stat_item idx,
+				 int val);
+
+static inline void mem_cgroup_inc_page_stat(struct page *page,
+					    enum mem_cgroup_page_stat_item idx)
+{
+	mem_cgroup_update_page_stat(page, idx, 1);
+}
+
+static inline void mem_cgroup_dec_page_stat(struct page *page,
+					    enum mem_cgroup_page_stat_item idx)
+{
+	mem_cgroup_update_page_stat(page, idx, -1);
+}
+
 unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
 						gfp_t gfp_mask);
 u64 mem_cgroup_get_limit(struct mem_cgroup *mem);
@@ -293,8 +313,13 @@ mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
 {
 }
 
-static inline void mem_cgroup_update_file_mapped(struct page *page,
-							int val)
+static inline void mem_cgroup_inc_page_stat(struct page *page,
+					    enum mem_cgroup_page_stat_item idx)
+{
+}
+
+static inline void mem_cgroup_dec_page_stat(struct page *page,
+					    enum mem_cgroup_page_stat_item idx)
 {
 }
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index a4034b6..369879a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1609,7 +1609,8 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
  * possibility of race condition. If there is, we take a lock.
  */
 
-static void mem_cgroup_update_file_stat(struct page *page, int idx, int val)
+void mem_cgroup_update_page_stat(struct page *page,
+				 enum mem_cgroup_page_stat_item idx, int val)
 {
 	struct mem_cgroup *mem;
 	struct page_cgroup *pc = lookup_page_cgroup(page);
@@ -1632,30 +1633,27 @@ static void mem_cgroup_update_file_stat(struct page *page, int idx, int val)
 			goto out;
 	}
 
-	this_cpu_add(mem->stat->count[idx], val);
-
 	switch (idx) {
-	case MEM_CGROUP_STAT_FILE_MAPPED:
+	case MEMCG_NR_FILE_MAPPED:
 		if (val > 0)
 			SetPageCgroupFileMapped(pc);
 		else if (!page_mapped(page))
 			ClearPageCgroupFileMapped(pc);
+		idx = MEM_CGROUP_STAT_FILE_MAPPED;
 		break;
 	default:
 		BUG();
 	}
 
+	this_cpu_add(mem->stat->count[idx], val);
+
 out:
 	if (unlikely(need_unlock))
 		unlock_page_cgroup(pc);
 	rcu_read_unlock();
 	return;
 }
-
-void mem_cgroup_update_file_mapped(struct page *page, int val)
-{
-	mem_cgroup_update_file_stat(page, MEM_CGROUP_STAT_FILE_MAPPED, val);
-}
+EXPORT_SYMBOL(mem_cgroup_update_page_stat);
 
 /*
  * size of first charge trial. "32" comes from vmscan.c's magic value.
diff --git a/mm/rmap.c b/mm/rmap.c
index 1a8bf76..a66ab76 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -911,7 +911,7 @@ void page_add_file_rmap(struct page *page)
 {
 	if (atomic_inc_and_test(&page->_mapcount)) {
 		__inc_zone_page_state(page, NR_FILE_MAPPED);
-		mem_cgroup_update_file_mapped(page, 1);
+		mem_cgroup_inc_page_stat(page, MEMCG_NR_FILE_MAPPED);
 	}
 }
 
@@ -949,7 +949,7 @@ void page_remove_rmap(struct page *page)
 		__dec_zone_page_state(page, NR_ANON_PAGES);
 	} else {
 		__dec_zone_page_state(page, NR_FILE_MAPPED);
-		mem_cgroup_update_file_mapped(page, -1);
+		mem_cgroup_dec_page_stat(page, MEMCG_NR_FILE_MAPPED);
 	}
 	/*
 	 * It would be tidy to reset the PageAnon mapping here,
-- 
1.7.1

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [PATCH v3 02/11] memcg: document cgroup dirty memory interfaces
From: Greg Thelen @ 2010-10-19  0:39 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-kernel, linux-mm, containers, Andrea Righi, Balbir Singh,
	KAMEZAWA Hiroyuki, Daisuke Nishimura, Minchan Kim, Ciju Rajan K,
	David Rientjes, Greg Thelen
In-Reply-To: <1287448784-25684-1-git-send-email-gthelen@google.com>

Document cgroup dirty memory interfaces and statistics.

Signed-off-by: Andrea Righi <arighi@develer.com>
Signed-off-by: Greg Thelen <gthelen@google.com>
---

Changelog since v1:
- Renamed "nfs"/"total_nfs" to "nfs_unstable"/"total_nfs_unstable" in per cgroup
  memory.stat to match /proc/meminfo.

- Allow [kKmMgG] suffixes for newly created dirty limit value cgroupfs files.

- Describe a situation where a cgroup can exceed its dirty limit.

 Documentation/cgroups/memory.txt |   60 ++++++++++++++++++++++++++++++++++++++
 1 files changed, 60 insertions(+), 0 deletions(-)

diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt
index 7781857..02bbd6f 100644
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroups/memory.txt
@@ -385,6 +385,10 @@ mapped_file	- # of bytes of mapped file (includes tmpfs/shmem)
 pgpgin		- # of pages paged in (equivalent to # of charging events).
 pgpgout		- # of pages paged out (equivalent to # of uncharging events).
 swap		- # of bytes of swap usage
+dirty		- # of bytes that are waiting to get written back to the disk.
+writeback	- # of bytes that are actively being written back to the disk.
+nfs_unstable	- # of bytes sent to the NFS server, but not yet committed to
+		the actual storage.
 inactive_anon	- # of bytes of anonymous memory and swap cache memory on
 		LRU list.
 active_anon	- # of bytes of anonymous and swap cache memory on active
@@ -453,6 +457,62 @@ memory under it will be reclaimed.
 You can reset failcnt by writing 0 to failcnt file.
 # echo 0 > .../memory.failcnt
 
+5.5 dirty memory
+
+Control the maximum amount of dirty pages a cgroup can have at any given time.
+
+Limiting dirty memory is like fixing the max amount of dirty (hard to reclaim)
+page cache used by a cgroup.  So, in case of multiple cgroup writers, they will
+not be able to consume more than their designated share of dirty pages and will
+be forced to perform write-out if they cross that limit.
+
+The interface is equivalent to the procfs interface: /proc/sys/vm/dirty_*.  It
+is possible to configure a limit to trigger both a direct writeback or a
+background writeback performed by per-bdi flusher threads.  The root cgroup
+memory.dirty_* control files are read-only and match the contents of
+the /proc/sys/vm/dirty_* files.
+
+Per-cgroup dirty limits can be set using the following files in the cgroupfs:
+
+- memory.dirty_ratio: the amount of dirty memory (expressed as a percentage of
+  cgroup memory) at which a process generating dirty pages will itself start
+  writing out dirty data.
+
+- memory.dirty_limit_in_bytes: the amount of dirty memory (expressed in bytes)
+  in the cgroup at which a process generating dirty pages will start itself
+  writing out dirty data.  Suffix (k, K, m, M, g, or G) can be used to indicate
+  that value is kilo, mega or gigabytes.
+
+  Note: memory.dirty_limit_in_bytes is the counterpart of memory.dirty_ratio.
+  Only one of them may be specified at a time.  When one is written it is
+  immediately taken into account to evaluate the dirty memory limits and the
+  other appears as 0 when read.
+
+- memory.dirty_background_ratio: the amount of dirty memory of the cgroup
+  (expressed as a percentage of cgroup memory) at which background writeback
+  kernel threads will start writing out dirty data.
+
+- memory.dirty_background_limit_in_bytes: the amount of dirty memory (expressed
+  in bytes) in the cgroup at which background writeback kernel threads will
+  start writing out dirty data.  Suffix (k, K, m, M, g, or G) can be used to
+  indicate that value is kilo, mega or gigabytes.
+
+  Note: memory.dirty_background_limit_in_bytes is the counterpart of
+  memory.dirty_background_ratio.  Only one of them may be specified at a time.
+  When one is written it is immediately taken into account to evaluate the dirty
+  memory limits and the other appears as 0 when read.
+
+A cgroup may contain more dirty memory than its dirty limit.  This is possible
+because of the principle that the first cgroup to touch a page is charged for
+it.  Subsequent page counting events (dirty, writeback, nfs_unstable) are also
+counted to the originally charged cgroup.
+
+Example: If page is allocated by a cgroup A task, then the page is charged to
+cgroup A.  If the page is later dirtied by a task in cgroup B, then the cgroup A
+dirty count will be incremented.  If cgroup A is over its dirty limit but cgroup
+B is not, then dirtying a cgroup A page from a cgroup B task may push cgroup A
+over its dirty limit without throttling the dirtying cgroup B task.
+
 6. Hierarchy support
 
 The memory controller supports a deep hierarchy and hierarchical accounting.
-- 
1.7.1

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [PATCH v3 01/11] memcg: add page_cgroup flags for dirty page tracking
From: Greg Thelen @ 2010-10-19  0:39 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-kernel, linux-mm, containers, Andrea Righi, Balbir Singh,
	KAMEZAWA Hiroyuki, Daisuke Nishimura, Minchan Kim, Ciju Rajan K,
	David Rientjes, Greg Thelen
In-Reply-To: <1287448784-25684-1-git-send-email-gthelen@google.com>

Add additional flags to page_cgroup to track dirty pages
within a mem_cgroup.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrea Righi <arighi@develer.com>
Signed-off-by: Greg Thelen <gthelen@google.com>
---
 include/linux/page_cgroup.h |   23 +++++++++++++++++++++++
 1 files changed, 23 insertions(+), 0 deletions(-)

diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
index 5bb13b3..b59c298 100644
--- a/include/linux/page_cgroup.h
+++ b/include/linux/page_cgroup.h
@@ -40,6 +40,9 @@ enum {
 	PCG_USED, /* this object is in use. */
 	PCG_ACCT_LRU, /* page has been accounted for */
 	PCG_FILE_MAPPED, /* page is accounted as "mapped" */
+	PCG_FILE_DIRTY, /* page is dirty */
+	PCG_FILE_WRITEBACK, /* page is under writeback */
+	PCG_FILE_UNSTABLE_NFS, /* page is NFS unstable */
 	PCG_MIGRATION, /* under page migration */
 };
 
@@ -59,6 +62,10 @@ static inline void ClearPageCgroup##uname(struct page_cgroup *pc)	\
 static inline int TestClearPageCgroup##uname(struct page_cgroup *pc)	\
 	{ return test_and_clear_bit(PCG_##lname, &pc->flags);  }
 
+#define TESTSETPCGFLAG(uname, lname)			\
+static inline int TestSetPageCgroup##uname(struct page_cgroup *pc)	\
+	{ return test_and_set_bit(PCG_##lname, &pc->flags);  }
+
 TESTPCGFLAG(Locked, LOCK)
 
 /* Cache flag is set only once (at allocation) */
@@ -80,6 +87,22 @@ SETPCGFLAG(FileMapped, FILE_MAPPED)
 CLEARPCGFLAG(FileMapped, FILE_MAPPED)
 TESTPCGFLAG(FileMapped, FILE_MAPPED)
 
+SETPCGFLAG(FileDirty, FILE_DIRTY)
+CLEARPCGFLAG(FileDirty, FILE_DIRTY)
+TESTPCGFLAG(FileDirty, FILE_DIRTY)
+TESTCLEARPCGFLAG(FileDirty, FILE_DIRTY)
+TESTSETPCGFLAG(FileDirty, FILE_DIRTY)
+
+SETPCGFLAG(FileWriteback, FILE_WRITEBACK)
+CLEARPCGFLAG(FileWriteback, FILE_WRITEBACK)
+TESTPCGFLAG(FileWriteback, FILE_WRITEBACK)
+
+SETPCGFLAG(FileUnstableNFS, FILE_UNSTABLE_NFS)
+CLEARPCGFLAG(FileUnstableNFS, FILE_UNSTABLE_NFS)
+TESTPCGFLAG(FileUnstableNFS, FILE_UNSTABLE_NFS)
+TESTCLEARPCGFLAG(FileUnstableNFS, FILE_UNSTABLE_NFS)
+TESTSETPCGFLAG(FileUnstableNFS, FILE_UNSTABLE_NFS)
+
 SETPCGFLAG(Migration, MIGRATION)
 CLEARPCGFLAG(Migration, MIGRATION)
 TESTPCGFLAG(Migration, MIGRATION)
-- 
1.7.1

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [PATCH v3 00/11] memcg: per cgroup dirty page accounting
From: Greg Thelen @ 2010-10-19  0:39 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-kernel, linux-mm, containers, Andrea Righi, Balbir Singh,
	KAMEZAWA Hiroyuki, Daisuke Nishimura, Minchan Kim, Ciju Rajan K,
	David Rientjes, Greg Thelen

Changes since v2:
- Rather than disabling softirq in lock_page_cgroup(), introduce a separate lock
  to synchronize between memcg page accounting and migration.  This only affects
  patch 4 of the series.  Patch 4 used to disable softirq, now it introduces the
  new lock.

Changes since v1:
- Renamed "nfs"/"total_nfs" to "nfs_unstable"/"total_nfs_unstable" in per cgroup
  memory.stat to match /proc/meminfo.
- Avoid lockdep warnings by using rcu_read_[un]lock() in
  mem_cgroup_has_dirty_limit().
- Fixed lockdep issue in mem_cgroup_read_stat() which is exposed by these
  patches.
- Remove redundant comments.
- Rename (for clarity):
  - mem_cgroup_write_page_stat_item -> mem_cgroup_page_stat_item
  - mem_cgroup_read_page_stat_item -> mem_cgroup_nr_pages_item
- Renamed newly created proc files:
  - memory.dirty_bytes -> memory.dirty_limit_in_bytes
  - memory.dirty_background_bytes -> memory.dirty_background_limit_in_bytes
- Removed unnecessary get_ prefix from get_xxx() functions.
- Allow [kKmMgG] suffixes for newly created dirty limit value cgroupfs files.
- Disable softirq rather than hardirq in lock_page_cgroup()
- Made mem_cgroup_move_account_page_stat() inline.
- Ported patches to mmotm-2010-10-13-17-13.

This patch set provides the ability for each cgroup to have independent dirty
page limits.

Limiting dirty memory is like fixing the max amount of dirty (hard to reclaim)
page cache used by a cgroup.  So, in case of multiple cgroup writers, they will
not be able to consume more than their designated share of dirty pages and will
be forced to perform write-out if they cross that limit.

The patches are based on a series proposed by Andrea Righi in Mar 2010.


Overview:
- Add page_cgroup flags to record when pages are dirty, in writeback, or nfs
  unstable.

- Extend mem_cgroup to record the total number of pages in each of the 
  interesting dirty states (dirty, writeback, unstable_nfs).  

- Add dirty parameters similar to the system-wide  /proc/sys/vm/dirty_*
  limits to mem_cgroup.  The mem_cgroup dirty parameters are accessible
  via cgroupfs control files.

- Consider both system and per-memcg dirty limits in page writeback when
  deciding to queue background writeback or block for foreground writeback.


Known shortcomings:
- When a cgroup dirty limit is exceeded, then bdi writeback is employed to
  writeback dirty inodes.  Bdi writeback considers inodes from any cgroup, not
  just inodes contributing dirty pages to the cgroup exceeding its limit.  


Performance data:
- A page fault microbenchmark workload was used to measure performance, which
  can be called in read or write mode:
        f = open(foo. $cpu)
        truncate(f, 4096)
        alarm(60)
        while (1) {
                p = mmap(f, 4096)
                if (write)
			*p = 1
		else
			x = *p
                munmap(p)
        }

- The workload was called for several points in the patch series in different
  modes:
  - s_read is a single threaded reader
  - s_write is a single threaded writer
  - p_read is a 16 thread reader, each operating on a different file
  - p_write is a 16 thread writer, each operating on a different file

- Measurements were collected on a 16 core non-numa system using "perf stat
  --repeat 3".  The -a option was used for parallel (p_*) runs.

- All numbers are page fault rate (M/sec).  Higher is better.

- To compare the performance of a kernel without non-memcg compare the first and
  last rows, neither has memcg configured.  The first row does not include any
  of these memcg patches.

- To compare the performance of using memcg dirty limits, compare the baseline
  (2nd row titled "w/ memcg") with the the code and memcg enabled (2nd to last
  row titled "all patches").

                           root_cgroup                     child_cgroup
                 s_read s_write p_read p_write    s_read s_write p_read p_write
mmotm w/o memcg   0.424  0.400   0.420  0.395
w/ memcg          0.419  0.390   0.395  0.371      0.413  0.385   0.385  0.361
all patches       0.421  0.384   0.395  0.362      0.418  0.380   0.396  0.360
all patches       0.425  0.396   0.423  0.388
  w/o memcg


Balbir Singh (1):
  memcg: CPU hotplug lockdep warning fix

Greg Thelen (9):
  memcg: add page_cgroup flags for dirty page tracking
  memcg: document cgroup dirty memory interfaces
  memcg: create extensible page stat update routines
  memcg: add dirty page accounting infrastructure
  memcg: add kernel calls for memcg dirty page stats
  memcg: add dirty limits to mem_cgroup
  memcg: add cgroupfs interface to memcg dirty limits
  writeback: make determine_dirtyable_memory() static.
  memcg: check memcg dirty limits in page writeback

KAMEZAWA Hiroyuki (1):
  memcg: add lock to synchronize page accounting and migration

 Documentation/cgroups/memory.txt |   60 ++++++
 fs/nfs/write.c                   |    4 +
 include/linux/memcontrol.h       |   78 +++++++-
 include/linux/page_cgroup.h      |   54 +++++-
 include/linux/writeback.h        |    2 -
 mm/filemap.c                     |    1 +
 mm/memcontrol.c                  |  417 ++++++++++++++++++++++++++++++++++++--
 mm/page-writeback.c              |  213 +++++++++++++-------
 mm/rmap.c                        |    4 +-
 mm/truncate.c                    |    1 +
 10 files changed, 726 insertions(+), 108 deletions(-)

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* a SimpleMessenger fix
From: Colin P. McCabe @ 2010-10-19  0:39 UTC (permalink / raw)
  To: Gregory Farnum; +Cc: Sage Weil, ceph-devel

cc'ing list

On Mon, Oct 18, 2010 at 5:12 PM, Gregory Farnum <gregf@hq.newdream.net> wrote:
> On Mon, Oct 18, 2010 at 4:57 PM, Colin P. McCabe <colinm@hq.newdream.net> wrote:
>> Hi guys,
>>
>> See if you agree with this patch. My understanding is that if we're
>> doing a non-replace open operation, "existing" should always be zero.
>> However, it is possible that it is unintentionally set to something
>> else because of a previous run through the while(1) loop. This should
>> fix that.
>>
>> diff --git a/src/msg/SimpleMessenger.cc b/src/msg/SimpleMessenger.cc
>> index f826448..277acdb 100644
>> --- a/src/msg/SimpleMessenger.cc
>> +++ b/src/msg/SimpleMessenger.cc
>> @@ -782,6 +782,7 @@ int SimpleMessenger::Pipe::accept()
>>      } else {
>>        // new session
>>        dout(10) << "accept new session" << dendl;
>> +      existing = NULL;
>>        goto open;
>>      }
>>      assert(0);
>> --
>> 1.6.6.1
>>
> I'm pretty sure that actually can't happen -- the existing pipe isn't
> removed from queues[1] until after you've exited the loop. :) No
> reason we can't make it explicit, though!
> -Greg
>
> [1]:
>  replace:
>  replace = true;
>  if (connect.features & CEPH_FEATURE_RECONNECT_SEQ) {
>    reply_tag = CEPH_MSGR_TAG_SEQ;
>  }
>  dout(10) << "accept replacing " << existing << dendl;
>  existing->stop();
>  existing->unregister_pipe();
>
>  if (!existing->policy.lossy) { /* if we're lossy, we can lose messages and
>                                    should let the daemon handle it itself.
>    Otherwise, take over other Connection so we don't lose older messages */
>    existing->connection_state->clear_pipe();
>    existing->connection_state->pipe = get();
>    existing->connection_state->put();
>    existing->connection_state = NULL;
>  }
>

Well, what if the while(1) loop executes two times. The first time, we
take the branch

>    if (messenger->rank_pipe.count(peer_addr)) {
>      existing = messenger->rank_pipe[peer_addr];
>   ...etc etc...

but then for some reason we "goto reply", which brings us back up to
the top of the loop.
Then, the next time, we take the other branch of the if statement, and
end up doing "goto open". (You would know better than I whether it is
possible for us to take the other branch the second time around.
However, since we've dropped the messenger lock before doing "goto
reply", it seems reasonable to me that the messenger's state might
have changed).
Meanwhile, "existing" is still set to the value it had during the
previous loop iteration.

Then tcp_write fails, and we encounter:
>    if (existing)
>      existing->pipe_lock.Unlock();

Which tries to unlock a mutex that we don't have.

Anyway, that's the theory. I guess like you said, we should take the
patch since it clarifies things.

cheers,
Colin McCabe
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH] omap4: pandaboard: fix up mmc card detect logic
From: Tony Lindgren @ 2010-10-19  0:31 UTC (permalink / raw)
  To: Madhusudhan
  Cc: 'Nishanth Menon', 'l-o', 'Dave',
	'Adrian Hunter', 'Samuel Ortiz'
In-Reply-To: <2E5804DAE93D403D9F7FCEDBF6367E59@am.dhcp.ti.com>

* Madhusudhan <madhu.cr@ti.com> [101018 09:57]:
> 
> 
> > -----Original Message-----
> > From: Nishanth Menon [mailto:nm@ti.com]
> > Sent: Friday, October 15, 2010 8:25 AM
> > To: l-o
> > Cc: Dave; Nishanth Menon; Tony Lindgren; Madhusudhan Chikkature; Adrian
> > Hunter; Samuel Ortiz
> > Subject: [PATCH] omap4: pandaboard: fix up mmc card detect logic
> > 
> > For MMC1 Controller, card detect interrupt source is
> > twl6030 which is non-gpio. The card detect call back function provides
> > card present/absent status by reading MMC Control register present
> > on twl6030. This functionality was introduced in mfd tree on
> > track to kernel.org
> > 
> > Sync pandaboard to the same and make mmc work.
> > 
> > Cc: Tony Lindgren <tony@atomide.com>
> > Cc: Madhusudhan Chikkature <madhu.cr@ti.com>
> > Cc: Adrian Hunter <adrian.hunter@nokia.com>
> > Cc: Samuel Ortiz <sameo@linux.intel.com>
> > 
> > Acked-by: Kishore Kadiyala <kishore.kadiyala@ti.com>
> 
> The patch looks good.
> Acked-by: Madhusudhan Chikkature <madhu.cr@ti.com>
> 
> > Signed-off-by: Nishanth Menon <nm@ti.com>
> > ---
> > 
> > Depends on
> > http://git.kernel.org/?p=linux/kernel/git/sameo/mfd-
> > 2.6.git;a=commitdiff;h=1bf5197061a4aec99e9fd4f92d4a543310f83585;hp=0c9b33e
> > 5a23e2053165c9e30ffff3b3a3cf1b2b8
> > This patch probably should be squashed to that.

Nishant, please post one more time with linux-arm-kernel list
also cc'd for review and add all the acks.

After that, Samuel, can you please apply it to your queue because
of the depenency above?

Acked-by: Tony Lindgren <tony@atomide.com>

^ permalink raw reply

* RE: linux-next: build failure after merge of the sound-asoc tree
From: Peter Hsiang @ 2010-10-19  0:30 UTC (permalink / raw)
  To: Mark Brown
  Cc: Stephen Rothwell, Liam Girdwood, linux-next@vger.kernel.org,
	linux-kernel@vger.kernel.org
In-Reply-To: <20101019000730.GA27248@opensource.wolfsonmicro.com>

On Mon, Oct 18, 2010, Mark Brown wrote:
> On Mon, Oct 18, 2010 at 01:25:12PM -0700, Peter Hsiang wrote:
> 
> > This message seems to be saying that this header file is missing.
> > However, this header file was included in the patch that was submitted,
> > and it did compile fine here before the submit.
> > I don't understand why it's not there after it was applied?
> 
> Your patch was apparently not generated against the mainline Makefile or
> Kconfig and the automated conflict resolution tool I used (which is
> normally fairly reliable) appears to have misplaced the header.  I've
> restored the driver with the readdition of the header.

.../pub/scm/linux/kernel/git/next/linux-next.git  Right?
Thanks!

Peter

^ permalink raw reply

* Re: [RFC/PATCH] reset: accept "git reset <removed file>"
From: Jonathan Nieder @ 2010-10-19  0:23 UTC (permalink / raw)
  To: Junio C Hamano
  Cc: Thore Husfeldt, git, Scott Chacon, Matthieu Moy, Jakub Narebski,
	Sverre Rabbelier
In-Reply-To: <7viq0z2gxj.fsf@alter.siamese.dyndns.org>

Junio C Hamano wrote:

> Makes me wonder
>
>  - if we can/want to have a logic like this inside verify_filename();

Yes, I think so.  I was worried that this would be confusing for some
command that looks to the worktree, like git grep without --cached,
but I suspect that worry was unfounded.

The one case I am worried about is "git rev-parse".  What is
"git rev-parse <path>" supposed to be used for?

>  - if we need a corresponding logic in either the previous else/if cascade
>    that calls verify_non_filename(), or in verify_non_filename() itself.

Yes.

Is it safe to load the index so early?  I can imagine a person trying
"git reset" to recover from a corrupted index; are we regressing in
that respect and how would one check for it?

^ permalink raw reply

* [PATCH net-next] socket: localize functions
From: Stephen Hemminger @ 2010-10-19  0:27 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

A couple of functions in socket.c are only used there and
should be localized.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

--- a/include/linux/socket.h	2010-10-18 17:06:18.455644277 -0700
+++ b/include/linux/socket.h	2010-10-18 17:06:26.763083909 -0700
@@ -326,7 +326,6 @@ extern long verify_iovec(struct msghdr *
 extern int memcpy_toiovec(struct iovec *v, unsigned char *kdata, int len);
 extern int memcpy_toiovecend(const struct iovec *v, unsigned char *kdata,
 			     int offset, int len);
-extern int move_addr_to_user(struct sockaddr *kaddr, int klen, void __user *uaddr, int __user *ulen);
 extern int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr *kaddr);
 extern int put_cmsg(struct msghdr*, int level, int type, int len, void *data);
 
--- a/net/socket.c	2010-10-18 17:06:18.475642930 -0700
+++ b/net/socket.c	2010-10-18 17:12:53.001522923 -0700
@@ -209,8 +209,8 @@ int move_addr_to_kernel(void __user *uad
  *	specified. Zero is returned for a success.
  */
 
-int move_addr_to_user(struct sockaddr *kaddr, int klen, void __user *uaddr,
-		      int __user *ulen)
+static int move_addr_to_user(struct sockaddr *kaddr, int klen,
+			     void __user *uaddr, int __user *ulen)
 {
 	int err;
 	int len;
@@ -661,7 +661,8 @@ void __sock_recv_timestamp(struct msghdr
 }
 EXPORT_SYMBOL_GPL(__sock_recv_timestamp);
 
-inline void sock_recv_drops(struct msghdr *msg, struct sock *sk, struct sk_buff *skb)
+static inline void sock_recv_drops(struct msghdr *msg, struct sock *sk,
+				   struct sk_buff *skb)
 {
 	if (sock_flag(sk, SOCK_RXQ_OVFL) && skb && skb->dropcount)
 		put_cmsg(msg, SOL_SOCKET, SO_RXQ_OVFL,

^ permalink raw reply

* Re: [Qemu-devel] Snapshots ide0-hd0 issue
From: Ubuntu Explorer @ 2010-10-19  0:18 UTC (permalink / raw)
  To: qemu-devel
In-Reply-To: <AANLkTikxUc3nyehb+BRs_1RhLAmypW6vLOz=uYKjM-96@mail.gmail.com>

[-- Attachment #1: Type: text/plain, Size: 2157 bytes --]

I also read the following

VM snapshots currently have the following known limitations:
They cannot cope with removable devices if they are removed or inserted
after a snapshot is done.
A few device drivers still have incomplete snapshot support so their state
is not saved or restored properly (in particular USB).

I am using an ide0-hd0 device option with removable=0 and ro=0.
Are there any additional options to be set?
--


On Tue, Oct 19, 2010 at 8:51 AM, Ubuntu Explorer
<ubuntuexplorer@gmail.com>wrote:

> Thanks for your help.
>
> But, after commenting out snapshot option, I still cannot save the VM state
> into the ide0-hd0 block device.
>
> Here is some more information about the problem.
>
> I am trying to do the following
> a. info block
>     shows virtio, ide0-hd0
> b. savevm snapshot_name
> c. info snapshots
>     Shows snapshot_name under virtio
> d. commit ide0-hd0
> e. quit
> f. Check timestamp of ide0 file - no change. ( I assume that qemu would
> write something to this file)
> g. restart qemu.
> h. info snapshots
> i. No snapshots in "virtio"
>
> I will try to run qemu in gdb mode to see why commit is not committing the
> changes to the ide0-hd0 block device.
> But any other information will be helpful as well. I have googled a lot
> without much luck.
>
> Regards
> UE.
>
>
> On Mon, Oct 18, 2010 at 3:17 PM, Stefan Hajnoczi <stefanha@gmail.com>wrote:
>
>> On Mon, Oct 18, 2010 at 12:37 AM, Ubuntu Explorer
>> <ubuntuexplorer@gmail.com> wrote:
>> > I am trying to implement snapshot saving and loading from command line
>> using
>> > qemu. I am using both the drive and disk options as follows.
>> > <qemu exe> \
>> > --disk <path to disk file> \
>> > ...other options \
>> > -drive file=<path to drive file>,
>> > index=0,media=disk,snapshot=on,if=ide,type=drive,cache=writethrough
>>
>> Remove snapshot=on.  See the documentation about -snapshot versus
>> savevm snapshots:
>>
>> http://wiki.qemu.org/download/qemu-doc.html#vm_005fsnapshots
>>
>> "When using the (unrelated) -snapshot option (Snapshot mode), you can
>> always make VM snapshots, but they are deleted as soon as you exit
>> QEMU."
>>
>> Stefan
>>
>
>

[-- Attachment #2: Type: text/html, Size: 4358 bytes --]

^ permalink raw reply

* Re: [PATCH 0/2] Series short description
From: David Gibson @ 2010-10-19  0:27 UTC (permalink / raw)
  To: John Bonesio; +Cc: devicetree-discuss-uLR06cmDAlY/bJ5BZ2RsiQ
In-Reply-To: <20101018220341.25805.71490.stgit@riker>

On Mon, Oct 18, 2010 at 03:09:07PM -0700, John Bonesio wrote:
> The following series implements...
> 
> a new dtsi file for mpc5200 systems, and modifies media5200.dts file
> to use it.
> 
> The diff for the media5200.dts file may be confusing. I'll send the
> changed file in a separate email as an attachment as well.
> 
> This should least let us see how the changes to the dts syntax look
> in a real example.

Um.. what is this patch actually against?  It doesn't appear to be
either the dtc tree or the kernel...

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

^ permalink raw reply

* Re: pnfs-nfs-utils compile errors
From: Peng Tao @ 2010-10-19  0:26 UTC (permalink / raw)
  To: Jim Rees; +Cc: linux-nfs
In-Reply-To: <20101018160953.GA19529-8f4Pc2RrbJmHXe+LvDLADg@public.gmane.org>

On Tue, Oct 19, 2010 at 12:09 AM, Jim Rees <rees@umich.edu> wrote:
> Peng Tao wrote:
>
> =C2=A0I fail to compile with pnfs-nfs-utils master branch (commit
> =C2=A048a8d6b13b2f0b720267b1af104addb7426e7cc9). It fails in blkmapd
> =C2=A0directory. Are the files cfg.c and cfg.h missing or deleted?
>
> Yes, they were removed in commit fb9c48c5e1ce2023d3dc2e279537e158130a=
6ba0,
> "blkmapd: get rid of config file and instead examine all block device=
s."
> Maybe you need to do a autogen.sh or make distclean?
Thanks, Jim. After running autogen.sh, I compile successfully.
>

--=20
Thanks,
-Bergwolf

^ permalink raw reply

* Re: [PATCH v2 06/12] regulator: add driver for tps6524x regulator
From: Mark Brown @ 2010-10-19  0:22 UTC (permalink / raw)
  To: Cyril Chemparathy
  Cc: grant.likely-s3s/WqlpOiPyB63q8FvJNQ,
	spi-devel-general-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f,
	davinci-linux-open-source-VycZQUHpC/PFrsHnngEfi1aTQe2KTcn/,
	dbrownell-Rn4VEauK+AKRv+LV9MX5uipxlwaOVQ5f,
	lrg-kDsPt+C1G03kYMGBc/C6ZA
In-Reply-To: <1287429922-18870-7-git-send-email-cyril-l0cyMroinI0@public.gmane.org>

On Mon, Oct 18, 2010 at 03:25:16PM -0400, Cyril Chemparathy wrote:
> TPS6524X provides three step-down converters and two general-purpose LDO
> voltage regulators.  This device is interfaced using SPI.
> 
> Signed-off-by: Cyril Chemparathy <cyril-l0cyMroinI0@public.gmane.org>

Acked-by: Mark Brown <broonie-yzvPICuk2AATkU/dhu1WVueM+bqZidxxQQ4Iyu8u01E@public.gmane.org>

^ permalink raw reply

* Re: [PATCH] [PERF] (Userspace Tools) Fix a compilation error with -fstack-protector and -Werror
From: Frederic Weisbecker @ 2010-10-19  0:20 UTC (permalink / raw)
  To: Brian Gitonga Marete, Ingo Molnar, Arnaldo Carvalho de Melo
  Cc: LKML, Peter Zijlstra
In-Reply-To: <AANLkTi=Ssz4WdMDXq-93GE9bPHoaQ8QyTSrWXDSCa7eH@mail.gmail.com>

On Tue, Oct 19, 2010 at 03:06:41AM +0300, Brian Gitonga Marete wrote:
> On Tue, Oct 19, 2010 at 2:38 AM, Frederic Weisbecker <fweisbec@gmail.com> wrote:
> > On Tue, Oct 19, 2010 at 02:24:00AM +0300, Brian Gitonga Marete wrote:
> >> The following patch fixes compilation of the perf user-space tools on,
> >> for example, gcc version 4.3.3 (Ubuntu 4.3.3-5ubuntu4) . It should not
> >> break anything else.
> >
> >
> >
> > Hi,
> >
> > What kind of warning have you encountered and why it fixes it?
> > Can you describe that in your changelog?
> >
> 
> Hello Frederic,
> 
> Some versions of gcc, e.g. gcc version 4.3.3 (Ubuntu 4.3.3-5ubuntu4), have
> the (default) minimum size of buffers protected by `-fstack-protector' set
> to 8. But in perf, there exist much smaller automatic buffers.
> 
> This in combination with the -fstack-protector-all, -Werror and
> -Wstack-protector causes the compile to fail for such a compiler.
> 
> The error encountered with the above-mentioned compiler is:
> 
> gcc -o util/ui/util.o -c -ggdb3 -Wall -Wextra -std=gnu99 -Werror -O6
> -D_FORTIFY_SOURCE=2 -Wformat -Wformat-security -Wformat-y2k -Wshadow
> -Winit-self -Wpacked -Wredundant-decls -Wstack-protector
> -Wstrict-aliasing=3 -Wswitch-default -Wswitch-enum -Wno-system-headers
> -Wundef -Wvolatile-register-var -Wwrite-strings -Wbad-function-cast
> -Wmissing-declarations -Wmissing-prototypes -Wnested-externs
> -Wold-style-definition -Wstrict-prototypes
> -Wdeclaration-after-statement  -fstack-protector-all
> -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -Iutil/include
> -Iarch/x86/include -DLIBELF_NO_MMAP -I/usr/include/elfutils
> -DDWARF_SUPPORT -I/usr/include/slang -DSHA1_HEADER='<openssl/sha.h>'
> util/ui/util.c
> cc1: warnings being treated as errors
> util/ui/util.c: In function ‘ui__dialog_yesno’:
> util/ui/util.c:110: error: not protecting function: no buffer at least
> 8 bytes long


Doh! So that was the reason of this warning. Yeah looks like a right fix.
And that fixes the issue for me.

Tested-by: Frederic Weisbecker <fweisbec@gmail.com>

Thanks!


> 
> Signed-off-by: Brian Gitonga Marete <marete@toshnix.com>
> ---
>  tools/perf/Makefile |    2 +-
>  1 files changed, 1 insertions(+), 1 deletions(-)
> 
> diff --git a/tools/perf/Makefile b/tools/perf/Makefile
> index 1950e19..64eb2ea 100644
> --- a/tools/perf/Makefile
> +++ b/tools/perf/Makefile
> @@ -288,7 +288,7 @@ endif
>  -include feature-tests.mak
> 
>  ifeq ($(call try-cc,$(SOURCE_HELLO),-Werror -fstack-protector-all),y)
> -	CFLAGS := $(CFLAGS) -fstack-protector-all
> +	CFLAGS := $(CFLAGS) -fstack-protector-all --param ssp-buffer-size=1
>  endif
> 
> 
> -- 
> 1.6.0.4
> 
> 
> -- 
> Brian Gitonga Marete
> Toshnix Systems
> Tel: +254722151590


^ permalink raw reply

* [PATCH net-next-2.6 3/5] jme: Prevent possible read re-order error
From: Guo-Fu Tseng @ 2010-10-19  0:10 UTC (permalink / raw)
  To: David Miller; +Cc: Guo-Fu Tseng, linux-netdev
In-Reply-To: <1287447044-24471-1-git-send-email-cooldavid@cooldavid.org>

From: Guo-Fu Tseng <cooldavid@cooldavid.org>

Adding read memory barrier in between flag reading and data reading of
receive descriptors. This prevents the data being read before hardware
complete writing informations.

Reported-by: Stefan Hajnoczi <stefanha@gmail.com>
Signed-off-by: Guo-Fu Tseng <cooldavid@cooldavid.org>
---
 drivers/net/jme.c |    1 +
 1 files changed, 1 insertions(+), 0 deletions(-)

diff --git a/drivers/net/jme.c b/drivers/net/jme.c
index 0ea0da3..095f899 100644
--- a/drivers/net/jme.c
+++ b/drivers/net/jme.c
@@ -989,6 +989,7 @@ jme_process_receive(struct jme_adapter *jme, int limit)
 			goto out;
 		--limit;
 
+		rmb();
 		desccnt = rxdesc->descwb.desccnt & RXWBDCNT_DCNT;
 
 		if (unlikely(desccnt > 1 ||
-- 
1.7.2.2


^ permalink raw reply related

* [PATCH net-next-2.6 4/5] jme: Adding mii-tool support
From: Guo-Fu Tseng @ 2010-10-19  0:10 UTC (permalink / raw)
  To: David Miller; +Cc: Guo-Fu Tseng, linux-netdev
In-Reply-To: <1287447044-24471-1-git-send-email-cooldavid@cooldavid.org>

From: Guo-Fu Tseng <cooldavid@cooldavid.org>

Adding mii-tool support for some distribution only have mii-tool
installed by default.

Signed-off-by: Guo-Fu Tseng <cooldavid@cooldavid.org>
---
 drivers/net/jme.c |   34 +++++++++++++++++++++++++++++++++-
 1 files changed, 33 insertions(+), 1 deletions(-)

diff --git a/drivers/net/jme.c b/drivers/net/jme.c
index 095f899..c34c70f 100644
--- a/drivers/net/jme.c
+++ b/drivers/net/jme.c
@@ -2411,8 +2411,37 @@ jme_set_settings(struct net_device *netdev,
 	if (!rc) {
 		if (fdc)
 			jme_reset_link(jme);
-		set_bit(JME_FLAG_SSET, &jme->flags);
 		jme->old_ecmd = *ecmd;
+		set_bit(JME_FLAG_SSET, &jme->flags);
+	}
+
+	return rc;
+}
+
+static int
+jme_ioctl(struct net_device *netdev, struct ifreq *rq, int cmd)
+{
+	int rc;
+	struct jme_adapter *jme = netdev_priv(netdev);
+	struct mii_ioctl_data *mii_data = if_mii(rq);
+	unsigned int duplex_chg;
+
+	if (cmd == SIOCSMIIREG) {
+		u16 val = mii_data->val_in;
+		if (!(val & (BMCR_RESET|BMCR_ANENABLE)) &&
+		    (val & BMCR_SPEED1000))
+			return -EINVAL;
+	}
+
+	spin_lock_bh(&jme->phy_lock);
+	rc = generic_mii_ioctl(&jme->mii_if, mii_data, cmd, &duplex_chg);
+	spin_unlock_bh(&jme->phy_lock);
+
+	if (!rc && (cmd == SIOCSMIIREG)) {
+		if (duplex_chg)
+			jme_reset_link(jme);
+		jme_get_settings(netdev, &jme->old_ecmd);
+		set_bit(JME_FLAG_SSET, &jme->flags);
 	}
 
 	return rc;
@@ -2692,6 +2721,7 @@ static const struct net_device_ops jme_netdev_ops = {
 	.ndo_open		= jme_open,
 	.ndo_stop		= jme_close,
 	.ndo_validate_addr	= eth_validate_addr,
+	.ndo_do_ioctl		= jme_ioctl,
 	.ndo_start_xmit		= jme_start_xmit,
 	.ndo_set_mac_address	= jme_set_macaddr,
 	.ndo_set_multicast_list	= jme_set_multi,
@@ -2883,6 +2913,8 @@ jme_init_one(struct pci_dev *pdev,
 		jme->mii_if.supports_gmii = true;
 	else
 		jme->mii_if.supports_gmii = false;
+	jme->mii_if.phy_id_mask = 0x1F;
+	jme->mii_if.reg_num_mask = 0x1F;
 	jme->mii_if.mdio_read = jme_mdio_read;
 	jme->mii_if.mdio_write = jme_mdio_write;
 
-- 
1.7.2.2


^ permalink raw reply related

* [PATCH net-next-2.6 2/5] jme: Add comment in jme_set_settings
From: Guo-Fu Tseng @ 2010-10-19  0:10 UTC (permalink / raw)
  To: David Miller; +Cc: Guo-Fu Tseng, linux-netdev
In-Reply-To: <1287447044-24471-1-git-send-email-cooldavid@cooldavid.org>

From: Guo-Fu Tseng <cooldavid@cooldavid.org>

Explains what `fdc` variable is for.

Signed-off-by: Guo-Fu Tseng <cooldavid@cooldavid.org>
---
 drivers/net/jme.c |    9 ++++++---
 1 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/drivers/net/jme.c b/drivers/net/jme.c
index e04f180..0ea0da3 100644
--- a/drivers/net/jme.c
+++ b/drivers/net/jme.c
@@ -2394,6 +2394,10 @@ jme_set_settings(struct net_device *netdev,
 	if (ecmd->speed == SPEED_1000 && ecmd->autoneg != AUTONEG_ENABLE)
 		return -EINVAL;
 
+	/*
+	 * Check If user changed duplex only while force_media.
+	 * Hardware would not generate link change interrupt.
+	 */
 	if (jme->mii_if.force_media &&
 	ecmd->autoneg != AUTONEG_ENABLE &&
 	(jme->mii_if.full_duplex != ecmd->duplex))
@@ -2403,10 +2407,9 @@ jme_set_settings(struct net_device *netdev,
 	rc = mii_ethtool_sset(&(jme->mii_if), ecmd);
 	spin_unlock_bh(&jme->phy_lock);
 
-	if (!rc && fdc)
-		jme_reset_link(jme);
-
 	if (!rc) {
+		if (fdc)
+			jme_reset_link(jme);
 		set_bit(JME_FLAG_SSET, &jme->flags);
 		jme->old_ecmd = *ecmd;
 	}
-- 
1.7.2.2


^ permalink raw reply related

* [PATCH net-next-2.6 5/5] jme: Advance version number
From: Guo-Fu Tseng @ 2010-10-19  0:10 UTC (permalink / raw)
  To: David Miller; +Cc: Guo-Fu Tseng, linux-netdev
In-Reply-To: <1287447044-24471-1-git-send-email-cooldavid@cooldavid.org>

From: Guo-Fu Tseng <cooldavid@cooldavid.org>

Advance version number and update copyright info

Signed-off-by: Guo-Fu Tseng <cooldavid@cooldavid.org>
---
 drivers/net/jme.c |    1 +
 drivers/net/jme.h |    3 ++-
 2 files changed, 3 insertions(+), 1 deletions(-)

diff --git a/drivers/net/jme.c b/drivers/net/jme.c
index c34c70f..d7a975e 100644
--- a/drivers/net/jme.c
+++ b/drivers/net/jme.c
@@ -3,6 +3,7 @@
  *
  * Copyright 2008 JMicron Technology Corporation
  * http://www.jmicron.com/
+ * Copyright (c) 2009 - 2010 Guo-Fu Tseng <cooldavid@cooldavid.org>
  *
  * Author: Guo-Fu Tseng <cooldavid@cooldavid.org>
  *
diff --git a/drivers/net/jme.h b/drivers/net/jme.h
index 1360f68..eac0926 100644
--- a/drivers/net/jme.h
+++ b/drivers/net/jme.h
@@ -3,6 +3,7 @@
  *
  * Copyright 2008 JMicron Technology Corporation
  * http://www.jmicron.com/
+ * Copyright (c) 2009 - 2010 Guo-Fu Tseng <cooldavid@cooldavid.org>
  *
  * Author: Guo-Fu Tseng <cooldavid@cooldavid.org>
  *
@@ -25,7 +26,7 @@
 #define __JME_H_INCLUDED__
 
 #define DRV_NAME	"jme"
-#define DRV_VERSION	"1.0.6"
+#define DRV_VERSION	"1.0.7"
 #define PFX		DRV_NAME ": "
 
 #define PCI_DEVICE_ID_JMICRON_JMC250	0x0250
-- 
1.7.2.2


^ permalink raw reply related

* [PATCH net-next-2.6 1/5] jme: Fix PHY power-off error
From: Guo-Fu Tseng @ 2010-10-19  0:10 UTC (permalink / raw)
  To: David Miller; +Cc: Guo-Fu Tseng, linux-netdev, stable

From: Guo-Fu Tseng <cooldavid@cooldavid.org>

Adding phy_on in opposition to phy_off.

Signed-off-by: Guo-Fu Tseng <cooldavid@cooldavid.org>
Cc: <stable@kernel.org>
---
 drivers/net/jme.c |   22 ++++++++++++++++++----
 1 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/drivers/net/jme.c b/drivers/net/jme.c
index c04c096..e04f180 100644
--- a/drivers/net/jme.c
+++ b/drivers/net/jme.c
@@ -1574,6 +1574,16 @@ jme_free_irq(struct jme_adapter *jme)
 	}
 }
 
+static inline void
+jme_phy_on(struct jme_adapter *jme)
+{
+	u32 bmcr;
+
+	bmcr = jme_mdio_read(jme->dev, jme->mii_if.phy_id, MII_BMCR);
+	bmcr &= ~BMCR_PDOWN;
+	jme_mdio_write(jme->dev, jme->mii_if.phy_id, MII_BMCR, bmcr);
+}
+
 static int
 jme_open(struct net_device *netdev)
 {
@@ -1594,10 +1604,12 @@ jme_open(struct net_device *netdev)
 
 	jme_start_irq(jme);
 
-	if (test_bit(JME_FLAG_SSET, &jme->flags))
+	if (test_bit(JME_FLAG_SSET, &jme->flags)) {
+		jme_phy_on(jme);
 		jme_set_settings(netdev, &jme->old_ecmd);
-	else
+	} else {
 		jme_reset_phy_processor(jme);
+	}
 
 	jme_reset_link(jme);
 
@@ -3005,10 +3017,12 @@ jme_resume(struct pci_dev *pdev)
 	jme_clear_pm(jme);
 	pci_restore_state(pdev);
 
-	if (test_bit(JME_FLAG_SSET, &jme->flags))
+	if (test_bit(JME_FLAG_SSET, &jme->flags)) {
+		jme_phy_on(jme);
 		jme_set_settings(netdev, &jme->old_ecmd);
-	else
+	} else {
 		jme_reset_phy_processor(jme);
+	}
 
 	jme_start_irq(jme);
 	netif_device_attach(netdev);
-- 
1.7.2.2


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.