[PATCH -mmotm 3/4] memcg: dirty pages accounting and limiting infrastructure

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [PATCH -mmotm 3/4] memcg: dirty pages accounting and limiting infrastructure
  2010-03-04 10:40 [PATCH -mmotm 0/4] memcg: per cgroup dirty limit (v4) Andrea Righi
@ 2010-03-04 10:40 ` Andrea Righi
  2010-03-04 11:54   ` Kirill A. Shutemov
  2010-03-05  1:12   ` Daisuke Nishimura
  0 siblings, 2 replies; 41+ messages in thread
From: Andrea Righi @ 2010-03-04 10:40 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki, Balbir Singh
  Cc: Vivek Goyal, Peter Zijlstra, Trond Myklebust, Suleiman Souhlal,
	Greg Thelen, Daisuke Nishimura, Kirill A. Shutemov, Andrew Morton,
	containers, linux-kernel, linux-mm, Andrea Righi

Infrastructure to account dirty pages per cgroup and add dirty limit
interfaces in the cgroupfs:

 - Direct write-out: memory.dirty_ratio, memory.dirty_bytes

 - Background write-out: memory.dirty_background_ratio, memory.dirty_background_bytes

Signed-off-by: Andrea Righi <arighi@develer.com>
---
 include/linux/memcontrol.h |   80 ++++++++-
 mm/memcontrol.c            |  420 +++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 450 insertions(+), 50 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 1f9b119..cc3421b 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -19,12 +19,66 @@
 
 #ifndef _LINUX_MEMCONTROL_H
 #define _LINUX_MEMCONTROL_H
+
+#include <linux/writeback.h>
 #include <linux/cgroup.h>
+
 struct mem_cgroup;
 struct page_cgroup;
 struct page;
 struct mm_struct;
 
+/* Cgroup memory statistics items exported to the kernel */
+enum mem_cgroup_page_stat_item {
+	MEMCG_NR_DIRTYABLE_PAGES,
+	MEMCG_NR_RECLAIM_PAGES,
+	MEMCG_NR_WRITEBACK,
+	MEMCG_NR_DIRTY_WRITEBACK_PAGES,
+};
+
+/* Dirty memory parameters */
+struct dirty_param {
+	int dirty_ratio;
+	unsigned long dirty_bytes;
+	int dirty_background_ratio;
+	unsigned long dirty_background_bytes;
+};
+
+/*
+ * Statistics for memory cgroup.
+ */
+enum mem_cgroup_stat_index {
+	/*
+	 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
+	 */
+	MEM_CGROUP_STAT_CACHE,	   /* # of pages charged as cache */
+	MEM_CGROUP_STAT_RSS,	   /* # of pages charged as anon rss */
+	MEM_CGROUP_STAT_FILE_MAPPED,  /* # of pages charged as file rss */
+	MEM_CGROUP_STAT_PGPGIN_COUNT,	/* # of pages paged in */
+	MEM_CGROUP_STAT_PGPGOUT_COUNT,	/* # of pages paged out */
+	MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
+	MEM_CGROUP_EVENTS,	/* incremented at every  pagein/pageout */
+	MEM_CGROUP_STAT_FILE_DIRTY,   /* # of dirty pages in page cache */
+	MEM_CGROUP_STAT_WRITEBACK,   /* # of pages under writeback */
+	MEM_CGROUP_STAT_WRITEBACK_TEMP,   /* # of pages under writeback using
+						temporary buffers */
+	MEM_CGROUP_STAT_UNSTABLE_NFS,   /* # of NFS unstable pages */
+
+	MEM_CGROUP_STAT_NSTATS,
+};
+
+/*
+ * TODO: provide a validation check routine. And retry if validation
+ * fails.
+ */
+static inline void get_global_dirty_param(struct dirty_param *param)
+{
+	param->dirty_ratio = vm_dirty_ratio;
+	param->dirty_bytes = vm_dirty_bytes;
+	param->dirty_background_ratio = dirty_background_ratio;
+	param->dirty_background_bytes = dirty_background_bytes;
+}
+
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
 /*
  * All "charge" functions with gfp_mask should use GFP_KERNEL or
@@ -117,6 +171,10 @@ extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
 extern int do_swap_account;
 #endif
 
+extern bool mem_cgroup_has_dirty_limit(void);
+extern void get_dirty_param(struct dirty_param *param);
+extern s64 mem_cgroup_page_stat(enum mem_cgroup_page_stat_item item);
+
 static inline bool mem_cgroup_disabled(void)
 {
 	if (mem_cgroup_subsys.disabled)
@@ -125,7 +183,8 @@ static inline bool mem_cgroup_disabled(void)
 }
 
 extern bool mem_cgroup_oom_called(struct task_struct *task);
-void mem_cgroup_update_file_mapped(struct page *page, int val);
+void mem_cgroup_update_stat(struct page *page,
+			enum mem_cgroup_stat_index idx, int val);
 unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
 						gfp_t gfp_mask, int nid,
 						int zid);
@@ -300,8 +359,8 @@ mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
 {
 }
 
-static inline void mem_cgroup_update_file_mapped(struct page *page,
-							int val)
+static inline void mem_cgroup_update_stat(struct page *page,
+			enum mem_cgroup_stat_index idx, int val)
 {
 }
 
@@ -312,6 +371,21 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
 	return 0;
 }
 
+static inline bool mem_cgroup_has_dirty_limit(void)
+{
+	return false;
+}
+
+static inline void get_dirty_param(struct dirty_param *param)
+{
+	get_global_dirty_param(param);
+}
+
+static inline s64 mem_cgroup_page_stat(enum mem_cgroup_page_stat_item item)
+{
+	return -ENOSYS;
+}
+
 #endif /* CONFIG_CGROUP_MEM_CONT */
 
 #endif /* _LINUX_MEMCONTROL_H */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 497b6f7..9842e7b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -73,28 +73,23 @@ static int really_do_swap_account __initdata = 1; /* for remember boot option*/
 #define THRESHOLDS_EVENTS_THRESH (7) /* once in 128 */
 #define SOFTLIMIT_EVENTS_THRESH (10) /* once in 1024 */
 
-/*
- * Statistics for memory cgroup.
- */
-enum mem_cgroup_stat_index {
-	/*
-	 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
-	 */
-	MEM_CGROUP_STAT_CACHE, 	   /* # of pages charged as cache */
-	MEM_CGROUP_STAT_RSS,	   /* # of pages charged as anon rss */
-	MEM_CGROUP_STAT_FILE_MAPPED,  /* # of pages charged as file rss */
-	MEM_CGROUP_STAT_PGPGIN_COUNT,	/* # of pages paged in */
-	MEM_CGROUP_STAT_PGPGOUT_COUNT,	/* # of pages paged out */
-	MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
-	MEM_CGROUP_EVENTS,	/* incremented at every  pagein/pageout */
-
-	MEM_CGROUP_STAT_NSTATS,
-};
-
 struct mem_cgroup_stat_cpu {
 	s64 count[MEM_CGROUP_STAT_NSTATS];
 };
 
+/* Per cgroup page statistics */
+struct mem_cgroup_page_stat {
+	enum mem_cgroup_page_stat_item item;
+	s64 value;
+};
+
+enum {
+	MEM_CGROUP_DIRTY_RATIO,
+	MEM_CGROUP_DIRTY_BYTES,
+	MEM_CGROUP_DIRTY_BACKGROUND_RATIO,
+	MEM_CGROUP_DIRTY_BACKGROUND_BYTES,
+};
+
 /*
  * per-zone information in memory controller.
  */
@@ -208,6 +203,9 @@ struct mem_cgroup {
 
 	unsigned int	swappiness;
 
+	/* control memory cgroup dirty pages */
+	struct dirty_param dirty_param;
+
 	/* set when res.limit == memsw.limit */
 	bool		memsw_is_minimum;
 
@@ -1033,6 +1031,156 @@ static unsigned int get_swappiness(struct mem_cgroup *memcg)
 	return swappiness;
 }
 
+static bool dirty_param_is_valid(struct dirty_param *param)
+{
+	if (param->dirty_ratio && param->dirty_bytes)
+		return false;
+	if (param->dirty_background_ratio && param->dirty_background_bytes)
+		return false;
+	return true;
+}
+
+static void
+__mem_cgroup_get_dirty_param(struct dirty_param *param, struct mem_cgroup *mem)
+{
+	param->dirty_ratio = mem->dirty_param.dirty_ratio;
+	param->dirty_bytes = mem->dirty_param.dirty_bytes;
+	param->dirty_background_ratio = mem->dirty_param.dirty_background_ratio;
+	param->dirty_background_bytes = mem->dirty_param.dirty_background_bytes;
+}
+
+/*
+ * get_dirty_param() - get dirty memory parameters of the current memcg
+ * @param:	a structure is filled with the dirty memory settings
+ *
+ * The function fills @param with the current memcg dirty memory settings. If
+ * memory cgroup is disabled or in case of error the structure is filled with
+ * the global dirty memory settings.
+ */
+void get_dirty_param(struct dirty_param *param)
+{
+	struct mem_cgroup *memcg;
+
+	if (mem_cgroup_disabled()) {
+		get_global_dirty_param(param);
+		return;
+	}
+	/*
+	 * It's possible that "current" may be moved to other cgroup while we
+	 * access cgroup. But precise check is meaningless because the task can
+	 * be moved after our access and writeback tends to take long time.
+	 * At least, "memcg" will not be freed under rcu_read_lock().
+	 */
+	while (1) {
+		rcu_read_lock();
+		memcg = mem_cgroup_from_task(current);
+		if (likely(memcg))
+			__mem_cgroup_get_dirty_param(param, memcg);
+		else
+			get_global_dirty_param(param);
+		rcu_read_unlock();
+		/*
+		 * Since global and memcg dirty_param are not protected we try
+		 * to speculatively read them and retry if we get inconsistent
+		 * values.
+		 */
+		if (likely(dirty_param_is_valid(param)))
+			break;
+	}
+}
+
+static inline bool mem_cgroup_can_swap(struct mem_cgroup *memcg)
+{
+	if (!do_swap_account)
+		return nr_swap_pages > 0;
+	return !memcg->memsw_is_minimum &&
+		(res_counter_read_u64(&memcg->memsw, RES_LIMIT) > 0);
+}
+
+static s64 mem_cgroup_get_local_page_stat(struct mem_cgroup *memcg,
+				enum mem_cgroup_page_stat_item item)
+{
+	s64 ret;
+
+	switch (item) {
+	case MEMCG_NR_DIRTYABLE_PAGES:
+		ret = res_counter_read_u64(&memcg->res, RES_LIMIT) -
+			res_counter_read_u64(&memcg->res, RES_USAGE);
+		/* Translate free memory in pages */
+		ret >>= PAGE_SHIFT;
+		ret += mem_cgroup_read_stat(memcg, LRU_ACTIVE_FILE) +
+			mem_cgroup_read_stat(memcg, LRU_INACTIVE_FILE);
+		if (mem_cgroup_can_swap(memcg))
+			ret += mem_cgroup_read_stat(memcg, LRU_ACTIVE_ANON) +
+				mem_cgroup_read_stat(memcg, LRU_INACTIVE_ANON);
+		break;
+	case MEMCG_NR_RECLAIM_PAGES:
+		ret = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_FILE_DIRTY) +
+			mem_cgroup_read_stat(memcg,
+					MEM_CGROUP_STAT_UNSTABLE_NFS);
+		break;
+	case MEMCG_NR_WRITEBACK:
+		ret = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
+		break;
+	case MEMCG_NR_DIRTY_WRITEBACK_PAGES:
+		ret = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_WRITEBACK) +
+			mem_cgroup_read_stat(memcg,
+				MEM_CGROUP_STAT_UNSTABLE_NFS);
+		break;
+	default:
+		BUG_ON(1);
+	}
+	return ret;
+}
+
+static int mem_cgroup_page_stat_cb(struct mem_cgroup *mem, void *data)
+{
+	struct mem_cgroup_page_stat *stat = (struct mem_cgroup_page_stat *)data;
+
+	stat->value += mem_cgroup_get_local_page_stat(mem, stat->item);
+	return 0;
+}
+
+/*
+ * mem_cgroup_has_dirty_limit() - check if current memcg has local dirty limits
+ *
+ * Return true if the current memory cgroup has local dirty memory settings,
+ * false otherwise.
+ */
+bool mem_cgroup_has_dirty_limit(void)
+{
+	if (mem_cgroup_disabled())
+		return false;
+	return mem_cgroup_from_task(current) != NULL;
+}
+
+/*
+ * mem_cgroup_page_stat() - get memory cgroup file cache statistics
+ * @item:	memory statistic item exported to the kernel
+ *
+ * Return the accounted statistic value, or a negative value in case of error.
+ */
+s64 mem_cgroup_page_stat(enum mem_cgroup_page_stat_item item)
+{
+	struct mem_cgroup_page_stat stat = {};
+	struct mem_cgroup *memcg;
+
+	rcu_read_lock();
+	memcg = mem_cgroup_from_task(current);
+	if (memcg) {
+		/*
+		 * Recursively evaulate page statistics against all cgroup
+		 * under hierarchy tree
+		 */
+		stat.item = item;
+		mem_cgroup_walk_tree(memcg, &stat, mem_cgroup_page_stat_cb);
+	} else
+		stat.value = -EINVAL;
+	rcu_read_unlock();
+
+	return stat.value;
+}
+
 static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data)
 {
 	int *val = data;
@@ -1275,34 +1423,70 @@ static void record_last_oom(struct mem_cgroup *mem)
 }
 
 /*
- * Currently used to update mapped file statistics, but the routine can be
- * generalized to update other statistics as well.
+ * Generalized routine to update file cache's status for memcg.
+ *
+ * Before calling this, mapping->tree_lock should be held and preemption is
+ * disabled.  Then, it's guarnteed that the page is not uncharged while we
+ * access page_cgroup. We can make use of that.
  */
-void mem_cgroup_update_file_mapped(struct page *page, int val)
+void mem_cgroup_update_stat(struct page *page,
+			enum mem_cgroup_stat_index idx, int val)
 {
 	struct mem_cgroup *mem;
 	struct page_cgroup *pc;
 
+	if (mem_cgroup_disabled())
+		return;
 	pc = lookup_page_cgroup(page);
-	if (unlikely(!pc))
+	if (unlikely(!pc) || !PageCgroupUsed(pc))
 		return;
 
-	lock_page_cgroup(pc);
-	mem = pc->mem_cgroup;
-	if (!mem)
-		goto done;
-
-	if (!PageCgroupUsed(pc))
-		goto done;
-
+	lock_page_cgroup_migrate(pc);
 	/*
-	 * Preemption is already disabled. We can use __this_cpu_xxx
-	 */
-	__this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], val);
-
-done:
-	unlock_page_cgroup(pc);
+	* It's guarnteed that this page is never uncharged.
+	* The only racy problem is moving account among memcgs.
+	*/
+	switch (idx) {
+	case MEM_CGROUP_STAT_FILE_MAPPED:
+		if (val > 0)
+			SetPageCgroupFileMapped(pc);
+		else
+			ClearPageCgroupFileMapped(pc);
+		break;
+	case MEM_CGROUP_STAT_FILE_DIRTY:
+		if (val > 0)
+			SetPageCgroupDirty(pc);
+		else
+			ClearPageCgroupDirty(pc);
+		break;
+	case MEM_CGROUP_STAT_WRITEBACK:
+		if (val > 0)
+			SetPageCgroupWriteback(pc);
+		else
+			ClearPageCgroupWriteback(pc);
+		break;
+	case MEM_CGROUP_STAT_WRITEBACK_TEMP:
+		if (val > 0)
+			SetPageCgroupWritebackTemp(pc);
+		else
+			ClearPageCgroupWritebackTemp(pc);
+		break;
+	case MEM_CGROUP_STAT_UNSTABLE_NFS:
+		if (val > 0)
+			SetPageCgroupUnstableNFS(pc);
+		else
+			ClearPageCgroupUnstableNFS(pc);
+		break;
+	default:
+		BUG();
+		break;
+	}
+	mem = pc->mem_cgroup;
+	if (likely(mem))
+		__this_cpu_add(mem->stat->count[idx], val);
+	unlock_page_cgroup_migrate(pc);
 }
+EXPORT_SYMBOL_GPL(mem_cgroup_update_stat);
 
 /*
  * size of first charge trial. "32" comes from vmscan.c's magic value.
@@ -1701,6 +1885,45 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
 	memcg_check_events(mem, pc->page);
 }
 
+/*
+ * Update file cache accounted statistics on task migration.
+ *
+ * TODO: We don't move charges of file (including shmem/tmpfs) pages for now.
+ * So, at the moment this function simply returns without updating accounted
+ * statistics, because we deal only with anonymous pages here.
+ */
+static void __mem_cgroup_update_file_stat(struct page_cgroup *pc,
+	struct mem_cgroup *from, struct mem_cgroup *to)
+{
+	struct page *page = pc->page;
+
+	if (!page_mapped(page) || PageAnon(page))
+		return;
+
+	if (PageCgroupFileMapped(pc)) {
+		__this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
+		__this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
+	}
+	if (PageCgroupDirty(pc)) {
+		__this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_DIRTY]);
+		__this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_DIRTY]);
+	}
+	if (PageCgroupWriteback(pc)) {
+		__this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_WRITEBACK]);
+		__this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_WRITEBACK]);
+	}
+	if (PageCgroupWritebackTemp(pc)) {
+		__this_cpu_dec(
+			from->stat->count[MEM_CGROUP_STAT_WRITEBACK_TEMP]);
+		__this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_WRITEBACK_TEMP]);
+	}
+	if (PageCgroupUnstableNFS(pc)) {
+		__this_cpu_dec(
+			from->stat->count[MEM_CGROUP_STAT_UNSTABLE_NFS]);
+		__this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_UNSTABLE_NFS]);
+	}
+}
+
 /**
  * __mem_cgroup_move_account - move account of the page
  * @pc:	page_cgroup of the page.
@@ -1721,22 +1944,16 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
 static void __mem_cgroup_move_account(struct page_cgroup *pc,
 	struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
 {
-	struct page *page;
-
 	VM_BUG_ON(from == to);
 	VM_BUG_ON(PageLRU(pc->page));
 	VM_BUG_ON(!PageCgroupLocked(pc));
 	VM_BUG_ON(!PageCgroupUsed(pc));
 	VM_BUG_ON(pc->mem_cgroup != from);
 
-	page = pc->page;
-	if (page_mapped(page) && !PageAnon(page)) {
-		/* Update mapped_file data for mem_cgroup */
-		preempt_disable();
-		__this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
-		__this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
-		preempt_enable();
-	}
+	preempt_disable();
+	lock_page_cgroup_migrate(pc);
+	__mem_cgroup_update_file_stat(pc, from, to);
+
 	mem_cgroup_charge_statistics(from, pc, false);
 	if (uncharge)
 		/* This is not "cancel", but cancel_charge does all we need. */
@@ -1745,6 +1962,8 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc,
 	/* caller should have done css_get */
 	pc->mem_cgroup = to;
 	mem_cgroup_charge_statistics(to, pc, true);
+	unlock_page_cgroup_migrate(pc);
+	preempt_enable();
 	/*
 	 * We charges against "to" which may not have any tasks. Then, "to"
 	 * can be under rmdir(). But in current implementation, caller of
@@ -3042,6 +3261,10 @@ enum {
 	MCS_PGPGIN,
 	MCS_PGPGOUT,
 	MCS_SWAP,
+	MCS_FILE_DIRTY,
+	MCS_WRITEBACK,
+	MCS_WRITEBACK_TEMP,
+	MCS_UNSTABLE_NFS,
 	MCS_INACTIVE_ANON,
 	MCS_ACTIVE_ANON,
 	MCS_INACTIVE_FILE,
@@ -3064,6 +3287,10 @@ struct {
 	{"pgpgin", "total_pgpgin"},
 	{"pgpgout", "total_pgpgout"},
 	{"swap", "total_swap"},
+	{"filedirty", "dirty_pages"},
+	{"writeback", "writeback_pages"},
+	{"writeback_tmp", "writeback_temp_pages"},
+	{"nfs", "nfs_unstable"},
 	{"inactive_anon", "total_inactive_anon"},
 	{"active_anon", "total_active_anon"},
 	{"inactive_file", "total_inactive_file"},
@@ -3092,6 +3319,14 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data)
 		val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT);
 		s->stat[MCS_SWAP] += val * PAGE_SIZE;
 	}
+	val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_DIRTY);
+	s->stat[MCS_FILE_DIRTY] += val;
+	val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_WRITEBACK);
+	s->stat[MCS_WRITEBACK] += val;
+	val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_WRITEBACK_TEMP);
+	s->stat[MCS_WRITEBACK_TEMP] += val;
+	val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_UNSTABLE_NFS);
+	s->stat[MCS_UNSTABLE_NFS] += val;
 
 	/* per zone stat */
 	val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON);
@@ -3453,6 +3688,60 @@ unlock:
 	return ret;
 }
 
+static u64 mem_cgroup_dirty_read(struct cgroup *cgrp, struct cftype *cft)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
+
+	switch (cft->private) {
+	case MEM_CGROUP_DIRTY_RATIO:
+		return memcg->dirty_param.dirty_ratio;
+	case MEM_CGROUP_DIRTY_BYTES:
+		return memcg->dirty_param.dirty_bytes;
+	case MEM_CGROUP_DIRTY_BACKGROUND_RATIO:
+		return memcg->dirty_param.dirty_background_ratio;
+	case MEM_CGROUP_DIRTY_BACKGROUND_BYTES:
+		return memcg->dirty_param.dirty_background_bytes;
+	default:
+		BUG();
+	}
+}
+
+static int
+mem_cgroup_dirty_write(struct cgroup *cgrp, struct cftype *cft, u64 val)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
+	int type = cft->private;
+
+	if (cgrp->parent == NULL)
+		return -EINVAL;
+	if ((type == MEM_CGROUP_DIRTY_RATIO ||
+		type == MEM_CGROUP_DIRTY_BACKGROUND_RATIO) && val > 100)
+		return -EINVAL;
+	/*
+	 * TODO: provide a validation check routine. And retry if validation
+	 * fails.
+	 */
+	switch (type) {
+	case MEM_CGROUP_DIRTY_RATIO:
+		memcg->dirty_param.dirty_ratio = val;
+		memcg->dirty_param.dirty_bytes = 0;
+		break;
+	case MEM_CGROUP_DIRTY_BYTES:
+		memcg->dirty_param.dirty_ratio  = 0;
+		memcg->dirty_param.dirty_bytes = val;
+		break;
+	case MEM_CGROUP_DIRTY_BACKGROUND_RATIO:
+		memcg->dirty_param.dirty_background_ratio = val;
+		memcg->dirty_param.dirty_background_bytes = 0;
+		break;
+	case MEM_CGROUP_DIRTY_BACKGROUND_BYTES:
+		memcg->dirty_param.dirty_background_ratio = 0;
+		memcg->dirty_param.dirty_background_bytes = val;
+		break;
+	}
+	return 0;
+}
+
 static struct cftype mem_cgroup_files[] = {
 	{
 		.name = "usage_in_bytes",
@@ -3504,6 +3793,30 @@ static struct cftype mem_cgroup_files[] = {
 		.write_u64 = mem_cgroup_swappiness_write,
 	},
 	{
+		.name = "dirty_ratio",
+		.read_u64 = mem_cgroup_dirty_read,
+		.write_u64 = mem_cgroup_dirty_write,
+		.private = MEM_CGROUP_DIRTY_RATIO,
+	},
+	{
+		.name = "dirty_bytes",
+		.read_u64 = mem_cgroup_dirty_read,
+		.write_u64 = mem_cgroup_dirty_write,
+		.private = MEM_CGROUP_DIRTY_BYTES,
+	},
+	{
+		.name = "dirty_background_ratio",
+		.read_u64 = mem_cgroup_dirty_read,
+		.write_u64 = mem_cgroup_dirty_write,
+		.private = MEM_CGROUP_DIRTY_BACKGROUND_RATIO,
+	},
+	{
+		.name = "dirty_background_bytes",
+		.read_u64 = mem_cgroup_dirty_read,
+		.write_u64 = mem_cgroup_dirty_write,
+		.private = MEM_CGROUP_DIRTY_BACKGROUND_BYTES,
+	},
+	{
 		.name = "move_charge_at_immigrate",
 		.read_u64 = mem_cgroup_move_charge_read,
 		.write_u64 = mem_cgroup_move_charge_write,
@@ -3762,8 +4075,21 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
 	mem->last_scanned_child = 0;
 	spin_lock_init(&mem->reclaim_param_lock);
 
-	if (parent)
+	if (parent) {
 		mem->swappiness = get_swappiness(parent);
+		mem->dirty_param = parent->dirty_param;
+	} else {
+		while (1) {
+			get_global_dirty_param(&mem->dirty_param);
+			/*
+			 * Since global dirty parameters are not protected we
+			 * try to speculatively read them and retry if we get
+			 * inconsistent values.
+			 */
+			if (likely(dirty_param_is_valid(&mem->dirty_param)))
+				break;
+		}
+	}
 	atomic_set(&mem->refcnt, 1);
 	mem->move_charge_at_immigrate = 0;
 	mutex_init(&mem->thresholds_lock);
-- 
1.6.3.3

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 41+ messages in thread

* Re: [PATCH -mmotm 3/4] memcg: dirty pages accounting and limiting infrastructure
  2010-03-04 10:40 ` [PATCH -mmotm 3/4] memcg: dirty pages accounting and limiting infrastructure Andrea Righi
@ 2010-03-04 11:54   ` Kirill A. Shutemov
  2010-03-05  1:12   ` Daisuke Nishimura
  1 sibling, 0 replies; 41+ messages in thread
From: Kirill A. Shutemov @ 2010-03-04 11:54 UTC (permalink / raw)
  To: Andrea Righi
  Cc: KAMEZAWA Hiroyuki, Balbir Singh, Vivek Goyal, Peter Zijlstra,
	Trond Myklebust, Suleiman Souhlal, Greg Thelen, Daisuke Nishimura,
	Andrew Morton, containers, linux-kernel, linux-mm

On Thu, Mar 4, 2010 at 12:40 PM, Andrea Righi <arighi@develer.com> wrote:
> Infrastructure to account dirty pages per cgroup and add dirty limit
> interfaces in the cgroupfs:
>
>  - Direct write-out: memory.dirty_ratio, memory.dirty_bytes
>
>  - Background write-out: memory.dirty_background_ratio, memory.dirty_background_bytes
>
> Signed-off-by: Andrea Righi <arighi@develer.com>
> ---
>  include/linux/memcontrol.h |   80 ++++++++-
>  mm/memcontrol.c            |  420 +++++++++++++++++++++++++++++++++++++++-----
>  2 files changed, 450 insertions(+), 50 deletions(-)
>
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 1f9b119..cc3421b 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -19,12 +19,66 @@
>
>  #ifndef _LINUX_MEMCONTROL_H
>  #define _LINUX_MEMCONTROL_H
> +
> +#include <linux/writeback.h>
>  #include <linux/cgroup.h>
> +
>  struct mem_cgroup;
>  struct page_cgroup;
>  struct page;
>  struct mm_struct;
>
> +/* Cgroup memory statistics items exported to the kernel */
> +enum mem_cgroup_page_stat_item {
> +       MEMCG_NR_DIRTYABLE_PAGES,
> +       MEMCG_NR_RECLAIM_PAGES,
> +       MEMCG_NR_WRITEBACK,
> +       MEMCG_NR_DIRTY_WRITEBACK_PAGES,
> +};
> +
> +/* Dirty memory parameters */
> +struct dirty_param {
> +       int dirty_ratio;
> +       unsigned long dirty_bytes;
> +       int dirty_background_ratio;
> +       unsigned long dirty_background_bytes;
> +};
> +
> +/*
> + * Statistics for memory cgroup.
> + */
> +enum mem_cgroup_stat_index {
> +       /*
> +        * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
> +        */
> +       MEM_CGROUP_STAT_CACHE,     /* # of pages charged as cache */
> +       MEM_CGROUP_STAT_RSS,       /* # of pages charged as anon rss */
> +       MEM_CGROUP_STAT_FILE_MAPPED,  /* # of pages charged as file rss */
> +       MEM_CGROUP_STAT_PGPGIN_COUNT,   /* # of pages paged in */
> +       MEM_CGROUP_STAT_PGPGOUT_COUNT,  /* # of pages paged out */
> +       MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
> +       MEM_CGROUP_EVENTS,      /* incremented at every  pagein/pageout */
> +       MEM_CGROUP_STAT_FILE_DIRTY,   /* # of dirty pages in page cache */
> +       MEM_CGROUP_STAT_WRITEBACK,   /* # of pages under writeback */
> +       MEM_CGROUP_STAT_WRITEBACK_TEMP,   /* # of pages under writeback using
> +                                               temporary buffers */
> +       MEM_CGROUP_STAT_UNSTABLE_NFS,   /* # of NFS unstable pages */
> +
> +       MEM_CGROUP_STAT_NSTATS,
> +};
> +
> +/*
> + * TODO: provide a validation check routine. And retry if validation
> + * fails.
> + */
> +static inline void get_global_dirty_param(struct dirty_param *param)
> +{
> +       param->dirty_ratio = vm_dirty_ratio;
> +       param->dirty_bytes = vm_dirty_bytes;
> +       param->dirty_background_ratio = dirty_background_ratio;
> +       param->dirty_background_bytes = dirty_background_bytes;
> +}
> +
>  #ifdef CONFIG_CGROUP_MEM_RES_CTLR
>  /*
>  * All "charge" functions with gfp_mask should use GFP_KERNEL or
> @@ -117,6 +171,10 @@ extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
>  extern int do_swap_account;
>  #endif
>
> +extern bool mem_cgroup_has_dirty_limit(void);
> +extern void get_dirty_param(struct dirty_param *param);
> +extern s64 mem_cgroup_page_stat(enum mem_cgroup_page_stat_item item);
> +
>  static inline bool mem_cgroup_disabled(void)
>  {
>        if (mem_cgroup_subsys.disabled)
> @@ -125,7 +183,8 @@ static inline bool mem_cgroup_disabled(void)
>  }
>
>  extern bool mem_cgroup_oom_called(struct task_struct *task);
> -void mem_cgroup_update_file_mapped(struct page *page, int val);
> +void mem_cgroup_update_stat(struct page *page,
> +                       enum mem_cgroup_stat_index idx, int val);
>  unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
>                                                gfp_t gfp_mask, int nid,
>                                                int zid);
> @@ -300,8 +359,8 @@ mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
>  {
>  }
>
> -static inline void mem_cgroup_update_file_mapped(struct page *page,
> -                                                       int val)
> +static inline void mem_cgroup_update_stat(struct page *page,
> +                       enum mem_cgroup_stat_index idx, int val)
>  {
>  }
>
> @@ -312,6 +371,21 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
>        return 0;
>  }
>
> +static inline bool mem_cgroup_has_dirty_limit(void)
> +{
> +       return false;
> +}
> +
> +static inline void get_dirty_param(struct dirty_param *param)
> +{
> +       get_global_dirty_param(param);
> +}
> +
> +static inline s64 mem_cgroup_page_stat(enum mem_cgroup_page_stat_item item)
> +{
> +       return -ENOSYS;
> +}
> +
>  #endif /* CONFIG_CGROUP_MEM_CONT */
>
>  #endif /* _LINUX_MEMCONTROL_H */
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 497b6f7..9842e7b 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -73,28 +73,23 @@ static int really_do_swap_account __initdata = 1; /* for remember boot option*/
>  #define THRESHOLDS_EVENTS_THRESH (7) /* once in 128 */
>  #define SOFTLIMIT_EVENTS_THRESH (10) /* once in 1024 */
>
> -/*
> - * Statistics for memory cgroup.
> - */
> -enum mem_cgroup_stat_index {
> -       /*
> -        * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
> -        */
> -       MEM_CGROUP_STAT_CACHE,     /* # of pages charged as cache */
> -       MEM_CGROUP_STAT_RSS,       /* # of pages charged as anon rss */
> -       MEM_CGROUP_STAT_FILE_MAPPED,  /* # of pages charged as file rss */
> -       MEM_CGROUP_STAT_PGPGIN_COUNT,   /* # of pages paged in */
> -       MEM_CGROUP_STAT_PGPGOUT_COUNT,  /* # of pages paged out */
> -       MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
> -       MEM_CGROUP_EVENTS,      /* incremented at every  pagein/pageout */
> -
> -       MEM_CGROUP_STAT_NSTATS,
> -};
> -
>  struct mem_cgroup_stat_cpu {
>        s64 count[MEM_CGROUP_STAT_NSTATS];
>  };
>
> +/* Per cgroup page statistics */
> +struct mem_cgroup_page_stat {
> +       enum mem_cgroup_page_stat_item item;
> +       s64 value;
> +};
> +
> +enum {
> +       MEM_CGROUP_DIRTY_RATIO,
> +       MEM_CGROUP_DIRTY_BYTES,
> +       MEM_CGROUP_DIRTY_BACKGROUND_RATIO,
> +       MEM_CGROUP_DIRTY_BACKGROUND_BYTES,
> +};
> +
>  /*
>  * per-zone information in memory controller.
>  */
> @@ -208,6 +203,9 @@ struct mem_cgroup {
>
>        unsigned int    swappiness;
>
> +       /* control memory cgroup dirty pages */
> +       struct dirty_param dirty_param;
> +
>        /* set when res.limit == memsw.limit */
>        bool            memsw_is_minimum;
>
> @@ -1033,6 +1031,156 @@ static unsigned int get_swappiness(struct mem_cgroup *memcg)
>        return swappiness;
>  }
>
> +static bool dirty_param_is_valid(struct dirty_param *param)
> +{
> +       if (param->dirty_ratio && param->dirty_bytes)
> +               return false;
> +       if (param->dirty_background_ratio && param->dirty_background_bytes)
> +               return false;
> +       return true;
> +}
> +
> +static void
> +__mem_cgroup_get_dirty_param(struct dirty_param *param, struct mem_cgroup *mem)
> +{
> +       param->dirty_ratio = mem->dirty_param.dirty_ratio;
> +       param->dirty_bytes = mem->dirty_param.dirty_bytes;
> +       param->dirty_background_ratio = mem->dirty_param.dirty_background_ratio;
> +       param->dirty_background_bytes = mem->dirty_param.dirty_background_bytes;
> +}
> +
> +/*
> + * get_dirty_param() - get dirty memory parameters of the current memcg
> + * @param:     a structure is filled with the dirty memory settings
> + *
> + * The function fills @param with the current memcg dirty memory settings. If
> + * memory cgroup is disabled or in case of error the structure is filled with
> + * the global dirty memory settings.
> + */
> +void get_dirty_param(struct dirty_param *param)
> +{
> +       struct mem_cgroup *memcg;
> +
> +       if (mem_cgroup_disabled()) {
> +               get_global_dirty_param(param);
> +               return;
> +       }
> +       /*
> +        * It's possible that "current" may be moved to other cgroup while we
> +        * access cgroup. But precise check is meaningless because the task can
> +        * be moved after our access and writeback tends to take long time.
> +        * At least, "memcg" will not be freed under rcu_read_lock().
> +        */
> +       while (1) {
> +               rcu_read_lock();
> +               memcg = mem_cgroup_from_task(current);
> +               if (likely(memcg))
> +                       __mem_cgroup_get_dirty_param(param, memcg);
> +               else
> +                       get_global_dirty_param(param);
> +               rcu_read_unlock();
> +               /*
> +                * Since global and memcg dirty_param are not protected we try
> +                * to speculatively read them and retry if we get inconsistent
> +                * values.
> +                */
> +               if (likely(dirty_param_is_valid(param)))
> +                       break;
> +       }
> +}
> +
> +static inline bool mem_cgroup_can_swap(struct mem_cgroup *memcg)
> +{
> +       if (!do_swap_account)
> +               return nr_swap_pages > 0;
> +       return !memcg->memsw_is_minimum &&
> +               (res_counter_read_u64(&memcg->memsw, RES_LIMIT) > 0);
> +}
> +
> +static s64 mem_cgroup_get_local_page_stat(struct mem_cgroup *memcg,
> +                               enum mem_cgroup_page_stat_item item)
> +{
> +       s64 ret;
> +
> +       switch (item) {
> +       case MEMCG_NR_DIRTYABLE_PAGES:
> +               ret = res_counter_read_u64(&memcg->res, RES_LIMIT) -
> +                       res_counter_read_u64(&memcg->res, RES_USAGE);
> +               /* Translate free memory in pages */
> +               ret >>= PAGE_SHIFT;
> +               ret += mem_cgroup_read_stat(memcg, LRU_ACTIVE_FILE) +
> +                       mem_cgroup_read_stat(memcg, LRU_INACTIVE_FILE);
> +               if (mem_cgroup_can_swap(memcg))
> +                       ret += mem_cgroup_read_stat(memcg, LRU_ACTIVE_ANON) +
> +                               mem_cgroup_read_stat(memcg, LRU_INACTIVE_ANON);
> +               break;
> +       case MEMCG_NR_RECLAIM_PAGES:
> +               ret = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_FILE_DIRTY) +
> +                       mem_cgroup_read_stat(memcg,
> +                                       MEM_CGROUP_STAT_UNSTABLE_NFS);
> +               break;
> +       case MEMCG_NR_WRITEBACK:
> +               ret = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
> +               break;
> +       case MEMCG_NR_DIRTY_WRITEBACK_PAGES:
> +               ret = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_WRITEBACK) +
> +                       mem_cgroup_read_stat(memcg,
> +                               MEM_CGROUP_STAT_UNSTABLE_NFS);
> +               break;
> +       default:
> +               BUG_ON(1);

Just BUG()?
Andd add 'break;', please.

> +       }
> +       return ret;
> +}
> +
> +static int mem_cgroup_page_stat_cb(struct mem_cgroup *mem, void *data)
> +{
> +       struct mem_cgroup_page_stat *stat = (struct mem_cgroup_page_stat *)data;
> +
> +       stat->value += mem_cgroup_get_local_page_stat(mem, stat->item);
> +       return 0;
> +}
> +
> +/*
> + * mem_cgroup_has_dirty_limit() - check if current memcg has local dirty limits
> + *
> + * Return true if the current memory cgroup has local dirty memory settings,
> + * false otherwise.
> + */
> +bool mem_cgroup_has_dirty_limit(void)
> +{
> +       if (mem_cgroup_disabled())
> +               return false;
> +       return mem_cgroup_from_task(current) != NULL;
> +}
> +
> +/*
> + * mem_cgroup_page_stat() - get memory cgroup file cache statistics
> + * @item:      memory statistic item exported to the kernel
> + *
> + * Return the accounted statistic value, or a negative value in case of error.
> + */
> +s64 mem_cgroup_page_stat(enum mem_cgroup_page_stat_item item)
> +{
> +       struct mem_cgroup_page_stat stat = {};
> +       struct mem_cgroup *memcg;
> +
> +       rcu_read_lock();
> +       memcg = mem_cgroup_from_task(current);
> +       if (memcg) {
> +               /*
> +                * Recursively evaulate page statistics against all cgroup
> +                * under hierarchy tree
> +                */
> +               stat.item = item;
> +               mem_cgroup_walk_tree(memcg, &stat, mem_cgroup_page_stat_cb);
> +       } else
> +               stat.value = -EINVAL;
> +       rcu_read_unlock();
> +
> +       return stat.value;
> +}
> +
>  static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data)
>  {
>        int *val = data;
> @@ -1275,34 +1423,70 @@ static void record_last_oom(struct mem_cgroup *mem)
>  }
>
>  /*
> - * Currently used to update mapped file statistics, but the routine can be
> - * generalized to update other statistics as well.
> + * Generalized routine to update file cache's status for memcg.
> + *
> + * Before calling this, mapping->tree_lock should be held and preemption is
> + * disabled.  Then, it's guarnteed that the page is not uncharged while we
> + * access page_cgroup. We can make use of that.
>  */
> -void mem_cgroup_update_file_mapped(struct page *page, int val)
> +void mem_cgroup_update_stat(struct page *page,
> +                       enum mem_cgroup_stat_index idx, int val)
>  {
>        struct mem_cgroup *mem;
>        struct page_cgroup *pc;
>
> +       if (mem_cgroup_disabled())
> +               return;
>        pc = lookup_page_cgroup(page);
> -       if (unlikely(!pc))
> +       if (unlikely(!pc) || !PageCgroupUsed(pc))
>                return;
>
> -       lock_page_cgroup(pc);
> -       mem = pc->mem_cgroup;
> -       if (!mem)
> -               goto done;
> -
> -       if (!PageCgroupUsed(pc))
> -               goto done;
> -
> +       lock_page_cgroup_migrate(pc);
>        /*
> -        * Preemption is already disabled. We can use __this_cpu_xxx
> -        */
> -       __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], val);
> -
> -done:
> -       unlock_page_cgroup(pc);
> +       * It's guarnteed that this page is never uncharged.
> +       * The only racy problem is moving account among memcgs.
> +       */
> +       switch (idx) {
> +       case MEM_CGROUP_STAT_FILE_MAPPED:
> +               if (val > 0)
> +                       SetPageCgroupFileMapped(pc);
> +               else
> +                       ClearPageCgroupFileMapped(pc);
> +               break;
> +       case MEM_CGROUP_STAT_FILE_DIRTY:
> +               if (val > 0)
> +                       SetPageCgroupDirty(pc);
> +               else
> +                       ClearPageCgroupDirty(pc);
> +               break;
> +       case MEM_CGROUP_STAT_WRITEBACK:
> +               if (val > 0)
> +                       SetPageCgroupWriteback(pc);
> +               else
> +                       ClearPageCgroupWriteback(pc);
> +               break;
> +       case MEM_CGROUP_STAT_WRITEBACK_TEMP:
> +               if (val > 0)
> +                       SetPageCgroupWritebackTemp(pc);
> +               else
> +                       ClearPageCgroupWritebackTemp(pc);
> +               break;
> +       case MEM_CGROUP_STAT_UNSTABLE_NFS:
> +               if (val > 0)
> +                       SetPageCgroupUnstableNFS(pc);
> +               else
> +                       ClearPageCgroupUnstableNFS(pc);
> +               break;
> +       default:
> +               BUG();
> +               break;
> +       }
> +       mem = pc->mem_cgroup;
> +       if (likely(mem))
> +               __this_cpu_add(mem->stat->count[idx], val);
> +       unlock_page_cgroup_migrate(pc);
>  }
> +EXPORT_SYMBOL_GPL(mem_cgroup_update_stat);
>
>  /*
>  * size of first charge trial. "32" comes from vmscan.c's magic value.
> @@ -1701,6 +1885,45 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
>        memcg_check_events(mem, pc->page);
>  }
>
> +/*
> + * Update file cache accounted statistics on task migration.
> + *
> + * TODO: We don't move charges of file (including shmem/tmpfs) pages for now.
> + * So, at the moment this function simply returns without updating accounted
> + * statistics, because we deal only with anonymous pages here.
> + */
> +static void __mem_cgroup_update_file_stat(struct page_cgroup *pc,
> +       struct mem_cgroup *from, struct mem_cgroup *to)
> +{
> +       struct page *page = pc->page;
> +
> +       if (!page_mapped(page) || PageAnon(page))
> +               return;
> +
> +       if (PageCgroupFileMapped(pc)) {
> +               __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
> +               __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
> +       }
> +       if (PageCgroupDirty(pc)) {
> +               __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_DIRTY]);
> +               __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_DIRTY]);
> +       }
> +       if (PageCgroupWriteback(pc)) {
> +               __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_WRITEBACK]);
> +               __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_WRITEBACK]);
> +       }
> +       if (PageCgroupWritebackTemp(pc)) {
> +               __this_cpu_dec(
> +                       from->stat->count[MEM_CGROUP_STAT_WRITEBACK_TEMP]);
> +               __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_WRITEBACK_TEMP]);
> +       }
> +       if (PageCgroupUnstableNFS(pc)) {
> +               __this_cpu_dec(
> +                       from->stat->count[MEM_CGROUP_STAT_UNSTABLE_NFS]);
> +               __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_UNSTABLE_NFS]);
> +       }
> +}
> +
>  /**
>  * __mem_cgroup_move_account - move account of the page
>  * @pc:        page_cgroup of the page.
> @@ -1721,22 +1944,16 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
>  static void __mem_cgroup_move_account(struct page_cgroup *pc,
>        struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
>  {
> -       struct page *page;
> -
>        VM_BUG_ON(from == to);
>        VM_BUG_ON(PageLRU(pc->page));
>        VM_BUG_ON(!PageCgroupLocked(pc));
>        VM_BUG_ON(!PageCgroupUsed(pc));
>        VM_BUG_ON(pc->mem_cgroup != from);
>
> -       page = pc->page;
> -       if (page_mapped(page) && !PageAnon(page)) {
> -               /* Update mapped_file data for mem_cgroup */
> -               preempt_disable();
> -               __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
> -               __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
> -               preempt_enable();
> -       }
> +       preempt_disable();
> +       lock_page_cgroup_migrate(pc);
> +       __mem_cgroup_update_file_stat(pc, from, to);
> +
>        mem_cgroup_charge_statistics(from, pc, false);
>        if (uncharge)
>                /* This is not "cancel", but cancel_charge does all we need. */
> @@ -1745,6 +1962,8 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc,
>        /* caller should have done css_get */
>        pc->mem_cgroup = to;
>        mem_cgroup_charge_statistics(to, pc, true);
> +       unlock_page_cgroup_migrate(pc);
> +       preempt_enable();
>        /*
>         * We charges against "to" which may not have any tasks. Then, "to"
>         * can be under rmdir(). But in current implementation, caller of
> @@ -3042,6 +3261,10 @@ enum {
>        MCS_PGPGIN,
>        MCS_PGPGOUT,
>        MCS_SWAP,
> +       MCS_FILE_DIRTY,
> +       MCS_WRITEBACK,
> +       MCS_WRITEBACK_TEMP,
> +       MCS_UNSTABLE_NFS,
>        MCS_INACTIVE_ANON,
>        MCS_ACTIVE_ANON,
>        MCS_INACTIVE_FILE,
> @@ -3064,6 +3287,10 @@ struct {
>        {"pgpgin", "total_pgpgin"},
>        {"pgpgout", "total_pgpgout"},
>        {"swap", "total_swap"},
> +       {"filedirty", "dirty_pages"},
> +       {"writeback", "writeback_pages"},
> +       {"writeback_tmp", "writeback_temp_pages"},
> +       {"nfs", "nfs_unstable"},
>        {"inactive_anon", "total_inactive_anon"},
>        {"active_anon", "total_active_anon"},
>        {"inactive_file", "total_inactive_file"},
> @@ -3092,6 +3319,14 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data)
>                val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT);
>                s->stat[MCS_SWAP] += val * PAGE_SIZE;
>        }
> +       val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_DIRTY);
> +       s->stat[MCS_FILE_DIRTY] += val;
> +       val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_WRITEBACK);
> +       s->stat[MCS_WRITEBACK] += val;
> +       val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_WRITEBACK_TEMP);
> +       s->stat[MCS_WRITEBACK_TEMP] += val;
> +       val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_UNSTABLE_NFS);
> +       s->stat[MCS_UNSTABLE_NFS] += val;
>
>        /* per zone stat */
>        val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON);
> @@ -3453,6 +3688,60 @@ unlock:
>        return ret;
>  }
>
> +static u64 mem_cgroup_dirty_read(struct cgroup *cgrp, struct cftype *cft)
> +{
> +       struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
> +
> +       switch (cft->private) {
> +       case MEM_CGROUP_DIRTY_RATIO:
> +               return memcg->dirty_param.dirty_ratio;
> +       case MEM_CGROUP_DIRTY_BYTES:
> +               return memcg->dirty_param.dirty_bytes;
> +       case MEM_CGROUP_DIRTY_BACKGROUND_RATIO:
> +               return memcg->dirty_param.dirty_background_ratio;
> +       case MEM_CGROUP_DIRTY_BACKGROUND_BYTES:
> +               return memcg->dirty_param.dirty_background_bytes;
> +       default:
> +               BUG();
> +       }
> +}
> +
> +static int
> +mem_cgroup_dirty_write(struct cgroup *cgrp, struct cftype *cft, u64 val)
> +{
> +       struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
> +       int type = cft->private;
> +
> +       if (cgrp->parent == NULL)
> +               return -EINVAL;
> +       if ((type == MEM_CGROUP_DIRTY_RATIO ||
> +               type == MEM_CGROUP_DIRTY_BACKGROUND_RATIO) && val > 100)
> +               return -EINVAL;
> +       /*
> +        * TODO: provide a validation check routine. And retry if validation
> +        * fails.
> +        */
> +       switch (type) {
> +       case MEM_CGROUP_DIRTY_RATIO:
> +               memcg->dirty_param.dirty_ratio = val;
> +               memcg->dirty_param.dirty_bytes = 0;
> +               break;
> +       case MEM_CGROUP_DIRTY_BYTES:
> +               memcg->dirty_param.dirty_ratio  = 0;
> +               memcg->dirty_param.dirty_bytes = val;
> +               break;
> +       case MEM_CGROUP_DIRTY_BACKGROUND_RATIO:
> +               memcg->dirty_param.dirty_background_ratio = val;
> +               memcg->dirty_param.dirty_background_bytes = 0;
> +               break;
> +       case MEM_CGROUP_DIRTY_BACKGROUND_BYTES:
> +               memcg->dirty_param.dirty_background_ratio = 0;
> +               memcg->dirty_param.dirty_background_bytes = val;
> +               break;

default:
        BUG();
        break;

> +       }
> +       return 0;
> +}
> +
>  static struct cftype mem_cgroup_files[] = {
>        {
>                .name = "usage_in_bytes",
> @@ -3504,6 +3793,30 @@ static struct cftype mem_cgroup_files[] = {
>                .write_u64 = mem_cgroup_swappiness_write,
>        },
>        {
> +               .name = "dirty_ratio",
> +               .read_u64 = mem_cgroup_dirty_read,
> +               .write_u64 = mem_cgroup_dirty_write,
> +               .private = MEM_CGROUP_DIRTY_RATIO,
> +       },
> +       {
> +               .name = "dirty_bytes",
> +               .read_u64 = mem_cgroup_dirty_read,
> +               .write_u64 = mem_cgroup_dirty_write,
> +               .private = MEM_CGROUP_DIRTY_BYTES,
> +       },
> +       {
> +               .name = "dirty_background_ratio",
> +               .read_u64 = mem_cgroup_dirty_read,
> +               .write_u64 = mem_cgroup_dirty_write,
> +               .private = MEM_CGROUP_DIRTY_BACKGROUND_RATIO,
> +       },
> +       {
> +               .name = "dirty_background_bytes",
> +               .read_u64 = mem_cgroup_dirty_read,
> +               .write_u64 = mem_cgroup_dirty_write,
> +               .private = MEM_CGROUP_DIRTY_BACKGROUND_BYTES,
> +       },
> +       {
>                .name = "move_charge_at_immigrate",
>                .read_u64 = mem_cgroup_move_charge_read,
>                .write_u64 = mem_cgroup_move_charge_write,
> @@ -3762,8 +4075,21 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
>        mem->last_scanned_child = 0;
>        spin_lock_init(&mem->reclaim_param_lock);
>
> -       if (parent)
> +       if (parent) {
>                mem->swappiness = get_swappiness(parent);
> +               mem->dirty_param = parent->dirty_param;
> +       } else {
> +               while (1) {
> +                       get_global_dirty_param(&mem->dirty_param);
> +                       /*
> +                        * Since global dirty parameters are not protected we
> +                        * try to speculatively read them and retry if we get
> +                        * inconsistent values.
> +                        */
> +                       if (likely(dirty_param_is_valid(&mem->dirty_param)))
> +                               break;
> +               }
> +       }
>        atomic_set(&mem->refcnt, 1);
>        mem->move_charge_at_immigrate = 0;
>        mutex_init(&mem->thresholds_lock);
> --
> 1.6.3.3
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH -mmotm 3/4] memcg: dirty pages accounting and limiting infrastructure
  2010-03-04 10:40 ` [PATCH -mmotm 3/4] memcg: dirty pages accounting and limiting infrastructure Andrea Righi
  2010-03-04 11:54   ` Kirill A. Shutemov
@ 2010-03-05  1:12   ` Daisuke Nishimura
  2010-03-05  1:58     ` KAMEZAWA Hiroyuki
  2010-03-05 22:14     ` Andrea Righi
  1 sibling, 2 replies; 41+ messages in thread
From: Daisuke Nishimura @ 2010-03-05  1:12 UTC (permalink / raw)
  To: Andrea Righi
  Cc: KAMEZAWA Hiroyuki, Balbir Singh, Vivek Goyal, Peter Zijlstra,
	Trond Myklebust, Suleiman Souhlal, Greg Thelen,
	Kirill A. Shutemov, Andrew Morton, containers, linux-kernel,
	linux-mm, Daisuke Nishimura

On Thu,  4 Mar 2010 11:40:14 +0100, Andrea Righi <arighi@develer.com> wrote:
> Infrastructure to account dirty pages per cgroup and add dirty limit
> interfaces in the cgroupfs:
> 
>  - Direct write-out: memory.dirty_ratio, memory.dirty_bytes
> 
>  - Background write-out: memory.dirty_background_ratio, memory.dirty_background_bytes
> 
> Signed-off-by: Andrea Righi <arighi@develer.com>
> ---
>  include/linux/memcontrol.h |   80 ++++++++-
>  mm/memcontrol.c            |  420 +++++++++++++++++++++++++++++++++++++++-----
>  2 files changed, 450 insertions(+), 50 deletions(-)
> 
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 1f9b119..cc3421b 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -19,12 +19,66 @@
>  
>  #ifndef _LINUX_MEMCONTROL_H
>  #define _LINUX_MEMCONTROL_H
> +
> +#include <linux/writeback.h>
>  #include <linux/cgroup.h>
> +
>  struct mem_cgroup;
>  struct page_cgroup;
>  struct page;
>  struct mm_struct;
>  
> +/* Cgroup memory statistics items exported to the kernel */
> +enum mem_cgroup_page_stat_item {
> +	MEMCG_NR_DIRTYABLE_PAGES,
> +	MEMCG_NR_RECLAIM_PAGES,
> +	MEMCG_NR_WRITEBACK,
> +	MEMCG_NR_DIRTY_WRITEBACK_PAGES,
> +};
> +
> +/* Dirty memory parameters */
> +struct dirty_param {
> +	int dirty_ratio;
> +	unsigned long dirty_bytes;
> +	int dirty_background_ratio;
> +	unsigned long dirty_background_bytes;
> +};
> +
> +/*
> + * Statistics for memory cgroup.
> + */
> +enum mem_cgroup_stat_index {
> +	/*
> +	 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
> +	 */
> +	MEM_CGROUP_STAT_CACHE,	   /* # of pages charged as cache */
> +	MEM_CGROUP_STAT_RSS,	   /* # of pages charged as anon rss */
> +	MEM_CGROUP_STAT_FILE_MAPPED,  /* # of pages charged as file rss */
> +	MEM_CGROUP_STAT_PGPGIN_COUNT,	/* # of pages paged in */
> +	MEM_CGROUP_STAT_PGPGOUT_COUNT,	/* # of pages paged out */
> +	MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
> +	MEM_CGROUP_EVENTS,	/* incremented at every  pagein/pageout */
> +	MEM_CGROUP_STAT_FILE_DIRTY,   /* # of dirty pages in page cache */
> +	MEM_CGROUP_STAT_WRITEBACK,   /* # of pages under writeback */
> +	MEM_CGROUP_STAT_WRITEBACK_TEMP,   /* # of pages under writeback using
> +						temporary buffers */
> +	MEM_CGROUP_STAT_UNSTABLE_NFS,   /* # of NFS unstable pages */
> +
> +	MEM_CGROUP_STAT_NSTATS,
> +};
> +
I must have said it earlier, but I don't think exporting all of these flags
is a good idea.
Can you export only mem_cgroup_page_stat_item(of course, need to add MEMCG_NR_FILE_MAPPED)?
We can translate mem_cgroup_page_stat_item to mem_cgroup_stat_index by simple arithmetic
if you define MEM_CGROUP_STAT_FILE_MAPPED..MEM_CGROUP_STAT_UNSTABLE_NFS sequentially.

> +/*
> + * TODO: provide a validation check routine. And retry if validation
> + * fails.
> + */
> +static inline void get_global_dirty_param(struct dirty_param *param)
> +{
> +	param->dirty_ratio = vm_dirty_ratio;
> +	param->dirty_bytes = vm_dirty_bytes;
> +	param->dirty_background_ratio = dirty_background_ratio;
> +	param->dirty_background_bytes = dirty_background_bytes;
> +}
> +
>  #ifdef CONFIG_CGROUP_MEM_RES_CTLR
>  /*
>   * All "charge" functions with gfp_mask should use GFP_KERNEL or
> @@ -117,6 +171,10 @@ extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
>  extern int do_swap_account;
>  #endif
>  
> +extern bool mem_cgroup_has_dirty_limit(void);
> +extern void get_dirty_param(struct dirty_param *param);
> +extern s64 mem_cgroup_page_stat(enum mem_cgroup_page_stat_item item);
> +
>  static inline bool mem_cgroup_disabled(void)
>  {
>  	if (mem_cgroup_subsys.disabled)
> @@ -125,7 +183,8 @@ static inline bool mem_cgroup_disabled(void)
>  }
>  
>  extern bool mem_cgroup_oom_called(struct task_struct *task);
> -void mem_cgroup_update_file_mapped(struct page *page, int val);
> +void mem_cgroup_update_stat(struct page *page,
> +			enum mem_cgroup_stat_index idx, int val);
>  unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
>  						gfp_t gfp_mask, int nid,
>  						int zid);
> @@ -300,8 +359,8 @@ mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
>  {
>  }
>  
> -static inline void mem_cgroup_update_file_mapped(struct page *page,
> -							int val)
> +static inline void mem_cgroup_update_stat(struct page *page,
> +			enum mem_cgroup_stat_index idx, int val)
>  {
>  }
>  
> @@ -312,6 +371,21 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
>  	return 0;
>  }
>  
> +static inline bool mem_cgroup_has_dirty_limit(void)
> +{
> +	return false;
> +}
> +
> +static inline void get_dirty_param(struct dirty_param *param)
> +{
> +	get_global_dirty_param(param);
> +}
> +
> +static inline s64 mem_cgroup_page_stat(enum mem_cgroup_page_stat_item item)
> +{
> +	return -ENOSYS;
> +}
> +
>  #endif /* CONFIG_CGROUP_MEM_CONT */
>  
>  #endif /* _LINUX_MEMCONTROL_H */
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 497b6f7..9842e7b 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -73,28 +73,23 @@ static int really_do_swap_account __initdata = 1; /* for remember boot option*/
>  #define THRESHOLDS_EVENTS_THRESH (7) /* once in 128 */
>  #define SOFTLIMIT_EVENTS_THRESH (10) /* once in 1024 */
>  
> -/*
> - * Statistics for memory cgroup.
> - */
> -enum mem_cgroup_stat_index {
> -	/*
> -	 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
> -	 */
> -	MEM_CGROUP_STAT_CACHE, 	   /* # of pages charged as cache */
> -	MEM_CGROUP_STAT_RSS,	   /* # of pages charged as anon rss */
> -	MEM_CGROUP_STAT_FILE_MAPPED,  /* # of pages charged as file rss */
> -	MEM_CGROUP_STAT_PGPGIN_COUNT,	/* # of pages paged in */
> -	MEM_CGROUP_STAT_PGPGOUT_COUNT,	/* # of pages paged out */
> -	MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
> -	MEM_CGROUP_EVENTS,	/* incremented at every  pagein/pageout */
> -
> -	MEM_CGROUP_STAT_NSTATS,
> -};
> -
>  struct mem_cgroup_stat_cpu {
>  	s64 count[MEM_CGROUP_STAT_NSTATS];
>  };
>  
> +/* Per cgroup page statistics */
> +struct mem_cgroup_page_stat {
> +	enum mem_cgroup_page_stat_item item;
> +	s64 value;
> +};
> +
> +enum {
> +	MEM_CGROUP_DIRTY_RATIO,
> +	MEM_CGROUP_DIRTY_BYTES,
> +	MEM_CGROUP_DIRTY_BACKGROUND_RATIO,
> +	MEM_CGROUP_DIRTY_BACKGROUND_BYTES,
> +};
> +
>  /*
>   * per-zone information in memory controller.
>   */
> @@ -208,6 +203,9 @@ struct mem_cgroup {
>  
>  	unsigned int	swappiness;
>  
> +	/* control memory cgroup dirty pages */
> +	struct dirty_param dirty_param;
> +
>  	/* set when res.limit == memsw.limit */
>  	bool		memsw_is_minimum;
>  
> @@ -1033,6 +1031,156 @@ static unsigned int get_swappiness(struct mem_cgroup *memcg)
>  	return swappiness;
>  }
>  
> +static bool dirty_param_is_valid(struct dirty_param *param)
> +{
> +	if (param->dirty_ratio && param->dirty_bytes)
> +		return false;
> +	if (param->dirty_background_ratio && param->dirty_background_bytes)
> +		return false;
> +	return true;
> +}
> +
> +static void
> +__mem_cgroup_get_dirty_param(struct dirty_param *param, struct mem_cgroup *mem)
> +{
> +	param->dirty_ratio = mem->dirty_param.dirty_ratio;
> +	param->dirty_bytes = mem->dirty_param.dirty_bytes;
> +	param->dirty_background_ratio = mem->dirty_param.dirty_background_ratio;
> +	param->dirty_background_bytes = mem->dirty_param.dirty_background_bytes;
> +}
> +
> +/*
> + * get_dirty_param() - get dirty memory parameters of the current memcg
> + * @param:	a structure is filled with the dirty memory settings
> + *
> + * The function fills @param with the current memcg dirty memory settings. If
> + * memory cgroup is disabled or in case of error the structure is filled with
> + * the global dirty memory settings.
> + */
> +void get_dirty_param(struct dirty_param *param)
> +{
> +	struct mem_cgroup *memcg;
> +
> +	if (mem_cgroup_disabled()) {
> +		get_global_dirty_param(param);
> +		return;
> +	}
> +	/*
> +	 * It's possible that "current" may be moved to other cgroup while we
> +	 * access cgroup. But precise check is meaningless because the task can
> +	 * be moved after our access and writeback tends to take long time.
> +	 * At least, "memcg" will not be freed under rcu_read_lock().
> +	 */
> +	while (1) {
> +		rcu_read_lock();
> +		memcg = mem_cgroup_from_task(current);
> +		if (likely(memcg))
> +			__mem_cgroup_get_dirty_param(param, memcg);
> +		else
> +			get_global_dirty_param(param);
> +		rcu_read_unlock();
> +		/*
> +		 * Since global and memcg dirty_param are not protected we try
> +		 * to speculatively read them and retry if we get inconsistent
> +		 * values.
> +		 */
> +		if (likely(dirty_param_is_valid(param)))
> +			break;
> +	}
> +}
> +
> +static inline bool mem_cgroup_can_swap(struct mem_cgroup *memcg)
> +{
> +	if (!do_swap_account)
> +		return nr_swap_pages > 0;
> +	return !memcg->memsw_is_minimum &&
> +		(res_counter_read_u64(&memcg->memsw, RES_LIMIT) > 0);
> +}
> +
> +static s64 mem_cgroup_get_local_page_stat(struct mem_cgroup *memcg,
> +				enum mem_cgroup_page_stat_item item)
> +{
> +	s64 ret;
> +
> +	switch (item) {
> +	case MEMCG_NR_DIRTYABLE_PAGES:
> +		ret = res_counter_read_u64(&memcg->res, RES_LIMIT) -
> +			res_counter_read_u64(&memcg->res, RES_USAGE);
> +		/* Translate free memory in pages */
> +		ret >>= PAGE_SHIFT;
> +		ret += mem_cgroup_read_stat(memcg, LRU_ACTIVE_FILE) +
> +			mem_cgroup_read_stat(memcg, LRU_INACTIVE_FILE);
> +		if (mem_cgroup_can_swap(memcg))
> +			ret += mem_cgroup_read_stat(memcg, LRU_ACTIVE_ANON) +
> +				mem_cgroup_read_stat(memcg, LRU_INACTIVE_ANON);
> +		break;
> +	case MEMCG_NR_RECLAIM_PAGES:
> +		ret = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_FILE_DIRTY) +
> +			mem_cgroup_read_stat(memcg,
> +					MEM_CGROUP_STAT_UNSTABLE_NFS);
> +		break;
> +	case MEMCG_NR_WRITEBACK:
> +		ret = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
> +		break;
> +	case MEMCG_NR_DIRTY_WRITEBACK_PAGES:
> +		ret = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_WRITEBACK) +
> +			mem_cgroup_read_stat(memcg,
> +				MEM_CGROUP_STAT_UNSTABLE_NFS);
> +		break;
> +	default:
> +		BUG_ON(1);
> +	}
> +	return ret;
> +}
> +
> +static int mem_cgroup_page_stat_cb(struct mem_cgroup *mem, void *data)
> +{
> +	struct mem_cgroup_page_stat *stat = (struct mem_cgroup_page_stat *)data;
> +
> +	stat->value += mem_cgroup_get_local_page_stat(mem, stat->item);
> +	return 0;
> +}
> +
> +/*
> + * mem_cgroup_has_dirty_limit() - check if current memcg has local dirty limits
> + *
> + * Return true if the current memory cgroup has local dirty memory settings,
> + * false otherwise.
> + */
> +bool mem_cgroup_has_dirty_limit(void)
> +{
> +	if (mem_cgroup_disabled())
> +		return false;
> +	return mem_cgroup_from_task(current) != NULL;
> +}
> +
> +/*
> + * mem_cgroup_page_stat() - get memory cgroup file cache statistics
> + * @item:	memory statistic item exported to the kernel
> + *
> + * Return the accounted statistic value, or a negative value in case of error.
> + */
> +s64 mem_cgroup_page_stat(enum mem_cgroup_page_stat_item item)
> +{
> +	struct mem_cgroup_page_stat stat = {};
> +	struct mem_cgroup *memcg;
> +
> +	rcu_read_lock();
> +	memcg = mem_cgroup_from_task(current);
> +	if (memcg) {
> +		/*
> +		 * Recursively evaulate page statistics against all cgroup
> +		 * under hierarchy tree
> +		 */
> +		stat.item = item;
> +		mem_cgroup_walk_tree(memcg, &stat, mem_cgroup_page_stat_cb);
> +	} else
> +		stat.value = -EINVAL;
> +	rcu_read_unlock();
> +
> +	return stat.value;
> +}
> +
>  static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data)
>  {
>  	int *val = data;
> @@ -1275,34 +1423,70 @@ static void record_last_oom(struct mem_cgroup *mem)
>  }
>  
>  /*
> - * Currently used to update mapped file statistics, but the routine can be
> - * generalized to update other statistics as well.
> + * Generalized routine to update file cache's status for memcg.
> + *
> + * Before calling this, mapping->tree_lock should be held and preemption is
> + * disabled.  Then, it's guarnteed that the page is not uncharged while we
> + * access page_cgroup. We can make use of that.
>   */
IIUC, mapping->tree_lock is held with irq disabled, so I think "mapping->tree_lock
should be held with irq disabled" would be enouth.
And, as far as I can see, callers of this function have not ensured this yet in [4/4].

how about:

	void mem_cgroup_update_stat_locked(...)
	{
		...
	}

	void mem_cgroup_update_stat_unlocked(mapping, ...)
	{
		spin_lock_irqsave(mapping->tree_lock, ...);
		mem_cgroup_update_stat_locked();
		spin_unlock_irqrestore(...);
	}

> -void mem_cgroup_update_file_mapped(struct page *page, int val)
> +void mem_cgroup_update_stat(struct page *page,
> +			enum mem_cgroup_stat_index idx, int val)
>  {
I preffer "void mem_cgroup_update_page_stat(struct page *, enum mem_cgroup_page_stat_item, ..)"
as I said above.

>  	struct mem_cgroup *mem;
>  	struct page_cgroup *pc;
>  
> +	if (mem_cgroup_disabled())
> +		return;
>  	pc = lookup_page_cgroup(page);
> -	if (unlikely(!pc))
> +	if (unlikely(!pc) || !PageCgroupUsed(pc))
>  		return;
>  
> -	lock_page_cgroup(pc);
> -	mem = pc->mem_cgroup;
> -	if (!mem)
> -		goto done;
> -
> -	if (!PageCgroupUsed(pc))
> -		goto done;
> -
> +	lock_page_cgroup_migrate(pc);
>  	/*
> -	 * Preemption is already disabled. We can use __this_cpu_xxx
> -	 */
> -	__this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], val);
> -
> -done:
> -	unlock_page_cgroup(pc);
> +	* It's guarnteed that this page is never uncharged.
> +	* The only racy problem is moving account among memcgs.
> +	*/
> +	switch (idx) {
> +	case MEM_CGROUP_STAT_FILE_MAPPED:
> +		if (val > 0)
> +			SetPageCgroupFileMapped(pc);
> +		else
> +			ClearPageCgroupFileMapped(pc);
> +		break;
> +	case MEM_CGROUP_STAT_FILE_DIRTY:
> +		if (val > 0)
> +			SetPageCgroupDirty(pc);
> +		else
> +			ClearPageCgroupDirty(pc);
> +		break;
> +	case MEM_CGROUP_STAT_WRITEBACK:
> +		if (val > 0)
> +			SetPageCgroupWriteback(pc);
> +		else
> +			ClearPageCgroupWriteback(pc);
> +		break;
> +	case MEM_CGROUP_STAT_WRITEBACK_TEMP:
> +		if (val > 0)
> +			SetPageCgroupWritebackTemp(pc);
> +		else
> +			ClearPageCgroupWritebackTemp(pc);
> +		break;
> +	case MEM_CGROUP_STAT_UNSTABLE_NFS:
> +		if (val > 0)
> +			SetPageCgroupUnstableNFS(pc);
> +		else
> +			ClearPageCgroupUnstableNFS(pc);
> +		break;
> +	default:
> +		BUG();
> +		break;
> +	}
> +	mem = pc->mem_cgroup;
> +	if (likely(mem))
> +		__this_cpu_add(mem->stat->count[idx], val);
> +	unlock_page_cgroup_migrate(pc);
>  }
> +EXPORT_SYMBOL_GPL(mem_cgroup_update_stat);
>  
>  /*
>   * size of first charge trial. "32" comes from vmscan.c's magic value.
> @@ -1701,6 +1885,45 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
>  	memcg_check_events(mem, pc->page);
>  }
>  
> +/*
> + * Update file cache accounted statistics on task migration.
> + *
> + * TODO: We don't move charges of file (including shmem/tmpfs) pages for now.
> + * So, at the moment this function simply returns without updating accounted
> + * statistics, because we deal only with anonymous pages here.
> + */
This function is not unique to task migration. It's called from rmdir() too.
So this comment isn't needed.

> +static void __mem_cgroup_update_file_stat(struct page_cgroup *pc,
> +	struct mem_cgroup *from, struct mem_cgroup *to)
> +{
> +	struct page *page = pc->page;
> +
> +	if (!page_mapped(page) || PageAnon(page))
> +		return;
> +
> +	if (PageCgroupFileMapped(pc)) {
> +		__this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
> +		__this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
> +	}
> +	if (PageCgroupDirty(pc)) {
> +		__this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_DIRTY]);
> +		__this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_DIRTY]);
> +	}
> +	if (PageCgroupWriteback(pc)) {
> +		__this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_WRITEBACK]);
> +		__this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_WRITEBACK]);
> +	}
> +	if (PageCgroupWritebackTemp(pc)) {
> +		__this_cpu_dec(
> +			from->stat->count[MEM_CGROUP_STAT_WRITEBACK_TEMP]);
> +		__this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_WRITEBACK_TEMP]);
> +	}
> +	if (PageCgroupUnstableNFS(pc)) {
> +		__this_cpu_dec(
> +			from->stat->count[MEM_CGROUP_STAT_UNSTABLE_NFS]);
> +		__this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_UNSTABLE_NFS]);
> +	}
> +}
> +
>  /**
>   * __mem_cgroup_move_account - move account of the page
>   * @pc:	page_cgroup of the page.
> @@ -1721,22 +1944,16 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
>  static void __mem_cgroup_move_account(struct page_cgroup *pc,
>  	struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
>  {
> -	struct page *page;
> -
>  	VM_BUG_ON(from == to);
>  	VM_BUG_ON(PageLRU(pc->page));
>  	VM_BUG_ON(!PageCgroupLocked(pc));
>  	VM_BUG_ON(!PageCgroupUsed(pc));
>  	VM_BUG_ON(pc->mem_cgroup != from);
>  
> -	page = pc->page;
> -	if (page_mapped(page) && !PageAnon(page)) {
> -		/* Update mapped_file data for mem_cgroup */
> -		preempt_disable();
> -		__this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
> -		__this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
> -		preempt_enable();
> -	}
> +	preempt_disable();
> +	lock_page_cgroup_migrate(pc);
> +	__mem_cgroup_update_file_stat(pc, from, to);
> +
>  	mem_cgroup_charge_statistics(from, pc, false);
>  	if (uncharge)
>  		/* This is not "cancel", but cancel_charge does all we need. */
> @@ -1745,6 +1962,8 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc,
>  	/* caller should have done css_get */
>  	pc->mem_cgroup = to;
>  	mem_cgroup_charge_statistics(to, pc, true);
> +	unlock_page_cgroup_migrate(pc);
> +	preempt_enable();
Glad to see this cleanup :)
But, hmm, I don't think preempt_disable/enable() is enough(and bit_spin_lock/unlock()
does it anyway). lock/unlock_page_cgroup_migrate() can be called under irq context
(e.g. end_page_writeback()), so I think we must local_irq_disable()/enable() here.


Thanks,
Daisuke Nishimura.

>  	/*
>  	 * We charges against "to" which may not have any tasks. Then, "to"
>  	 * can be under rmdir(). But in current implementation, caller of
> @@ -3042,6 +3261,10 @@ enum {
>  	MCS_PGPGIN,
>  	MCS_PGPGOUT,
>  	MCS_SWAP,
> +	MCS_FILE_DIRTY,
> +	MCS_WRITEBACK,
> +	MCS_WRITEBACK_TEMP,
> +	MCS_UNSTABLE_NFS,
>  	MCS_INACTIVE_ANON,
>  	MCS_ACTIVE_ANON,
>  	MCS_INACTIVE_FILE,
> @@ -3064,6 +3287,10 @@ struct {
>  	{"pgpgin", "total_pgpgin"},
>  	{"pgpgout", "total_pgpgout"},
>  	{"swap", "total_swap"},
> +	{"filedirty", "dirty_pages"},
> +	{"writeback", "writeback_pages"},
> +	{"writeback_tmp", "writeback_temp_pages"},
> +	{"nfs", "nfs_unstable"},
>  	{"inactive_anon", "total_inactive_anon"},
>  	{"active_anon", "total_active_anon"},
>  	{"inactive_file", "total_inactive_file"},
> @@ -3092,6 +3319,14 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data)
>  		val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT);
>  		s->stat[MCS_SWAP] += val * PAGE_SIZE;
>  	}
> +	val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_DIRTY);
> +	s->stat[MCS_FILE_DIRTY] += val;
> +	val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_WRITEBACK);
> +	s->stat[MCS_WRITEBACK] += val;
> +	val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_WRITEBACK_TEMP);
> +	s->stat[MCS_WRITEBACK_TEMP] += val;
> +	val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_UNSTABLE_NFS);
> +	s->stat[MCS_UNSTABLE_NFS] += val;
>  
>  	/* per zone stat */
>  	val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON);
> @@ -3453,6 +3688,60 @@ unlock:
>  	return ret;
>  }
>  
> +static u64 mem_cgroup_dirty_read(struct cgroup *cgrp, struct cftype *cft)
> +{
> +	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
> +
> +	switch (cft->private) {
> +	case MEM_CGROUP_DIRTY_RATIO:
> +		return memcg->dirty_param.dirty_ratio;
> +	case MEM_CGROUP_DIRTY_BYTES:
> +		return memcg->dirty_param.dirty_bytes;
> +	case MEM_CGROUP_DIRTY_BACKGROUND_RATIO:
> +		return memcg->dirty_param.dirty_background_ratio;
> +	case MEM_CGROUP_DIRTY_BACKGROUND_BYTES:
> +		return memcg->dirty_param.dirty_background_bytes;
> +	default:
> +		BUG();
> +	}
> +}
> +
> +static int
> +mem_cgroup_dirty_write(struct cgroup *cgrp, struct cftype *cft, u64 val)
> +{
> +	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
> +	int type = cft->private;
> +
> +	if (cgrp->parent == NULL)
> +		return -EINVAL;
> +	if ((type == MEM_CGROUP_DIRTY_RATIO ||
> +		type == MEM_CGROUP_DIRTY_BACKGROUND_RATIO) && val > 100)
> +		return -EINVAL;
> +	/*
> +	 * TODO: provide a validation check routine. And retry if validation
> +	 * fails.
> +	 */
> +	switch (type) {
> +	case MEM_CGROUP_DIRTY_RATIO:
> +		memcg->dirty_param.dirty_ratio = val;
> +		memcg->dirty_param.dirty_bytes = 0;
> +		break;
> +	case MEM_CGROUP_DIRTY_BYTES:
> +		memcg->dirty_param.dirty_ratio  = 0;
> +		memcg->dirty_param.dirty_bytes = val;
> +		break;
> +	case MEM_CGROUP_DIRTY_BACKGROUND_RATIO:
> +		memcg->dirty_param.dirty_background_ratio = val;
> +		memcg->dirty_param.dirty_background_bytes = 0;
> +		break;
> +	case MEM_CGROUP_DIRTY_BACKGROUND_BYTES:
> +		memcg->dirty_param.dirty_background_ratio = 0;
> +		memcg->dirty_param.dirty_background_bytes = val;
> +		break;
> +	}
> +	return 0;
> +}
> +
>  static struct cftype mem_cgroup_files[] = {
>  	{
>  		.name = "usage_in_bytes",
> @@ -3504,6 +3793,30 @@ static struct cftype mem_cgroup_files[] = {
>  		.write_u64 = mem_cgroup_swappiness_write,
>  	},
>  	{
> +		.name = "dirty_ratio",
> +		.read_u64 = mem_cgroup_dirty_read,
> +		.write_u64 = mem_cgroup_dirty_write,
> +		.private = MEM_CGROUP_DIRTY_RATIO,
> +	},
> +	{
> +		.name = "dirty_bytes",
> +		.read_u64 = mem_cgroup_dirty_read,
> +		.write_u64 = mem_cgroup_dirty_write,
> +		.private = MEM_CGROUP_DIRTY_BYTES,
> +	},
> +	{
> +		.name = "dirty_background_ratio",
> +		.read_u64 = mem_cgroup_dirty_read,
> +		.write_u64 = mem_cgroup_dirty_write,
> +		.private = MEM_CGROUP_DIRTY_BACKGROUND_RATIO,
> +	},
> +	{
> +		.name = "dirty_background_bytes",
> +		.read_u64 = mem_cgroup_dirty_read,
> +		.write_u64 = mem_cgroup_dirty_write,
> +		.private = MEM_CGROUP_DIRTY_BACKGROUND_BYTES,
> +	},
> +	{
>  		.name = "move_charge_at_immigrate",
>  		.read_u64 = mem_cgroup_move_charge_read,
>  		.write_u64 = mem_cgroup_move_charge_write,
> @@ -3762,8 +4075,21 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
>  	mem->last_scanned_child = 0;
>  	spin_lock_init(&mem->reclaim_param_lock);
>  
> -	if (parent)
> +	if (parent) {
>  		mem->swappiness = get_swappiness(parent);
> +		mem->dirty_param = parent->dirty_param;
> +	} else {
> +		while (1) {
> +			get_global_dirty_param(&mem->dirty_param);
> +			/*
> +			 * Since global dirty parameters are not protected we
> +			 * try to speculatively read them and retry if we get
> +			 * inconsistent values.
> +			 */
> +			if (likely(dirty_param_is_valid(&mem->dirty_param)))
> +				break;
> +		}
> +	}
>  	atomic_set(&mem->refcnt, 1);
>  	mem->move_charge_at_immigrate = 0;
>  	mutex_init(&mem->thresholds_lock);
> -- 
> 1.6.3.3
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH -mmotm 3/4] memcg: dirty pages accounting and limiting infrastructure
  2010-03-05  1:12   ` Daisuke Nishimura
@ 2010-03-05  1:58     ` KAMEZAWA Hiroyuki
  2010-03-05  7:01       ` Balbir Singh
  2010-03-05 22:14       ` Andrea Righi
  2010-03-05 22:14     ` Andrea Righi
  1 sibling, 2 replies; 41+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-03-05  1:58 UTC (permalink / raw)
  To: Daisuke Nishimura
  Cc: Andrea Righi, Balbir Singh, Vivek Goyal, Peter Zijlstra,
	Trond Myklebust, Suleiman Souhlal, Greg Thelen,
	Kirill A. Shutemov, Andrew Morton, containers, linux-kernel,
	linux-mm

On Fri, 5 Mar 2010 10:12:34 +0900
Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:

> On Thu,  4 Mar 2010 11:40:14 +0100, Andrea Righi <arighi@develer.com> wrote:
> > Infrastructure to account dirty pages per cgroup and add dirty limit
> >  static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data)
> >  {
> >  	int *val = data;
> > @@ -1275,34 +1423,70 @@ static void record_last_oom(struct mem_cgroup *mem)
> >  }
> >  
> >  /*
> > - * Currently used to update mapped file statistics, but the routine can be
> > - * generalized to update other statistics as well.
> > + * Generalized routine to update file cache's status for memcg.
> > + *
> > + * Before calling this, mapping->tree_lock should be held and preemption is
> > + * disabled.  Then, it's guarnteed that the page is not uncharged while we
> > + * access page_cgroup. We can make use of that.
> >   */
> IIUC, mapping->tree_lock is held with irq disabled, so I think "mapping->tree_lock
> should be held with irq disabled" would be enouth.
> And, as far as I can see, callers of this function have not ensured this yet in [4/4].
> 
> how about:
> 
> 	void mem_cgroup_update_stat_locked(...)
> 	{
> 		...
> 	}
> 
> 	void mem_cgroup_update_stat_unlocked(mapping, ...)
> 	{
> 		spin_lock_irqsave(mapping->tree_lock, ...);
> 		mem_cgroup_update_stat_locked();
> 		spin_unlock_irqrestore(...);
> 	}
>
Rather than tree_lock, lock_page_cgroup() can be used if tree_lock is not held.

		lock_page_cgroup();
		mem_cgroup_update_stat_locked();
		unlock_page_cgroup();

Andrea-san, FILE_MAPPED is updated without treelock, at least. You can't depend
on migration_lock about FILE_MAPPED.


 
> > -void mem_cgroup_update_file_mapped(struct page *page, int val)
> > +void mem_cgroup_update_stat(struct page *page,
> > +			enum mem_cgroup_stat_index idx, int val)
> >  {
> I preffer "void mem_cgroup_update_page_stat(struct page *, enum mem_cgroup_page_stat_item, ..)"
> as I said above.
> 
> >  	struct mem_cgroup *mem;
> >  	struct page_cgroup *pc;
> >  
> > +	if (mem_cgroup_disabled())
> > +		return;
> >  	pc = lookup_page_cgroup(page);
> > -	if (unlikely(!pc))
> > +	if (unlikely(!pc) || !PageCgroupUsed(pc))
> >  		return;
> >  
> > -	lock_page_cgroup(pc);
> > -	mem = pc->mem_cgroup;
> > -	if (!mem)
> > -		goto done;
> > -
> > -	if (!PageCgroupUsed(pc))
> > -		goto done;
> > -
> > +	lock_page_cgroup_migrate(pc);
> >  	/*
> > -	 * Preemption is already disabled. We can use __this_cpu_xxx
> > -	 */
> > -	__this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], val);
> > -
> > -done:
> > -	unlock_page_cgroup(pc);
> > +	* It's guarnteed that this page is never uncharged.
> > +	* The only racy problem is moving account among memcgs.
> > +	*/
> > +	switch (idx) {
> > +	case MEM_CGROUP_STAT_FILE_MAPPED:
> > +		if (val > 0)
> > +			SetPageCgroupFileMapped(pc);
> > +		else
> > +			ClearPageCgroupFileMapped(pc);
> > +		break;
> > +	case MEM_CGROUP_STAT_FILE_DIRTY:
> > +		if (val > 0)
> > +			SetPageCgroupDirty(pc);
> > +		else
> > +			ClearPageCgroupDirty(pc);
> > +		break;
> > +	case MEM_CGROUP_STAT_WRITEBACK:
> > +		if (val > 0)
> > +			SetPageCgroupWriteback(pc);
> > +		else
> > +			ClearPageCgroupWriteback(pc);
> > +		break;
> > +	case MEM_CGROUP_STAT_WRITEBACK_TEMP:
> > +		if (val > 0)
> > +			SetPageCgroupWritebackTemp(pc);
> > +		else
> > +			ClearPageCgroupWritebackTemp(pc);
> > +		break;
> > +	case MEM_CGROUP_STAT_UNSTABLE_NFS:
> > +		if (val > 0)
> > +			SetPageCgroupUnstableNFS(pc);
> > +		else
> > +			ClearPageCgroupUnstableNFS(pc);
> > +		break;
> > +	default:
> > +		BUG();
> > +		break;
> > +	}
> > +	mem = pc->mem_cgroup;
> > +	if (likely(mem))
> > +		__this_cpu_add(mem->stat->count[idx], val);
> > +	unlock_page_cgroup_migrate(pc);
> >  }
> > +EXPORT_SYMBOL_GPL(mem_cgroup_update_stat);
> >  
> >  /*
> >   * size of first charge trial. "32" comes from vmscan.c's magic value.
> > @@ -1701,6 +1885,45 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
> >  	memcg_check_events(mem, pc->page);
> >  }
> >  
> > +/*
> > + * Update file cache accounted statistics on task migration.
> > + *
> > + * TODO: We don't move charges of file (including shmem/tmpfs) pages for now.
> > + * So, at the moment this function simply returns without updating accounted
> > + * statistics, because we deal only with anonymous pages here.
> > + */
> This function is not unique to task migration. It's called from rmdir() too.
> So this comment isn't needed.
> 
> > +static void __mem_cgroup_update_file_stat(struct page_cgroup *pc,
> > +	struct mem_cgroup *from, struct mem_cgroup *to)
> > +{
> > +	struct page *page = pc->page;
> > +
> > +	if (!page_mapped(page) || PageAnon(page))
> > +		return;
> > +
> > +	if (PageCgroupFileMapped(pc)) {
> > +		__this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
> > +		__this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
> > +	}
> > +	if (PageCgroupDirty(pc)) {
> > +		__this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_DIRTY]);
> > +		__this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_DIRTY]);
> > +	}
> > +	if (PageCgroupWriteback(pc)) {
> > +		__this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_WRITEBACK]);
> > +		__this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_WRITEBACK]);
> > +	}
> > +	if (PageCgroupWritebackTemp(pc)) {
> > +		__this_cpu_dec(
> > +			from->stat->count[MEM_CGROUP_STAT_WRITEBACK_TEMP]);
> > +		__this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_WRITEBACK_TEMP]);
> > +	}
> > +	if (PageCgroupUnstableNFS(pc)) {
> > +		__this_cpu_dec(
> > +			from->stat->count[MEM_CGROUP_STAT_UNSTABLE_NFS]);
> > +		__this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_UNSTABLE_NFS]);
> > +	}
> > +}
> > +
> >  /**
> >   * __mem_cgroup_move_account - move account of the page
> >   * @pc:	page_cgroup of the page.
> > @@ -1721,22 +1944,16 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
> >  static void __mem_cgroup_move_account(struct page_cgroup *pc,
> >  	struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
> >  {
> > -	struct page *page;
> > -
> >  	VM_BUG_ON(from == to);
> >  	VM_BUG_ON(PageLRU(pc->page));
> >  	VM_BUG_ON(!PageCgroupLocked(pc));
> >  	VM_BUG_ON(!PageCgroupUsed(pc));
> >  	VM_BUG_ON(pc->mem_cgroup != from);
> >  
> > -	page = pc->page;
> > -	if (page_mapped(page) && !PageAnon(page)) {
> > -		/* Update mapped_file data for mem_cgroup */
> > -		preempt_disable();
> > -		__this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
> > -		__this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
> > -		preempt_enable();
> > -	}
> > +	preempt_disable();
> > +	lock_page_cgroup_migrate(pc);
> > +	__mem_cgroup_update_file_stat(pc, from, to);
> > +
> >  	mem_cgroup_charge_statistics(from, pc, false);
> >  	if (uncharge)
> >  		/* This is not "cancel", but cancel_charge does all we need. */
> > @@ -1745,6 +1962,8 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc,
> >  	/* caller should have done css_get */
> >  	pc->mem_cgroup = to;
> >  	mem_cgroup_charge_statistics(to, pc, true);
> > +	unlock_page_cgroup_migrate(pc);
> > +	preempt_enable();
> Glad to see this cleanup :)
> But, hmm, I don't think preempt_disable/enable() is enough(and bit_spin_lock/unlock()
> does it anyway). lock/unlock_page_cgroup_migrate() can be called under irq context
> (e.g. end_page_writeback()), so I think we must local_irq_disable()/enable() here.
> 
Ah, hmm, yes. irq-disable is required.

Thanks,
-Kame

>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH -mmotm 3/4] memcg: dirty pages accounting and limiting infrastructure
  2010-03-05  1:58     ` KAMEZAWA Hiroyuki
@ 2010-03-05  7:01       ` Balbir Singh
  2010-03-05 22:14       ` Andrea Righi
  1 sibling, 0 replies; 41+ messages in thread
From: Balbir Singh @ 2010-03-05  7:01 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Daisuke Nishimura, Andrea Righi, Vivek Goyal, Peter Zijlstra,
	Trond Myklebust, Suleiman Souhlal, Greg Thelen,
	Kirill A. Shutemov, Andrew Morton, containers, linux-kernel,
	linux-mm

* KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> [2010-03-05 10:58:55]:

> On Fri, 5 Mar 2010 10:12:34 +0900
> Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:
> 
> > On Thu,  4 Mar 2010 11:40:14 +0100, Andrea Righi <arighi@develer.com> wrote:
> > > Infrastructure to account dirty pages per cgroup and add dirty limit
> > >  static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data)
> > >  {
> > >  	int *val = data;
> > > @@ -1275,34 +1423,70 @@ static void record_last_oom(struct mem_cgroup *mem)
> > >  }
> > >  
> > >  /*
> > > - * Currently used to update mapped file statistics, but the routine can be
> > > - * generalized to update other statistics as well.
> > > + * Generalized routine to update file cache's status for memcg.
> > > + *
> > > + * Before calling this, mapping->tree_lock should be held and preemption is
> > > + * disabled.  Then, it's guarnteed that the page is not uncharged while we
> > > + * access page_cgroup. We can make use of that.
> > >   */
> > IIUC, mapping->tree_lock is held with irq disabled, so I think "mapping->tree_lock
> > should be held with irq disabled" would be enouth.
> > And, as far as I can see, callers of this function have not ensured this yet in [4/4].
> > 
> > how about:
> > 
> > 	void mem_cgroup_update_stat_locked(...)
> > 	{
> > 		...
> > 	}
> > 
> > 	void mem_cgroup_update_stat_unlocked(mapping, ...)
> > 	{
> > 		spin_lock_irqsave(mapping->tree_lock, ...);
> > 		mem_cgroup_update_stat_locked();
> > 		spin_unlock_irqrestore(...);
> > 	}
> >
> Rather than tree_lock, lock_page_cgroup() can be used if tree_lock is not held.
> 
> 		lock_page_cgroup();
> 		mem_cgroup_update_stat_locked();
> 		unlock_page_cgroup();
> 
> Andrea-san, FILE_MAPPED is updated without treelock, at least. You can't depend
> on migration_lock about FILE_MAPPED.
>

FILE_MAPPED is updated under pte lock in the rmap context and
page_cgroup lock within update_file_mapped.
 

-- 
	Three Cheers,
	Balbir

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH -mmotm 3/4] memcg: dirty pages accounting and limiting infrastructure
  2010-03-05  1:12   ` Daisuke Nishimura
  2010-03-05  1:58     ` KAMEZAWA Hiroyuki
@ 2010-03-05 22:14     ` Andrea Righi
  1 sibling, 0 replies; 41+ messages in thread
From: Andrea Righi @ 2010-03-05 22:14 UTC (permalink / raw)
  To: Daisuke Nishimura
  Cc: KAMEZAWA Hiroyuki, Balbir Singh, Vivek Goyal, Peter Zijlstra,
	Trond Myklebust, Suleiman Souhlal, Greg Thelen,
	Kirill A. Shutemov, Andrew Morton, containers, linux-kernel,
	linux-mm

On Fri, Mar 05, 2010 at 10:12:34AM +0900, Daisuke Nishimura wrote:
> On Thu,  4 Mar 2010 11:40:14 +0100, Andrea Righi <arighi@develer.com> wrote:
> > Infrastructure to account dirty pages per cgroup and add dirty limit
> > interfaces in the cgroupfs:
> > 
> >  - Direct write-out: memory.dirty_ratio, memory.dirty_bytes
> > 
> >  - Background write-out: memory.dirty_background_ratio, memory.dirty_background_bytes
> > 
> > Signed-off-by: Andrea Righi <arighi@develer.com>
> > ---
> >  include/linux/memcontrol.h |   80 ++++++++-
> >  mm/memcontrol.c            |  420 +++++++++++++++++++++++++++++++++++++++-----
> >  2 files changed, 450 insertions(+), 50 deletions(-)
> > 
> > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> > index 1f9b119..cc3421b 100644
> > --- a/include/linux/memcontrol.h
> > +++ b/include/linux/memcontrol.h
> > @@ -19,12 +19,66 @@
> >  
> >  #ifndef _LINUX_MEMCONTROL_H
> >  #define _LINUX_MEMCONTROL_H
> > +
> > +#include <linux/writeback.h>
> >  #include <linux/cgroup.h>
> > +
> >  struct mem_cgroup;
> >  struct page_cgroup;
> >  struct page;
> >  struct mm_struct;
> >  
> > +/* Cgroup memory statistics items exported to the kernel */
> > +enum mem_cgroup_page_stat_item {
> > +	MEMCG_NR_DIRTYABLE_PAGES,
> > +	MEMCG_NR_RECLAIM_PAGES,
> > +	MEMCG_NR_WRITEBACK,
> > +	MEMCG_NR_DIRTY_WRITEBACK_PAGES,
> > +};
> > +
> > +/* Dirty memory parameters */
> > +struct dirty_param {
> > +	int dirty_ratio;
> > +	unsigned long dirty_bytes;
> > +	int dirty_background_ratio;
> > +	unsigned long dirty_background_bytes;
> > +};
> > +
> > +/*
> > + * Statistics for memory cgroup.
> > + */
> > +enum mem_cgroup_stat_index {
> > +	/*
> > +	 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
> > +	 */
> > +	MEM_CGROUP_STAT_CACHE,	   /* # of pages charged as cache */
> > +	MEM_CGROUP_STAT_RSS,	   /* # of pages charged as anon rss */
> > +	MEM_CGROUP_STAT_FILE_MAPPED,  /* # of pages charged as file rss */
> > +	MEM_CGROUP_STAT_PGPGIN_COUNT,	/* # of pages paged in */
> > +	MEM_CGROUP_STAT_PGPGOUT_COUNT,	/* # of pages paged out */
> > +	MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
> > +	MEM_CGROUP_EVENTS,	/* incremented at every  pagein/pageout */
> > +	MEM_CGROUP_STAT_FILE_DIRTY,   /* # of dirty pages in page cache */
> > +	MEM_CGROUP_STAT_WRITEBACK,   /* # of pages under writeback */
> > +	MEM_CGROUP_STAT_WRITEBACK_TEMP,   /* # of pages under writeback using
> > +						temporary buffers */
> > +	MEM_CGROUP_STAT_UNSTABLE_NFS,   /* # of NFS unstable pages */
> > +
> > +	MEM_CGROUP_STAT_NSTATS,
> > +};
> > +
> I must have said it earlier, but I don't think exporting all of these flags
> is a good idea.
> Can you export only mem_cgroup_page_stat_item(of course, need to add MEMCG_NR_FILE_MAPPED)?
> We can translate mem_cgroup_page_stat_item to mem_cgroup_stat_index by simple arithmetic
> if you define MEM_CGROUP_STAT_FILE_MAPPED..MEM_CGROUP_STAT_UNSTABLE_NFS sequentially.

Agreed.

> 
> > +/*
> > + * TODO: provide a validation check routine. And retry if validation
> > + * fails.
> > + */
> > +static inline void get_global_dirty_param(struct dirty_param *param)
> > +{
> > +	param->dirty_ratio = vm_dirty_ratio;
> > +	param->dirty_bytes = vm_dirty_bytes;
> > +	param->dirty_background_ratio = dirty_background_ratio;
> > +	param->dirty_background_bytes = dirty_background_bytes;
> > +}
> > +
> >  #ifdef CONFIG_CGROUP_MEM_RES_CTLR
> >  /*
> >   * All "charge" functions with gfp_mask should use GFP_KERNEL or
> > @@ -117,6 +171,10 @@ extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
> >  extern int do_swap_account;
> >  #endif
> >  
> > +extern bool mem_cgroup_has_dirty_limit(void);
> > +extern void get_dirty_param(struct dirty_param *param);
> > +extern s64 mem_cgroup_page_stat(enum mem_cgroup_page_stat_item item);
> > +
> >  static inline bool mem_cgroup_disabled(void)
> >  {
> >  	if (mem_cgroup_subsys.disabled)
> > @@ -125,7 +183,8 @@ static inline bool mem_cgroup_disabled(void)
> >  }
> >  
> >  extern bool mem_cgroup_oom_called(struct task_struct *task);
> > -void mem_cgroup_update_file_mapped(struct page *page, int val);
> > +void mem_cgroup_update_stat(struct page *page,
> > +			enum mem_cgroup_stat_index idx, int val);
> >  unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
> >  						gfp_t gfp_mask, int nid,
> >  						int zid);
> > @@ -300,8 +359,8 @@ mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
> >  {
> >  }
> >  
> > -static inline void mem_cgroup_update_file_mapped(struct page *page,
> > -							int val)
> > +static inline void mem_cgroup_update_stat(struct page *page,
> > +			enum mem_cgroup_stat_index idx, int val)
> >  {
> >  }
> >  
> > @@ -312,6 +371,21 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
> >  	return 0;
> >  }
> >  
> > +static inline bool mem_cgroup_has_dirty_limit(void)
> > +{
> > +	return false;
> > +}
> > +
> > +static inline void get_dirty_param(struct dirty_param *param)
> > +{
> > +	get_global_dirty_param(param);
> > +}
> > +
> > +static inline s64 mem_cgroup_page_stat(enum mem_cgroup_page_stat_item item)
> > +{
> > +	return -ENOSYS;
> > +}
> > +
> >  #endif /* CONFIG_CGROUP_MEM_CONT */
> >  
> >  #endif /* _LINUX_MEMCONTROL_H */
> > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > index 497b6f7..9842e7b 100644
> > --- a/mm/memcontrol.c
> > +++ b/mm/memcontrol.c
> > @@ -73,28 +73,23 @@ static int really_do_swap_account __initdata = 1; /* for remember boot option*/
> >  #define THRESHOLDS_EVENTS_THRESH (7) /* once in 128 */
> >  #define SOFTLIMIT_EVENTS_THRESH (10) /* once in 1024 */
> >  
> > -/*
> > - * Statistics for memory cgroup.
> > - */
> > -enum mem_cgroup_stat_index {
> > -	/*
> > -	 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
> > -	 */
> > -	MEM_CGROUP_STAT_CACHE, 	   /* # of pages charged as cache */
> > -	MEM_CGROUP_STAT_RSS,	   /* # of pages charged as anon rss */
> > -	MEM_CGROUP_STAT_FILE_MAPPED,  /* # of pages charged as file rss */
> > -	MEM_CGROUP_STAT_PGPGIN_COUNT,	/* # of pages paged in */
> > -	MEM_CGROUP_STAT_PGPGOUT_COUNT,	/* # of pages paged out */
> > -	MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
> > -	MEM_CGROUP_EVENTS,	/* incremented at every  pagein/pageout */
> > -
> > -	MEM_CGROUP_STAT_NSTATS,
> > -};
> > -
> >  struct mem_cgroup_stat_cpu {
> >  	s64 count[MEM_CGROUP_STAT_NSTATS];
> >  };
> >  
> > +/* Per cgroup page statistics */
> > +struct mem_cgroup_page_stat {
> > +	enum mem_cgroup_page_stat_item item;
> > +	s64 value;
> > +};
> > +
> > +enum {
> > +	MEM_CGROUP_DIRTY_RATIO,
> > +	MEM_CGROUP_DIRTY_BYTES,
> > +	MEM_CGROUP_DIRTY_BACKGROUND_RATIO,
> > +	MEM_CGROUP_DIRTY_BACKGROUND_BYTES,
> > +};
> > +
> >  /*
> >   * per-zone information in memory controller.
> >   */
> > @@ -208,6 +203,9 @@ struct mem_cgroup {
> >  
> >  	unsigned int	swappiness;
> >  
> > +	/* control memory cgroup dirty pages */
> > +	struct dirty_param dirty_param;
> > +
> >  	/* set when res.limit == memsw.limit */
> >  	bool		memsw_is_minimum;
> >  
> > @@ -1033,6 +1031,156 @@ static unsigned int get_swappiness(struct mem_cgroup *memcg)
> >  	return swappiness;
> >  }
> >  
> > +static bool dirty_param_is_valid(struct dirty_param *param)
> > +{
> > +	if (param->dirty_ratio && param->dirty_bytes)
> > +		return false;
> > +	if (param->dirty_background_ratio && param->dirty_background_bytes)
> > +		return false;
> > +	return true;
> > +}
> > +
> > +static void
> > +__mem_cgroup_get_dirty_param(struct dirty_param *param, struct mem_cgroup *mem)
> > +{
> > +	param->dirty_ratio = mem->dirty_param.dirty_ratio;
> > +	param->dirty_bytes = mem->dirty_param.dirty_bytes;
> > +	param->dirty_background_ratio = mem->dirty_param.dirty_background_ratio;
> > +	param->dirty_background_bytes = mem->dirty_param.dirty_background_bytes;
> > +}
> > +
> > +/*
> > + * get_dirty_param() - get dirty memory parameters of the current memcg
> > + * @param:	a structure is filled with the dirty memory settings
> > + *
> > + * The function fills @param with the current memcg dirty memory settings. If
> > + * memory cgroup is disabled or in case of error the structure is filled with
> > + * the global dirty memory settings.
> > + */
> > +void get_dirty_param(struct dirty_param *param)
> > +{
> > +	struct mem_cgroup *memcg;
> > +
> > +	if (mem_cgroup_disabled()) {
> > +		get_global_dirty_param(param);
> > +		return;
> > +	}
> > +	/*
> > +	 * It's possible that "current" may be moved to other cgroup while we
> > +	 * access cgroup. But precise check is meaningless because the task can
> > +	 * be moved after our access and writeback tends to take long time.
> > +	 * At least, "memcg" will not be freed under rcu_read_lock().
> > +	 */
> > +	while (1) {
> > +		rcu_read_lock();
> > +		memcg = mem_cgroup_from_task(current);
> > +		if (likely(memcg))
> > +			__mem_cgroup_get_dirty_param(param, memcg);
> > +		else
> > +			get_global_dirty_param(param);
> > +		rcu_read_unlock();
> > +		/*
> > +		 * Since global and memcg dirty_param are not protected we try
> > +		 * to speculatively read them and retry if we get inconsistent
> > +		 * values.
> > +		 */
> > +		if (likely(dirty_param_is_valid(param)))
> > +			break;
> > +	}
> > +}
> > +
> > +static inline bool mem_cgroup_can_swap(struct mem_cgroup *memcg)
> > +{
> > +	if (!do_swap_account)
> > +		return nr_swap_pages > 0;
> > +	return !memcg->memsw_is_minimum &&
> > +		(res_counter_read_u64(&memcg->memsw, RES_LIMIT) > 0);
> > +}
> > +
> > +static s64 mem_cgroup_get_local_page_stat(struct mem_cgroup *memcg,
> > +				enum mem_cgroup_page_stat_item item)
> > +{
> > +	s64 ret;
> > +
> > +	switch (item) {
> > +	case MEMCG_NR_DIRTYABLE_PAGES:
> > +		ret = res_counter_read_u64(&memcg->res, RES_LIMIT) -
> > +			res_counter_read_u64(&memcg->res, RES_USAGE);
> > +		/* Translate free memory in pages */
> > +		ret >>= PAGE_SHIFT;
> > +		ret += mem_cgroup_read_stat(memcg, LRU_ACTIVE_FILE) +
> > +			mem_cgroup_read_stat(memcg, LRU_INACTIVE_FILE);
> > +		if (mem_cgroup_can_swap(memcg))
> > +			ret += mem_cgroup_read_stat(memcg, LRU_ACTIVE_ANON) +
> > +				mem_cgroup_read_stat(memcg, LRU_INACTIVE_ANON);
> > +		break;
> > +	case MEMCG_NR_RECLAIM_PAGES:
> > +		ret = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_FILE_DIRTY) +
> > +			mem_cgroup_read_stat(memcg,
> > +					MEM_CGROUP_STAT_UNSTABLE_NFS);
> > +		break;
> > +	case MEMCG_NR_WRITEBACK:
> > +		ret = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
> > +		break;
> > +	case MEMCG_NR_DIRTY_WRITEBACK_PAGES:
> > +		ret = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_WRITEBACK) +
> > +			mem_cgroup_read_stat(memcg,
> > +				MEM_CGROUP_STAT_UNSTABLE_NFS);
> > +		break;
> > +	default:
> > +		BUG_ON(1);
> > +	}
> > +	return ret;
> > +}
> > +
> > +static int mem_cgroup_page_stat_cb(struct mem_cgroup *mem, void *data)
> > +{
> > +	struct mem_cgroup_page_stat *stat = (struct mem_cgroup_page_stat *)data;
> > +
> > +	stat->value += mem_cgroup_get_local_page_stat(mem, stat->item);
> > +	return 0;
> > +}
> > +
> > +/*
> > + * mem_cgroup_has_dirty_limit() - check if current memcg has local dirty limits
> > + *
> > + * Return true if the current memory cgroup has local dirty memory settings,
> > + * false otherwise.
> > + */
> > +bool mem_cgroup_has_dirty_limit(void)
> > +{
> > +	if (mem_cgroup_disabled())
> > +		return false;
> > +	return mem_cgroup_from_task(current) != NULL;
> > +}
> > +
> > +/*
> > + * mem_cgroup_page_stat() - get memory cgroup file cache statistics
> > + * @item:	memory statistic item exported to the kernel
> > + *
> > + * Return the accounted statistic value, or a negative value in case of error.
> > + */
> > +s64 mem_cgroup_page_stat(enum mem_cgroup_page_stat_item item)
> > +{
> > +	struct mem_cgroup_page_stat stat = {};
> > +	struct mem_cgroup *memcg;
> > +
> > +	rcu_read_lock();
> > +	memcg = mem_cgroup_from_task(current);
> > +	if (memcg) {
> > +		/*
> > +		 * Recursively evaulate page statistics against all cgroup
> > +		 * under hierarchy tree
> > +		 */
> > +		stat.item = item;
> > +		mem_cgroup_walk_tree(memcg, &stat, mem_cgroup_page_stat_cb);
> > +	} else
> > +		stat.value = -EINVAL;
> > +	rcu_read_unlock();
> > +
> > +	return stat.value;
> > +}
> > +
> >  static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data)
> >  {
> >  	int *val = data;
> > @@ -1275,34 +1423,70 @@ static void record_last_oom(struct mem_cgroup *mem)
> >  }
> >  
> >  /*
> > - * Currently used to update mapped file statistics, but the routine can be
> > - * generalized to update other statistics as well.
> > + * Generalized routine to update file cache's status for memcg.
> > + *
> > + * Before calling this, mapping->tree_lock should be held and preemption is
> > + * disabled.  Then, it's guarnteed that the page is not uncharged while we
> > + * access page_cgroup. We can make use of that.
> >   */
> IIUC, mapping->tree_lock is held with irq disabled, so I think "mapping->tree_lock
> should be held with irq disabled" would be enouth.
> And, as far as I can see, callers of this function have not ensured this yet in [4/4].
> 
> how about:
> 
> 	void mem_cgroup_update_stat_locked(...)
> 	{
> 		...
> 	}
> 
> 	void mem_cgroup_update_stat_unlocked(mapping, ...)
> 	{
> 		spin_lock_irqsave(mapping->tree_lock, ...);
> 		mem_cgroup_update_stat_locked();
> 		spin_unlock_irqrestore(...);
> 	}

So, basically, lock_page_cgroup_migrate() should disable irqs and
unlock_page_cgroup_migrate() should re-enable them, except for updating
MEM_CGROUP_STAT_FILE_MAPPED, where just a lock/unlock_page_cgroup() is
needed. Right?

> 
> > -void mem_cgroup_update_file_mapped(struct page *page, int val)
> > +void mem_cgroup_update_stat(struct page *page,
> > +			enum mem_cgroup_stat_index idx, int val)
> >  {
> I preffer "void mem_cgroup_update_page_stat(struct page *, enum mem_cgroup_page_stat_item, ..)"
> as I said above.
> 
> >  	struct mem_cgroup *mem;
> >  	struct page_cgroup *pc;
> >  
> > +	if (mem_cgroup_disabled())
> > +		return;
> >  	pc = lookup_page_cgroup(page);
> > -	if (unlikely(!pc))
> > +	if (unlikely(!pc) || !PageCgroupUsed(pc))
> >  		return;
> >  
> > -	lock_page_cgroup(pc);
> > -	mem = pc->mem_cgroup;
> > -	if (!mem)
> > -		goto done;
> > -
> > -	if (!PageCgroupUsed(pc))
> > -		goto done;
> > -
> > +	lock_page_cgroup_migrate(pc);
> >  	/*
> > -	 * Preemption is already disabled. We can use __this_cpu_xxx
> > -	 */
> > -	__this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], val);
> > -
> > -done:
> > -	unlock_page_cgroup(pc);
> > +	* It's guarnteed that this page is never uncharged.
> > +	* The only racy problem is moving account among memcgs.
> > +	*/
> > +	switch (idx) {
> > +	case MEM_CGROUP_STAT_FILE_MAPPED:
> > +		if (val > 0)
> > +			SetPageCgroupFileMapped(pc);
> > +		else
> > +			ClearPageCgroupFileMapped(pc);
> > +		break;
> > +	case MEM_CGROUP_STAT_FILE_DIRTY:
> > +		if (val > 0)
> > +			SetPageCgroupDirty(pc);
> > +		else
> > +			ClearPageCgroupDirty(pc);
> > +		break;
> > +	case MEM_CGROUP_STAT_WRITEBACK:
> > +		if (val > 0)
> > +			SetPageCgroupWriteback(pc);
> > +		else
> > +			ClearPageCgroupWriteback(pc);
> > +		break;
> > +	case MEM_CGROUP_STAT_WRITEBACK_TEMP:
> > +		if (val > 0)
> > +			SetPageCgroupWritebackTemp(pc);
> > +		else
> > +			ClearPageCgroupWritebackTemp(pc);
> > +		break;
> > +	case MEM_CGROUP_STAT_UNSTABLE_NFS:
> > +		if (val > 0)
> > +			SetPageCgroupUnstableNFS(pc);
> > +		else
> > +			ClearPageCgroupUnstableNFS(pc);
> > +		break;
> > +	default:
> > +		BUG();
> > +		break;
> > +	}
> > +	mem = pc->mem_cgroup;
> > +	if (likely(mem))
> > +		__this_cpu_add(mem->stat->count[idx], val);
> > +	unlock_page_cgroup_migrate(pc);
> >  }
> > +EXPORT_SYMBOL_GPL(mem_cgroup_update_stat);
> >  
> >  /*
> >   * size of first charge trial. "32" comes from vmscan.c's magic value.
> > @@ -1701,6 +1885,45 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
> >  	memcg_check_events(mem, pc->page);
> >  }
> >  
> > +/*
> > + * Update file cache accounted statistics on task migration.
> > + *
> > + * TODO: We don't move charges of file (including shmem/tmpfs) pages for now.
> > + * So, at the moment this function simply returns without updating accounted
> > + * statistics, because we deal only with anonymous pages here.
> > + */
> This function is not unique to task migration. It's called from rmdir() too.
> So this comment isn't needed.

Agreed.

> 
> > +static void __mem_cgroup_update_file_stat(struct page_cgroup *pc,
> > +	struct mem_cgroup *from, struct mem_cgroup *to)
> > +{
> > +	struct page *page = pc->page;
> > +
> > +	if (!page_mapped(page) || PageAnon(page))
> > +		return;
> > +
> > +	if (PageCgroupFileMapped(pc)) {
> > +		__this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
> > +		__this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
> > +	}
> > +	if (PageCgroupDirty(pc)) {
> > +		__this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_DIRTY]);
> > +		__this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_DIRTY]);
> > +	}
> > +	if (PageCgroupWriteback(pc)) {
> > +		__this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_WRITEBACK]);
> > +		__this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_WRITEBACK]);
> > +	}
> > +	if (PageCgroupWritebackTemp(pc)) {
> > +		__this_cpu_dec(
> > +			from->stat->count[MEM_CGROUP_STAT_WRITEBACK_TEMP]);
> > +		__this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_WRITEBACK_TEMP]);
> > +	}
> > +	if (PageCgroupUnstableNFS(pc)) {
> > +		__this_cpu_dec(
> > +			from->stat->count[MEM_CGROUP_STAT_UNSTABLE_NFS]);
> > +		__this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_UNSTABLE_NFS]);
> > +	}
> > +}
> > +
> >  /**
> >   * __mem_cgroup_move_account - move account of the page
> >   * @pc:	page_cgroup of the page.
> > @@ -1721,22 +1944,16 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
> >  static void __mem_cgroup_move_account(struct page_cgroup *pc,
> >  	struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
> >  {
> > -	struct page *page;
> > -
> >  	VM_BUG_ON(from == to);
> >  	VM_BUG_ON(PageLRU(pc->page));
> >  	VM_BUG_ON(!PageCgroupLocked(pc));
> >  	VM_BUG_ON(!PageCgroupUsed(pc));
> >  	VM_BUG_ON(pc->mem_cgroup != from);
> >  
> > -	page = pc->page;
> > -	if (page_mapped(page) && !PageAnon(page)) {
> > -		/* Update mapped_file data for mem_cgroup */
> > -		preempt_disable();
> > -		__this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
> > -		__this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
> > -		preempt_enable();
> > -	}
> > +	preempt_disable();
> > +	lock_page_cgroup_migrate(pc);
> > +	__mem_cgroup_update_file_stat(pc, from, to);
> > +
> >  	mem_cgroup_charge_statistics(from, pc, false);
> >  	if (uncharge)
> >  		/* This is not "cancel", but cancel_charge does all we need. */
> > @@ -1745,6 +1962,8 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc,
> >  	/* caller should have done css_get */
> >  	pc->mem_cgroup = to;
> >  	mem_cgroup_charge_statistics(to, pc, true);
> > +	unlock_page_cgroup_migrate(pc);
> > +	preempt_enable();
> Glad to see this cleanup :)
> But, hmm, I don't think preempt_disable/enable() is enough(and bit_spin_lock/unlock()
> does it anyway). lock/unlock_page_cgroup_migrate() can be called under irq context
> (e.g. end_page_writeback()), so I think we must local_irq_disable()/enable() here.

You're right. So, also for this case irqs must be disabled/enabled by
lock/unlock_page_cgroup_migrate(). And again, FILE_MAPPED just needs
lock/unlock_page_cgroup().

Thanks,
-Andrea

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH -mmotm 3/4] memcg: dirty pages accounting and limiting infrastructure
  2010-03-05  1:58     ` KAMEZAWA Hiroyuki
  2010-03-05  7:01       ` Balbir Singh
@ 2010-03-05 22:14       ` Andrea Righi
  1 sibling, 0 replies; 41+ messages in thread
From: Andrea Righi @ 2010-03-05 22:14 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Daisuke Nishimura, Balbir Singh, Vivek Goyal, Peter Zijlstra,
	Trond Myklebust, Suleiman Souhlal, Greg Thelen,
	Kirill A. Shutemov, Andrew Morton, containers, linux-kernel,
	linux-mm

On Fri, Mar 05, 2010 at 10:58:55AM +0900, KAMEZAWA Hiroyuki wrote:
> On Fri, 5 Mar 2010 10:12:34 +0900
> Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:
> 
> > On Thu,  4 Mar 2010 11:40:14 +0100, Andrea Righi <arighi@develer.com> wrote:
> > > Infrastructure to account dirty pages per cgroup and add dirty limit
> > >  static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data)
> > >  {
> > >  	int *val = data;
> > > @@ -1275,34 +1423,70 @@ static void record_last_oom(struct mem_cgroup *mem)
> > >  }
> > >  
> > >  /*
> > > - * Currently used to update mapped file statistics, but the routine can be
> > > - * generalized to update other statistics as well.
> > > + * Generalized routine to update file cache's status for memcg.
> > > + *
> > > + * Before calling this, mapping->tree_lock should be held and preemption is
> > > + * disabled.  Then, it's guarnteed that the page is not uncharged while we
> > > + * access page_cgroup. We can make use of that.
> > >   */
> > IIUC, mapping->tree_lock is held with irq disabled, so I think "mapping->tree_lock
> > should be held with irq disabled" would be enouth.
> > And, as far as I can see, callers of this function have not ensured this yet in [4/4].
> > 
> > how about:
> > 
> > 	void mem_cgroup_update_stat_locked(...)
> > 	{
> > 		...
> > 	}
> > 
> > 	void mem_cgroup_update_stat_unlocked(mapping, ...)
> > 	{
> > 		spin_lock_irqsave(mapping->tree_lock, ...);
> > 		mem_cgroup_update_stat_locked();
> > 		spin_unlock_irqrestore(...);
> > 	}
> >
> Rather than tree_lock, lock_page_cgroup() can be used if tree_lock is not held.
> 
> 		lock_page_cgroup();
> 		mem_cgroup_update_stat_locked();
> 		unlock_page_cgroup();
> 
> Andrea-san, FILE_MAPPED is updated without treelock, at least. You can't depend
> on migration_lock about FILE_MAPPED.

Right. I'll consider this in the next version of the patch.

Thanks,
-Andrea

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 41+ messages in thread

* [PATCH -mmotm 0/4] memcg: per cgroup dirty limit (v5)
@ 2010-03-07 20:57 Andrea Righi
  2010-03-07 20:57 ` [PATCH -mmotm 1/4] memcg: dirty memory documentation Andrea Righi
                   ` (3 more replies)
  0 siblings, 4 replies; 41+ messages in thread
From: Andrea Righi @ 2010-03-07 20:57 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki, Balbir Singh, Daisuke Nishimura
  Cc: Vivek Goyal, Peter Zijlstra, Trond Myklebust, Suleiman Souhlal,
	Greg Thelen, Kirill A. Shutemov, Andrew Morton, containers,
	linux-kernel, linux-mm

Control the maximum amount of dirty pages a cgroup can have at any given time.

Per cgroup dirty limit is like fixing the max amount of dirty (hard to reclaim)
page cache used by any cgroup. So, in case of multiple cgroup writers, they
will not be able to consume more than their designated share of dirty pages and
will be forced to perform write-out if they cross that limit.

The overall design is the following:

 - account dirty pages per cgroup
 - limit the number of dirty pages via memory.dirty_ratio / memory.dirty_bytes
   and memory.dirty_background_ratio / memory.dirty_background_bytes in
   cgroupfs
 - start to write-out (directly or background) when the cgroup limits are
   exceeded

This feature is supposed to be strictly connected to any underlying IO
controller implementation, so we can stop increasing dirty pages in VM layer
and enforce a write-out before any cgroup will consume the global amount of
dirty pages defined by /proc/sys/vm/dirty_ratio|dirty_bytes and
/proc/sys/vm/dirty_background_ratio|dirty_background_bytes.

Changelog (v4 -> v5)
~~~~~~~~~~~~~~~~~~~~~~
 * fixed a potential deadlock between lock_page_cgroup and mapping->tree_lock
   (I'm not sure I did the right thing for this point, so review and tests are
   very welcome)
 * introduce inc/dec functions to update file cache accounting
 * export only a restricted subset of mem_cgroup_stat_index flags
 * fixed a bug in determine_dirtyable_memory() to correctly return the local
   memcg dirtyable memory
 * always use global dirty memory settings in calc_period_shift()

-Andrea

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 41+ messages in thread

* [PATCH -mmotm 1/4] memcg: dirty memory documentation
  2010-03-07 20:57 [PATCH -mmotm 0/4] memcg: per cgroup dirty limit (v5) Andrea Righi
@ 2010-03-07 20:57 ` Andrea Righi
  2010-03-07 20:57 ` [PATCH -mmotm 2/4] page_cgroup: introduce file cache flags Andrea Righi
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 41+ messages in thread
From: Andrea Righi @ 2010-03-07 20:57 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki, Balbir Singh, Daisuke Nishimura
  Cc: Vivek Goyal, Peter Zijlstra, Trond Myklebust, Suleiman Souhlal,
	Greg Thelen, Kirill A. Shutemov, Andrew Morton, containers,
	linux-kernel, linux-mm, Andrea Righi

Document cgroup dirty memory interfaces and statistics.

Signed-off-by: Andrea Righi <arighi@develer.com>
---
 Documentation/cgroups/memory.txt |   36 ++++++++++++++++++++++++++++++++++++
 1 files changed, 36 insertions(+), 0 deletions(-)

diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt
index 49f86f3..38ca499 100644
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroups/memory.txt
@@ -310,6 +310,11 @@ cache		- # of bytes of page cache memory.
 rss		- # of bytes of anonymous and swap cache memory.
 pgpgin		- # of pages paged in (equivalent to # of charging events).
 pgpgout		- # of pages paged out (equivalent to # of uncharging events).
+filedirty	- # of pages that are waiting to get written back to the disk.
+writeback	- # of pages that are actively being written back to the disk.
+writeback_tmp	- # of pages used by FUSE for temporary writeback buffers.
+nfs		- # of NFS pages sent to the server, but not yet committed to
+		  the actual storage.
 active_anon	- # of bytes of anonymous and  swap cache memory on active
 		  lru list.
 inactive_anon	- # of bytes of anonymous memory and swap cache memory on
@@ -345,6 +350,37 @@ Note:
   - a cgroup which uses hierarchy and it has child cgroup.
   - a cgroup which uses hierarchy and not the root of hierarchy.
 
+5.4 dirty memory
+
+  Control the maximum amount of dirty pages a cgroup can have at any given time.
+
+  Limiting dirty memory is like fixing the max amount of dirty (hard to
+  reclaim) page cache used by any cgroup. So, in case of multiple cgroup writers,
+  they will not be able to consume more than their designated share of dirty
+  pages and will be forced to perform write-out if they cross that limit.
+
+  The interface is equivalent to the procfs interface: /proc/sys/vm/dirty_*.
+  It is possible to configure a limit to trigger both a direct writeback or a
+  background writeback performed by per-bdi flusher threads.
+
+  Per-cgroup dirty limits can be set using the following files in the cgroupfs:
+
+  - memory.dirty_ratio: contains, as a percentage of cgroup memory, the
+    amount of dirty memory at which a process which is generating disk writes
+    inside the cgroup will start itself writing out dirty data.
+
+  - memory.dirty_bytes: the amount of dirty memory of the cgroup (expressed in
+    bytes) at which a process generating disk writes will start itself writing
+    out dirty data.
+
+  - memory.dirty_background_ratio: contains, as a percentage of the cgroup
+    memory, the amount of dirty memory at which background writeback kernel
+    threads will start writing out dirty data.
+
+  - memory.dirty_background_bytes: the amount of dirty memory of the cgroup (in
+    bytes) at which background writeback kernel threads will start writing out
+    dirty data.
+
 
 6. Hierarchy support
 
-- 
1.6.3.3

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 41+ messages in thread

* [PATCH -mmotm 2/4] page_cgroup: introduce file cache flags
  2010-03-07 20:57 [PATCH -mmotm 0/4] memcg: per cgroup dirty limit (v5) Andrea Righi
  2010-03-07 20:57 ` [PATCH -mmotm 1/4] memcg: dirty memory documentation Andrea Righi
@ 2010-03-07 20:57 ` Andrea Righi
  2010-03-07 20:57 ` [PATCH -mmotm 3/4] memcg: dirty pages accounting and limiting infrastructure Andrea Righi
  2010-03-07 20:57 ` [PATCH -mmotm 4/4] memcg: dirty pages instrumentation Andrea Righi
  3 siblings, 0 replies; 41+ messages in thread
From: Andrea Righi @ 2010-03-07 20:57 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki, Balbir Singh, Daisuke Nishimura
  Cc: Vivek Goyal, Peter Zijlstra, Trond Myklebust, Suleiman Souhlal,
	Greg Thelen, Kirill A. Shutemov, Andrew Morton, containers,
	linux-kernel, linux-mm, Andrea Righi

Introduce page_cgroup flags to keep track of file cache pages.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrea Righi <arighi@develer.com>
---
 include/linux/page_cgroup.h |   45 +++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 45 insertions(+), 0 deletions(-)

diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
index 30b0813..dc66bee 100644
--- a/include/linux/page_cgroup.h
+++ b/include/linux/page_cgroup.h
@@ -39,6 +39,12 @@ enum {
 	PCG_CACHE, /* charged as cache */
 	PCG_USED, /* this object is in use. */
 	PCG_ACCT_LRU, /* page has been accounted for */
+	PCG_MIGRATE_LOCK, /* used for mutual execution of account migration */
+	PCG_ACCT_FILE_MAPPED, /* page is accounted as file rss*/
+	PCG_ACCT_DIRTY, /* page is dirty */
+	PCG_ACCT_WRITEBACK, /* page is being written back to disk */
+	PCG_ACCT_WRITEBACK_TEMP, /* page is used as temporary buffer for FUSE */
+	PCG_ACCT_UNSTABLE_NFS, /* NFS page not yet committed to the server */
 };
 
 #define TESTPCGFLAG(uname, lname)			\
@@ -73,6 +79,27 @@ CLEARPCGFLAG(AcctLRU, ACCT_LRU)
 TESTPCGFLAG(AcctLRU, ACCT_LRU)
 TESTCLEARPCGFLAG(AcctLRU, ACCT_LRU)
 
+/* File cache and dirty memory flags */
+TESTPCGFLAG(FileMapped, ACCT_FILE_MAPPED)
+SETPCGFLAG(FileMapped, ACCT_FILE_MAPPED)
+CLEARPCGFLAG(FileMapped, ACCT_FILE_MAPPED)
+
+TESTPCGFLAG(Dirty, ACCT_DIRTY)
+SETPCGFLAG(Dirty, ACCT_DIRTY)
+CLEARPCGFLAG(Dirty, ACCT_DIRTY)
+
+TESTPCGFLAG(Writeback, ACCT_WRITEBACK)
+SETPCGFLAG(Writeback, ACCT_WRITEBACK)
+CLEARPCGFLAG(Writeback, ACCT_WRITEBACK)
+
+TESTPCGFLAG(WritebackTemp, ACCT_WRITEBACK_TEMP)
+SETPCGFLAG(WritebackTemp, ACCT_WRITEBACK_TEMP)
+CLEARPCGFLAG(WritebackTemp, ACCT_WRITEBACK_TEMP)
+
+TESTPCGFLAG(UnstableNFS, ACCT_UNSTABLE_NFS)
+SETPCGFLAG(UnstableNFS, ACCT_UNSTABLE_NFS)
+CLEARPCGFLAG(UnstableNFS, ACCT_UNSTABLE_NFS)
+
 static inline int page_cgroup_nid(struct page_cgroup *pc)
 {
 	return page_to_nid(pc->page);
@@ -83,6 +110,9 @@ static inline enum zone_type page_cgroup_zid(struct page_cgroup *pc)
 	return page_zonenum(pc->page);
 }
 
+/*
+ * lock_page_cgroup() should not be held under mapping->tree_lock
+ */
 static inline void lock_page_cgroup(struct page_cgroup *pc)
 {
 	bit_spin_lock(PCG_LOCK, &pc->flags);
@@ -93,6 +123,21 @@ static inline void unlock_page_cgroup(struct page_cgroup *pc)
 	bit_spin_unlock(PCG_LOCK, &pc->flags);
 }
 
+/*
+ * This lock is not be lock for charge/uncharge but for account moving.
+ * i.e. overwrite pc->mem_cgroup. The lock owner should guarantee by itself
+ * the page is uncharged while we hold this.
+ */
+static inline void lock_page_cgroup_migrate(struct page_cgroup *pc)
+{
+	bit_spin_lock(PCG_MIGRATE_LOCK, &pc->flags);
+}
+
+static inline void unlock_page_cgroup_migrate(struct page_cgroup *pc)
+{
+	bit_spin_unlock(PCG_MIGRATE_LOCK, &pc->flags);
+}
+
 #else /* CONFIG_CGROUP_MEM_RES_CTLR */
 struct page_cgroup;
 
-- 
1.6.3.3

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 41+ messages in thread

* [PATCH -mmotm 3/4] memcg: dirty pages accounting and limiting infrastructure
  2010-03-07 20:57 [PATCH -mmotm 0/4] memcg: per cgroup dirty limit (v5) Andrea Righi
  2010-03-07 20:57 ` [PATCH -mmotm 1/4] memcg: dirty memory documentation Andrea Righi
  2010-03-07 20:57 ` [PATCH -mmotm 2/4] page_cgroup: introduce file cache flags Andrea Righi
@ 2010-03-07 20:57 ` Andrea Righi
  2010-03-08  1:44   ` Daisuke Nishimura
  2010-03-07 20:57 ` [PATCH -mmotm 4/4] memcg: dirty pages instrumentation Andrea Righi
  3 siblings, 1 reply; 41+ messages in thread
From: Andrea Righi @ 2010-03-07 20:57 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki, Balbir Singh, Daisuke Nishimura
  Cc: Vivek Goyal, Peter Zijlstra, Trond Myklebust, Suleiman Souhlal,
	Greg Thelen, Kirill A. Shutemov, Andrew Morton, containers,
	linux-kernel, linux-mm, Andrea Righi

Infrastructure to account dirty pages per cgroup and to add dirty limit
interface to the cgroupfs:

 - Direct write-out: memory.dirty_ratio, memory.dirty_bytes

 - Background write-out: memory.dirty_background_ratio, memory.dirty_background_bytes

Signed-off-by: Andrea Righi <arighi@develer.com>
---
 include/linux/memcontrol.h |  122 +++++++++++-
 mm/memcontrol.c            |  507 +++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 593 insertions(+), 36 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 44301c6..61fdca4 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -19,12 +19,55 @@
 
 #ifndef _LINUX_MEMCONTROL_H
 #define _LINUX_MEMCONTROL_H
+
+#include <linux/writeback.h>
 #include <linux/cgroup.h>
+
 struct mem_cgroup;
 struct page_cgroup;
 struct page;
 struct mm_struct;
 
+/* Cgroup memory statistics items exported to the kernel */
+enum mem_cgroup_read_page_stat_item {
+	MEMCG_NR_DIRTYABLE_PAGES,
+	MEMCG_NR_RECLAIM_PAGES,
+	MEMCG_NR_WRITEBACK,
+	MEMCG_NR_DIRTY_WRITEBACK_PAGES,
+};
+
+/* File cache pages accounting */
+enum mem_cgroup_write_page_stat_item {
+	MEMCG_NR_FILE_MAPPED,		/* # of pages charged as file rss */
+	MEMCG_NR_FILE_DIRTY,		/* # of dirty pages in page cache */
+	MEMCG_NR_FILE_WRITEBACK,	/* # of pages under writeback */
+	MEMCG_NR_FILE_WRITEBACK_TEMP,	/* # of pages under writeback using
+					   temporary buffers */
+	MEMCG_NR_FILE_UNSTABLE_NFS,	/* # of NFS unstable pages */
+
+	MEMCG_NR_FILE_NSTAT,
+};
+
+/* Dirty memory parameters */
+struct vm_dirty_param {
+	int dirty_ratio;
+	int dirty_background_ratio;
+	unsigned long dirty_bytes;
+	unsigned long dirty_background_bytes;
+};
+
+/*
+ * TODO: provide a validation check routine. And retry if validation
+ * fails.
+ */
+static inline void get_global_vm_dirty_param(struct vm_dirty_param *param)
+{
+	param->dirty_ratio = vm_dirty_ratio;
+	param->dirty_bytes = vm_dirty_bytes;
+	param->dirty_background_ratio = dirty_background_ratio;
+	param->dirty_background_bytes = dirty_background_bytes;
+}
+
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
 /*
  * All "charge" functions with gfp_mask should use GFP_KERNEL or
@@ -117,6 +160,40 @@ extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
 extern int do_swap_account;
 #endif
 
+extern bool mem_cgroup_has_dirty_limit(void);
+extern void get_vm_dirty_param(struct vm_dirty_param *param);
+extern s64 mem_cgroup_page_stat(enum mem_cgroup_read_page_stat_item item);
+
+extern void mem_cgroup_update_page_stat_locked(struct page *page,
+			enum mem_cgroup_write_page_stat_item idx, bool charge);
+
+extern void mem_cgroup_update_page_stat_unlocked(struct page *page,
+			enum mem_cgroup_write_page_stat_item idx, bool charge);
+
+static inline void mem_cgroup_inc_page_stat_locked(struct page *page,
+		enum mem_cgroup_write_page_stat_item idx)
+{
+	mem_cgroup_update_page_stat_locked(page, idx, true);
+}
+
+static inline void mem_cgroup_dec_page_stat_locked(struct page *page,
+		enum mem_cgroup_write_page_stat_item idx)
+{
+	mem_cgroup_update_page_stat_locked(page, idx, false);
+}
+
+static inline void mem_cgroup_inc_page_stat_unlocked(struct page *page,
+		enum mem_cgroup_write_page_stat_item idx)
+{
+	mem_cgroup_update_page_stat_unlocked(page, idx, true);
+}
+
+static inline void mem_cgroup_dec_page_stat_unlocked(struct page *page,
+		enum mem_cgroup_write_page_stat_item idx)
+{
+	mem_cgroup_update_page_stat_unlocked(page, idx, false);
+}
+
 static inline bool mem_cgroup_disabled(void)
 {
 	if (mem_cgroup_subsys.disabled)
@@ -124,7 +201,6 @@ static inline bool mem_cgroup_disabled(void)
 	return false;
 }
 
-void mem_cgroup_update_file_mapped(struct page *page, int val);
 unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
 						gfp_t gfp_mask, int nid,
 						int zid);
@@ -294,8 +370,38 @@ mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
 {
 }
 
-static inline void mem_cgroup_update_file_mapped(struct page *page,
-							int val)
+static inline s64 mem_cgroup_page_stat(enum mem_cgroup_read_page_stat_item item)
+{
+	return -ENOSYS;
+}
+
+static inline void mem_cgroup_update_page_stat_locked(struct page *page,
+			enum mem_cgroup_write_page_stat_item idx, bool charge)
+{
+}
+
+static inline void mem_cgroup_update_page_stat_unlocked(struct page *page,
+			enum mem_cgroup_write_page_stat_item idx, bool charge)
+{
+}
+
+static inline void mem_cgroup_inc_page_stat_locked(struct page *page,
+			enum mem_cgroup_write_page_stat_item idx)
+{
+}
+
+static inline void mem_cgroup_dec_page_stat_locked(struct page *page,
+			enum mem_cgroup_write_page_stat_item idx)
+{
+}
+
+static inline void mem_cgroup_inc_page_stat_unlocked(struct page *page,
+			enum mem_cgroup_write_page_stat_item idx)
+{
+}
+
+static inline void mem_cgroup_dec_page_stat_unlocked(struct page *page,
+			enum mem_cgroup_write_page_stat_item idx)
 {
 }
 
@@ -306,6 +412,16 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
 	return 0;
 }
 
+static inline bool mem_cgroup_has_dirty_limit(void)
+{
+	return false;
+}
+
+static inline void get_vm_dirty_param(struct vm_dirty_param *param)
+{
+	get_global_vm_dirty_param(param);
+}
+
 #endif /* CONFIG_CGROUP_MEM_CONT */
 
 #endif /* _LINUX_MEMCONTROL_H */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7fab84e..ac38549 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -80,14 +80,21 @@ enum mem_cgroup_stat_index {
 	/*
 	 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
 	 */
-	MEM_CGROUP_STAT_CACHE, 	   /* # of pages charged as cache */
+	MEM_CGROUP_STAT_CACHE,	   /* # of pages charged as cache */
 	MEM_CGROUP_STAT_RSS,	   /* # of pages charged as anon rss */
-	MEM_CGROUP_STAT_FILE_MAPPED,  /* # of pages charged as file rss */
 	MEM_CGROUP_STAT_PGPGIN_COUNT,	/* # of pages paged in */
 	MEM_CGROUP_STAT_PGPGOUT_COUNT,	/* # of pages paged out */
 	MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
 	MEM_CGROUP_EVENTS,	/* incremented at every  pagein/pageout */
 
+	/* File cache pages accounting */
+	MEM_CGROUP_STAT_FILE_MAPPED,  /* # of pages charged as file rss */
+	MEM_CGROUP_STAT_FILE_DIRTY,   /* # of dirty pages in page cache */
+	MEM_CGROUP_STAT_WRITEBACK,   /* # of pages under writeback */
+	MEM_CGROUP_STAT_WRITEBACK_TEMP,   /* # of pages under writeback using
+						temporary buffers */
+	MEM_CGROUP_STAT_UNSTABLE_NFS,   /* # of NFS unstable pages */
+
 	MEM_CGROUP_STAT_NSTATS,
 };
 
@@ -95,6 +102,19 @@ struct mem_cgroup_stat_cpu {
 	s64 count[MEM_CGROUP_STAT_NSTATS];
 };
 
+/* Per cgroup page statistics */
+struct mem_cgroup_page_stat {
+	enum mem_cgroup_read_page_stat_item item;
+	s64 value;
+};
+
+enum {
+	MEM_CGROUP_DIRTY_RATIO,
+	MEM_CGROUP_DIRTY_BYTES,
+	MEM_CGROUP_DIRTY_BACKGROUND_RATIO,
+	MEM_CGROUP_DIRTY_BACKGROUND_BYTES,
+};
+
 /*
  * per-zone information in memory controller.
  */
@@ -208,6 +228,9 @@ struct mem_cgroup {
 
 	unsigned int	swappiness;
 
+	/* control memory cgroup dirty pages */
+	struct vm_dirty_param dirty_param;
+
 	/* set when res.limit == memsw.limit */
 	bool		memsw_is_minimum;
 
@@ -1033,6 +1056,157 @@ static unsigned int get_swappiness(struct mem_cgroup *memcg)
 	return swappiness;
 }
 
+static bool dirty_param_is_valid(struct vm_dirty_param *param)
+{
+	if (param->dirty_ratio && param->dirty_bytes)
+		return false;
+	if (param->dirty_background_ratio && param->dirty_background_bytes)
+		return false;
+	return true;
+}
+
+static void __mem_cgroup_get_dirty_param(struct vm_dirty_param *param,
+				struct mem_cgroup *mem)
+{
+	param->dirty_ratio = mem->dirty_param.dirty_ratio;
+	param->dirty_bytes = mem->dirty_param.dirty_bytes;
+	param->dirty_background_ratio = mem->dirty_param.dirty_background_ratio;
+	param->dirty_background_bytes = mem->dirty_param.dirty_background_bytes;
+}
+
+/*
+ * get_vm_dirty_param() - get dirty memory parameters of the current memcg
+ * @param:	a structure that is filled with the dirty memory settings
+ *
+ * The function fills @param with the current memcg dirty memory settings. If
+ * memory cgroup is disabled or in case of error the structure is filled with
+ * the global dirty memory settings.
+ */
+void get_vm_dirty_param(struct vm_dirty_param *param)
+{
+	struct mem_cgroup *memcg;
+
+	if (mem_cgroup_disabled()) {
+		get_global_vm_dirty_param(param);
+		return;
+	}
+	/*
+	 * It's possible that "current" may be moved to other cgroup while we
+	 * access cgroup. But precise check is meaningless because the task can
+	 * be moved after our access and writeback tends to take long time.
+	 * At least, "memcg" will not be freed under rcu_read_lock().
+	 */
+	while (1) {
+		rcu_read_lock();
+		memcg = mem_cgroup_from_task(current);
+		if (likely(memcg))
+			__mem_cgroup_get_dirty_param(param, memcg);
+		else
+			get_global_vm_dirty_param(param);
+		rcu_read_unlock();
+		/*
+		 * Since global and memcg vm_dirty_param are not protected we
+		 * try to speculatively read them and retry if we get
+		 * inconsistent values.
+		 */
+		if (likely(dirty_param_is_valid(param)))
+			break;
+	}
+}
+
+static inline bool mem_cgroup_can_swap(struct mem_cgroup *memcg)
+{
+	if (!do_swap_account)
+		return nr_swap_pages > 0;
+	return !memcg->memsw_is_minimum &&
+		(res_counter_read_u64(&memcg->memsw, RES_LIMIT) > 0);
+}
+
+static s64 mem_cgroup_get_local_page_stat(struct mem_cgroup *memcg,
+				enum mem_cgroup_read_page_stat_item item)
+{
+	s64 ret;
+
+	switch (item) {
+	case MEMCG_NR_DIRTYABLE_PAGES:
+		ret = res_counter_read_u64(&memcg->res, RES_LIMIT) -
+			res_counter_read_u64(&memcg->res, RES_USAGE);
+		/* Translate free memory in pages */
+		ret >>= PAGE_SHIFT;
+		ret += mem_cgroup_read_stat(memcg, LRU_ACTIVE_FILE) +
+			mem_cgroup_read_stat(memcg, LRU_INACTIVE_FILE);
+		if (mem_cgroup_can_swap(memcg))
+			ret += mem_cgroup_read_stat(memcg, LRU_ACTIVE_ANON) +
+				mem_cgroup_read_stat(memcg, LRU_INACTIVE_ANON);
+		break;
+	case MEMCG_NR_RECLAIM_PAGES:
+		ret = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_FILE_DIRTY) +
+			mem_cgroup_read_stat(memcg,
+					MEM_CGROUP_STAT_UNSTABLE_NFS);
+		break;
+	case MEMCG_NR_WRITEBACK:
+		ret = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
+		break;
+	case MEMCG_NR_DIRTY_WRITEBACK_PAGES:
+		ret = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_WRITEBACK) +
+			mem_cgroup_read_stat(memcg,
+				MEM_CGROUP_STAT_UNSTABLE_NFS);
+		break;
+	default:
+		BUG();
+		break;
+	}
+	return ret;
+}
+
+/*
+ * mem_cgroup_has_dirty_limit() - check if current memcg has local dirty limits
+ *
+ * Return true if the current memory cgroup has local dirty memory settings,
+ * false otherwise.
+ */
+bool mem_cgroup_has_dirty_limit(void)
+{
+	if (mem_cgroup_disabled())
+		return false;
+	return mem_cgroup_from_task(current) != NULL;
+}
+
+static int mem_cgroup_page_stat_cb(struct mem_cgroup *mem, void *data)
+{
+	struct mem_cgroup_page_stat *stat = (struct mem_cgroup_page_stat *)data;
+
+	stat->value += mem_cgroup_get_local_page_stat(mem, stat->item);
+	return 0;
+}
+
+/*
+ * mem_cgroup_page_stat() - get memory cgroup file cache statistics
+ * @item:	memory statistic item exported to the kernel
+ *
+ * Return the accounted statistic value, or a negative value in case of error.
+ */
+s64 mem_cgroup_page_stat(enum mem_cgroup_read_page_stat_item item)
+{
+	struct mem_cgroup_page_stat stat = {};
+	struct mem_cgroup *memcg;
+
+	rcu_read_lock();
+	memcg = mem_cgroup_from_task(current);
+	if (memcg) {
+		/*
+		 * Recursively evaulate page statistics against all cgroup
+		 * under hierarchy tree
+		 */
+		stat.item = item;
+		mem_cgroup_walk_tree(memcg, &stat, mem_cgroup_page_stat_cb);
+	} else
+		stat.value = -EINVAL;
+	rcu_read_unlock();
+
+	return stat.value;
+}
+
 static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data)
 {
 	int *val = data;
@@ -1345,34 +1519,160 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
 }
 
 /*
- * Currently used to update mapped file statistics, but the routine can be
- * generalized to update other statistics as well.
+ * Update memcg page cache statistics under lock_page_cgroup() or migration
+ * lock held.
  */
-void mem_cgroup_update_file_mapped(struct page *page, int val)
+static void __mem_cgroup_update_page_stat(struct page_cgroup *pc,
+			enum mem_cgroup_write_page_stat_item idx, bool charge)
 {
-	struct mem_cgroup *mem;
-	struct page_cgroup *pc;
+	struct mem_cgroup *mem = pc->mem_cgroup;
 
-	pc = lookup_page_cgroup(page);
-	if (unlikely(!pc))
-		return;
+	/*
+	 * Set the opportune flags of page_cgroup and translate the public
+	 * mem_cgroup_page_stat_index into the local mem_cgroup_stat_index.
+	 *
+	 * In this way we can export to the kernel only a restricted subset of
+	 * memcg flags.
+	 */
+	switch (idx) {
+	case MEMCG_NR_FILE_MAPPED:
+		if (charge)
+			SetPageCgroupFileMapped(pc);
+		else
+			ClearPageCgroupFileMapped(pc);
+		idx = MEM_CGROUP_STAT_FILE_MAPPED;
+		break;
+	case MEMCG_NR_FILE_DIRTY:
+		if (charge)
+			SetPageCgroupDirty(pc);
+		else
+			ClearPageCgroupDirty(pc);
+		idx = MEM_CGROUP_STAT_FILE_DIRTY;
+		break;
+	case MEMCG_NR_FILE_WRITEBACK:
+		if (charge)
+			SetPageCgroupWriteback(pc);
+		else
+			ClearPageCgroupWriteback(pc);
+		idx = MEM_CGROUP_STAT_WRITEBACK;
+		break;
+	case MEMCG_NR_FILE_WRITEBACK_TEMP:
+		if (charge)
+			SetPageCgroupWritebackTemp(pc);
+		else
+			ClearPageCgroupWritebackTemp(pc);
+		idx = MEM_CGROUP_STAT_WRITEBACK_TEMP;
+		break;
+	case MEMCG_NR_FILE_UNSTABLE_NFS:
+		if (charge)
+			SetPageCgroupUnstableNFS(pc);
+		else
+			ClearPageCgroupUnstableNFS(pc);
+		idx = MEM_CGROUP_STAT_UNSTABLE_NFS;
+		break;
+	default:
+		BUG();
+		break;
+	}
+	__this_cpu_add(mem->stat->count[idx], charge ? 1 : -1);
+}
 
-	lock_page_cgroup(pc);
-	mem = pc->mem_cgroup;
-	if (!mem)
-		goto done;
+/*
+ * Update memcg page cache statistics with mapping->tree_lock held.
+ */
+static void __mem_cgroup_update_page_stat_migration_lock(struct page_cgroup *pc,
+			enum mem_cgroup_write_page_stat_item idx, bool charge)
+{
+	unsigned long flags;
 
-	if (!PageCgroupUsed(pc))
-		goto done;
+	/*
+	 * This function can be called under IRQ context, so we need to update
+	 * the page statistics with IRQs disabled.
+	 */
+	local_irq_save(flags);
+	lock_page_cgroup_migrate(pc);
 
+	__mem_cgroup_update_page_stat(pc, idx, charge);
+
+	unlock_page_cgroup_migrate(pc);
+	local_irq_restore(flags);
+}
+
+static void mem_cgroup_update_page_stat(struct page_cgroup *pc,
+			enum mem_cgroup_write_page_stat_item idx, bool charge)
+{
+	struct mem_cgroup *mem = pc->mem_cgroup;
+
+	if (unlikely(mem == NULL))
+		return;
 	/*
-	 * Preemption is already disabled. We can use __this_cpu_xxx
+	 * Distinguish between statistics that are _never_ updated with
+	 * mem_cgroup_update_page_stat_locked() and statistics that can be
+	 * updated under mapping->tree_lock. In the first case there's no need
+	 * to acquire the migration lock.
+	 *
+	 * NOTE: at the moment, MEMCG_NR_FILE_DIRTY is the only statistic that
+	 * can be updated with mapping->tree_lock held.
 	 */
-	__this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], val);
+	if (idx == MEMCG_NR_FILE_DIRTY)
+		__mem_cgroup_update_page_stat_migration_lock(pc, idx, charge);
+	else
+		__mem_cgroup_update_page_stat(pc, idx, charge);
+}
 
-done:
+/*
+ * mem_cgroup_update_page_stat_locked() - update memcg file cache's accounting
+ * @page:	the page involved in a file cache operation.
+ * @idx:	the particular file cache statistic.
+ * @charge:	true to increment, false to decrement the statistic specified
+ *		by @idx.
+ *
+ * Update memory cgroup file cache's accounting from a locked context.
+ *
+ * NOTE: must be called with mapping->tree_lock held.
+ */
+void mem_cgroup_update_page_stat_locked(struct page *page,
+			enum mem_cgroup_write_page_stat_item idx, bool charge)
+{
+	struct address_space *mapping = page_mapping(page);
+	struct page_cgroup *pc;
+
+	if (mem_cgroup_disabled())
+		return;
+	WARN_ON_ONCE(!irqs_disabled());
+	WARN_ON_ONCE(mapping && !spin_is_locked(&mapping->tree_lock));
+
+	pc = lookup_page_cgroup(page);
+	if (unlikely(!pc) || !PageCgroupUsed(pc))
+		return;
+	mem_cgroup_update_page_stat(pc, idx, charge);
+}
+EXPORT_SYMBOL_GPL(mem_cgroup_update_page_stat_locked);
+
+/*
+ * mem_cgroup_update_page_stat_unlocked() - update memcg file cache's accounting
+ * @page:	the page involved in a file cache operation.
+ * @idx:	the particular file cache statistic.
+ * @charge:	true to increment, false to decrement the statistic specified
+ *		by @idx.
+ *
+ * Update memory cgroup file cache's accounting from an unlocked context.
+ */
+void mem_cgroup_update_page_stat_unlocked(struct page *page,
+			enum mem_cgroup_write_page_stat_item idx, bool charge)
+{
+	struct page_cgroup *pc;
+
+	if (mem_cgroup_disabled())
+		return;
+	pc = lookup_page_cgroup(page);
+	if (unlikely(!pc) || !PageCgroupUsed(pc))
+		return;
+	lock_page_cgroup(pc);
+	mem_cgroup_update_page_stat(pc, idx, charge);
 	unlock_page_cgroup(pc);
 }
+EXPORT_SYMBOL_GPL(mem_cgroup_update_page_stat_unlocked);
 
 /*
  * size of first charge trial. "32" comes from vmscan.c's magic value.
@@ -1781,6 +2081,39 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
 	memcg_check_events(mem, pc->page);
 }
 
+/* Update file cache accounted statistics on task migration. */
+static void __mem_cgroup_update_file_stat(struct page_cgroup *pc,
+	struct mem_cgroup *from, struct mem_cgroup *to)
+{
+	struct page *page = pc->page;
+
+	if (!page_mapped(page) || PageAnon(page))
+		return;
+
+	if (PageCgroupFileMapped(pc)) {
+		__this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
+		__this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
+	}
+	if (PageCgroupDirty(pc)) {
+		__this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_DIRTY]);
+		__this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_DIRTY]);
+	}
+	if (PageCgroupWriteback(pc)) {
+		__this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_WRITEBACK]);
+		__this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_WRITEBACK]);
+	}
+	if (PageCgroupWritebackTemp(pc)) {
+		__this_cpu_dec(
+			from->stat->count[MEM_CGROUP_STAT_WRITEBACK_TEMP]);
+		__this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_WRITEBACK_TEMP]);
+	}
+	if (PageCgroupUnstableNFS(pc)) {
+		__this_cpu_dec(
+			from->stat->count[MEM_CGROUP_STAT_UNSTABLE_NFS]);
+		__this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_UNSTABLE_NFS]);
+	}
+}
+
 /**
  * __mem_cgroup_move_account - move account of the page
  * @pc:	page_cgroup of the page.
@@ -1801,7 +2134,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
 static void __mem_cgroup_move_account(struct page_cgroup *pc,
 	struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
 {
-	struct page *page;
+	unsigned long flags;
 
 	VM_BUG_ON(from == to);
 	VM_BUG_ON(PageLRU(pc->page));
@@ -1809,14 +2142,10 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc,
 	VM_BUG_ON(!PageCgroupUsed(pc));
 	VM_BUG_ON(pc->mem_cgroup != from);
 
-	page = pc->page;
-	if (page_mapped(page) && !PageAnon(page)) {
-		/* Update mapped_file data for mem_cgroup */
-		preempt_disable();
-		__this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
-		__this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
-		preempt_enable();
-	}
+	local_irq_save(flags);
+	lock_page_cgroup_migrate(pc);
+	__mem_cgroup_update_file_stat(pc, from, to);
+
 	mem_cgroup_charge_statistics(from, pc, false);
 	if (uncharge)
 		/* This is not "cancel", but cancel_charge does all we need. */
@@ -1825,6 +2154,8 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc,
 	/* caller should have done css_get */
 	pc->mem_cgroup = to;
 	mem_cgroup_charge_statistics(to, pc, true);
+	unlock_page_cgroup_migrate(pc);
+	local_irq_restore(flags);
 	/*
 	 * We charges against "to" which may not have any tasks. Then, "to"
 	 * can be under rmdir(). But in current implementation, caller of
@@ -3118,10 +3449,14 @@ static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
 enum {
 	MCS_CACHE,
 	MCS_RSS,
-	MCS_FILE_MAPPED,
 	MCS_PGPGIN,
 	MCS_PGPGOUT,
 	MCS_SWAP,
+	MCS_FILE_MAPPED,
+	MCS_FILE_DIRTY,
+	MCS_WRITEBACK,
+	MCS_WRITEBACK_TEMP,
+	MCS_UNSTABLE_NFS,
 	MCS_INACTIVE_ANON,
 	MCS_ACTIVE_ANON,
 	MCS_INACTIVE_FILE,
@@ -3140,10 +3475,14 @@ struct {
 } memcg_stat_strings[NR_MCS_STAT] = {
 	{"cache", "total_cache"},
 	{"rss", "total_rss"},
-	{"mapped_file", "total_mapped_file"},
 	{"pgpgin", "total_pgpgin"},
 	{"pgpgout", "total_pgpgout"},
 	{"swap", "total_swap"},
+	{"mapped_file", "total_mapped_file"},
+	{"filedirty", "dirty_pages"},
+	{"writeback", "writeback_pages"},
+	{"writeback_tmp", "writeback_temp_pages"},
+	{"nfs", "nfs_unstable"},
 	{"inactive_anon", "total_inactive_anon"},
 	{"active_anon", "total_active_anon"},
 	{"inactive_file", "total_inactive_file"},
@@ -3162,8 +3501,6 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data)
 	s->stat[MCS_CACHE] += val * PAGE_SIZE;
 	val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
 	s->stat[MCS_RSS] += val * PAGE_SIZE;
-	val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED);
-	s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;
 	val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGIN_COUNT);
 	s->stat[MCS_PGPGIN] += val;
 	val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGOUT_COUNT);
@@ -3172,6 +3509,16 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data)
 		val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT);
 		s->stat[MCS_SWAP] += val * PAGE_SIZE;
 	}
+	val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED);
+	s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;
+	val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_DIRTY);
+	s->stat[MCS_FILE_DIRTY] += val;
+	val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_WRITEBACK);
+	s->stat[MCS_WRITEBACK] += val;
+	val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_WRITEBACK_TEMP);
+	s->stat[MCS_WRITEBACK_TEMP] += val;
+	val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_UNSTABLE_NFS);
+	s->stat[MCS_UNSTABLE_NFS] += val;
 
 	/* per zone stat */
 	val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON);
@@ -3533,6 +3880,63 @@ unlock:
 	return ret;
 }
 
+static u64 mem_cgroup_dirty_read(struct cgroup *cgrp, struct cftype *cft)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
+
+	switch (cft->private) {
+	case MEM_CGROUP_DIRTY_RATIO:
+		return memcg->dirty_param.dirty_ratio;
+	case MEM_CGROUP_DIRTY_BYTES:
+		return memcg->dirty_param.dirty_bytes;
+	case MEM_CGROUP_DIRTY_BACKGROUND_RATIO:
+		return memcg->dirty_param.dirty_background_ratio;
+	case MEM_CGROUP_DIRTY_BACKGROUND_BYTES:
+		return memcg->dirty_param.dirty_background_bytes;
+	default:
+		BUG();
+	}
+}
+
+static int
+mem_cgroup_dirty_write(struct cgroup *cgrp, struct cftype *cft, u64 val)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
+	int type = cft->private;
+
+	if (cgrp->parent == NULL)
+		return -EINVAL;
+	if ((type == MEM_CGROUP_DIRTY_RATIO ||
+		type == MEM_CGROUP_DIRTY_BACKGROUND_RATIO) && val > 100)
+		return -EINVAL;
+	/*
+	 * TODO: provide a validation check routine. And retry if validation
+	 * fails.
+	 */
+	switch (type) {
+	case MEM_CGROUP_DIRTY_RATIO:
+		memcg->dirty_param.dirty_ratio = val;
+		memcg->dirty_param.dirty_bytes = 0;
+		break;
+	case MEM_CGROUP_DIRTY_BYTES:
+		memcg->dirty_param.dirty_ratio  = 0;
+		memcg->dirty_param.dirty_bytes = val;
+		break;
+	case MEM_CGROUP_DIRTY_BACKGROUND_RATIO:
+		memcg->dirty_param.dirty_background_ratio = val;
+		memcg->dirty_param.dirty_background_bytes = 0;
+		break;
+	case MEM_CGROUP_DIRTY_BACKGROUND_BYTES:
+		memcg->dirty_param.dirty_background_ratio = 0;
+		memcg->dirty_param.dirty_background_bytes = val;
+		break;
+	default:
+		BUG();
+		break;
+	}
+	return 0;
+}
+
 static struct cftype mem_cgroup_files[] = {
 	{
 		.name = "usage_in_bytes",
@@ -3584,6 +3988,30 @@ static struct cftype mem_cgroup_files[] = {
 		.write_u64 = mem_cgroup_swappiness_write,
 	},
 	{
+		.name = "dirty_ratio",
+		.read_u64 = mem_cgroup_dirty_read,
+		.write_u64 = mem_cgroup_dirty_write,
+		.private = MEM_CGROUP_DIRTY_RATIO,
+	},
+	{
+		.name = "dirty_bytes",
+		.read_u64 = mem_cgroup_dirty_read,
+		.write_u64 = mem_cgroup_dirty_write,
+		.private = MEM_CGROUP_DIRTY_BYTES,
+	},
+	{
+		.name = "dirty_background_ratio",
+		.read_u64 = mem_cgroup_dirty_read,
+		.write_u64 = mem_cgroup_dirty_write,
+		.private = MEM_CGROUP_DIRTY_BACKGROUND_RATIO,
+	},
+	{
+		.name = "dirty_background_bytes",
+		.read_u64 = mem_cgroup_dirty_read,
+		.write_u64 = mem_cgroup_dirty_write,
+		.private = MEM_CGROUP_DIRTY_BACKGROUND_BYTES,
+	},
+	{
 		.name = "move_charge_at_immigrate",
 		.read_u64 = mem_cgroup_move_charge_read,
 		.write_u64 = mem_cgroup_move_charge_write,
@@ -3842,8 +4270,21 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
 	mem->last_scanned_child = 0;
 	spin_lock_init(&mem->reclaim_param_lock);
 
-	if (parent)
+	if (parent) {
 		mem->swappiness = get_swappiness(parent);
+		mem->dirty_param = parent->dirty_param;
+	} else {
+		while (1) {
+			get_global_vm_dirty_param(&mem->dirty_param);
+			/*
+			 * Since global dirty parameters are not protected we
+			 * try to speculatively read them and retry if we get
+			 * inconsistent values.
+			 */
+			if (likely(dirty_param_is_valid(&mem->dirty_param)))
+				break;
+		}
+	}
 	atomic_set(&mem->refcnt, 1);
 	mem->move_charge_at_immigrate = 0;
 	mutex_init(&mem->thresholds_lock);
-- 
1.6.3.3

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 41+ messages in thread

* [PATCH -mmotm 4/4] memcg: dirty pages instrumentation
  2010-03-07 20:57 [PATCH -mmotm 0/4] memcg: per cgroup dirty limit (v5) Andrea Righi
                   ` (2 preceding siblings ...)
  2010-03-07 20:57 ` [PATCH -mmotm 3/4] memcg: dirty pages accounting and limiting infrastructure Andrea Righi
@ 2010-03-07 20:57 ` Andrea Righi
  2010-03-08  2:31   ` KAMEZAWA Hiroyuki
  3 siblings, 1 reply; 41+ messages in thread
From: Andrea Righi @ 2010-03-07 20:57 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki, Balbir Singh, Daisuke Nishimura
  Cc: Vivek Goyal, Peter Zijlstra, Trond Myklebust, Suleiman Souhlal,
	Greg Thelen, Kirill A. Shutemov, Andrew Morton, containers,
	linux-kernel, linux-mm, Andrea Righi

Apply the cgroup dirty pages accounting and limiting infrastructure to
the opportune kernel functions.

As a bonus, make determine_dirtyable_memory() static again: this
function isn't used anymore outside page writeback.

Signed-off-by: Andrea Righi <arighi@develer.com>
---
 fs/fuse/file.c            |    5 +
 fs/nfs/write.c            |    6 +
 fs/nilfs2/segment.c       |   11 ++-
 include/linux/writeback.h |    2 -
 mm/filemap.c              |    1 +
 mm/page-writeback.c       |  224 ++++++++++++++++++++++++++++-----------------
 mm/rmap.c                 |    4 +-
 mm/truncate.c             |    2 +
 8 files changed, 165 insertions(+), 90 deletions(-)

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index a9f5e13..9a542e5 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -11,6 +11,7 @@
 #include <linux/pagemap.h>
 #include <linux/slab.h>
 #include <linux/kernel.h>
+#include <linux/memcontrol.h>
 #include <linux/sched.h>
 #include <linux/module.h>
 
@@ -1129,6 +1130,8 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
 
 	list_del(&req->writepages_entry);
 	dec_bdi_stat(bdi, BDI_WRITEBACK);
+	mem_cgroup_dec_page_stat_unlocked(req->pages[0],
+			MEMCG_NR_FILE_WRITEBACK_TEMP);
 	dec_zone_page_state(req->pages[0], NR_WRITEBACK_TEMP);
 	bdi_writeout_inc(bdi);
 	wake_up(&fi->page_waitq);
@@ -1240,6 +1243,8 @@ static int fuse_writepage_locked(struct page *page)
 	req->inode = inode;
 
 	inc_bdi_stat(mapping->backing_dev_info, BDI_WRITEBACK);
+	mem_cgroup_inc_page_stat_unlocked(tmp_page,
+			MEMCG_NR_FILE_WRITEBACK_TEMP);
 	inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
 	end_page_writeback(page);
 
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 53ff70e..a35e3c0 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -440,6 +440,8 @@ nfs_mark_request_commit(struct nfs_page *req)
 			NFS_PAGE_TAG_COMMIT);
 	nfsi->ncommit++;
 	spin_unlock(&inode->i_lock);
+	mem_cgroup_inc_page_stat_unlocked(req->wb_page,
+			MEMCG_NR_FILE_UNSTABLE_NFS);
 	inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
 	inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE);
 	__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
@@ -451,6 +453,8 @@ nfs_clear_request_commit(struct nfs_page *req)
 	struct page *page = req->wb_page;
 
 	if (test_and_clear_bit(PG_CLEAN, &(req)->wb_flags)) {
+		mem_cgroup_dec_page_stat_unlocked(page,
+				MEMCG_NR_FILE_UNSTABLE_NFS);
 		dec_zone_page_state(page, NR_UNSTABLE_NFS);
 		dec_bdi_stat(page->mapping->backing_dev_info, BDI_RECLAIMABLE);
 		return 1;
@@ -1277,6 +1281,8 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how)
 		req = nfs_list_entry(head->next);
 		nfs_list_remove_request(req);
 		nfs_mark_request_commit(req);
+		mem_cgroup_dec_page_stat_unlocked(req->wb_page,
+				MEMCG_NR_FILE_UNSTABLE_NFS);
 		dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
 		dec_bdi_stat(req->wb_page->mapping->backing_dev_info,
 				BDI_RECLAIMABLE);
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index ada2f1b..fb79558 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -24,6 +24,7 @@
 #include <linux/pagemap.h>
 #include <linux/buffer_head.h>
 #include <linux/writeback.h>
+#include <linux/memcontrol.h>
 #include <linux/bio.h>
 #include <linux/completion.h>
 #include <linux/blkdev.h>
@@ -1660,8 +1661,11 @@ nilfs_copy_replace_page_buffers(struct page *page, struct list_head *out)
 	} while (bh = bh->b_this_page, bh2 = bh2->b_this_page, bh != head);
 	kunmap_atomic(kaddr, KM_USER0);
 
-	if (!TestSetPageWriteback(clone_page))
+	if (!TestSetPageWriteback(clone_page)) {
+		mem_cgroup_inc_page_stat_unlocked(clone_page,
+				MEMCG_NR_FILE_WRITEBACK);
 		inc_zone_page_state(clone_page, NR_WRITEBACK);
+	}
 	unlock_page(clone_page);
 
 	return 0;
@@ -1783,8 +1787,11 @@ static void __nilfs_end_page_io(struct page *page, int err)
 	}
 
 	if (buffer_nilfs_allocated(page_buffers(page))) {
-		if (TestClearPageWriteback(page))
+		if (TestClearPageWriteback(page)) {
+			mem_cgroup_dec_page_stat_unlocked(page,
+					MEMCG_NR_FILE_WRITEBACK);
 			dec_zone_page_state(page, NR_WRITEBACK);
+		}
 	} else
 		end_page_writeback(page);
 }
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index dd9512d..39e4cb2 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -117,8 +117,6 @@ extern int vm_highmem_is_dirtyable;
 extern int block_dump;
 extern int laptop_mode;
 
-extern unsigned long determine_dirtyable_memory(void);
-
 extern int dirty_background_ratio_handler(struct ctl_table *table, int write,
 		void __user *buffer, size_t *lenp,
 		loff_t *ppos);
diff --git a/mm/filemap.c b/mm/filemap.c
index 62cbac0..37f89d1 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -135,6 +135,7 @@ void __remove_from_page_cache(struct page *page)
 	 * having removed the page entirely.
 	 */
 	if (PageDirty(page) && mapping_cap_account_dirty(mapping)) {
+		mem_cgroup_dec_page_stat_locked(page, MEMCG_NR_FILE_DIRTY);
 		dec_zone_page_state(page, NR_FILE_DIRTY);
 		dec_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
 	}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index ab84693..9d4503a 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -131,6 +131,111 @@ static struct prop_descriptor vm_completions;
 static struct prop_descriptor vm_dirties;
 
 /*
+ * Work out the current dirty-memory clamping and background writeout
+ * thresholds.
+ *
+ * The main aim here is to lower them aggressively if there is a lot of mapped
+ * memory around.  To avoid stressing page reclaim with lots of unreclaimable
+ * pages.  It is better to clamp down on writers than to start swapping, and
+ * performing lots of scanning.
+ *
+ * We only allow 1/2 of the currently-unmapped memory to be dirtied.
+ *
+ * We don't permit the clamping level to fall below 5% - that is getting rather
+ * excessive.
+ *
+ * We make sure that the background writeout level is below the adjusted
+ * clamping level.
+ */
+
+static unsigned long highmem_dirtyable_memory(unsigned long total)
+{
+#ifdef CONFIG_HIGHMEM
+	int node;
+	unsigned long x = 0;
+
+	for_each_node_state(node, N_HIGH_MEMORY) {
+		struct zone *z =
+			&NODE_DATA(node)->node_zones[ZONE_HIGHMEM];
+
+		x += zone_page_state(z, NR_FREE_PAGES) +
+		     zone_reclaimable_pages(z);
+	}
+	/*
+	 * Make sure that the number of highmem pages is never larger
+	 * than the number of the total dirtyable memory. This can only
+	 * occur in very strange VM situations but we want to make sure
+	 * that this does not occur.
+	 */
+	return min(x, total);
+#else
+	return 0;
+#endif
+}
+
+static unsigned long get_global_dirtyable_memory(void)
+{
+	unsigned long memory;
+
+	memory = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages();
+	if (!vm_highmem_is_dirtyable)
+		memory -= highmem_dirtyable_memory(memory);
+	return memory + 1;
+}
+
+static unsigned long get_dirtyable_memory(void)
+{
+	unsigned long memory;
+	s64 memcg_memory;
+
+	memory = get_global_dirtyable_memory();
+	if (!mem_cgroup_has_dirty_limit())
+		return memory;
+	memcg_memory = mem_cgroup_page_stat(MEMCG_NR_DIRTYABLE_PAGES);
+	BUG_ON(memcg_memory < 0);
+
+	return min((unsigned long)memcg_memory, memory);
+}
+
+static long get_reclaimable_pages(void)
+{
+	s64 ret;
+
+	if (!mem_cgroup_has_dirty_limit())
+		return global_page_state(NR_FILE_DIRTY) +
+			global_page_state(NR_UNSTABLE_NFS);
+	ret = mem_cgroup_page_stat(MEMCG_NR_RECLAIM_PAGES);
+	BUG_ON(ret < 0);
+
+	return ret;
+}
+
+static long get_writeback_pages(void)
+{
+	s64 ret;
+
+	if (!mem_cgroup_has_dirty_limit())
+		return global_page_state(NR_WRITEBACK);
+	ret = mem_cgroup_page_stat(MEMCG_NR_WRITEBACK);
+	BUG_ON(ret < 0);
+
+	return ret;
+}
+
+static unsigned long get_dirty_writeback_pages(void)
+{
+	s64 ret;
+
+	if (!mem_cgroup_has_dirty_limit())
+		return global_page_state(NR_UNSTABLE_NFS) +
+			global_page_state(NR_WRITEBACK);
+	ret = mem_cgroup_page_stat(MEMCG_NR_DIRTY_WRITEBACK_PAGES);
+	BUG_ON(ret < 0);
+
+	return ret;
+}
+
+/*
  * couple the period to the dirty_ratio:
  *
  *   period/2 ~ roundup_pow_of_two(dirty limit)
@@ -142,7 +247,7 @@ static int calc_period_shift(void)
 	if (vm_dirty_bytes)
 		dirty_total = vm_dirty_bytes / PAGE_SIZE;
 	else
-		dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) /
+		dirty_total = (vm_dirty_ratio * get_global_dirtyable_memory()) /
 				100;
 	return 2 + ilog2(dirty_total - 1);
 }
@@ -355,92 +460,34 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio)
 }
 EXPORT_SYMBOL(bdi_set_max_ratio);
 
-/*
- * Work out the current dirty-memory clamping and background writeout
- * thresholds.
- *
- * The main aim here is to lower them aggressively if there is a lot of mapped
- * memory around.  To avoid stressing page reclaim with lots of unreclaimable
- * pages.  It is better to clamp down on writers than to start swapping, and
- * performing lots of scanning.
- *
- * We only allow 1/2 of the currently-unmapped memory to be dirtied.
- *
- * We don't permit the clamping level to fall below 5% - that is getting rather
- * excessive.
- *
- * We make sure that the background writeout level is below the adjusted
- * clamping level.
- */
-
-static unsigned long highmem_dirtyable_memory(unsigned long total)
-{
-#ifdef CONFIG_HIGHMEM
-	int node;
-	unsigned long x = 0;
-
-	for_each_node_state(node, N_HIGH_MEMORY) {
-		struct zone *z =
-			&NODE_DATA(node)->node_zones[ZONE_HIGHMEM];
-
-		x += zone_page_state(z, NR_FREE_PAGES) +
-		     zone_reclaimable_pages(z);
-	}
-	/*
-	 * Make sure that the number of highmem pages is never larger
-	 * than the number of the total dirtyable memory. This can only
-	 * occur in very strange VM situations but we want to make sure
-	 * that this does not occur.
-	 */
-	return min(x, total);
-#else
-	return 0;
-#endif
-}
-
-/**
- * determine_dirtyable_memory - amount of memory that may be used
- *
- * Returns the numebr of pages that can currently be freed and used
- * by the kernel for direct mappings.
- */
-unsigned long determine_dirtyable_memory(void)
-{
-	unsigned long x;
-
-	x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages();
-
-	if (!vm_highmem_is_dirtyable)
-		x -= highmem_dirtyable_memory(x);
-
-	return x + 1;	/* Ensure that we never return 0 */
-}
-
 void
 get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty,
 		 unsigned long *pbdi_dirty, struct backing_dev_info *bdi)
 {
-	unsigned long background;
-	unsigned long dirty;
-	unsigned long available_memory = determine_dirtyable_memory();
+	unsigned long dirty, background;
+	unsigned long available_memory = get_dirtyable_memory();
 	struct task_struct *tsk;
+	struct vm_dirty_param dirty_param;
 
-	if (vm_dirty_bytes)
-		dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
+	get_vm_dirty_param(&dirty_param);
+
+	if (dirty_param.dirty_bytes)
+		dirty = DIV_ROUND_UP(dirty_param.dirty_bytes, PAGE_SIZE);
 	else {
 		int dirty_ratio;
 
-		dirty_ratio = vm_dirty_ratio;
+		dirty_ratio = dirty_param.dirty_ratio;
 		if (dirty_ratio < 5)
 			dirty_ratio = 5;
 		dirty = (dirty_ratio * available_memory) / 100;
 	}
 
-	if (dirty_background_bytes)
-		background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE);
+	if (dirty_param.dirty_background_bytes)
+		background = DIV_ROUND_UP(dirty_param.dirty_background_bytes,
+						PAGE_SIZE);
 	else
-		background = (dirty_background_ratio * available_memory) / 100;
-
+		background = (dirty_param.dirty_background_ratio *
+						available_memory) / 100;
 	if (background >= dirty)
 		background = dirty / 2;
 	tsk = current;
@@ -505,9 +552,8 @@ static void balance_dirty_pages(struct address_space *mapping,
 		get_dirty_limits(&background_thresh, &dirty_thresh,
 				&bdi_thresh, bdi);
 
-		nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
-					global_page_state(NR_UNSTABLE_NFS);
-		nr_writeback = global_page_state(NR_WRITEBACK);
+		nr_reclaimable = get_reclaimable_pages();
+		nr_writeback = get_writeback_pages();
 
 		bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
 		bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
@@ -593,10 +639,9 @@ static void balance_dirty_pages(struct address_space *mapping,
 	 * In normal mode, we start background writeout at the lower
 	 * background_thresh, to keep the amount of dirty memory low.
 	 */
+	nr_reclaimable = get_reclaimable_pages();
 	if ((laptop_mode && pages_written) ||
-	    (!laptop_mode && ((global_page_state(NR_FILE_DIRTY)
-			       + global_page_state(NR_UNSTABLE_NFS))
-					  > background_thresh)))
+	    (!laptop_mode && (nr_reclaimable > background_thresh)))
 		bdi_start_writeback(bdi, NULL, 0);
 }
 
@@ -660,6 +705,8 @@ void throttle_vm_writeout(gfp_t gfp_mask)
 	unsigned long dirty_thresh;
 
         for ( ; ; ) {
+		unsigned long dirty;
+
 		get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
 
                 /*
@@ -668,10 +715,10 @@ void throttle_vm_writeout(gfp_t gfp_mask)
                  */
                 dirty_thresh += dirty_thresh / 10;      /* wheeee... */
 
-                if (global_page_state(NR_UNSTABLE_NFS) +
-			global_page_state(NR_WRITEBACK) <= dirty_thresh)
-                        	break;
-                congestion_wait(BLK_RW_ASYNC, HZ/10);
+		dirty = get_dirty_writeback_pages();
+		if (dirty <= dirty_thresh)
+			break;
+		congestion_wait(BLK_RW_ASYNC, HZ/10);
 
 		/*
 		 * The caller might hold locks which can prevent IO completion
@@ -1078,6 +1125,7 @@ int __set_page_dirty_no_writeback(struct page *page)
 void account_page_dirtied(struct page *page, struct address_space *mapping)
 {
 	if (mapping_cap_account_dirty(mapping)) {
+		mem_cgroup_inc_page_stat_locked(page, MEMCG_NR_FILE_DIRTY);
 		__inc_zone_page_state(page, NR_FILE_DIRTY);
 		__inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
 		task_dirty_inc(current);
@@ -1279,6 +1327,8 @@ int clear_page_dirty_for_io(struct page *page)
 		 * for more comments.
 		 */
 		if (TestClearPageDirty(page)) {
+			mem_cgroup_dec_page_stat_unlocked(page,
+					MEMCG_NR_FILE_DIRTY);
 			dec_zone_page_state(page, NR_FILE_DIRTY);
 			dec_bdi_stat(mapping->backing_dev_info,
 					BDI_RECLAIMABLE);
@@ -1314,8 +1364,11 @@ int test_clear_page_writeback(struct page *page)
 	} else {
 		ret = TestClearPageWriteback(page);
 	}
-	if (ret)
+	if (ret) {
+		mem_cgroup_dec_page_stat_unlocked(page,
+				MEMCG_NR_FILE_WRITEBACK);
 		dec_zone_page_state(page, NR_WRITEBACK);
+	}
 	return ret;
 }
 
@@ -1345,8 +1398,11 @@ int test_set_page_writeback(struct page *page)
 	} else {
 		ret = TestSetPageWriteback(page);
 	}
-	if (!ret)
+	if (!ret) {
+		mem_cgroup_inc_page_stat_unlocked(page,
+				MEMCG_NR_FILE_WRITEBACK);
 		inc_zone_page_state(page, NR_WRITEBACK);
+	}
 	return ret;
 
 }
diff --git a/mm/rmap.c b/mm/rmap.c
index fcd593c..61f07cc 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -828,8 +828,8 @@ void page_add_new_anon_rmap(struct page *page,
 void page_add_file_rmap(struct page *page)
 {
 	if (atomic_inc_and_test(&page->_mapcount)) {
+		mem_cgroup_inc_page_stat_unlocked(page, MEMCG_NR_FILE_MAPPED);
 		__inc_zone_page_state(page, NR_FILE_MAPPED);
-		mem_cgroup_update_file_mapped(page, 1);
 	}
 }
 
@@ -860,8 +860,8 @@ void page_remove_rmap(struct page *page)
 		mem_cgroup_uncharge_page(page);
 		__dec_zone_page_state(page, NR_ANON_PAGES);
 	} else {
+		mem_cgroup_dec_page_stat_unlocked(page, MEMCG_NR_FILE_MAPPED);
 		__dec_zone_page_state(page, NR_FILE_MAPPED);
-		mem_cgroup_update_file_mapped(page, -1);
 	}
 	/*
 	 * It would be tidy to reset the PageAnon mapping here,
diff --git a/mm/truncate.c b/mm/truncate.c
index e87e372..1613632 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -73,6 +73,8 @@ void cancel_dirty_page(struct page *page, unsigned int account_size)
 	if (TestClearPageDirty(page)) {
 		struct address_space *mapping = page->mapping;
 		if (mapping && mapping_cap_account_dirty(mapping)) {
+			mem_cgroup_dec_page_stat_unlocked(page,
+					MEMCG_NR_FILE_DIRTY);
 			dec_zone_page_state(page, NR_FILE_DIRTY);
 			dec_bdi_stat(mapping->backing_dev_info,
 					BDI_RECLAIMABLE);
-- 
1.6.3.3

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 41+ messages in thread

* Re: [PATCH -mmotm 3/4] memcg: dirty pages accounting and limiting infrastructure
  2010-03-07 20:57 ` [PATCH -mmotm 3/4] memcg: dirty pages accounting and limiting infrastructure Andrea Righi
@ 2010-03-08  1:44   ` Daisuke Nishimura
  2010-03-08  1:56     ` KAMEZAWA Hiroyuki
  0 siblings, 1 reply; 41+ messages in thread
From: Daisuke Nishimura @ 2010-03-08  1:44 UTC (permalink / raw)
  To: Andrea Righi
  Cc: KAMEZAWA Hiroyuki, Balbir Singh, Vivek Goyal, Peter Zijlstra,
	Trond Myklebust, Suleiman Souhlal, Greg Thelen,
	Kirill A. Shutemov, Andrew Morton, containers, linux-kernel,
	linux-mm, Daisuke Nishimura

> +/*
> + * mem_cgroup_update_page_stat_locked() - update memcg file cache's accounting
> + * @page:	the page involved in a file cache operation.
> + * @idx:	the particular file cache statistic.
> + * @charge:	true to increment, false to decrement the statistic specified
> + *		by @idx.
> + *
> + * Update memory cgroup file cache's accounting from a locked context.
> + *
> + * NOTE: must be called with mapping->tree_lock held.
> + */
> +void mem_cgroup_update_page_stat_locked(struct page *page,
> +			enum mem_cgroup_write_page_stat_item idx, bool charge)
> +{
> +	struct address_space *mapping = page_mapping(page);
> +	struct page_cgroup *pc;
> +
> +	if (mem_cgroup_disabled())
> +		return;
> +	WARN_ON_ONCE(!irqs_disabled());
> +	WARN_ON_ONCE(mapping && !spin_is_locked(&mapping->tree_lock));
> +
I think this is a wrong place to insert assertion.
The problem about page cgroup lock is that it can be interrupted in current implementation.
So,

a) it must not be aquired under another lock which can be aquired in interrupt context,
   such as mapping->tree_lock, to avoid:

		context1			context2
					lock_page_cgroup(pcA)
	spin_lock_irq(&tree_lock)
		lock_page_cgroup(pcA)		<interrupted>
		=>fail				spin_lock_irqsave(&tree_lock)
						=>fail

b) it must not be aquired in interrupt context to avoid:

	lock_page_cgroup(pcA)
		<interrupted>
		lock_page_cgroup(pcA)
		=>fail

I think something like this would be better:

@@ -83,8 +83,14 @@ static inline enum zone_type page_cgroup_zid(struct page_cgroup *pc)
        return page_zonenum(pc->page);
 }

+#include <linux/irqflags.h>
+#include <linux/hardirq.h>
 static inline void lock_page_cgroup(struct page_cgroup *pc)
 {
+#ifdef CONFIG_DEBUG_VM
+       WARN_ON_ONCE(irqs_disabled());
+       WARN_ON_ONCE(in_interrupt());
+#endif
        bit_spin_lock(PCG_LOCK, &pc->flags);
 }

> +	pc = lookup_page_cgroup(page);
> +	if (unlikely(!pc) || !PageCgroupUsed(pc))
> +		return;
> +	mem_cgroup_update_page_stat(pc, idx, charge);
> +}
> +EXPORT_SYMBOL_GPL(mem_cgroup_update_page_stat_locked);
> +
> +/*
> + * mem_cgroup_update_page_stat_unlocked() - update memcg file cache's accounting
> + * @page:	the page involved in a file cache operation.
> + * @idx:	the particular file cache statistic.
> + * @charge:	true to increment, false to decrement the statistic specified
> + *		by @idx.
> + *
> + * Update memory cgroup file cache's accounting from an unlocked context.
> + */
> +void mem_cgroup_update_page_stat_unlocked(struct page *page,
> +			enum mem_cgroup_write_page_stat_item idx, bool charge)
> +{
> +	struct page_cgroup *pc;
> +
> +	if (mem_cgroup_disabled())
> +		return;
> +	pc = lookup_page_cgroup(page);
> +	if (unlikely(!pc) || !PageCgroupUsed(pc))
> +		return;
> +	lock_page_cgroup(pc);
> +	mem_cgroup_update_page_stat(pc, idx, charge);
>  	unlock_page_cgroup(pc);
>  }
> +EXPORT_SYMBOL_GPL(mem_cgroup_update_page_stat_unlocked);
>  
IIUC, test_clear_page_writeback(at least) can be called under interrupt context.
This means lock_page_cgroup() is called under interrupt context, that is,
the case b) above can happen.
hmm... I don't have any good idea for now except disabling irq around page cgroup lock
to avoid all of these mess things.


Thanks,
Daisuke Nishimura.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH -mmotm 3/4] memcg: dirty pages accounting and limiting infrastructure
  2010-03-08  1:44   ` Daisuke Nishimura
@ 2010-03-08  1:56     ` KAMEZAWA Hiroyuki
  2010-03-08  2:17       ` Daisuke Nishimura
  0 siblings, 1 reply; 41+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-03-08  1:56 UTC (permalink / raw)
  To: Daisuke Nishimura
  Cc: Andrea Righi, Balbir Singh, Vivek Goyal, Peter Zijlstra,
	Trond Myklebust, Suleiman Souhlal, Greg Thelen,
	Kirill A. Shutemov, Andrew Morton, containers, linux-kernel,
	linux-mm

On Mon, 8 Mar 2010 10:44:47 +0900
Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:

> > +/*
> > + * mem_cgroup_update_page_stat_locked() - update memcg file cache's accounting
> > + * @page:	the page involved in a file cache operation.
> > + * @idx:	the particular file cache statistic.
> > + * @charge:	true to increment, false to decrement the statistic specified
> > + *		by @idx.
> > + *
> > + * Update memory cgroup file cache's accounting from a locked context.
> > + *
> > + * NOTE: must be called with mapping->tree_lock held.
> > + */
> > +void mem_cgroup_update_page_stat_locked(struct page *page,
> > +			enum mem_cgroup_write_page_stat_item idx, bool charge)
> > +{
> > +	struct address_space *mapping = page_mapping(page);
> > +	struct page_cgroup *pc;
> > +
> > +	if (mem_cgroup_disabled())
> > +		return;
> > +	WARN_ON_ONCE(!irqs_disabled());
> > +	WARN_ON_ONCE(mapping && !spin_is_locked(&mapping->tree_lock));
> > +
> I think this is a wrong place to insert assertion.
> The problem about page cgroup lock is that it can be interrupted in current implementation.
> So,
> 
> a) it must not be aquired under another lock which can be aquired in interrupt context,
>    such as mapping->tree_lock, to avoid:
> 
> 		context1			context2
> 					lock_page_cgroup(pcA)
> 	spin_lock_irq(&tree_lock)
> 		lock_page_cgroup(pcA)		<interrupted>
> 		=>fail				spin_lock_irqsave(&tree_lock)
> 						=>fail
> 
> b) it must not be aquired in interrupt context to avoid:
> 
> 	lock_page_cgroup(pcA)
> 		<interrupted>
> 		lock_page_cgroup(pcA)
> 		=>fail
> 
> I think something like this would be better:
> 
> @@ -83,8 +83,14 @@ static inline enum zone_type page_cgroup_zid(struct page_cgroup *pc)
>         return page_zonenum(pc->page);
>  }
> 
> +#include <linux/irqflags.h>
> +#include <linux/hardirq.h>
>  static inline void lock_page_cgroup(struct page_cgroup *pc)
>  {
> +#ifdef CONFIG_DEBUG_VM
> +       WARN_ON_ONCE(irqs_disabled());
> +       WARN_ON_ONCE(in_interrupt());
> +#endif
>         bit_spin_lock(PCG_LOCK, &pc->flags);
>  }
> 
> > +	pc = lookup_page_cgroup(page);
> > +	if (unlikely(!pc) || !PageCgroupUsed(pc))
> > +		return;
> > +	mem_cgroup_update_page_stat(pc, idx, charge);
> > +}
> > +EXPORT_SYMBOL_GPL(mem_cgroup_update_page_stat_locked);
> > +
> > +/*
> > + * mem_cgroup_update_page_stat_unlocked() - update memcg file cache's accounting
> > + * @page:	the page involved in a file cache operation.
> > + * @idx:	the particular file cache statistic.
> > + * @charge:	true to increment, false to decrement the statistic specified
> > + *		by @idx.
> > + *
> > + * Update memory cgroup file cache's accounting from an unlocked context.
> > + */
> > +void mem_cgroup_update_page_stat_unlocked(struct page *page,
> > +			enum mem_cgroup_write_page_stat_item idx, bool charge)
> > +{
> > +	struct page_cgroup *pc;
> > +
> > +	if (mem_cgroup_disabled())
> > +		return;
> > +	pc = lookup_page_cgroup(page);
> > +	if (unlikely(!pc) || !PageCgroupUsed(pc))
> > +		return;
> > +	lock_page_cgroup(pc);
> > +	mem_cgroup_update_page_stat(pc, idx, charge);
> >  	unlock_page_cgroup(pc);
> >  }
> > +EXPORT_SYMBOL_GPL(mem_cgroup_update_page_stat_unlocked);
> >  
> IIUC, test_clear_page_writeback(at least) can be called under interrupt context.
> This means lock_page_cgroup() is called under interrupt context, that is,
> the case b) above can happen.
> hmm... I don't have any good idea for now except disabling irq around page cgroup lock
> to avoid all of these mess things.
> 

Hmm...simply IRQ-off for all updates ?
But IIRC, clear_writeback is done under treelock.... No ?

Thanks,
-Kame

> 
> Thanks,
> Daisuke Nishimura.
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH -mmotm 3/4] memcg: dirty pages accounting and limiting infrastructure
  2010-03-08  1:56     ` KAMEZAWA Hiroyuki
@ 2010-03-08  2:17       ` Daisuke Nishimura
  2010-03-08  2:37         ` KAMEZAWA Hiroyuki
  0 siblings, 1 reply; 41+ messages in thread
From: Daisuke Nishimura @ 2010-03-08  2:17 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Andrea Righi, Balbir Singh, Vivek Goyal, Peter Zijlstra,
	Trond Myklebust, Suleiman Souhlal, Greg Thelen,
	Kirill A. Shutemov, Andrew Morton, containers, linux-kernel,
	linux-mm, Daisuke Nishimura

On Mon, 8 Mar 2010 10:56:41 +0900, KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> On Mon, 8 Mar 2010 10:44:47 +0900
> Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:
> 
> > > +/*
> > > + * mem_cgroup_update_page_stat_locked() - update memcg file cache's accounting
> > > + * @page:	the page involved in a file cache operation.
> > > + * @idx:	the particular file cache statistic.
> > > + * @charge:	true to increment, false to decrement the statistic specified
> > > + *		by @idx.
> > > + *
> > > + * Update memory cgroup file cache's accounting from a locked context.
> > > + *
> > > + * NOTE: must be called with mapping->tree_lock held.
> > > + */
> > > +void mem_cgroup_update_page_stat_locked(struct page *page,
> > > +			enum mem_cgroup_write_page_stat_item idx, bool charge)
> > > +{
> > > +	struct address_space *mapping = page_mapping(page);
> > > +	struct page_cgroup *pc;
> > > +
> > > +	if (mem_cgroup_disabled())
> > > +		return;
> > > +	WARN_ON_ONCE(!irqs_disabled());
> > > +	WARN_ON_ONCE(mapping && !spin_is_locked(&mapping->tree_lock));
> > > +
> > I think this is a wrong place to insert assertion.
> > The problem about page cgroup lock is that it can be interrupted in current implementation.
> > So,
> > 
> > a) it must not be aquired under another lock which can be aquired in interrupt context,
> >    such as mapping->tree_lock, to avoid:
> > 
> > 		context1			context2
> > 					lock_page_cgroup(pcA)
> > 	spin_lock_irq(&tree_lock)
> > 		lock_page_cgroup(pcA)		<interrupted>
> > 		=>fail				spin_lock_irqsave(&tree_lock)
> > 						=>fail
> > 
> > b) it must not be aquired in interrupt context to avoid:
> > 
> > 	lock_page_cgroup(pcA)
> > 		<interrupted>
> > 		lock_page_cgroup(pcA)
> > 		=>fail
> > 
> > I think something like this would be better:
> > 
> > @@ -83,8 +83,14 @@ static inline enum zone_type page_cgroup_zid(struct page_cgroup *pc)
> >         return page_zonenum(pc->page);
> >  }
> > 
> > +#include <linux/irqflags.h>
> > +#include <linux/hardirq.h>
> >  static inline void lock_page_cgroup(struct page_cgroup *pc)
> >  {
> > +#ifdef CONFIG_DEBUG_VM
> > +       WARN_ON_ONCE(irqs_disabled());
> > +       WARN_ON_ONCE(in_interrupt());
> > +#endif
> >         bit_spin_lock(PCG_LOCK, &pc->flags);
> >  }
> > 
> > > +	pc = lookup_page_cgroup(page);
> > > +	if (unlikely(!pc) || !PageCgroupUsed(pc))
> > > +		return;
> > > +	mem_cgroup_update_page_stat(pc, idx, charge);
> > > +}
> > > +EXPORT_SYMBOL_GPL(mem_cgroup_update_page_stat_locked);
> > > +
> > > +/*
> > > + * mem_cgroup_update_page_stat_unlocked() - update memcg file cache's accounting
> > > + * @page:	the page involved in a file cache operation.
> > > + * @idx:	the particular file cache statistic.
> > > + * @charge:	true to increment, false to decrement the statistic specified
> > > + *		by @idx.
> > > + *
> > > + * Update memory cgroup file cache's accounting from an unlocked context.
> > > + */
> > > +void mem_cgroup_update_page_stat_unlocked(struct page *page,
> > > +			enum mem_cgroup_write_page_stat_item idx, bool charge)
> > > +{
> > > +	struct page_cgroup *pc;
> > > +
> > > +	if (mem_cgroup_disabled())
> > > +		return;
> > > +	pc = lookup_page_cgroup(page);
> > > +	if (unlikely(!pc) || !PageCgroupUsed(pc))
> > > +		return;
> > > +	lock_page_cgroup(pc);
> > > +	mem_cgroup_update_page_stat(pc, idx, charge);
> > >  	unlock_page_cgroup(pc);
> > >  }
> > > +EXPORT_SYMBOL_GPL(mem_cgroup_update_page_stat_unlocked);
> > >  
> > IIUC, test_clear_page_writeback(at least) can be called under interrupt context.
> > This means lock_page_cgroup() is called under interrupt context, that is,
> > the case b) above can happen.
> > hmm... I don't have any good idea for now except disabling irq around page cgroup lock
> > to avoid all of these mess things.
> > 
> 
> Hmm...simply IRQ-off for all updates ?
I think so in current code.
But after these changes, we must use local_irq_save()/restore()
instead of local_irq_disable()/enable() in mem_cgroup_update_page_stat().

> But IIRC, clear_writeback is done under treelock.... No ?
> 
The place where NR_WRITEBACK is updated is out of tree_lock.

   1311 int test_clear_page_writeback(struct page *page)
   1312 {
   1313         struct address_space *mapping = page_mapping(page);
   1314         int ret;
   1315
   1316         if (mapping) {
   1317                 struct backing_dev_info *bdi = mapping->backing_dev_info;
   1318                 unsigned long flags;
   1319
   1320                 spin_lock_irqsave(&mapping->tree_lock, flags);
   1321                 ret = TestClearPageWriteback(page);
   1322                 if (ret) {
   1323                         radix_tree_tag_clear(&mapping->page_tree,
   1324                                                 page_index(page),
   1325                                                 PAGECACHE_TAG_WRITEBACK);
   1326                         if (bdi_cap_account_writeback(bdi)) {
   1327                                 __dec_bdi_stat(bdi, BDI_WRITEBACK);
   1328                                 __bdi_writeout_inc(bdi);
   1329                         }
   1330                 }
   1331                 spin_unlock_irqrestore(&mapping->tree_lock, flags);
   1332         } else {
   1333                 ret = TestClearPageWriteback(page);
   1334         }
   1335         if (ret)
   1336                 dec_zone_page_state(page, NR_WRITEBACK);
   1337         return ret;
   1338 }
   1339


Thanks,
Daisuke Nishimura.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH -mmotm 4/4] memcg: dirty pages instrumentation
  2010-03-07 20:57 ` [PATCH -mmotm 4/4] memcg: dirty pages instrumentation Andrea Righi
@ 2010-03-08  2:31   ` KAMEZAWA Hiroyuki
  0 siblings, 0 replies; 41+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-03-08  2:31 UTC (permalink / raw)
  To: Andrea Righi
  Cc: Balbir Singh, Daisuke Nishimura, Vivek Goyal, Peter Zijlstra,
	Trond Myklebust, Suleiman Souhlal, Greg Thelen,
	Kirill A. Shutemov, Andrew Morton, containers, linux-kernel,
	linux-mm

On Sun,  7 Mar 2010 21:57:54 +0100
Andrea Righi <arighi@develer.com> wrote:

> Apply the cgroup dirty pages accounting and limiting infrastructure to
> the opportune kernel functions.
> 
> As a bonus, make determine_dirtyable_memory() static again: this
> function isn't used anymore outside page writeback.
> 
> Signed-off-by: Andrea Righi <arighi@develer.com>

I'm sorry if I misunderstand..almost all this kind of accounting is done
under lock_page()...then...


> ---
>  fs/fuse/file.c            |    5 +
>  fs/nfs/write.c            |    6 +
>  fs/nilfs2/segment.c       |   11 ++-
>  include/linux/writeback.h |    2 -
>  mm/filemap.c              |    1 +
>  mm/page-writeback.c       |  224 ++++++++++++++++++++++++++++-----------------
>  mm/rmap.c                 |    4 +-
>  mm/truncate.c             |    2 +
>  8 files changed, 165 insertions(+), 90 deletions(-)
> 
> diff --git a/fs/fuse/file.c b/fs/fuse/file.c
> index a9f5e13..9a542e5 100644
> --- a/fs/fuse/file.c
> +++ b/fs/fuse/file.c
> @@ -11,6 +11,7 @@
>  #include <linux/pagemap.h>
>  #include <linux/slab.h>
>  #include <linux/kernel.h>
> +#include <linux/memcontrol.h>
>  #include <linux/sched.h>
>  #include <linux/module.h>
>  
> @@ -1129,6 +1130,8 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
>  
>  	list_del(&req->writepages_entry);
>  	dec_bdi_stat(bdi, BDI_WRITEBACK);
> +	mem_cgroup_dec_page_stat_unlocked(req->pages[0],
> +			MEMCG_NR_FILE_WRITEBACK_TEMP);
>  	dec_zone_page_state(req->pages[0], NR_WRITEBACK_TEMP);

Hmm. IIUC, this req->pages[0] is "tmp_page", which works as bounce_buffer for FUSE.
Then, this req->pages[] is not under any memcg.
So, this accounting never work.


>  	bdi_writeout_inc(bdi);
>  	wake_up(&fi->page_waitq);
> @@ -1240,6 +1243,8 @@ static int fuse_writepage_locked(struct page *page)
>  	req->inode = inode;
>  
>  	inc_bdi_stat(mapping->backing_dev_info, BDI_WRITEBACK);
> +	mem_cgroup_inc_page_stat_unlocked(tmp_page,
> +			MEMCG_NR_FILE_WRITEBACK_TEMP);
>  	inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
>  	end_page_writeback(page);
ditto.


>  
> diff --git a/fs/nfs/write.c b/fs/nfs/write.c
> index 53ff70e..a35e3c0 100644
> --- a/fs/nfs/write.c
> +++ b/fs/nfs/write.c
> @@ -440,6 +440,8 @@ nfs_mark_request_commit(struct nfs_page *req)
>  			NFS_PAGE_TAG_COMMIT);
>  	nfsi->ncommit++;
>  	spin_unlock(&inode->i_lock);
> +	mem_cgroup_inc_page_stat_unlocked(req->wb_page,
> +			MEMCG_NR_FILE_UNSTABLE_NFS);
>  	inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);

Here, if the page is locked (by lock_page()), it will never be uncharged.
Then, _locked() version stat accounting can be used.


>  	inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE);
>  	__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
> @@ -451,6 +453,8 @@ nfs_clear_request_commit(struct nfs_page *req)
>  	struct page *page = req->wb_page;
>  
>  	if (test_and_clear_bit(PG_CLEAN, &(req)->wb_flags)) {
> +		mem_cgroup_dec_page_stat_unlocked(page,
> +				MEMCG_NR_FILE_UNSTABLE_NFS);
ditto.


>  		dec_zone_page_state(page, NR_UNSTABLE_NFS);
>  		dec_bdi_stat(page->mapping->backing_dev_info, BDI_RECLAIMABLE);
>  		return 1;
> @@ -1277,6 +1281,8 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how)
>  		req = nfs_list_entry(head->next);
>  		nfs_list_remove_request(req);
>  		nfs_mark_request_commit(req);
> +		mem_cgroup_dec_page_stat_unlocked(req->wb_page,
> +				MEMCG_NR_FILE_UNSTABLE_NFS);

ditto.

>  		dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
>  		dec_bdi_stat(req->wb_page->mapping->backing_dev_info,
>  				BDI_RECLAIMABLE);
> diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
> index ada2f1b..fb79558 100644
> --- a/fs/nilfs2/segment.c
> +++ b/fs/nilfs2/segment.c
> @@ -24,6 +24,7 @@
>  #include <linux/pagemap.h>
>  #include <linux/buffer_head.h>
>  #include <linux/writeback.h>
> +#include <linux/memcontrol.h>
>  #include <linux/bio.h>
>  #include <linux/completion.h>
>  #include <linux/blkdev.h>
> @@ -1660,8 +1661,11 @@ nilfs_copy_replace_page_buffers(struct page *page, struct list_head *out)
>  	} while (bh = bh->b_this_page, bh2 = bh2->b_this_page, bh != head);
>  	kunmap_atomic(kaddr, KM_USER0);
>  
> -	if (!TestSetPageWriteback(clone_page))
> +	if (!TestSetPageWriteback(clone_page)) {
> +		mem_cgroup_inc_page_stat_unlocked(clone_page,
> +				MEMCG_NR_FILE_WRITEBACK);
>  		inc_zone_page_state(clone_page, NR_WRITEBACK);
> +	}
>  	unlock_page(clone_page);
>  
IIUC, this clone_page is not under memcg, too. Then, it can't be handled. (now)




>  	return 0;
> @@ -1783,8 +1787,11 @@ static void __nilfs_end_page_io(struct page *page, int err)
>  	}
>  
>  	if (buffer_nilfs_allocated(page_buffers(page))) {
> -		if (TestClearPageWriteback(page))
> +		if (TestClearPageWriteback(page)) {
> +			mem_cgroup_dec_page_stat_unlocked(page,
> +					MEMCG_NR_FILE_WRITEBACK);
>  			dec_zone_page_state(page, NR_WRITEBACK);
> +		}

Hmm...isn't this a clone_page in above ? If so, this should be avoided.

IMHO, at 1st version, NILFS and FUSE's bounce page should be skipped.
If we want to limit this, we have to charge against bounce page.
I'm not sure it's difficult or not...but...




>  	} else
>  		end_page_writeback(page);
>  }
> diff --git a/include/linux/writeback.h b/include/linux/writeback.h
> index dd9512d..39e4cb2 100644
> --- a/include/linux/writeback.h
> +++ b/include/linux/writeback.h
> @@ -117,8 +117,6 @@ extern int vm_highmem_is_dirtyable;
>  extern int block_dump;
>  extern int laptop_mode;
>  
> -extern unsigned long determine_dirtyable_memory(void);
> -
>  extern int dirty_background_ratio_handler(struct ctl_table *table, int write,
>  		void __user *buffer, size_t *lenp,
>  		loff_t *ppos);
> diff --git a/mm/filemap.c b/mm/filemap.c
> index 62cbac0..37f89d1 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -135,6 +135,7 @@ void __remove_from_page_cache(struct page *page)
>  	 * having removed the page entirely.
>  	 */
>  	if (PageDirty(page) && mapping_cap_account_dirty(mapping)) {
> +		mem_cgroup_dec_page_stat_locked(page, MEMCG_NR_FILE_DIRTY);
>  		dec_zone_page_state(page, NR_FILE_DIRTY);
>  		dec_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
>  	}
> diff --git a/mm/page-writeback.c b/mm/page-writeback.c
> index ab84693..9d4503a 100644
> --- a/mm/page-writeback.c
> +++ b/mm/page-writeback.c
> @@ -131,6 +131,111 @@ static struct prop_descriptor vm_completions;
>  static struct prop_descriptor vm_dirties;
>  
>  /*
> + * Work out the current dirty-memory clamping and background writeout
> + * thresholds.
> + *
> + * The main aim here is to lower them aggressively if there is a lot of mapped
> + * memory around.  To avoid stressing page reclaim with lots of unreclaimable
> + * pages.  It is better to clamp down on writers than to start swapping, and
> + * performing lots of scanning.
> + *
> + * We only allow 1/2 of the currently-unmapped memory to be dirtied.
> + *
> + * We don't permit the clamping level to fall below 5% - that is getting rather
> + * excessive.
> + *
> + * We make sure that the background writeout level is below the adjusted
> + * clamping level.
> + */
> +
> +static unsigned long highmem_dirtyable_memory(unsigned long total)
> +{
> +#ifdef CONFIG_HIGHMEM
> +	int node;
> +	unsigned long x = 0;
> +
> +	for_each_node_state(node, N_HIGH_MEMORY) {
> +		struct zone *z =
> +			&NODE_DATA(node)->node_zones[ZONE_HIGHMEM];
> +
> +		x += zone_page_state(z, NR_FREE_PAGES) +
> +		     zone_reclaimable_pages(z);
> +	}
> +	/*
> +	 * Make sure that the number of highmem pages is never larger
> +	 * than the number of the total dirtyable memory. This can only
> +	 * occur in very strange VM situations but we want to make sure
> +	 * that this does not occur.
> +	 */
> +	return min(x, total);
> +#else
> +	return 0;
> +#endif
> +}
> +
> +static unsigned long get_global_dirtyable_memory(void)
> +{
> +	unsigned long memory;
> +
> +	memory = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages();
> +	if (!vm_highmem_is_dirtyable)
> +		memory -= highmem_dirtyable_memory(memory);
> +	return memory + 1;
> +}
> +
> +static unsigned long get_dirtyable_memory(void)
> +{
> +	unsigned long memory;
> +	s64 memcg_memory;
> +
> +	memory = get_global_dirtyable_memory();
> +	if (!mem_cgroup_has_dirty_limit())
> +		return memory;
> +	memcg_memory = mem_cgroup_page_stat(MEMCG_NR_DIRTYABLE_PAGES);
> +	BUG_ON(memcg_memory < 0);
> +
> +	return min((unsigned long)memcg_memory, memory);
> +}
> +
> +static long get_reclaimable_pages(void)
> +{
> +	s64 ret;
> +
> +	if (!mem_cgroup_has_dirty_limit())
> +		return global_page_state(NR_FILE_DIRTY) +
> +			global_page_state(NR_UNSTABLE_NFS);
> +	ret = mem_cgroup_page_stat(MEMCG_NR_RECLAIM_PAGES);
> +	BUG_ON(ret < 0);
> +
> +	return ret;
> +}
> +
> +static long get_writeback_pages(void)
> +{
> +	s64 ret;
> +
> +	if (!mem_cgroup_has_dirty_limit())
> +		return global_page_state(NR_WRITEBACK);
> +	ret = mem_cgroup_page_stat(MEMCG_NR_WRITEBACK);
> +	BUG_ON(ret < 0);
> +
> +	return ret;
> +}
> +
> +static unsigned long get_dirty_writeback_pages(void)
> +{
> +	s64 ret;
> +
> +	if (!mem_cgroup_has_dirty_limit())
> +		return global_page_state(NR_UNSTABLE_NFS) +
> +			global_page_state(NR_WRITEBACK);
> +	ret = mem_cgroup_page_stat(MEMCG_NR_DIRTY_WRITEBACK_PAGES);
> +	BUG_ON(ret < 0);
> +
> +	return ret;
> +}
> +
> +/*
>   * couple the period to the dirty_ratio:
>   *
>   *   period/2 ~ roundup_pow_of_two(dirty limit)
> @@ -142,7 +247,7 @@ static int calc_period_shift(void)
>  	if (vm_dirty_bytes)
>  		dirty_total = vm_dirty_bytes / PAGE_SIZE;
>  	else
> -		dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) /
> +		dirty_total = (vm_dirty_ratio * get_global_dirtyable_memory()) /
>  				100;
>  	return 2 + ilog2(dirty_total - 1);
>  }
> @@ -355,92 +460,34 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio)
>  }
>  EXPORT_SYMBOL(bdi_set_max_ratio);
>  
> -/*
> - * Work out the current dirty-memory clamping and background writeout
> - * thresholds.
> - *
> - * The main aim here is to lower them aggressively if there is a lot of mapped
> - * memory around.  To avoid stressing page reclaim with lots of unreclaimable
> - * pages.  It is better to clamp down on writers than to start swapping, and
> - * performing lots of scanning.
> - *
> - * We only allow 1/2 of the currently-unmapped memory to be dirtied.
> - *
> - * We don't permit the clamping level to fall below 5% - that is getting rather
> - * excessive.
> - *
> - * We make sure that the background writeout level is below the adjusted
> - * clamping level.
> - */
> -
> -static unsigned long highmem_dirtyable_memory(unsigned long total)
> -{
> -#ifdef CONFIG_HIGHMEM
> -	int node;
> -	unsigned long x = 0;
> -
> -	for_each_node_state(node, N_HIGH_MEMORY) {
> -		struct zone *z =
> -			&NODE_DATA(node)->node_zones[ZONE_HIGHMEM];
> -
> -		x += zone_page_state(z, NR_FREE_PAGES) +
> -		     zone_reclaimable_pages(z);
> -	}
> -	/*
> -	 * Make sure that the number of highmem pages is never larger
> -	 * than the number of the total dirtyable memory. This can only
> -	 * occur in very strange VM situations but we want to make sure
> -	 * that this does not occur.
> -	 */
> -	return min(x, total);
> -#else
> -	return 0;
> -#endif
> -}
> -
> -/**
> - * determine_dirtyable_memory - amount of memory that may be used
> - *
> - * Returns the numebr of pages that can currently be freed and used
> - * by the kernel for direct mappings.
> - */
> -unsigned long determine_dirtyable_memory(void)
> -{
> -	unsigned long x;
> -
> -	x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages();
> -
> -	if (!vm_highmem_is_dirtyable)
> -		x -= highmem_dirtyable_memory(x);
> -
> -	return x + 1;	/* Ensure that we never return 0 */
> -}
> -
>  void
>  get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty,
>  		 unsigned long *pbdi_dirty, struct backing_dev_info *bdi)
>  {
> -	unsigned long background;
> -	unsigned long dirty;
> -	unsigned long available_memory = determine_dirtyable_memory();
> +	unsigned long dirty, background;
> +	unsigned long available_memory = get_dirtyable_memory();
>  	struct task_struct *tsk;
> +	struct vm_dirty_param dirty_param;
>  
> -	if (vm_dirty_bytes)
> -		dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
> +	get_vm_dirty_param(&dirty_param);
> +
> +	if (dirty_param.dirty_bytes)
> +		dirty = DIV_ROUND_UP(dirty_param.dirty_bytes, PAGE_SIZE);
>  	else {
>  		int dirty_ratio;
>  
> -		dirty_ratio = vm_dirty_ratio;
> +		dirty_ratio = dirty_param.dirty_ratio;
>  		if (dirty_ratio < 5)
>  			dirty_ratio = 5;
>  		dirty = (dirty_ratio * available_memory) / 100;
>  	}
>  
> -	if (dirty_background_bytes)
> -		background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE);
> +	if (dirty_param.dirty_background_bytes)
> +		background = DIV_ROUND_UP(dirty_param.dirty_background_bytes,
> +						PAGE_SIZE);
>  	else
> -		background = (dirty_background_ratio * available_memory) / 100;
> -
> +		background = (dirty_param.dirty_background_ratio *
> +						available_memory) / 100;
>  	if (background >= dirty)
>  		background = dirty / 2;
>  	tsk = current;
> @@ -505,9 +552,8 @@ static void balance_dirty_pages(struct address_space *mapping,
>  		get_dirty_limits(&background_thresh, &dirty_thresh,
>  				&bdi_thresh, bdi);
>  
> -		nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
> -					global_page_state(NR_UNSTABLE_NFS);
> -		nr_writeback = global_page_state(NR_WRITEBACK);
> +		nr_reclaimable = get_reclaimable_pages();
> +		nr_writeback = get_writeback_pages();
>  
>  		bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
>  		bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
> @@ -593,10 +639,9 @@ static void balance_dirty_pages(struct address_space *mapping,
>  	 * In normal mode, we start background writeout at the lower
>  	 * background_thresh, to keep the amount of dirty memory low.
>  	 */
> +	nr_reclaimable = get_reclaimable_pages();
>  	if ((laptop_mode && pages_written) ||
> -	    (!laptop_mode && ((global_page_state(NR_FILE_DIRTY)
> -			       + global_page_state(NR_UNSTABLE_NFS))
> -					  > background_thresh)))
> +	    (!laptop_mode && (nr_reclaimable > background_thresh)))
>  		bdi_start_writeback(bdi, NULL, 0);
>  }
>  
> @@ -660,6 +705,8 @@ void throttle_vm_writeout(gfp_t gfp_mask)
>  	unsigned long dirty_thresh;
>  
>          for ( ; ; ) {
> +		unsigned long dirty;
> +
>  		get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
>  
>                  /*
> @@ -668,10 +715,10 @@ void throttle_vm_writeout(gfp_t gfp_mask)
>                   */
>                  dirty_thresh += dirty_thresh / 10;      /* wheeee... */
>  
> -                if (global_page_state(NR_UNSTABLE_NFS) +
> -			global_page_state(NR_WRITEBACK) <= dirty_thresh)
> -                        	break;
> -                congestion_wait(BLK_RW_ASYNC, HZ/10);
> +		dirty = get_dirty_writeback_pages();
> +		if (dirty <= dirty_thresh)
> +			break;
> +		congestion_wait(BLK_RW_ASYNC, HZ/10);
>  
>  		/*
>  		 * The caller might hold locks which can prevent IO completion
> @@ -1078,6 +1125,7 @@ int __set_page_dirty_no_writeback(struct page *page)
>  void account_page_dirtied(struct page *page, struct address_space *mapping)
>  {
>  	if (mapping_cap_account_dirty(mapping)) {
> +		mem_cgroup_inc_page_stat_locked(page, MEMCG_NR_FILE_DIRTY);
>  		__inc_zone_page_state(page, NR_FILE_DIRTY);
>  		__inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
>  		task_dirty_inc(current);
> @@ -1279,6 +1327,8 @@ int clear_page_dirty_for_io(struct page *page)
>  		 * for more comments.
>  		 */
>  		if (TestClearPageDirty(page)) {
> +			mem_cgroup_dec_page_stat_unlocked(page,
> +					MEMCG_NR_FILE_DIRTY);
>  			dec_zone_page_state(page, NR_FILE_DIRTY);
>  			dec_bdi_stat(mapping->backing_dev_info,
>  					BDI_RECLAIMABLE);

This is called under lock_page(). Then, the page is stable under us.
locked version can be used.


> @@ -1314,8 +1364,11 @@ int test_clear_page_writeback(struct page *page)
>  	} else {
>  		ret = TestClearPageWriteback(page);
>  	}
> -	if (ret)
> +	if (ret) {
> +		mem_cgroup_dec_page_stat_unlocked(page,
> +				MEMCG_NR_FILE_WRITEBACK);
>  		dec_zone_page_state(page, NR_WRITEBACK);
> +	}
Can this be moved up to under tree_lock ?


>  	return ret;
>  }
>  
> @@ -1345,8 +1398,11 @@ int test_set_page_writeback(struct page *page)
>  	} else {
>  		ret = TestSetPageWriteback(page);
>  	}
> -	if (!ret)
> +	if (!ret) {
> +		mem_cgroup_inc_page_stat_unlocked(page,
> +				MEMCG_NR_FILE_WRITEBACK);
>  		inc_zone_page_state(page, NR_WRITEBACK);
> +	}
>  	return ret;
>  
Maybe moving this to under tree_lock and using unloked version is better.



>  }
> diff --git a/mm/rmap.c b/mm/rmap.c
> index fcd593c..61f07cc 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -828,8 +828,8 @@ void page_add_new_anon_rmap(struct page *page,
>  void page_add_file_rmap(struct page *page)
>  {
>  	if (atomic_inc_and_test(&page->_mapcount)) {
> +		mem_cgroup_inc_page_stat_unlocked(page, MEMCG_NR_FILE_MAPPED);
>  		__inc_zone_page_state(page, NR_FILE_MAPPED);
> -		mem_cgroup_update_file_mapped(page, 1);
>  	}
>  }
>  
> @@ -860,8 +860,8 @@ void page_remove_rmap(struct page *page)
>  		mem_cgroup_uncharge_page(page);
>  		__dec_zone_page_state(page, NR_ANON_PAGES);
>  	} else {
> +		mem_cgroup_dec_page_stat_unlocked(page, MEMCG_NR_FILE_MAPPED);
>  		__dec_zone_page_state(page, NR_FILE_MAPPED);
> -		mem_cgroup_update_file_mapped(page, -1);
>  	}
>  	/*
>  	 * It would be tidy to reset the PageAnon mapping here,
> diff --git a/mm/truncate.c b/mm/truncate.c
> index e87e372..1613632 100644
> --- a/mm/truncate.c
> +++ b/mm/truncate.c
> @@ -73,6 +73,8 @@ void cancel_dirty_page(struct page *page, unsigned int account_size)
>  	if (TestClearPageDirty(page)) {
>  		struct address_space *mapping = page->mapping;
>  		if (mapping && mapping_cap_account_dirty(mapping)) {
> +			mem_cgroup_dec_page_stat_unlocked(page,
> +					MEMCG_NR_FILE_DIRTY);
>  			dec_zone_page_state(page, NR_FILE_DIRTY);
>  			dec_bdi_stat(mapping->backing_dev_info,
>  					BDI_RECLAIMABLE);

cancel_dirty_page() is called after do_invalidatepage() but before
remove_from_pagecache(), it's all done under lock_page().

Then, we can use "locked" accounting here.

If you feel locked/unlocked accounting is toooo complex, simply adding
irq_enable/disable around lock_page_cgroup() is a choice.
But please measure performance before doing that.


Thanks,
-Kame


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH -mmotm 3/4] memcg: dirty pages accounting and limiting infrastructure
  2010-03-08  2:17       ` Daisuke Nishimura
@ 2010-03-08  2:37         ` KAMEZAWA Hiroyuki
  2010-03-08  8:07           ` Daisuke Nishimura
  0 siblings, 1 reply; 41+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-03-08  2:37 UTC (permalink / raw)
  To: Daisuke Nishimura
  Cc: Andrea Righi, Balbir Singh, Vivek Goyal, Peter Zijlstra,
	Trond Myklebust, Suleiman Souhlal, Greg Thelen,
	Kirill A. Shutemov, Andrew Morton, containers, linux-kernel,
	linux-mm

On Mon, 8 Mar 2010 11:17:24 +0900
Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:

> > But IIRC, clear_writeback is done under treelock.... No ?
> > 
> The place where NR_WRITEBACK is updated is out of tree_lock.
> 
>    1311 int test_clear_page_writeback(struct page *page)
>    1312 {
>    1313         struct address_space *mapping = page_mapping(page);
>    1314         int ret;
>    1315
>    1316         if (mapping) {
>    1317                 struct backing_dev_info *bdi = mapping->backing_dev_info;
>    1318                 unsigned long flags;
>    1319
>    1320                 spin_lock_irqsave(&mapping->tree_lock, flags);
>    1321                 ret = TestClearPageWriteback(page);
>    1322                 if (ret) {
>    1323                         radix_tree_tag_clear(&mapping->page_tree,
>    1324                                                 page_index(page),
>    1325                                                 PAGECACHE_TAG_WRITEBACK);
>    1326                         if (bdi_cap_account_writeback(bdi)) {
>    1327                                 __dec_bdi_stat(bdi, BDI_WRITEBACK);
>    1328                                 __bdi_writeout_inc(bdi);
>    1329                         }
>    1330                 }
>    1331                 spin_unlock_irqrestore(&mapping->tree_lock, flags);
>    1332         } else {
>    1333                 ret = TestClearPageWriteback(page);
>    1334         }
>    1335         if (ret)
>    1336                 dec_zone_page_state(page, NR_WRITEBACK);
>    1337         return ret;
>    1338 }

We can move this up to under tree_lock. Considering memcg, all our target has "mapping".

If we newly account bounce-buffers (for NILFS, FUSE, etc..), which has no ->mapping,
we need much more complex new charge/uncharge theory.

But yes, adding new lock scheme seems complicated. (Sorry Andrea.)
My concerns is performance. We may need somehing new re-implementation of
locks/migrate/charge/uncharge.

Thanks,
-Kame



--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH -mmotm 3/4] memcg: dirty pages accounting and limiting infrastructure
  2010-03-08  2:37         ` KAMEZAWA Hiroyuki
@ 2010-03-08  8:07           ` Daisuke Nishimura
  2010-03-08  8:31             ` KAMEZAWA Hiroyuki
  2010-03-09  0:03             ` Andrea Righi
  0 siblings, 2 replies; 41+ messages in thread
From: Daisuke Nishimura @ 2010-03-08  8:07 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Andrea Righi, Balbir Singh, Vivek Goyal, Peter Zijlstra,
	Trond Myklebust, Suleiman Souhlal, Greg Thelen,
	Kirill A. Shutemov, Andrew Morton, containers, linux-kernel,
	linux-mm, Daisuke Nishimura

On Mon, 8 Mar 2010 11:37:11 +0900, KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> On Mon, 8 Mar 2010 11:17:24 +0900
> Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:
> 
> > > But IIRC, clear_writeback is done under treelock.... No ?
> > > 
> > The place where NR_WRITEBACK is updated is out of tree_lock.
> > 
> >    1311 int test_clear_page_writeback(struct page *page)
> >    1312 {
> >    1313         struct address_space *mapping = page_mapping(page);
> >    1314         int ret;
> >    1315
> >    1316         if (mapping) {
> >    1317                 struct backing_dev_info *bdi = mapping->backing_dev_info;
> >    1318                 unsigned long flags;
> >    1319
> >    1320                 spin_lock_irqsave(&mapping->tree_lock, flags);
> >    1321                 ret = TestClearPageWriteback(page);
> >    1322                 if (ret) {
> >    1323                         radix_tree_tag_clear(&mapping->page_tree,
> >    1324                                                 page_index(page),
> >    1325                                                 PAGECACHE_TAG_WRITEBACK);
> >    1326                         if (bdi_cap_account_writeback(bdi)) {
> >    1327                                 __dec_bdi_stat(bdi, BDI_WRITEBACK);
> >    1328                                 __bdi_writeout_inc(bdi);
> >    1329                         }
> >    1330                 }
> >    1331                 spin_unlock_irqrestore(&mapping->tree_lock, flags);
> >    1332         } else {
> >    1333                 ret = TestClearPageWriteback(page);
> >    1334         }
> >    1335         if (ret)
> >    1336                 dec_zone_page_state(page, NR_WRITEBACK);
> >    1337         return ret;
> >    1338 }
> 
> We can move this up to under tree_lock. Considering memcg, all our target has "mapping".
> 
> If we newly account bounce-buffers (for NILFS, FUSE, etc..), which has no ->mapping,
> we need much more complex new charge/uncharge theory.
> 
> But yes, adding new lock scheme seems complicated. (Sorry Andrea.)
> My concerns is performance. We may need somehing new re-implementation of
> locks/migrate/charge/uncharge.
> 
I agree. Performance is my concern too.

I made a patch below and measured the time(average of 10 times) of kernel build
on tmpfs(make -j8 on 8 CPU machine with 2.6.33 defconfig).

<before>
- root cgroup: 190.47 sec
- child cgroup: 192.81 sec

<after>
- root cgroup: 191.06 sec
- child cgroup: 193.06 sec

Hmm... about 0.3% slower for root, 0.1% slower for child.

===
From: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>

In current implementation, we don't have to disable irq at lock_page_cgroup()
because the lock is never acquired in interrupt context.
But we are going to do it in later patch, so this patch encloses all of
lock_page_cgroup()/unlock_page_cgroup() with irq_disabled()/irq_enabled().

Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
---
 mm/memcontrol.c |   17 +++++++++++++++++
 1 files changed, 17 insertions(+), 0 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 02ea959..e5ae1a1 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1359,6 +1359,7 @@ void mem_cgroup_update_file_mapped(struct page *page, int val)
 	if (unlikely(!pc))
 		return;
 
+	local_irq_disable();
 	lock_page_cgroup(pc);
 	mem = pc->mem_cgroup;
 	if (!mem)
@@ -1374,6 +1375,7 @@ void mem_cgroup_update_file_mapped(struct page *page, int val)
 
 done:
 	unlock_page_cgroup(pc);
+	local_irq_enable();
 }
 
 /*
@@ -1711,6 +1713,7 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
 	VM_BUG_ON(!PageLocked(page));
 
 	pc = lookup_page_cgroup(page);
+	local_irq_disable();
 	lock_page_cgroup(pc);
 	if (PageCgroupUsed(pc)) {
 		mem = pc->mem_cgroup;
@@ -1726,6 +1729,7 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
 		rcu_read_unlock();
 	}
 	unlock_page_cgroup(pc);
+	local_irq_enable();
 	return mem;
 }
 
@@ -1742,9 +1746,11 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
 	if (!mem)
 		return;
 
+	local_irq_disable();
 	lock_page_cgroup(pc);
 	if (unlikely(PageCgroupUsed(pc))) {
 		unlock_page_cgroup(pc);
+		local_irq_enable();
 		mem_cgroup_cancel_charge(mem);
 		return;
 	}
@@ -1775,6 +1781,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
 	mem_cgroup_charge_statistics(mem, pc, true);
 
 	unlock_page_cgroup(pc);
+	local_irq_enable();
 	/*
 	 * "charge_statistics" updated event counter. Then, check it.
 	 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
@@ -1844,12 +1851,14 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
 		struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
 {
 	int ret = -EINVAL;
+	local_irq_disable();
 	lock_page_cgroup(pc);
 	if (PageCgroupUsed(pc) && pc->mem_cgroup == from) {
 		__mem_cgroup_move_account(pc, from, to, uncharge);
 		ret = 0;
 	}
 	unlock_page_cgroup(pc);
+	local_irq_enable();
 	/*
 	 * check events
 	 */
@@ -1981,12 +1990,15 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
 		pc = lookup_page_cgroup(page);
 		if (!pc)
 			return 0;
+		local_irq_disable();
 		lock_page_cgroup(pc);
 		if (PageCgroupUsed(pc)) {
 			unlock_page_cgroup(pc);
+			local_irq_enable();
 			return 0;
 		}
 		unlock_page_cgroup(pc);
+		local_irq_enable();
 	}
 
 	if (unlikely(!mm && !mem))
@@ -2182,6 +2194,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
 	if (unlikely(!pc || !PageCgroupUsed(pc)))
 		return NULL;
 
+	local_irq_disable();
 	lock_page_cgroup(pc);
 
 	mem = pc->mem_cgroup;
@@ -2222,6 +2235,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
 
 	mz = page_cgroup_zoneinfo(pc);
 	unlock_page_cgroup(pc);
+	local_irq_enable();
 
 	memcg_check_events(mem, page);
 	/* at swapout, this memcg will be accessed to record to swap */
@@ -2232,6 +2246,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
 
 unlock_out:
 	unlock_page_cgroup(pc);
+	local_irq_enable();
 	return NULL;
 }
 
@@ -2424,12 +2439,14 @@ int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr)
 		return 0;
 
 	pc = lookup_page_cgroup(page);
+	local_irq_disable();
 	lock_page_cgroup(pc);
 	if (PageCgroupUsed(pc)) {
 		mem = pc->mem_cgroup;
 		css_get(&mem->css);
 	}
 	unlock_page_cgroup(pc);
+	local_irq_enable();
 
 	if (mem) {
 		ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false);
-- 
1.6.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 41+ messages in thread

* Re: [PATCH -mmotm 3/4] memcg: dirty pages accounting and limiting infrastructure
  2010-03-08  8:07           ` Daisuke Nishimura
@ 2010-03-08  8:31             ` KAMEZAWA Hiroyuki
  2010-03-09  0:12               ` Andrea Righi
  2010-03-09  0:18               ` [PATCH -mmotm 3/4] memcg: dirty pages accounting and limiting infrastructure Daisuke Nishimura
  2010-03-09  0:03             ` Andrea Righi
  1 sibling, 2 replies; 41+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-03-08  8:31 UTC (permalink / raw)
  To: Daisuke Nishimura
  Cc: Andrea Righi, Balbir Singh, Vivek Goyal, Peter Zijlstra,
	Trond Myklebust, Suleiman Souhlal, Greg Thelen,
	Kirill A. Shutemov, Andrew Morton, containers, linux-kernel,
	linux-mm

On Mon, 8 Mar 2010 17:07:11 +0900
Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:

> On Mon, 8 Mar 2010 11:37:11 +0900, KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> > On Mon, 8 Mar 2010 11:17:24 +0900
> > Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:
> > 
> > > > But IIRC, clear_writeback is done under treelock.... No ?
> > > > 
> > > The place where NR_WRITEBACK is updated is out of tree_lock.
> > > 
> > >    1311 int test_clear_page_writeback(struct page *page)
> > >    1312 {
> > >    1313         struct address_space *mapping = page_mapping(page);
> > >    1314         int ret;
> > >    1315
> > >    1316         if (mapping) {
> > >    1317                 struct backing_dev_info *bdi = mapping->backing_dev_info;
> > >    1318                 unsigned long flags;
> > >    1319
> > >    1320                 spin_lock_irqsave(&mapping->tree_lock, flags);
> > >    1321                 ret = TestClearPageWriteback(page);
> > >    1322                 if (ret) {
> > >    1323                         radix_tree_tag_clear(&mapping->page_tree,
> > >    1324                                                 page_index(page),
> > >    1325                                                 PAGECACHE_TAG_WRITEBACK);
> > >    1326                         if (bdi_cap_account_writeback(bdi)) {
> > >    1327                                 __dec_bdi_stat(bdi, BDI_WRITEBACK);
> > >    1328                                 __bdi_writeout_inc(bdi);
> > >    1329                         }
> > >    1330                 }
> > >    1331                 spin_unlock_irqrestore(&mapping->tree_lock, flags);
> > >    1332         } else {
> > >    1333                 ret = TestClearPageWriteback(page);
> > >    1334         }
> > >    1335         if (ret)
> > >    1336                 dec_zone_page_state(page, NR_WRITEBACK);
> > >    1337         return ret;
> > >    1338 }
> > 
> > We can move this up to under tree_lock. Considering memcg, all our target has "mapping".
> > 
> > If we newly account bounce-buffers (for NILFS, FUSE, etc..), which has no ->mapping,
> > we need much more complex new charge/uncharge theory.
> > 
> > But yes, adding new lock scheme seems complicated. (Sorry Andrea.)
> > My concerns is performance. We may need somehing new re-implementation of
> > locks/migrate/charge/uncharge.
> > 
> I agree. Performance is my concern too.
> 
> I made a patch below and measured the time(average of 10 times) of kernel build
> on tmpfs(make -j8 on 8 CPU machine with 2.6.33 defconfig).
> 
> <before>
> - root cgroup: 190.47 sec
> - child cgroup: 192.81 sec
> 
> <after>
> - root cgroup: 191.06 sec
> - child cgroup: 193.06 sec
> 
> Hmm... about 0.3% slower for root, 0.1% slower for child.
> 

Hmm...accepatable ? (sounds it's in error-range)

BTW, why local_irq_disable() ? 
local_irq_save()/restore() isn't better ?

Thanks,
-Kame

> ===
> From: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
> 
> In current implementation, we don't have to disable irq at lock_page_cgroup()
> because the lock is never acquired in interrupt context.
> But we are going to do it in later patch, so this patch encloses all of
> lock_page_cgroup()/unlock_page_cgroup() with irq_disabled()/irq_enabled().
> 
> Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
> ---
>  mm/memcontrol.c |   17 +++++++++++++++++
>  1 files changed, 17 insertions(+), 0 deletions(-)
> 
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 02ea959..e5ae1a1 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -1359,6 +1359,7 @@ void mem_cgroup_update_file_mapped(struct page *page, int val)
>  	if (unlikely(!pc))
>  		return;
>  
> +	local_irq_disable();
>  	lock_page_cgroup(pc);
>  	mem = pc->mem_cgroup;
>  	if (!mem)
> @@ -1374,6 +1375,7 @@ void mem_cgroup_update_file_mapped(struct page *page, int val)
>  
>  done:
>  	unlock_page_cgroup(pc);
> +	local_irq_enable();
>  }
>  
>  /*
> @@ -1711,6 +1713,7 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
>  	VM_BUG_ON(!PageLocked(page));
>  
>  	pc = lookup_page_cgroup(page);
> +	local_irq_disable();
>  	lock_page_cgroup(pc);
>  	if (PageCgroupUsed(pc)) {
>  		mem = pc->mem_cgroup;
> @@ -1726,6 +1729,7 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
>  		rcu_read_unlock();
>  	}
>  	unlock_page_cgroup(pc);
> +	local_irq_enable();
>  	return mem;
>  }
>  
> @@ -1742,9 +1746,11 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
>  	if (!mem)
>  		return;
>  
> +	local_irq_disable();
>  	lock_page_cgroup(pc);
>  	if (unlikely(PageCgroupUsed(pc))) {
>  		unlock_page_cgroup(pc);
> +		local_irq_enable();
>  		mem_cgroup_cancel_charge(mem);
>  		return;
>  	}
> @@ -1775,6 +1781,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
>  	mem_cgroup_charge_statistics(mem, pc, true);
>  
>  	unlock_page_cgroup(pc);
> +	local_irq_enable();
>  	/*
>  	 * "charge_statistics" updated event counter. Then, check it.
>  	 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
> @@ -1844,12 +1851,14 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
>  		struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
>  {
>  	int ret = -EINVAL;
> +	local_irq_disable();
>  	lock_page_cgroup(pc);
>  	if (PageCgroupUsed(pc) && pc->mem_cgroup == from) {
>  		__mem_cgroup_move_account(pc, from, to, uncharge);
>  		ret = 0;
>  	}
>  	unlock_page_cgroup(pc);
> +	local_irq_enable();
>  	/*
>  	 * check events
>  	 */
> @@ -1981,12 +1990,15 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
>  		pc = lookup_page_cgroup(page);
>  		if (!pc)
>  			return 0;
> +		local_irq_disable();
>  		lock_page_cgroup(pc);
>  		if (PageCgroupUsed(pc)) {
>  			unlock_page_cgroup(pc);
> +			local_irq_enable();
>  			return 0;
>  		}
>  		unlock_page_cgroup(pc);
> +		local_irq_enable();
>  	}
>  
>  	if (unlikely(!mm && !mem))
> @@ -2182,6 +2194,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
>  	if (unlikely(!pc || !PageCgroupUsed(pc)))
>  		return NULL;
>  
> +	local_irq_disable();
>  	lock_page_cgroup(pc);
>  
>  	mem = pc->mem_cgroup;
> @@ -2222,6 +2235,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
>  
>  	mz = page_cgroup_zoneinfo(pc);
>  	unlock_page_cgroup(pc);
> +	local_irq_enable();
>  
>  	memcg_check_events(mem, page);
>  	/* at swapout, this memcg will be accessed to record to swap */
> @@ -2232,6 +2246,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
>  
>  unlock_out:
>  	unlock_page_cgroup(pc);
> +	local_irq_enable();
>  	return NULL;
>  }
>  
> @@ -2424,12 +2439,14 @@ int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr)
>  		return 0;
>  
>  	pc = lookup_page_cgroup(page);
> +	local_irq_disable();
>  	lock_page_cgroup(pc);
>  	if (PageCgroupUsed(pc)) {
>  		mem = pc->mem_cgroup;
>  		css_get(&mem->css);
>  	}
>  	unlock_page_cgroup(pc);
> +	local_irq_enable();
>  
>  	if (mem) {
>  		ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false);
> -- 
> 1.6.4
> 
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH -mmotm 3/4] memcg: dirty pages accounting and limiting infrastructure
  2010-03-08  8:07           ` Daisuke Nishimura
  2010-03-08  8:31             ` KAMEZAWA Hiroyuki
@ 2010-03-09  0:03             ` Andrea Righi
  1 sibling, 0 replies; 41+ messages in thread
From: Andrea Righi @ 2010-03-09  0:03 UTC (permalink / raw)
  To: Daisuke Nishimura
  Cc: KAMEZAWA Hiroyuki, Balbir Singh, Vivek Goyal, Peter Zijlstra,
	Trond Myklebust, Suleiman Souhlal, Greg Thelen,
	Kirill A. Shutemov, Andrew Morton, containers, linux-kernel,
	linux-mm

On Mon, Mar 08, 2010 at 05:07:11PM +0900, Daisuke Nishimura wrote:
> On Mon, 8 Mar 2010 11:37:11 +0900, KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> > On Mon, 8 Mar 2010 11:17:24 +0900
> > Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:
> > 
> > > > But IIRC, clear_writeback is done under treelock.... No ?
> > > > 
> > > The place where NR_WRITEBACK is updated is out of tree_lock.
> > > 
> > >    1311 int test_clear_page_writeback(struct page *page)
> > >    1312 {
> > >    1313         struct address_space *mapping = page_mapping(page);
> > >    1314         int ret;
> > >    1315
> > >    1316         if (mapping) {
> > >    1317                 struct backing_dev_info *bdi = mapping->backing_dev_info;
> > >    1318                 unsigned long flags;
> > >    1319
> > >    1320                 spin_lock_irqsave(&mapping->tree_lock, flags);
> > >    1321                 ret = TestClearPageWriteback(page);
> > >    1322                 if (ret) {
> > >    1323                         radix_tree_tag_clear(&mapping->page_tree,
> > >    1324                                                 page_index(page),
> > >    1325                                                 PAGECACHE_TAG_WRITEBACK);
> > >    1326                         if (bdi_cap_account_writeback(bdi)) {
> > >    1327                                 __dec_bdi_stat(bdi, BDI_WRITEBACK);
> > >    1328                                 __bdi_writeout_inc(bdi);
> > >    1329                         }
> > >    1330                 }
> > >    1331                 spin_unlock_irqrestore(&mapping->tree_lock, flags);
> > >    1332         } else {
> > >    1333                 ret = TestClearPageWriteback(page);
> > >    1334         }
> > >    1335         if (ret)
> > >    1336                 dec_zone_page_state(page, NR_WRITEBACK);
> > >    1337         return ret;
> > >    1338 }
> > 
> > We can move this up to under tree_lock. Considering memcg, all our target has "mapping".
> > 
> > If we newly account bounce-buffers (for NILFS, FUSE, etc..), which has no ->mapping,
> > we need much more complex new charge/uncharge theory.
> > 
> > But yes, adding new lock scheme seems complicated. (Sorry Andrea.)
> > My concerns is performance. We may need somehing new re-implementation of
> > locks/migrate/charge/uncharge.
> > 
> I agree. Performance is my concern too.
> 
> I made a patch below and measured the time(average of 10 times) of kernel build
> on tmpfs(make -j8 on 8 CPU machine with 2.6.33 defconfig).
> 
> <before>
> - root cgroup: 190.47 sec
> - child cgroup: 192.81 sec
> 
> <after>
> - root cgroup: 191.06 sec
> - child cgroup: 193.06 sec
> 
> Hmm... about 0.3% slower for root, 0.1% slower for child.

Thanks Daisuke-san. This doesn't seem too bad, I'll repeat the test on
my machine and compare performance with my patch + your and Kame-san
fixes. Even if I agree that adding multiple locked/unlocked versions of
mem_cgroup_update_page_stat() it's just too bug-prone...

Thanks,
-Andrea

> 
> ===
> From: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
> 
> In current implementation, we don't have to disable irq at lock_page_cgroup()
> because the lock is never acquired in interrupt context.
> But we are going to do it in later patch, so this patch encloses all of
> lock_page_cgroup()/unlock_page_cgroup() with irq_disabled()/irq_enabled().
> 
> Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
> ---
>  mm/memcontrol.c |   17 +++++++++++++++++
>  1 files changed, 17 insertions(+), 0 deletions(-)
> 
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 02ea959..e5ae1a1 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -1359,6 +1359,7 @@ void mem_cgroup_update_file_mapped(struct page *page, int val)
>  	if (unlikely(!pc))
>  		return;
>  
> +	local_irq_disable();
>  	lock_page_cgroup(pc);
>  	mem = pc->mem_cgroup;
>  	if (!mem)
> @@ -1374,6 +1375,7 @@ void mem_cgroup_update_file_mapped(struct page *page, int val)
>  
>  done:
>  	unlock_page_cgroup(pc);
> +	local_irq_enable();
>  }
>  
>  /*
> @@ -1711,6 +1713,7 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
>  	VM_BUG_ON(!PageLocked(page));
>  
>  	pc = lookup_page_cgroup(page);
> +	local_irq_disable();
>  	lock_page_cgroup(pc);
>  	if (PageCgroupUsed(pc)) {
>  		mem = pc->mem_cgroup;
> @@ -1726,6 +1729,7 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
>  		rcu_read_unlock();
>  	}
>  	unlock_page_cgroup(pc);
> +	local_irq_enable();
>  	return mem;
>  }
>  
> @@ -1742,9 +1746,11 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
>  	if (!mem)
>  		return;
>  
> +	local_irq_disable();
>  	lock_page_cgroup(pc);
>  	if (unlikely(PageCgroupUsed(pc))) {
>  		unlock_page_cgroup(pc);
> +		local_irq_enable();
>  		mem_cgroup_cancel_charge(mem);
>  		return;
>  	}
> @@ -1775,6 +1781,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
>  	mem_cgroup_charge_statistics(mem, pc, true);
>  
>  	unlock_page_cgroup(pc);
> +	local_irq_enable();
>  	/*
>  	 * "charge_statistics" updated event counter. Then, check it.
>  	 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
> @@ -1844,12 +1851,14 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
>  		struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
>  {
>  	int ret = -EINVAL;
> +	local_irq_disable();
>  	lock_page_cgroup(pc);
>  	if (PageCgroupUsed(pc) && pc->mem_cgroup == from) {
>  		__mem_cgroup_move_account(pc, from, to, uncharge);
>  		ret = 0;
>  	}
>  	unlock_page_cgroup(pc);
> +	local_irq_enable();
>  	/*
>  	 * check events
>  	 */
> @@ -1981,12 +1990,15 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
>  		pc = lookup_page_cgroup(page);
>  		if (!pc)
>  			return 0;
> +		local_irq_disable();
>  		lock_page_cgroup(pc);
>  		if (PageCgroupUsed(pc)) {
>  			unlock_page_cgroup(pc);
> +			local_irq_enable();
>  			return 0;
>  		}
>  		unlock_page_cgroup(pc);
> +		local_irq_enable();
>  	}
>  
>  	if (unlikely(!mm && !mem))
> @@ -2182,6 +2194,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
>  	if (unlikely(!pc || !PageCgroupUsed(pc)))
>  		return NULL;
>  
> +	local_irq_disable();
>  	lock_page_cgroup(pc);
>  
>  	mem = pc->mem_cgroup;
> @@ -2222,6 +2235,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
>  
>  	mz = page_cgroup_zoneinfo(pc);
>  	unlock_page_cgroup(pc);
> +	local_irq_enable();
>  
>  	memcg_check_events(mem, page);
>  	/* at swapout, this memcg will be accessed to record to swap */
> @@ -2232,6 +2246,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
>  
>  unlock_out:
>  	unlock_page_cgroup(pc);
> +	local_irq_enable();
>  	return NULL;
>  }
>  
> @@ -2424,12 +2439,14 @@ int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr)
>  		return 0;
>  
>  	pc = lookup_page_cgroup(page);
> +	local_irq_disable();
>  	lock_page_cgroup(pc);
>  	if (PageCgroupUsed(pc)) {
>  		mem = pc->mem_cgroup;
>  		css_get(&mem->css);
>  	}
>  	unlock_page_cgroup(pc);
> +	local_irq_enable();
>  
>  	if (mem) {
>  		ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false);
> -- 
> 1.6.4
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH -mmotm 3/4] memcg: dirty pages accounting and limiting infrastructure
  2010-03-08  8:31             ` KAMEZAWA Hiroyuki
@ 2010-03-09  0:12               ` Andrea Righi
  2010-03-09  0:19                 ` KAMEZAWA Hiroyuki
  2010-03-09  0:18               ` [PATCH -mmotm 3/4] memcg: dirty pages accounting and limiting infrastructure Daisuke Nishimura
  1 sibling, 1 reply; 41+ messages in thread
From: Andrea Righi @ 2010-03-09  0:12 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Daisuke Nishimura, Balbir Singh, Vivek Goyal, Peter Zijlstra,
	Trond Myklebust, Suleiman Souhlal, Greg Thelen,
	Kirill A. Shutemov, Andrew Morton, containers, linux-kernel,
	linux-mm

On Mon, Mar 08, 2010 at 05:31:00PM +0900, KAMEZAWA Hiroyuki wrote:
> On Mon, 8 Mar 2010 17:07:11 +0900
> Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:
> 
> > On Mon, 8 Mar 2010 11:37:11 +0900, KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> > > On Mon, 8 Mar 2010 11:17:24 +0900
> > > Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:
> > > 
> > > > > But IIRC, clear_writeback is done under treelock.... No ?
> > > > > 
> > > > The place where NR_WRITEBACK is updated is out of tree_lock.
> > > > 
> > > >    1311 int test_clear_page_writeback(struct page *page)
> > > >    1312 {
> > > >    1313         struct address_space *mapping = page_mapping(page);
> > > >    1314         int ret;
> > > >    1315
> > > >    1316         if (mapping) {
> > > >    1317                 struct backing_dev_info *bdi = mapping->backing_dev_info;
> > > >    1318                 unsigned long flags;
> > > >    1319
> > > >    1320                 spin_lock_irqsave(&mapping->tree_lock, flags);
> > > >    1321                 ret = TestClearPageWriteback(page);
> > > >    1322                 if (ret) {
> > > >    1323                         radix_tree_tag_clear(&mapping->page_tree,
> > > >    1324                                                 page_index(page),
> > > >    1325                                                 PAGECACHE_TAG_WRITEBACK);
> > > >    1326                         if (bdi_cap_account_writeback(bdi)) {
> > > >    1327                                 __dec_bdi_stat(bdi, BDI_WRITEBACK);
> > > >    1328                                 __bdi_writeout_inc(bdi);
> > > >    1329                         }
> > > >    1330                 }
> > > >    1331                 spin_unlock_irqrestore(&mapping->tree_lock, flags);
> > > >    1332         } else {
> > > >    1333                 ret = TestClearPageWriteback(page);
> > > >    1334         }
> > > >    1335         if (ret)
> > > >    1336                 dec_zone_page_state(page, NR_WRITEBACK);
> > > >    1337         return ret;
> > > >    1338 }
> > > 
> > > We can move this up to under tree_lock. Considering memcg, all our target has "mapping".
> > > 
> > > If we newly account bounce-buffers (for NILFS, FUSE, etc..), which has no ->mapping,
> > > we need much more complex new charge/uncharge theory.
> > > 
> > > But yes, adding new lock scheme seems complicated. (Sorry Andrea.)
> > > My concerns is performance. We may need somehing new re-implementation of
> > > locks/migrate/charge/uncharge.
> > > 
> > I agree. Performance is my concern too.
> > 
> > I made a patch below and measured the time(average of 10 times) of kernel build
> > on tmpfs(make -j8 on 8 CPU machine with 2.6.33 defconfig).
> > 
> > <before>
> > - root cgroup: 190.47 sec
> > - child cgroup: 192.81 sec
> > 
> > <after>
> > - root cgroup: 191.06 sec
> > - child cgroup: 193.06 sec
> > 
> > Hmm... about 0.3% slower for root, 0.1% slower for child.
> > 
> 
> Hmm...accepatable ? (sounds it's in error-range)
> 
> BTW, why local_irq_disable() ? 
> local_irq_save()/restore() isn't better ?

Probably there's not the overhead of saving flags? Anyway, it would make
the code much more readable...

Thanks,
-Andrea


> 
> Thanks,
> -Kame
> 
> > ===
> > From: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
> > 
> > In current implementation, we don't have to disable irq at lock_page_cgroup()
> > because the lock is never acquired in interrupt context.
> > But we are going to do it in later patch, so this patch encloses all of
> > lock_page_cgroup()/unlock_page_cgroup() with irq_disabled()/irq_enabled().
> > 
> > Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
> > ---
> >  mm/memcontrol.c |   17 +++++++++++++++++
> >  1 files changed, 17 insertions(+), 0 deletions(-)
> > 
> > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > index 02ea959..e5ae1a1 100644
> > --- a/mm/memcontrol.c
> > +++ b/mm/memcontrol.c
> > @@ -1359,6 +1359,7 @@ void mem_cgroup_update_file_mapped(struct page *page, int val)
> >  	if (unlikely(!pc))
> >  		return;
> >  
> > +	local_irq_disable();
> >  	lock_page_cgroup(pc);
> >  	mem = pc->mem_cgroup;
> >  	if (!mem)
> > @@ -1374,6 +1375,7 @@ void mem_cgroup_update_file_mapped(struct page *page, int val)
> >  
> >  done:
> >  	unlock_page_cgroup(pc);
> > +	local_irq_enable();
> >  }
> >  
> >  /*
> > @@ -1711,6 +1713,7 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
> >  	VM_BUG_ON(!PageLocked(page));
> >  
> >  	pc = lookup_page_cgroup(page);
> > +	local_irq_disable();
> >  	lock_page_cgroup(pc);
> >  	if (PageCgroupUsed(pc)) {
> >  		mem = pc->mem_cgroup;
> > @@ -1726,6 +1729,7 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
> >  		rcu_read_unlock();
> >  	}
> >  	unlock_page_cgroup(pc);
> > +	local_irq_enable();
> >  	return mem;
> >  }
> >  
> > @@ -1742,9 +1746,11 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
> >  	if (!mem)
> >  		return;
> >  
> > +	local_irq_disable();
> >  	lock_page_cgroup(pc);
> >  	if (unlikely(PageCgroupUsed(pc))) {
> >  		unlock_page_cgroup(pc);
> > +		local_irq_enable();
> >  		mem_cgroup_cancel_charge(mem);
> >  		return;
> >  	}
> > @@ -1775,6 +1781,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
> >  	mem_cgroup_charge_statistics(mem, pc, true);
> >  
> >  	unlock_page_cgroup(pc);
> > +	local_irq_enable();
> >  	/*
> >  	 * "charge_statistics" updated event counter. Then, check it.
> >  	 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
> > @@ -1844,12 +1851,14 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
> >  		struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
> >  {
> >  	int ret = -EINVAL;
> > +	local_irq_disable();
> >  	lock_page_cgroup(pc);
> >  	if (PageCgroupUsed(pc) && pc->mem_cgroup == from) {
> >  		__mem_cgroup_move_account(pc, from, to, uncharge);
> >  		ret = 0;
> >  	}
> >  	unlock_page_cgroup(pc);
> > +	local_irq_enable();
> >  	/*
> >  	 * check events
> >  	 */
> > @@ -1981,12 +1990,15 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
> >  		pc = lookup_page_cgroup(page);
> >  		if (!pc)
> >  			return 0;
> > +		local_irq_disable();
> >  		lock_page_cgroup(pc);
> >  		if (PageCgroupUsed(pc)) {
> >  			unlock_page_cgroup(pc);
> > +			local_irq_enable();
> >  			return 0;
> >  		}
> >  		unlock_page_cgroup(pc);
> > +		local_irq_enable();
> >  	}
> >  
> >  	if (unlikely(!mm && !mem))
> > @@ -2182,6 +2194,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
> >  	if (unlikely(!pc || !PageCgroupUsed(pc)))
> >  		return NULL;
> >  
> > +	local_irq_disable();
> >  	lock_page_cgroup(pc);
> >  
> >  	mem = pc->mem_cgroup;
> > @@ -2222,6 +2235,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
> >  
> >  	mz = page_cgroup_zoneinfo(pc);
> >  	unlock_page_cgroup(pc);
> > +	local_irq_enable();
> >  
> >  	memcg_check_events(mem, page);
> >  	/* at swapout, this memcg will be accessed to record to swap */
> > @@ -2232,6 +2246,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
> >  
> >  unlock_out:
> >  	unlock_page_cgroup(pc);
> > +	local_irq_enable();
> >  	return NULL;
> >  }
> >  
> > @@ -2424,12 +2439,14 @@ int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr)
> >  		return 0;
> >  
> >  	pc = lookup_page_cgroup(page);
> > +	local_irq_disable();
> >  	lock_page_cgroup(pc);
> >  	if (PageCgroupUsed(pc)) {
> >  		mem = pc->mem_cgroup;
> >  		css_get(&mem->css);
> >  	}
> >  	unlock_page_cgroup(pc);
> > +	local_irq_enable();
> >  
> >  	if (mem) {
> >  		ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false);
> > -- 
> > 1.6.4
> > 
> > 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH -mmotm 3/4] memcg: dirty pages accounting and limiting infrastructure
  2010-03-08  8:31             ` KAMEZAWA Hiroyuki
  2010-03-09  0:12               ` Andrea Righi
@ 2010-03-09  0:18               ` Daisuke Nishimura
  2010-03-09  0:20                 ` KAMEZAWA Hiroyuki
  1 sibling, 1 reply; 41+ messages in thread
From: Daisuke Nishimura @ 2010-03-09  0:18 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Andrea Righi, Balbir Singh, Vivek Goyal, Peter Zijlstra,
	Trond Myklebust, Suleiman Souhlal, Greg Thelen,
	Kirill A. Shutemov, Andrew Morton, containers, linux-kernel,
	linux-mm, Daisuke Nishimura

On Mon, 8 Mar 2010 17:31:00 +0900, KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> On Mon, 8 Mar 2010 17:07:11 +0900
> Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:
> 
> > On Mon, 8 Mar 2010 11:37:11 +0900, KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> > > On Mon, 8 Mar 2010 11:17:24 +0900
> > > Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:
> > > 
> > > > > But IIRC, clear_writeback is done under treelock.... No ?
> > > > > 
> > > > The place where NR_WRITEBACK is updated is out of tree_lock.
> > > > 
> > > >    1311 int test_clear_page_writeback(struct page *page)
> > > >    1312 {
> > > >    1313         struct address_space *mapping = page_mapping(page);
> > > >    1314         int ret;
> > > >    1315
> > > >    1316         if (mapping) {
> > > >    1317                 struct backing_dev_info *bdi = mapping->backing_dev_info;
> > > >    1318                 unsigned long flags;
> > > >    1319
> > > >    1320                 spin_lock_irqsave(&mapping->tree_lock, flags);
> > > >    1321                 ret = TestClearPageWriteback(page);
> > > >    1322                 if (ret) {
> > > >    1323                         radix_tree_tag_clear(&mapping->page_tree,
> > > >    1324                                                 page_index(page),
> > > >    1325                                                 PAGECACHE_TAG_WRITEBACK);
> > > >    1326                         if (bdi_cap_account_writeback(bdi)) {
> > > >    1327                                 __dec_bdi_stat(bdi, BDI_WRITEBACK);
> > > >    1328                                 __bdi_writeout_inc(bdi);
> > > >    1329                         }
> > > >    1330                 }
> > > >    1331                 spin_unlock_irqrestore(&mapping->tree_lock, flags);
> > > >    1332         } else {
> > > >    1333                 ret = TestClearPageWriteback(page);
> > > >    1334         }
> > > >    1335         if (ret)
> > > >    1336                 dec_zone_page_state(page, NR_WRITEBACK);
> > > >    1337         return ret;
> > > >    1338 }
> > > 
> > > We can move this up to under tree_lock. Considering memcg, all our target has "mapping".
> > > 
> > > If we newly account bounce-buffers (for NILFS, FUSE, etc..), which has no ->mapping,
> > > we need much more complex new charge/uncharge theory.
> > > 
> > > But yes, adding new lock scheme seems complicated. (Sorry Andrea.)
> > > My concerns is performance. We may need somehing new re-implementation of
> > > locks/migrate/charge/uncharge.
> > > 
> > I agree. Performance is my concern too.
> > 
> > I made a patch below and measured the time(average of 10 times) of kernel build
> > on tmpfs(make -j8 on 8 CPU machine with 2.6.33 defconfig).
> > 
> > <before>
> > - root cgroup: 190.47 sec
> > - child cgroup: 192.81 sec
> > 
> > <after>
> > - root cgroup: 191.06 sec
> > - child cgroup: 193.06 sec
> > 
> > Hmm... about 0.3% slower for root, 0.1% slower for child.
> > 
> 
> Hmm...accepatable ? (sounds it's in error-range)
> 
> BTW, why local_irq_disable() ? 
> local_irq_save()/restore() isn't better ?
> 
I don't have any strong reason. All of lock_page_cgroup() is *now* called w/o irq disabled,
so I used just disable()/enable() instead of save()/restore().
I think disable()/enable() is better in those cases because we need not to save/restore
eflags register by pushf/popf, but, I don't have any numbers though, there wouldn't be a big
difference in performance.


Thanks,
Daisuke Nishimura.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH -mmotm 3/4] memcg: dirty pages accounting and limiting infrastructure
  2010-03-09  0:12               ` Andrea Righi
@ 2010-03-09  0:19                 ` KAMEZAWA Hiroyuki
  2010-03-09  1:29                   ` [PATCH mmotm 2.5/4] memcg: disable irq at page cgroup lock (Re: [PATCH -mmotm 3/4] memcg: dirty pages accounting and limiting infrastructure) Daisuke Nishimura
  0 siblings, 1 reply; 41+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-03-09  0:19 UTC (permalink / raw)
  To: Andrea Righi
  Cc: Daisuke Nishimura, Balbir Singh, Vivek Goyal, Peter Zijlstra,
	Trond Myklebust, Suleiman Souhlal, Greg Thelen,
	Kirill A. Shutemov, Andrew Morton, containers, linux-kernel,
	linux-mm

On Tue, 9 Mar 2010 01:12:52 +0100
Andrea Righi <arighi@develer.com> wrote:

> On Mon, Mar 08, 2010 at 05:31:00PM +0900, KAMEZAWA Hiroyuki wrote:
> > On Mon, 8 Mar 2010 17:07:11 +0900
> > Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:
> > 
> > > On Mon, 8 Mar 2010 11:37:11 +0900, KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> > > > On Mon, 8 Mar 2010 11:17:24 +0900
> > > > Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:
> > > > 
> > > > > > But IIRC, clear_writeback is done under treelock.... No ?
> > > > > > 
> > > > > The place where NR_WRITEBACK is updated is out of tree_lock.
> > > > > 
> > > > >    1311 int test_clear_page_writeback(struct page *page)
> > > > >    1312 {
> > > > >    1313         struct address_space *mapping = page_mapping(page);
> > > > >    1314         int ret;
> > > > >    1315
> > > > >    1316         if (mapping) {
> > > > >    1317                 struct backing_dev_info *bdi = mapping->backing_dev_info;
> > > > >    1318                 unsigned long flags;
> > > > >    1319
> > > > >    1320                 spin_lock_irqsave(&mapping->tree_lock, flags);
> > > > >    1321                 ret = TestClearPageWriteback(page);
> > > > >    1322                 if (ret) {
> > > > >    1323                         radix_tree_tag_clear(&mapping->page_tree,
> > > > >    1324                                                 page_index(page),
> > > > >    1325                                                 PAGECACHE_TAG_WRITEBACK);
> > > > >    1326                         if (bdi_cap_account_writeback(bdi)) {
> > > > >    1327                                 __dec_bdi_stat(bdi, BDI_WRITEBACK);
> > > > >    1328                                 __bdi_writeout_inc(bdi);
> > > > >    1329                         }
> > > > >    1330                 }
> > > > >    1331                 spin_unlock_irqrestore(&mapping->tree_lock, flags);
> > > > >    1332         } else {
> > > > >    1333                 ret = TestClearPageWriteback(page);
> > > > >    1334         }
> > > > >    1335         if (ret)
> > > > >    1336                 dec_zone_page_state(page, NR_WRITEBACK);
> > > > >    1337         return ret;
> > > > >    1338 }
> > > > 
> > > > We can move this up to under tree_lock. Considering memcg, all our target has "mapping".
> > > > 
> > > > If we newly account bounce-buffers (for NILFS, FUSE, etc..), which has no ->mapping,
> > > > we need much more complex new charge/uncharge theory.
> > > > 
> > > > But yes, adding new lock scheme seems complicated. (Sorry Andrea.)
> > > > My concerns is performance. We may need somehing new re-implementation of
> > > > locks/migrate/charge/uncharge.
> > > > 
> > > I agree. Performance is my concern too.
> > > 
> > > I made a patch below and measured the time(average of 10 times) of kernel build
> > > on tmpfs(make -j8 on 8 CPU machine with 2.6.33 defconfig).
> > > 
> > > <before>
> > > - root cgroup: 190.47 sec
> > > - child cgroup: 192.81 sec
> > > 
> > > <after>
> > > - root cgroup: 191.06 sec
> > > - child cgroup: 193.06 sec
> > > 
> > > Hmm... about 0.3% slower for root, 0.1% slower for child.
> > > 
> > 
> > Hmm...accepatable ? (sounds it's in error-range)
> > 
> > BTW, why local_irq_disable() ? 
> > local_irq_save()/restore() isn't better ?
> 
> Probably there's not the overhead of saving flags? 
maybe.

> Anyway, it would make the code much more readable...
> 
ok.

please go ahead in this direction. Nishimura-san, would you post an
independent patch ? If no, Andrea-san, please.

Thanks,
-Kame

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH -mmotm 3/4] memcg: dirty pages accounting and limiting infrastructure
  2010-03-09  0:18               ` [PATCH -mmotm 3/4] memcg: dirty pages accounting and limiting infrastructure Daisuke Nishimura
@ 2010-03-09  0:20                 ` KAMEZAWA Hiroyuki
  2010-03-09  0:52                   ` Daisuke Nishimura
  0 siblings, 1 reply; 41+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-03-09  0:20 UTC (permalink / raw)
  To: Daisuke Nishimura
  Cc: Andrea Righi, Balbir Singh, Vivek Goyal, Peter Zijlstra,
	Trond Myklebust, Suleiman Souhlal, Greg Thelen,
	Kirill A. Shutemov, Andrew Morton, containers, linux-kernel,
	linux-mm

On Tue, 9 Mar 2010 09:18:45 +0900
Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:

> On Mon, 8 Mar 2010 17:31:00 +0900, KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> > On Mon, 8 Mar 2010 17:07:11 +0900
> > Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:

> > Hmm...accepatable ? (sounds it's in error-range)
> > 
> > BTW, why local_irq_disable() ? 
> > local_irq_save()/restore() isn't better ?
> > 
> I don't have any strong reason. All of lock_page_cgroup() is *now* called w/o irq disabled,
> so I used just disable()/enable() instead of save()/restore().

My point is, this will be used under treelock soon.

Thanks,
-Kame

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH -mmotm 3/4] memcg: dirty pages accounting and limiting infrastructure
  2010-03-09  0:20                 ` KAMEZAWA Hiroyuki
@ 2010-03-09  0:52                   ` Daisuke Nishimura
  0 siblings, 0 replies; 41+ messages in thread
From: Daisuke Nishimura @ 2010-03-09  0:52 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Andrea Righi, Balbir Singh, Vivek Goyal, Peter Zijlstra,
	Trond Myklebust, Suleiman Souhlal, Greg Thelen,
	Kirill A. Shutemov, Andrew Morton, containers, linux-kernel,
	linux-mm, Daisuke Nishimura

On Tue, 9 Mar 2010 09:20:54 +0900, KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> On Tue, 9 Mar 2010 09:18:45 +0900
> Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:
> 
> > On Mon, 8 Mar 2010 17:31:00 +0900, KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> > > On Mon, 8 Mar 2010 17:07:11 +0900
> > > Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:
> 
> > > Hmm...accepatable ? (sounds it's in error-range)
> > > 
> > > BTW, why local_irq_disable() ? 
> > > local_irq_save()/restore() isn't better ?
> > > 
> > I don't have any strong reason. All of lock_page_cgroup() is *now* called w/o irq disabled,
> > so I used just disable()/enable() instead of save()/restore().
> 
> My point is, this will be used under treelock soon.
> 
I agree.

I'll update the patch using save()/restore(), and repost later.


Thanks,
Daisuke Nishimura.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 41+ messages in thread

* [PATCH mmotm 2.5/4] memcg: disable irq at page cgroup lock (Re: [PATCH -mmotm 3/4] memcg: dirty pages accounting and limiting infrastructure)
  2010-03-09  0:19                 ` KAMEZAWA Hiroyuki
@ 2010-03-09  1:29                   ` Daisuke Nishimura
  2010-03-09  2:07                     ` KAMEZAWA Hiroyuki
                                       ` (2 more replies)
  0 siblings, 3 replies; 41+ messages in thread
From: Daisuke Nishimura @ 2010-03-09  1:29 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Andrea Righi, Balbir Singh, Vivek Goyal, Peter Zijlstra,
	Trond Myklebust, Suleiman Souhlal, Greg Thelen,
	Kirill A. Shutemov, Andrew Morton, containers, linux-kernel,
	linux-mm, Daisuke Nishimura

On Tue, 9 Mar 2010 09:19:14 +0900, KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> On Tue, 9 Mar 2010 01:12:52 +0100
> Andrea Righi <arighi@develer.com> wrote:
> 
> > On Mon, Mar 08, 2010 at 05:31:00PM +0900, KAMEZAWA Hiroyuki wrote:
> > > On Mon, 8 Mar 2010 17:07:11 +0900
> > > Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:
> > > 
> > > > On Mon, 8 Mar 2010 11:37:11 +0900, KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> > > > > On Mon, 8 Mar 2010 11:17:24 +0900
> > > > > Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:
> > > > > 
> > > > > > > But IIRC, clear_writeback is done under treelock.... No ?
> > > > > > > 
> > > > > > The place where NR_WRITEBACK is updated is out of tree_lock.
> > > > > > 
> > > > > >    1311 int test_clear_page_writeback(struct page *page)
> > > > > >    1312 {
> > > > > >    1313         struct address_space *mapping = page_mapping(page);
> > > > > >    1314         int ret;
> > > > > >    1315
> > > > > >    1316         if (mapping) {
> > > > > >    1317                 struct backing_dev_info *bdi = mapping->backing_dev_info;
> > > > > >    1318                 unsigned long flags;
> > > > > >    1319
> > > > > >    1320                 spin_lock_irqsave(&mapping->tree_lock, flags);
> > > > > >    1321                 ret = TestClearPageWriteback(page);
> > > > > >    1322                 if (ret) {
> > > > > >    1323                         radix_tree_tag_clear(&mapping->page_tree,
> > > > > >    1324                                                 page_index(page),
> > > > > >    1325                                                 PAGECACHE_TAG_WRITEBACK);
> > > > > >    1326                         if (bdi_cap_account_writeback(bdi)) {
> > > > > >    1327                                 __dec_bdi_stat(bdi, BDI_WRITEBACK);
> > > > > >    1328                                 __bdi_writeout_inc(bdi);
> > > > > >    1329                         }
> > > > > >    1330                 }
> > > > > >    1331                 spin_unlock_irqrestore(&mapping->tree_lock, flags);
> > > > > >    1332         } else {
> > > > > >    1333                 ret = TestClearPageWriteback(page);
> > > > > >    1334         }
> > > > > >    1335         if (ret)
> > > > > >    1336                 dec_zone_page_state(page, NR_WRITEBACK);
> > > > > >    1337         return ret;
> > > > > >    1338 }
> > > > > 
> > > > > We can move this up to under tree_lock. Considering memcg, all our target has "mapping".
> > > > > 
> > > > > If we newly account bounce-buffers (for NILFS, FUSE, etc..), which has no ->mapping,
> > > > > we need much more complex new charge/uncharge theory.
> > > > > 
> > > > > But yes, adding new lock scheme seems complicated. (Sorry Andrea.)
> > > > > My concerns is performance. We may need somehing new re-implementation of
> > > > > locks/migrate/charge/uncharge.
> > > > > 
> > > > I agree. Performance is my concern too.
> > > > 
> > > > I made a patch below and measured the time(average of 10 times) of kernel build
> > > > on tmpfs(make -j8 on 8 CPU machine with 2.6.33 defconfig).
> > > > 
> > > > <before>
> > > > - root cgroup: 190.47 sec
> > > > - child cgroup: 192.81 sec
> > > > 
> > > > <after>
> > > > - root cgroup: 191.06 sec
> > > > - child cgroup: 193.06 sec
> > > > 
> > > > Hmm... about 0.3% slower for root, 0.1% slower for child.
> > > > 
> > > 
> > > Hmm...accepatable ? (sounds it's in error-range)
> > > 
> > > BTW, why local_irq_disable() ? 
> > > local_irq_save()/restore() isn't better ?
> > 
> > Probably there's not the overhead of saving flags? 
> maybe.
> 
> > Anyway, it would make the code much more readable...
> > 
> ok.
> 
> please go ahead in this direction. Nishimura-san, would you post an
> independent patch ? If no, Andrea-san, please.
> 
This is the updated version.

Andrea-san, can you merge this into your patch set ?

===
From: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>

In current implementation, we don't have to disable irq at lock_page_cgroup()
because the lock is never acquired in interrupt context.
But we are going to call it in later patch in an interrupt context or with
irq disabled, so this patch disables irq at lock_page_cgroup() and enables it
at unlock_page_cgroup().

Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
---
 include/linux/page_cgroup.h |   16 ++++++++++++++--
 mm/memcontrol.c             |   43 +++++++++++++++++++++++++------------------
 2 files changed, 39 insertions(+), 20 deletions(-)

diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
index 30b0813..0d2f92c 100644
--- a/include/linux/page_cgroup.h
+++ b/include/linux/page_cgroup.h
@@ -83,16 +83,28 @@ static inline enum zone_type page_cgroup_zid(struct page_cgroup *pc)
 	return page_zonenum(pc->page);
 }
 
-static inline void lock_page_cgroup(struct page_cgroup *pc)
+static inline void __lock_page_cgroup(struct page_cgroup *pc)
 {
 	bit_spin_lock(PCG_LOCK, &pc->flags);
 }
 
-static inline void unlock_page_cgroup(struct page_cgroup *pc)
+static inline void __unlock_page_cgroup(struct page_cgroup *pc)
 {
 	bit_spin_unlock(PCG_LOCK, &pc->flags);
 }
 
+#define lock_page_cgroup(pc, flags)		\
+	do {					\
+		local_irq_save(flags);		\
+		__lock_page_cgroup(pc);		\
+	} while (0)
+
+#define unlock_page_cgroup(pc, flags)		\
+	do {					\
+		__unlock_page_cgroup(pc);	\
+		local_irq_restore(flags);	\
+	} while (0)
+
 #else /* CONFIG_CGROUP_MEM_RES_CTLR */
 struct page_cgroup;
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7fab84e..a9fd736 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1352,12 +1352,13 @@ void mem_cgroup_update_file_mapped(struct page *page, int val)
 {
 	struct mem_cgroup *mem;
 	struct page_cgroup *pc;
+	unsigned long flags;
 
 	pc = lookup_page_cgroup(page);
 	if (unlikely(!pc))
 		return;
 
-	lock_page_cgroup(pc);
+	lock_page_cgroup(pc, flags);
 	mem = pc->mem_cgroup;
 	if (!mem)
 		goto done;
@@ -1371,7 +1372,7 @@ void mem_cgroup_update_file_mapped(struct page *page, int val)
 	__this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], val);
 
 done:
-	unlock_page_cgroup(pc);
+	unlock_page_cgroup(pc, flags);
 }
 
 /*
@@ -1705,11 +1706,12 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
 	struct page_cgroup *pc;
 	unsigned short id;
 	swp_entry_t ent;
+	unsigned long flags;
 
 	VM_BUG_ON(!PageLocked(page));
 
 	pc = lookup_page_cgroup(page);
-	lock_page_cgroup(pc);
+	lock_page_cgroup(pc, flags);
 	if (PageCgroupUsed(pc)) {
 		mem = pc->mem_cgroup;
 		if (mem && !css_tryget(&mem->css))
@@ -1723,7 +1725,7 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
 			mem = NULL;
 		rcu_read_unlock();
 	}
-	unlock_page_cgroup(pc);
+	unlock_page_cgroup(pc, flags);
 	return mem;
 }
 
@@ -1736,13 +1738,15 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
 				     struct page_cgroup *pc,
 				     enum charge_type ctype)
 {
+	unsigned long flags;
+
 	/* try_charge() can return NULL to *memcg, taking care of it. */
 	if (!mem)
 		return;
 
-	lock_page_cgroup(pc);
+	lock_page_cgroup(pc, flags);
 	if (unlikely(PageCgroupUsed(pc))) {
-		unlock_page_cgroup(pc);
+		unlock_page_cgroup(pc, flags);
 		mem_cgroup_cancel_charge(mem);
 		return;
 	}
@@ -1772,7 +1776,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
 
 	mem_cgroup_charge_statistics(mem, pc, true);
 
-	unlock_page_cgroup(pc);
+	unlock_page_cgroup(pc, flags);
 	/*
 	 * "charge_statistics" updated event counter. Then, check it.
 	 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
@@ -1842,12 +1846,13 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
 		struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
 {
 	int ret = -EINVAL;
-	lock_page_cgroup(pc);
+	unsigned long flags;
+	lock_page_cgroup(pc, flags);
 	if (PageCgroupUsed(pc) && pc->mem_cgroup == from) {
 		__mem_cgroup_move_account(pc, from, to, uncharge);
 		ret = 0;
 	}
-	unlock_page_cgroup(pc);
+	unlock_page_cgroup(pc, flags);
 	/*
 	 * check events
 	 */
@@ -1974,17 +1979,17 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
 	 */
 	if (!(gfp_mask & __GFP_WAIT)) {
 		struct page_cgroup *pc;
-
+		unsigned long flags;
 
 		pc = lookup_page_cgroup(page);
 		if (!pc)
 			return 0;
-		lock_page_cgroup(pc);
+		lock_page_cgroup(pc, flags);
 		if (PageCgroupUsed(pc)) {
-			unlock_page_cgroup(pc);
+			unlock_page_cgroup(pc, flags);
 			return 0;
 		}
-		unlock_page_cgroup(pc);
+		unlock_page_cgroup(pc, flags);
 	}
 
 	if (unlikely(!mm && !mem))
@@ -2166,6 +2171,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
 	struct page_cgroup *pc;
 	struct mem_cgroup *mem = NULL;
 	struct mem_cgroup_per_zone *mz;
+	unsigned long flags;
 
 	if (mem_cgroup_disabled())
 		return NULL;
@@ -2180,7 +2186,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
 	if (unlikely(!pc || !PageCgroupUsed(pc)))
 		return NULL;
 
-	lock_page_cgroup(pc);
+	lock_page_cgroup(pc, flags);
 
 	mem = pc->mem_cgroup;
 
@@ -2219,7 +2225,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
 	 */
 
 	mz = page_cgroup_zoneinfo(pc);
-	unlock_page_cgroup(pc);
+	unlock_page_cgroup(pc, flags);
 
 	memcg_check_events(mem, page);
 	/* at swapout, this memcg will be accessed to record to swap */
@@ -2229,7 +2235,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
 	return mem;
 
 unlock_out:
-	unlock_page_cgroup(pc);
+	unlock_page_cgroup(pc, flags);
 	return NULL;
 }
 
@@ -2417,17 +2423,18 @@ int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr)
 	struct page_cgroup *pc;
 	struct mem_cgroup *mem = NULL;
 	int ret = 0;
+	unsigned long flags;
 
 	if (mem_cgroup_disabled())
 		return 0;
 
 	pc = lookup_page_cgroup(page);
-	lock_page_cgroup(pc);
+	lock_page_cgroup(pc, flags);
 	if (PageCgroupUsed(pc)) {
 		mem = pc->mem_cgroup;
 		css_get(&mem->css);
 	}
-	unlock_page_cgroup(pc);
+	unlock_page_cgroup(pc, flags);
 
 	if (mem) {
 		ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false);
-- 
1.6.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 41+ messages in thread

* Re: [PATCH mmotm 2.5/4] memcg: disable irq at page cgroup lock (Re: [PATCH -mmotm 3/4] memcg: dirty pages accounting and limiting infrastructure)
  2010-03-09  1:29                   ` [PATCH mmotm 2.5/4] memcg: disable irq at page cgroup lock (Re: [PATCH -mmotm 3/4] memcg: dirty pages accounting and limiting infrastructure) Daisuke Nishimura
@ 2010-03-09  2:07                     ` KAMEZAWA Hiroyuki
  2010-03-09  4:50                     ` Balbir Singh
  2010-03-09  9:07                     ` Andrea Righi
  2 siblings, 0 replies; 41+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-03-09  2:07 UTC (permalink / raw)
  To: Daisuke Nishimura
  Cc: Andrea Righi, Balbir Singh, Vivek Goyal, Peter Zijlstra,
	Trond Myklebust, Suleiman Souhlal, Greg Thelen,
	Kirill A. Shutemov, Andrew Morton, containers, linux-kernel,
	linux-mm

On Tue, 9 Mar 2010 10:29:28 +0900
Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:
> > please go ahead in this direction. Nishimura-san, would you post an
> > independent patch ? If no, Andrea-san, please.
> > 
> This is the updated version.
> 
> Andrea-san, can you merge this into your patch set ?
> 
> ===
> From: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
> 
> In current implementation, we don't have to disable irq at lock_page_cgroup()
> because the lock is never acquired in interrupt context.
> But we are going to call it in later patch in an interrupt context or with
> irq disabled, so this patch disables irq at lock_page_cgroup() and enables it
> at unlock_page_cgroup().
> 
> Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>

Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>



> ---
>  include/linux/page_cgroup.h |   16 ++++++++++++++--
>  mm/memcontrol.c             |   43 +++++++++++++++++++++++++------------------
>  2 files changed, 39 insertions(+), 20 deletions(-)
> 
> diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
> index 30b0813..0d2f92c 100644
> --- a/include/linux/page_cgroup.h
> +++ b/include/linux/page_cgroup.h
> @@ -83,16 +83,28 @@ static inline enum zone_type page_cgroup_zid(struct page_cgroup *pc)
>  	return page_zonenum(pc->page);
>  }
>  
> -static inline void lock_page_cgroup(struct page_cgroup *pc)
> +static inline void __lock_page_cgroup(struct page_cgroup *pc)
>  {
>  	bit_spin_lock(PCG_LOCK, &pc->flags);
>  }
>  
> -static inline void unlock_page_cgroup(struct page_cgroup *pc)
> +static inline void __unlock_page_cgroup(struct page_cgroup *pc)
>  {
>  	bit_spin_unlock(PCG_LOCK, &pc->flags);
>  }
>  
> +#define lock_page_cgroup(pc, flags)		\
> +	do {					\
> +		local_irq_save(flags);		\
> +		__lock_page_cgroup(pc);		\
> +	} while (0)
> +
> +#define unlock_page_cgroup(pc, flags)		\
> +	do {					\
> +		__unlock_page_cgroup(pc);	\
> +		local_irq_restore(flags);	\
> +	} while (0)
> +
>  #else /* CONFIG_CGROUP_MEM_RES_CTLR */
>  struct page_cgroup;
>  
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 7fab84e..a9fd736 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -1352,12 +1352,13 @@ void mem_cgroup_update_file_mapped(struct page *page, int val)
>  {
>  	struct mem_cgroup *mem;
>  	struct page_cgroup *pc;
> +	unsigned long flags;
>  
>  	pc = lookup_page_cgroup(page);
>  	if (unlikely(!pc))
>  		return;
>  
> -	lock_page_cgroup(pc);
> +	lock_page_cgroup(pc, flags);
>  	mem = pc->mem_cgroup;
>  	if (!mem)
>  		goto done;
> @@ -1371,7 +1372,7 @@ void mem_cgroup_update_file_mapped(struct page *page, int val)
>  	__this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], val);
>  
>  done:
> -	unlock_page_cgroup(pc);
> +	unlock_page_cgroup(pc, flags);
>  }
>  
>  /*
> @@ -1705,11 +1706,12 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
>  	struct page_cgroup *pc;
>  	unsigned short id;
>  	swp_entry_t ent;
> +	unsigned long flags;
>  
>  	VM_BUG_ON(!PageLocked(page));
>  
>  	pc = lookup_page_cgroup(page);
> -	lock_page_cgroup(pc);
> +	lock_page_cgroup(pc, flags);
>  	if (PageCgroupUsed(pc)) {
>  		mem = pc->mem_cgroup;
>  		if (mem && !css_tryget(&mem->css))
> @@ -1723,7 +1725,7 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
>  			mem = NULL;
>  		rcu_read_unlock();
>  	}
> -	unlock_page_cgroup(pc);
> +	unlock_page_cgroup(pc, flags);
>  	return mem;
>  }
>  
> @@ -1736,13 +1738,15 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
>  				     struct page_cgroup *pc,
>  				     enum charge_type ctype)
>  {
> +	unsigned long flags;
> +
>  	/* try_charge() can return NULL to *memcg, taking care of it. */
>  	if (!mem)
>  		return;
>  
> -	lock_page_cgroup(pc);
> +	lock_page_cgroup(pc, flags);
>  	if (unlikely(PageCgroupUsed(pc))) {
> -		unlock_page_cgroup(pc);
> +		unlock_page_cgroup(pc, flags);
>  		mem_cgroup_cancel_charge(mem);
>  		return;
>  	}
> @@ -1772,7 +1776,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
>  
>  	mem_cgroup_charge_statistics(mem, pc, true);
>  
> -	unlock_page_cgroup(pc);
> +	unlock_page_cgroup(pc, flags);
>  	/*
>  	 * "charge_statistics" updated event counter. Then, check it.
>  	 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
> @@ -1842,12 +1846,13 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
>  		struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
>  {
>  	int ret = -EINVAL;
> -	lock_page_cgroup(pc);
> +	unsigned long flags;
> +	lock_page_cgroup(pc, flags);
>  	if (PageCgroupUsed(pc) && pc->mem_cgroup == from) {
>  		__mem_cgroup_move_account(pc, from, to, uncharge);
>  		ret = 0;
>  	}
> -	unlock_page_cgroup(pc);
> +	unlock_page_cgroup(pc, flags);
>  	/*
>  	 * check events
>  	 */
> @@ -1974,17 +1979,17 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
>  	 */
>  	if (!(gfp_mask & __GFP_WAIT)) {
>  		struct page_cgroup *pc;
> -
> +		unsigned long flags;
>  
>  		pc = lookup_page_cgroup(page);
>  		if (!pc)
>  			return 0;
> -		lock_page_cgroup(pc);
> +		lock_page_cgroup(pc, flags);
>  		if (PageCgroupUsed(pc)) {
> -			unlock_page_cgroup(pc);
> +			unlock_page_cgroup(pc, flags);
>  			return 0;
>  		}
> -		unlock_page_cgroup(pc);
> +		unlock_page_cgroup(pc, flags);
>  	}
>  
>  	if (unlikely(!mm && !mem))
> @@ -2166,6 +2171,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
>  	struct page_cgroup *pc;
>  	struct mem_cgroup *mem = NULL;
>  	struct mem_cgroup_per_zone *mz;
> +	unsigned long flags;
>  
>  	if (mem_cgroup_disabled())
>  		return NULL;
> @@ -2180,7 +2186,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
>  	if (unlikely(!pc || !PageCgroupUsed(pc)))
>  		return NULL;
>  
> -	lock_page_cgroup(pc);
> +	lock_page_cgroup(pc, flags);
>  
>  	mem = pc->mem_cgroup;
>  
> @@ -2219,7 +2225,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
>  	 */
>  
>  	mz = page_cgroup_zoneinfo(pc);
> -	unlock_page_cgroup(pc);
> +	unlock_page_cgroup(pc, flags);
>  
>  	memcg_check_events(mem, page);
>  	/* at swapout, this memcg will be accessed to record to swap */
> @@ -2229,7 +2235,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
>  	return mem;
>  
>  unlock_out:
> -	unlock_page_cgroup(pc);
> +	unlock_page_cgroup(pc, flags);
>  	return NULL;
>  }
>  
> @@ -2417,17 +2423,18 @@ int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr)
>  	struct page_cgroup *pc;
>  	struct mem_cgroup *mem = NULL;
>  	int ret = 0;
> +	unsigned long flags;
>  
>  	if (mem_cgroup_disabled())
>  		return 0;
>  
>  	pc = lookup_page_cgroup(page);
> -	lock_page_cgroup(pc);
> +	lock_page_cgroup(pc, flags);
>  	if (PageCgroupUsed(pc)) {
>  		mem = pc->mem_cgroup;
>  		css_get(&mem->css);
>  	}
> -	unlock_page_cgroup(pc);
> +	unlock_page_cgroup(pc, flags);
>  
>  	if (mem) {
>  		ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false);
> -- 
> 1.6.4
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH mmotm 2.5/4] memcg: disable irq at page cgroup lock (Re: [PATCH -mmotm 3/4] memcg: dirty pages accounting and limiting infrastructure)
  2010-03-09  1:29                   ` [PATCH mmotm 2.5/4] memcg: disable irq at page cgroup lock (Re: [PATCH -mmotm 3/4] memcg: dirty pages accounting and limiting infrastructure) Daisuke Nishimura
  2010-03-09  2:07                     ` KAMEZAWA Hiroyuki
@ 2010-03-09  4:50                     ` Balbir Singh
  2010-03-10  1:43                       ` Daisuke Nishimura
  2010-03-09  9:07                     ` Andrea Righi
  2 siblings, 1 reply; 41+ messages in thread
From: Balbir Singh @ 2010-03-09  4:50 UTC (permalink / raw)
  To: Daisuke Nishimura
  Cc: KAMEZAWA Hiroyuki, Andrea Righi, Vivek Goyal, Peter Zijlstra,
	Trond Myklebust, Suleiman Souhlal, Greg Thelen,
	Kirill A. Shutemov, Andrew Morton, containers, linux-kernel,
	linux-mm

* nishimura@mxp.nes.nec.co.jp <nishimura@mxp.nes.nec.co.jp> [2010-03-09 10:29:28]:

> On Tue, 9 Mar 2010 09:19:14 +0900, KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> > On Tue, 9 Mar 2010 01:12:52 +0100
> > Andrea Righi <arighi@develer.com> wrote:
> > 
> > > On Mon, Mar 08, 2010 at 05:31:00PM +0900, KAMEZAWA Hiroyuki wrote:
> > > > On Mon, 8 Mar 2010 17:07:11 +0900
> > > > Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:
> > > > 
> > > > > On Mon, 8 Mar 2010 11:37:11 +0900, KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> > > > > > On Mon, 8 Mar 2010 11:17:24 +0900
> > > > > > Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:
> > > > > > 
> > > > > > > > But IIRC, clear_writeback is done under treelock.... No ?
> > > > > > > > 
> > > > > > > The place where NR_WRITEBACK is updated is out of tree_lock.
> > > > > > > 
> > > > > > >    1311 int test_clear_page_writeback(struct page *page)
> > > > > > >    1312 {
> > > > > > >    1313         struct address_space *mapping = page_mapping(page);
> > > > > > >    1314         int ret;
> > > > > > >    1315
> > > > > > >    1316         if (mapping) {
> > > > > > >    1317                 struct backing_dev_info *bdi = mapping->backing_dev_info;
> > > > > > >    1318                 unsigned long flags;
> > > > > > >    1319
> > > > > > >    1320                 spin_lock_irqsave(&mapping->tree_lock, flags);
> > > > > > >    1321                 ret = TestClearPageWriteback(page);
> > > > > > >    1322                 if (ret) {
> > > > > > >    1323                         radix_tree_tag_clear(&mapping->page_tree,
> > > > > > >    1324                                                 page_index(page),
> > > > > > >    1325                                                 PAGECACHE_TAG_WRITEBACK);
> > > > > > >    1326                         if (bdi_cap_account_writeback(bdi)) {
> > > > > > >    1327                                 __dec_bdi_stat(bdi, BDI_WRITEBACK);
> > > > > > >    1328                                 __bdi_writeout_inc(bdi);
> > > > > > >    1329                         }
> > > > > > >    1330                 }
> > > > > > >    1331                 spin_unlock_irqrestore(&mapping->tree_lock, flags);
> > > > > > >    1332         } else {
> > > > > > >    1333                 ret = TestClearPageWriteback(page);
> > > > > > >    1334         }
> > > > > > >    1335         if (ret)
> > > > > > >    1336                 dec_zone_page_state(page, NR_WRITEBACK);
> > > > > > >    1337         return ret;
> > > > > > >    1338 }
> > > > > > 
> > > > > > We can move this up to under tree_lock. Considering memcg, all our target has "mapping".
> > > > > > 
> > > > > > If we newly account bounce-buffers (for NILFS, FUSE, etc..), which has no ->mapping,
> > > > > > we need much more complex new charge/uncharge theory.
> > > > > > 
> > > > > > But yes, adding new lock scheme seems complicated. (Sorry Andrea.)
> > > > > > My concerns is performance. We may need somehing new re-implementation of
> > > > > > locks/migrate/charge/uncharge.
> > > > > > 
> > > > > I agree. Performance is my concern too.
> > > > > 
> > > > > I made a patch below and measured the time(average of 10 times) of kernel build
> > > > > on tmpfs(make -j8 on 8 CPU machine with 2.6.33 defconfig).
> > > > > 
> > > > > <before>
> > > > > - root cgroup: 190.47 sec
> > > > > - child cgroup: 192.81 sec
> > > > > 
> > > > > <after>
> > > > > - root cgroup: 191.06 sec
> > > > > - child cgroup: 193.06 sec
> > > > > 
> > > > > Hmm... about 0.3% slower for root, 0.1% slower for child.
> > > > > 
> > > > 
> > > > Hmm...accepatable ? (sounds it's in error-range)
> > > > 
> > > > BTW, why local_irq_disable() ? 
> > > > local_irq_save()/restore() isn't better ?
> > > 
> > > Probably there's not the overhead of saving flags? 
> > maybe.
> > 
> > > Anyway, it would make the code much more readable...
> > > 
> > ok.
> > 
> > please go ahead in this direction. Nishimura-san, would you post an
> > independent patch ? If no, Andrea-san, please.
> > 
> This is the updated version.
> 
> Andrea-san, can you merge this into your patch set ?
> 

Please please measure the performance overhead of this change.

-- 
	Three Cheers,
	Balbir

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH mmotm 2.5/4] memcg: disable irq at page cgroup lock (Re: [PATCH -mmotm 3/4] memcg: dirty pages accounting and limiting infrastructure)
  2010-03-09  1:29                   ` [PATCH mmotm 2.5/4] memcg: disable irq at page cgroup lock (Re: [PATCH -mmotm 3/4] memcg: dirty pages accounting and limiting infrastructure) Daisuke Nishimura
  2010-03-09  2:07                     ` KAMEZAWA Hiroyuki
  2010-03-09  4:50                     ` Balbir Singh
@ 2010-03-09  9:07                     ` Andrea Righi
  2 siblings, 0 replies; 41+ messages in thread
From: Andrea Righi @ 2010-03-09  9:07 UTC (permalink / raw)
  To: Daisuke Nishimura
  Cc: KAMEZAWA Hiroyuki, Balbir Singh, Vivek Goyal, Peter Zijlstra,
	Trond Myklebust, Suleiman Souhlal, Greg Thelen,
	Kirill A. Shutemov, Andrew Morton, containers, linux-kernel,
	linux-mm

On Tue, Mar 09, 2010 at 10:29:28AM +0900, Daisuke Nishimura wrote:
> On Tue, 9 Mar 2010 09:19:14 +0900, KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> > On Tue, 9 Mar 2010 01:12:52 +0100
> > Andrea Righi <arighi@develer.com> wrote:
> > 
> > > On Mon, Mar 08, 2010 at 05:31:00PM +0900, KAMEZAWA Hiroyuki wrote:
> > > > On Mon, 8 Mar 2010 17:07:11 +0900
> > > > Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:
> > > > 
> > > > > On Mon, 8 Mar 2010 11:37:11 +0900, KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> > > > > > On Mon, 8 Mar 2010 11:17:24 +0900
> > > > > > Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:
> > > > > > 
> > > > > > > > But IIRC, clear_writeback is done under treelock.... No ?
> > > > > > > > 
> > > > > > > The place where NR_WRITEBACK is updated is out of tree_lock.
> > > > > > > 
> > > > > > >    1311 int test_clear_page_writeback(struct page *page)
> > > > > > >    1312 {
> > > > > > >    1313         struct address_space *mapping = page_mapping(page);
> > > > > > >    1314         int ret;
> > > > > > >    1315
> > > > > > >    1316         if (mapping) {
> > > > > > >    1317                 struct backing_dev_info *bdi = mapping->backing_dev_info;
> > > > > > >    1318                 unsigned long flags;
> > > > > > >    1319
> > > > > > >    1320                 spin_lock_irqsave(&mapping->tree_lock, flags);
> > > > > > >    1321                 ret = TestClearPageWriteback(page);
> > > > > > >    1322                 if (ret) {
> > > > > > >    1323                         radix_tree_tag_clear(&mapping->page_tree,
> > > > > > >    1324                                                 page_index(page),
> > > > > > >    1325                                                 PAGECACHE_TAG_WRITEBACK);
> > > > > > >    1326                         if (bdi_cap_account_writeback(bdi)) {
> > > > > > >    1327                                 __dec_bdi_stat(bdi, BDI_WRITEBACK);
> > > > > > >    1328                                 __bdi_writeout_inc(bdi);
> > > > > > >    1329                         }
> > > > > > >    1330                 }
> > > > > > >    1331                 spin_unlock_irqrestore(&mapping->tree_lock, flags);
> > > > > > >    1332         } else {
> > > > > > >    1333                 ret = TestClearPageWriteback(page);
> > > > > > >    1334         }
> > > > > > >    1335         if (ret)
> > > > > > >    1336                 dec_zone_page_state(page, NR_WRITEBACK);
> > > > > > >    1337         return ret;
> > > > > > >    1338 }
> > > > > > 
> > > > > > We can move this up to under tree_lock. Considering memcg, all our target has "mapping".
> > > > > > 
> > > > > > If we newly account bounce-buffers (for NILFS, FUSE, etc..), which has no ->mapping,
> > > > > > we need much more complex new charge/uncharge theory.
> > > > > > 
> > > > > > But yes, adding new lock scheme seems complicated. (Sorry Andrea.)
> > > > > > My concerns is performance. We may need somehing new re-implementation of
> > > > > > locks/migrate/charge/uncharge.
> > > > > > 
> > > > > I agree. Performance is my concern too.
> > > > > 
> > > > > I made a patch below and measured the time(average of 10 times) of kernel build
> > > > > on tmpfs(make -j8 on 8 CPU machine with 2.6.33 defconfig).
> > > > > 
> > > > > <before>
> > > > > - root cgroup: 190.47 sec
> > > > > - child cgroup: 192.81 sec
> > > > > 
> > > > > <after>
> > > > > - root cgroup: 191.06 sec
> > > > > - child cgroup: 193.06 sec
> > > > > 
> > > > > Hmm... about 0.3% slower for root, 0.1% slower for child.
> > > > > 
> > > > 
> > > > Hmm...accepatable ? (sounds it's in error-range)
> > > > 
> > > > BTW, why local_irq_disable() ? 
> > > > local_irq_save()/restore() isn't better ?
> > > 
> > > Probably there's not the overhead of saving flags? 
> > maybe.
> > 
> > > Anyway, it would make the code much more readable...
> > > 
> > ok.
> > 
> > please go ahead in this direction. Nishimura-san, would you post an
> > independent patch ? If no, Andrea-san, please.
> > 
> This is the updated version.
> 
> Andrea-san, can you merge this into your patch set ?

OK, I'll merge, do some tests and post a new version.

Thanks!
-Andrea

> 
> ===
> From: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
> 
> In current implementation, we don't have to disable irq at lock_page_cgroup()
> because the lock is never acquired in interrupt context.
> But we are going to call it in later patch in an interrupt context or with
> irq disabled, so this patch disables irq at lock_page_cgroup() and enables it
> at unlock_page_cgroup().
> 
> Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
> ---
>  include/linux/page_cgroup.h |   16 ++++++++++++++--
>  mm/memcontrol.c             |   43 +++++++++++++++++++++++++------------------
>  2 files changed, 39 insertions(+), 20 deletions(-)
> 
> diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
> index 30b0813..0d2f92c 100644
> --- a/include/linux/page_cgroup.h
> +++ b/include/linux/page_cgroup.h
> @@ -83,16 +83,28 @@ static inline enum zone_type page_cgroup_zid(struct page_cgroup *pc)
>  	return page_zonenum(pc->page);
>  }
>  
> -static inline void lock_page_cgroup(struct page_cgroup *pc)
> +static inline void __lock_page_cgroup(struct page_cgroup *pc)
>  {
>  	bit_spin_lock(PCG_LOCK, &pc->flags);
>  }
>  
> -static inline void unlock_page_cgroup(struct page_cgroup *pc)
> +static inline void __unlock_page_cgroup(struct page_cgroup *pc)
>  {
>  	bit_spin_unlock(PCG_LOCK, &pc->flags);
>  }
>  
> +#define lock_page_cgroup(pc, flags)		\
> +	do {					\
> +		local_irq_save(flags);		\
> +		__lock_page_cgroup(pc);		\
> +	} while (0)
> +
> +#define unlock_page_cgroup(pc, flags)		\
> +	do {					\
> +		__unlock_page_cgroup(pc);	\
> +		local_irq_restore(flags);	\
> +	} while (0)
> +
>  #else /* CONFIG_CGROUP_MEM_RES_CTLR */
>  struct page_cgroup;
>  
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 7fab84e..a9fd736 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -1352,12 +1352,13 @@ void mem_cgroup_update_file_mapped(struct page *page, int val)
>  {
>  	struct mem_cgroup *mem;
>  	struct page_cgroup *pc;
> +	unsigned long flags;
>  
>  	pc = lookup_page_cgroup(page);
>  	if (unlikely(!pc))
>  		return;
>  
> -	lock_page_cgroup(pc);
> +	lock_page_cgroup(pc, flags);
>  	mem = pc->mem_cgroup;
>  	if (!mem)
>  		goto done;
> @@ -1371,7 +1372,7 @@ void mem_cgroup_update_file_mapped(struct page *page, int val)
>  	__this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], val);
>  
>  done:
> -	unlock_page_cgroup(pc);
> +	unlock_page_cgroup(pc, flags);
>  }
>  
>  /*
> @@ -1705,11 +1706,12 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
>  	struct page_cgroup *pc;
>  	unsigned short id;
>  	swp_entry_t ent;
> +	unsigned long flags;
>  
>  	VM_BUG_ON(!PageLocked(page));
>  
>  	pc = lookup_page_cgroup(page);
> -	lock_page_cgroup(pc);
> +	lock_page_cgroup(pc, flags);
>  	if (PageCgroupUsed(pc)) {
>  		mem = pc->mem_cgroup;
>  		if (mem && !css_tryget(&mem->css))
> @@ -1723,7 +1725,7 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
>  			mem = NULL;
>  		rcu_read_unlock();
>  	}
> -	unlock_page_cgroup(pc);
> +	unlock_page_cgroup(pc, flags);
>  	return mem;
>  }
>  
> @@ -1736,13 +1738,15 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
>  				     struct page_cgroup *pc,
>  				     enum charge_type ctype)
>  {
> +	unsigned long flags;
> +
>  	/* try_charge() can return NULL to *memcg, taking care of it. */
>  	if (!mem)
>  		return;
>  
> -	lock_page_cgroup(pc);
> +	lock_page_cgroup(pc, flags);
>  	if (unlikely(PageCgroupUsed(pc))) {
> -		unlock_page_cgroup(pc);
> +		unlock_page_cgroup(pc, flags);
>  		mem_cgroup_cancel_charge(mem);
>  		return;
>  	}
> @@ -1772,7 +1776,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
>  
>  	mem_cgroup_charge_statistics(mem, pc, true);
>  
> -	unlock_page_cgroup(pc);
> +	unlock_page_cgroup(pc, flags);
>  	/*
>  	 * "charge_statistics" updated event counter. Then, check it.
>  	 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
> @@ -1842,12 +1846,13 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
>  		struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
>  {
>  	int ret = -EINVAL;
> -	lock_page_cgroup(pc);
> +	unsigned long flags;
> +	lock_page_cgroup(pc, flags);
>  	if (PageCgroupUsed(pc) && pc->mem_cgroup == from) {
>  		__mem_cgroup_move_account(pc, from, to, uncharge);
>  		ret = 0;
>  	}
> -	unlock_page_cgroup(pc);
> +	unlock_page_cgroup(pc, flags);
>  	/*
>  	 * check events
>  	 */
> @@ -1974,17 +1979,17 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
>  	 */
>  	if (!(gfp_mask & __GFP_WAIT)) {
>  		struct page_cgroup *pc;
> -
> +		unsigned long flags;
>  
>  		pc = lookup_page_cgroup(page);
>  		if (!pc)
>  			return 0;
> -		lock_page_cgroup(pc);
> +		lock_page_cgroup(pc, flags);
>  		if (PageCgroupUsed(pc)) {
> -			unlock_page_cgroup(pc);
> +			unlock_page_cgroup(pc, flags);
>  			return 0;
>  		}
> -		unlock_page_cgroup(pc);
> +		unlock_page_cgroup(pc, flags);
>  	}
>  
>  	if (unlikely(!mm && !mem))
> @@ -2166,6 +2171,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
>  	struct page_cgroup *pc;
>  	struct mem_cgroup *mem = NULL;
>  	struct mem_cgroup_per_zone *mz;
> +	unsigned long flags;
>  
>  	if (mem_cgroup_disabled())
>  		return NULL;
> @@ -2180,7 +2186,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
>  	if (unlikely(!pc || !PageCgroupUsed(pc)))
>  		return NULL;
>  
> -	lock_page_cgroup(pc);
> +	lock_page_cgroup(pc, flags);
>  
>  	mem = pc->mem_cgroup;
>  
> @@ -2219,7 +2225,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
>  	 */
>  
>  	mz = page_cgroup_zoneinfo(pc);
> -	unlock_page_cgroup(pc);
> +	unlock_page_cgroup(pc, flags);
>  
>  	memcg_check_events(mem, page);
>  	/* at swapout, this memcg will be accessed to record to swap */
> @@ -2229,7 +2235,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
>  	return mem;
>  
>  unlock_out:
> -	unlock_page_cgroup(pc);
> +	unlock_page_cgroup(pc, flags);
>  	return NULL;
>  }
>  
> @@ -2417,17 +2423,18 @@ int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr)
>  	struct page_cgroup *pc;
>  	struct mem_cgroup *mem = NULL;
>  	int ret = 0;
> +	unsigned long flags;
>  
>  	if (mem_cgroup_disabled())
>  		return 0;
>  
>  	pc = lookup_page_cgroup(page);
> -	lock_page_cgroup(pc);
> +	lock_page_cgroup(pc, flags);
>  	if (PageCgroupUsed(pc)) {
>  		mem = pc->mem_cgroup;
>  		css_get(&mem->css);
>  	}
> -	unlock_page_cgroup(pc);
> +	unlock_page_cgroup(pc, flags);
>  
>  	if (mem) {
>  		ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false);
> -- 
> 1.6.4
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH mmotm 2.5/4] memcg: disable irq at page cgroup lock (Re: [PATCH -mmotm 3/4] memcg: dirty pages accounting and limiting infrastructure)
  2010-03-09  4:50                     ` Balbir Singh
@ 2010-03-10  1:43                       ` Daisuke Nishimura
  2010-03-10  3:56                         ` Balbir Singh
  0 siblings, 1 reply; 41+ messages in thread
From: Daisuke Nishimura @ 2010-03-10  1:43 UTC (permalink / raw)
  To: balbir
  Cc: KAMEZAWA Hiroyuki, Andrea Righi, Vivek Goyal, Peter Zijlstra,
	Trond Myklebust, Suleiman Souhlal, Greg Thelen,
	Kirill A. Shutemov, Andrew Morton, containers, linux-kernel,
	linux-mm, Daisuke Nishimura

> Please please measure the performance overhead of this change.
> 

here.

> > > > > > I made a patch below and measured the time(average of 10 times) of kernel build
> > > > > > on tmpfs(make -j8 on 8 CPU machine with 2.6.33 defconfig).
> > > > > > 
> > > > > > <before>
> > > > > > - root cgroup: 190.47 sec
> > > > > > - child cgroup: 192.81 sec
> > > > > > 
> > > > > > <after>
> > > > > > - root cgroup: 191.06 sec
> > > > > > - child cgroup: 193.06 sec
> > > > > > 

<after2(local_irq_save/restore)>
- root cgroup: 191.42 sec
- child cgroup: 193.55 sec

hmm, I think it's in error range, but I can see a tendency by testing several times
that it's getting slower as I add additional codes. Using local_irq_disable()/enable()
except in mem_cgroup_update_file_mapped(it can be the only candidate to be called
with irq disabled in future) might be the choice.


Thanks,
Daisuke Nishimura.

On Tue, 9 Mar 2010 10:20:58 +0530, Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> * nishimura@mxp.nes.nec.co.jp <nishimura@mxp.nes.nec.co.jp> [2010-03-09 10:29:28]:
> 
> > On Tue, 9 Mar 2010 09:19:14 +0900, KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> > > On Tue, 9 Mar 2010 01:12:52 +0100
> > > Andrea Righi <arighi@develer.com> wrote:
> > > 
> > > > On Mon, Mar 08, 2010 at 05:31:00PM +0900, KAMEZAWA Hiroyuki wrote:
> > > > > On Mon, 8 Mar 2010 17:07:11 +0900
> > > > > Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:
> > > > > 
> > > > > > On Mon, 8 Mar 2010 11:37:11 +0900, KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> > > > > > > On Mon, 8 Mar 2010 11:17:24 +0900
> > > > > > > Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:
> > > > > > > 
> > > > > > > > > But IIRC, clear_writeback is done under treelock.... No ?
> > > > > > > > > 
> > > > > > > > The place where NR_WRITEBACK is updated is out of tree_lock.
> > > > > > > > 
> > > > > > > >    1311 int test_clear_page_writeback(struct page *page)
> > > > > > > >    1312 {
> > > > > > > >    1313         struct address_space *mapping = page_mapping(page);
> > > > > > > >    1314         int ret;
> > > > > > > >    1315
> > > > > > > >    1316         if (mapping) {
> > > > > > > >    1317                 struct backing_dev_info *bdi = mapping->backing_dev_info;
> > > > > > > >    1318                 unsigned long flags;
> > > > > > > >    1319
> > > > > > > >    1320                 spin_lock_irqsave(&mapping->tree_lock, flags);
> > > > > > > >    1321                 ret = TestClearPageWriteback(page);
> > > > > > > >    1322                 if (ret) {
> > > > > > > >    1323                         radix_tree_tag_clear(&mapping->page_tree,
> > > > > > > >    1324                                                 page_index(page),
> > > > > > > >    1325                                                 PAGECACHE_TAG_WRITEBACK);
> > > > > > > >    1326                         if (bdi_cap_account_writeback(bdi)) {
> > > > > > > >    1327                                 __dec_bdi_stat(bdi, BDI_WRITEBACK);
> > > > > > > >    1328                                 __bdi_writeout_inc(bdi);
> > > > > > > >    1329                         }
> > > > > > > >    1330                 }
> > > > > > > >    1331                 spin_unlock_irqrestore(&mapping->tree_lock, flags);
> > > > > > > >    1332         } else {
> > > > > > > >    1333                 ret = TestClearPageWriteback(page);
> > > > > > > >    1334         }
> > > > > > > >    1335         if (ret)
> > > > > > > >    1336                 dec_zone_page_state(page, NR_WRITEBACK);
> > > > > > > >    1337         return ret;
> > > > > > > >    1338 }
> > > > > > > 
> > > > > > > We can move this up to under tree_lock. Considering memcg, all our target has "mapping".
> > > > > > > 
> > > > > > > If we newly account bounce-buffers (for NILFS, FUSE, etc..), which has no ->mapping,
> > > > > > > we need much more complex new charge/uncharge theory.
> > > > > > > 
> > > > > > > But yes, adding new lock scheme seems complicated. (Sorry Andrea.)
> > > > > > > My concerns is performance. We may need somehing new re-implementation of
> > > > > > > locks/migrate/charge/uncharge.
> > > > > > > 
> > > > > > I agree. Performance is my concern too.
> > > > > > 
> > > > > > I made a patch below and measured the time(average of 10 times) of kernel build
> > > > > > on tmpfs(make -j8 on 8 CPU machine with 2.6.33 defconfig).
> > > > > > 
> > > > > > <before>
> > > > > > - root cgroup: 190.47 sec
> > > > > > - child cgroup: 192.81 sec
> > > > > > 
> > > > > > <after>
> > > > > > - root cgroup: 191.06 sec
> > > > > > - child cgroup: 193.06 sec
> > > > > > 
> > > > > > Hmm... about 0.3% slower for root, 0.1% slower for child.
> > > > > > 
> > > > > 
> > > > > Hmm...accepatable ? (sounds it's in error-range)
> > > > > 
> > > > > BTW, why local_irq_disable() ? 
> > > > > local_irq_save()/restore() isn't better ?
> > > > 
> > > > Probably there's not the overhead of saving flags? 
> > > maybe.
> > > 
> > > > Anyway, it would make the code much more readable...
> > > > 
> > > ok.
> > > 
> > > please go ahead in this direction. Nishimura-san, would you post an
> > > independent patch ? If no, Andrea-san, please.
> > > 
> > This is the updated version.
> > 
> > Andrea-san, can you merge this into your patch set ?
> > 
> 
> Please please measure the performance overhead of this change.
> 
> -- 
> 	Three Cheers,
> 	Balbir

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH mmotm 2.5/4] memcg: disable irq at page cgroup lock (Re: [PATCH -mmotm 3/4] memcg: dirty pages accounting and limiting infrastructure)
  2010-03-10  1:43                       ` Daisuke Nishimura
@ 2010-03-10  3:56                         ` Balbir Singh
  2010-03-11  4:31                           ` Daisuke Nishimura
  0 siblings, 1 reply; 41+ messages in thread
From: Balbir Singh @ 2010-03-10  3:56 UTC (permalink / raw)
  To: Daisuke Nishimura
  Cc: linux-mm, Andrea Righi, linux-kernel, Trond Myklebust,
	Suleiman Souhlal, Andrew Morton, containers, Vivek Goyal

* nishimura@mxp.nes.nec.co.jp <nishimura@mxp.nes.nec.co.jp> [2010-03-10 10:43:09]:

> > Please please measure the performance overhead of this change.
> > 
> 
> here.
> 
> > > > > > > I made a patch below and measured the time(average of 10 times) of kernel build
> > > > > > > on tmpfs(make -j8 on 8 CPU machine with 2.6.33 defconfig).
> > > > > > > 
> > > > > > > <before>
> > > > > > > - root cgroup: 190.47 sec
> > > > > > > - child cgroup: 192.81 sec
> > > > > > > 
> > > > > > > <after>
> > > > > > > - root cgroup: 191.06 sec
> > > > > > > - child cgroup: 193.06 sec
> > > > > > > 
> 
> <after2(local_irq_save/restore)>
> - root cgroup: 191.42 sec
> - child cgroup: 193.55 sec
> 
> hmm, I think it's in error range, but I can see a tendency by testing several times
> that it's getting slower as I add additional codes. Using local_irq_disable()/enable()
> except in mem_cgroup_update_file_mapped(it can be the only candidate to be called
> with irq disabled in future) might be the choice.
>

Error range would depend on things like standard deviation and
repetition. It might be good to keep update_file_mapped and see the
impact. My concern is with large systems, the difference might be
larger.
 
-- 
	Three Cheers,
	Balbir

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH mmotm 2.5/4] memcg: disable irq at page cgroup lock (Re: [PATCH -mmotm 3/4] memcg: dirty pages accounting and limiting infrastructure)
  2010-03-10  3:56                         ` Balbir Singh
@ 2010-03-11  4:31                           ` Daisuke Nishimura
  2010-03-11  4:49                             ` KAMEZAWA Hiroyuki
  0 siblings, 1 reply; 41+ messages in thread
From: Daisuke Nishimura @ 2010-03-11  4:31 UTC (permalink / raw)
  To: balbir
  Cc: linux-mm, Andrea Righi, linux-kernel, Trond Myklebust,
	Suleiman Souhlal, Andrew Morton, containers, Vivek Goyal,
	Daisuke Nishimura

[-- Attachment #1: Type: text/plain, Size: 7323 bytes --]

On Wed, 10 Mar 2010 09:26:24 +0530, Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> * nishimura@mxp.nes.nec.co.jp <nishimura@mxp.nes.nec.co.jp> [2010-03-10 10:43:09]:
> 
> > > Please please measure the performance overhead of this change.
> > > 
> > 
> > here.
> > 
> > > > > > > > I made a patch below and measured the time(average of 10 times) of kernel build
> > > > > > > > on tmpfs(make -j8 on 8 CPU machine with 2.6.33 defconfig).
> > > > > > > > 
> > > > > > > > <before>
> > > > > > > > - root cgroup: 190.47 sec
> > > > > > > > - child cgroup: 192.81 sec
> > > > > > > > 
> > > > > > > > <after>
> > > > > > > > - root cgroup: 191.06 sec
> > > > > > > > - child cgroup: 193.06 sec
> > > > > > > > 
> > 
> > <after2(local_irq_save/restore)>
> > - root cgroup: 191.42 sec
> > - child cgroup: 193.55 sec
> > 
> > hmm, I think it's in error range, but I can see a tendency by testing several times
> > that it's getting slower as I add additional codes. Using local_irq_disable()/enable()
> > except in mem_cgroup_update_file_mapped(it can be the only candidate to be called
> > with irq disabled in future) might be the choice.
> >
> 
> Error range would depend on things like standard deviation and
> repetition. It might be good to keep update_file_mapped and see the
> impact. My concern is with large systems, the difference might be
> larger.
>  
> -- 
> 	Three Cheers,
> 	Balbir
I made a patch(attached) using both local_irq_disable/enable and local_irq_save/restore.
local_irq_save/restore is used only in mem_cgroup_update_file_mapped.

And I attached a histogram graph of 30 times kernel build in root cgroup for each.

  before_root: no irq operation(original)
  after_root: local_irq_disable/enable for all
  after2_root: local_irq_save/restore for all
  after3_root: mixed version(attached)

hmm, there seems to be a tendency that before < after < after3 < after2 ?
Should I replace save/restore version to mixed version ?


Thanks,
Daisuke Nishimura.
===
 include/linux/page_cgroup.h |   28 ++++++++++++++++++++++++++--
 mm/memcontrol.c             |   36 ++++++++++++++++++------------------
 2 files changed, 44 insertions(+), 20 deletions(-)

diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
index 30b0813..c0aca62 100644
--- a/include/linux/page_cgroup.h
+++ b/include/linux/page_cgroup.h
@@ -83,16 +83,40 @@ static inline enum zone_type page_cgroup_zid(struct page_cgroup *pc)
 	return page_zonenum(pc->page);
 }
 
-static inline void lock_page_cgroup(struct page_cgroup *pc)
+static inline void __lock_page_cgroup(struct page_cgroup *pc)
 {
 	bit_spin_lock(PCG_LOCK, &pc->flags);
 }
 
-static inline void unlock_page_cgroup(struct page_cgroup *pc)
+static inline void __unlock_page_cgroup(struct page_cgroup *pc)
 {
 	bit_spin_unlock(PCG_LOCK, &pc->flags);
 }
 
+#define lock_page_cgroup_irq(pc)			\
+	do {						\
+		local_irq_disable();			\
+		__lock_page_cgroup(pc);			\
+	} while (0)
+
+#define unlock_page_cgroup_irq(pc)			\
+	do {						\
+		__unlock_page_cgroup(pc);		\
+		local_irq_enable();			\
+	} while (0)
+
+#define lock_page_cgroup_irqsave(pc, flags)		\
+	do {						\
+		local_irq_save(flags);			\
+		__lock_page_cgroup(pc);			\
+	} while (0)
+
+#define unlock_page_cgroup_irqrestore(pc, flags)	\
+	do {						\
+		__unlock_page_cgroup(pc);		\
+		local_irq_restore(flags);		\
+	} while (0)
+
 #else /* CONFIG_CGROUP_MEM_RES_CTLR */
 struct page_cgroup;
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 02ea959..11d483e 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1354,12 +1354,13 @@ void mem_cgroup_update_file_mapped(struct page *page, int val)
 {
 	struct mem_cgroup *mem;
 	struct page_cgroup *pc;
+	unsigned long flags;
 
 	pc = lookup_page_cgroup(page);
 	if (unlikely(!pc))
 		return;
 
-	lock_page_cgroup(pc);
+	lock_page_cgroup_irqsave(pc, flags);
 	mem = pc->mem_cgroup;
 	if (!mem)
 		goto done;
@@ -1373,7 +1374,7 @@ void mem_cgroup_update_file_mapped(struct page *page, int val)
 	__this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], val);
 
 done:
-	unlock_page_cgroup(pc);
+	unlock_page_cgroup_irqrestore(pc, flags);
 }
 
 /*
@@ -1711,7 +1712,7 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
 	VM_BUG_ON(!PageLocked(page));
 
 	pc = lookup_page_cgroup(page);
-	lock_page_cgroup(pc);
+	lock_page_cgroup_irq(pc);
 	if (PageCgroupUsed(pc)) {
 		mem = pc->mem_cgroup;
 		if (mem && !css_tryget(&mem->css))
@@ -1725,7 +1726,7 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
 			mem = NULL;
 		rcu_read_unlock();
 	}
-	unlock_page_cgroup(pc);
+	unlock_page_cgroup_irq(pc);
 	return mem;
 }
 
@@ -1742,9 +1743,9 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
 	if (!mem)
 		return;
 
-	lock_page_cgroup(pc);
+	lock_page_cgroup_irq(pc);
 	if (unlikely(PageCgroupUsed(pc))) {
-		unlock_page_cgroup(pc);
+		unlock_page_cgroup_irq(pc);
 		mem_cgroup_cancel_charge(mem);
 		return;
 	}
@@ -1774,7 +1775,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
 
 	mem_cgroup_charge_statistics(mem, pc, true);
 
-	unlock_page_cgroup(pc);
+	unlock_page_cgroup_irq(pc);
 	/*
 	 * "charge_statistics" updated event counter. Then, check it.
 	 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
@@ -1844,12 +1845,12 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
 		struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
 {
 	int ret = -EINVAL;
-	lock_page_cgroup(pc);
+	lock_page_cgroup_irq(pc);
 	if (PageCgroupUsed(pc) && pc->mem_cgroup == from) {
 		__mem_cgroup_move_account(pc, from, to, uncharge);
 		ret = 0;
 	}
-	unlock_page_cgroup(pc);
+	unlock_page_cgroup_irq(pc);
 	/*
 	 * check events
 	 */
@@ -1977,16 +1978,15 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
 	if (!(gfp_mask & __GFP_WAIT)) {
 		struct page_cgroup *pc;
 
-
 		pc = lookup_page_cgroup(page);
 		if (!pc)
 			return 0;
-		lock_page_cgroup(pc);
+		lock_page_cgroup_irq(pc);
 		if (PageCgroupUsed(pc)) {
-			unlock_page_cgroup(pc);
+			unlock_page_cgroup_irq(pc);
 			return 0;
 		}
-		unlock_page_cgroup(pc);
+		unlock_page_cgroup_irq(pc);
 	}
 
 	if (unlikely(!mm && !mem))
@@ -2182,7 +2182,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
 	if (unlikely(!pc || !PageCgroupUsed(pc)))
 		return NULL;
 
-	lock_page_cgroup(pc);
+	lock_page_cgroup_irq(pc);
 
 	mem = pc->mem_cgroup;
 
@@ -2221,7 +2221,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
 	 */
 
 	mz = page_cgroup_zoneinfo(pc);
-	unlock_page_cgroup(pc);
+	unlock_page_cgroup_irq(pc);
 
 	memcg_check_events(mem, page);
 	/* at swapout, this memcg will be accessed to record to swap */
@@ -2231,7 +2231,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
 	return mem;
 
 unlock_out:
-	unlock_page_cgroup(pc);
+	unlock_page_cgroup_irq(pc);
 	return NULL;
 }
 
@@ -2424,12 +2424,12 @@ int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr)
 		return 0;
 
 	pc = lookup_page_cgroup(page);
-	lock_page_cgroup(pc);
+	lock_page_cgroup_irq(pc);
 	if (PageCgroupUsed(pc)) {
 		mem = pc->mem_cgroup;
 		css_get(&mem->css);
 	}
-	unlock_page_cgroup(pc);
+	unlock_page_cgroup_irq(pc);
 
 	if (mem) {
 		ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false);



[-- Attachment #2: root_cgroup.bmp.gz --]
[-- Type: application/octet-stream, Size: 9426 bytes --]

^ permalink raw reply related	[flat|nested] 41+ messages in thread

* Re: [PATCH mmotm 2.5/4] memcg: disable irq at page cgroup lock (Re: [PATCH -mmotm 3/4] memcg: dirty pages accounting and limiting infrastructure)
  2010-03-11  4:31                           ` Daisuke Nishimura
@ 2010-03-11  4:49                             ` KAMEZAWA Hiroyuki
  2010-03-11  4:58                               ` Daisuke Nishimura
  2010-03-11 16:54                               ` Vivek Goyal
  0 siblings, 2 replies; 41+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-03-11  4:49 UTC (permalink / raw)
  To: Daisuke Nishimura
  Cc: balbir, linux-mm, Andrea Righi, linux-kernel, Trond Myklebust,
	Suleiman Souhlal, Andrew Morton, containers, Vivek Goyal

On Thu, 11 Mar 2010 13:31:23 +0900
Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:

> On Wed, 10 Mar 2010 09:26:24 +0530, Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> > * nishimura@mxp.nes.nec.co.jp <nishimura@mxp.nes.nec.co.jp> [2010-03-10 10:43:09]:

> I made a patch(attached) using both local_irq_disable/enable and local_irq_save/restore.
> local_irq_save/restore is used only in mem_cgroup_update_file_mapped.
> 
> And I attached a histogram graph of 30 times kernel build in root cgroup for each.
> 
>   before_root: no irq operation(original)
>   after_root: local_irq_disable/enable for all
>   after2_root: local_irq_save/restore for all
>   after3_root: mixed version(attached)
> 
> hmm, there seems to be a tendency that before < after < after3 < after2 ?
> Should I replace save/restore version to mixed version ?
> 

IMHO, starting from after2_root version is the easist.
If there is a chance to call lock/unlock page_cgroup can be called in
interrupt context, we _have to_ disable IRQ, anyway.
And if we have to do this, I prefer migration_lock rather than this mixture.

BTW, how big your system is ? Balbir-san's concern is for bigger machines.
But I'm not sure this change is affecte by the size of machines.
I'm sorry I have no big machine, now.

I'll consider yet another fix for race in account migration if I can.

Thanks,
-Kame


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH mmotm 2.5/4] memcg: disable irq at page cgroup lock (Re: [PATCH -mmotm 3/4] memcg: dirty pages accounting and limiting infrastructure)
  2010-03-11  4:49                             ` KAMEZAWA Hiroyuki
@ 2010-03-11  4:58                               ` Daisuke Nishimura
  2010-03-11  5:13                                 ` KAMEZAWA Hiroyuki
  2010-03-11 16:54                               ` Vivek Goyal
  1 sibling, 1 reply; 41+ messages in thread
From: Daisuke Nishimura @ 2010-03-11  4:58 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: balbir, linux-mm, Andrea Righi, linux-kernel, Trond Myklebust,
	Suleiman Souhlal, Andrew Morton, containers, Vivek Goyal,
	Daisuke Nishimura

On Thu, 11 Mar 2010 13:49:08 +0900, KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> On Thu, 11 Mar 2010 13:31:23 +0900
> Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:
> 
> > On Wed, 10 Mar 2010 09:26:24 +0530, Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> > > * nishimura@mxp.nes.nec.co.jp <nishimura@mxp.nes.nec.co.jp> [2010-03-10 10:43:09]:
> 
> > I made a patch(attached) using both local_irq_disable/enable and local_irq_save/restore.
> > local_irq_save/restore is used only in mem_cgroup_update_file_mapped.
> > 
> > And I attached a histogram graph of 30 times kernel build in root cgroup for each.
> > 
> >   before_root: no irq operation(original)
> >   after_root: local_irq_disable/enable for all
> >   after2_root: local_irq_save/restore for all
> >   after3_root: mixed version(attached)
> > 
> > hmm, there seems to be a tendency that before < after < after3 < after2 ?
> > Should I replace save/restore version to mixed version ?
> > 
> 
> IMHO, starting from after2_root version is the easist.
> If there is a chance to call lock/unlock page_cgroup can be called in
> interrupt context, we _have to_ disable IRQ, anyway.
> And if we have to do this, I prefer migration_lock rather than this mixture.
> 
I see.

> BTW, how big your system is ? Balbir-san's concern is for bigger machines.
> But I'm not sure this change is affecte by the size of machines.
> I'm sorry I have no big machine, now.
> 
My test machine have 8CPUs, and I run all the test with "make -j8".
Sorry, I don't have easy access to huge machine either.

> I'll consider yet another fix for race in account migration if I can.
> 
me too.


Thanks,
Daisuke Nishimura.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH mmotm 2.5/4] memcg: disable irq at page cgroup lock (Re: [PATCH -mmotm 3/4] memcg: dirty pages accounting and limiting infrastructure)
  2010-03-11  4:58                               ` Daisuke Nishimura
@ 2010-03-11  5:13                                 ` KAMEZAWA Hiroyuki
  2010-03-11  6:15                                   ` KAMEZAWA Hiroyuki
  0 siblings, 1 reply; 41+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-03-11  5:13 UTC (permalink / raw)
  To: Daisuke Nishimura
  Cc: balbir, linux-mm, Andrea Righi, linux-kernel, Trond Myklebust,
	Suleiman Souhlal, Andrew Morton, containers, Vivek Goyal

On Thu, 11 Mar 2010 13:58:47 +0900
Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:
> > I'll consider yet another fix for race in account migration if I can.
> > 
> me too.
> 

How about this ? Assume that the race is very rare.

	1. use trylock when updating statistics.
	   If trylock fails, don't account it.

	2. add PCG_FLAG for all status as

+	PCG_ACCT_FILE_MAPPED, /* page is accounted as file rss*/
+	PCG_ACCT_DIRTY, /* page is dirty */
+	PCG_ACCT_WRITEBACK, /* page is being written back to disk */
+	PCG_ACCT_WRITEBACK_TEMP, /* page is used as temporary buffer for FUSE */
+	PCG_ACCT_UNSTABLE_NFS, /* NFS page not yet committed to the server */

	3. At reducing counter, check PCG_xxx flags by
	TESTCLEARPCGFLAG()

This is similar to an _used_ method of LRU accounting. And We can think this
method's error-range never go too bad number. 

I think this kind of fuzzy accounting is enough for writeback status.
Does anyone need strict accounting ?

Thanks,
-Kame

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH mmotm 2.5/4] memcg: disable irq at page cgroup lock (Re: [PATCH -mmotm 3/4] memcg: dirty pages accounting and limiting infrastructure)
  2010-03-11  5:13                                 ` KAMEZAWA Hiroyuki
@ 2010-03-11  6:15                                   ` KAMEZAWA Hiroyuki
  2010-03-11  7:50                                     ` Daisuke Nishimura
  0 siblings, 1 reply; 41+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-03-11  6:15 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Daisuke Nishimura, balbir, linux-mm, Andrea Righi, linux-kernel,
	Trond Myklebust, Suleiman Souhlal, Andrew Morton, containers,
	Vivek Goyal

On Thu, 11 Mar 2010 14:13:00 +0900
KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:

> On Thu, 11 Mar 2010 13:58:47 +0900
> Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:
> > > I'll consider yet another fix for race in account migration if I can.
> > > 
> > me too.
> > 
> 
> How about this ? Assume that the race is very rare.
> 
> 	1. use trylock when updating statistics.
> 	   If trylock fails, don't account it.
> 
> 	2. add PCG_FLAG for all status as
> 
> +	PCG_ACCT_FILE_MAPPED, /* page is accounted as file rss*/
> +	PCG_ACCT_DIRTY, /* page is dirty */
> +	PCG_ACCT_WRITEBACK, /* page is being written back to disk */
> +	PCG_ACCT_WRITEBACK_TEMP, /* page is used as temporary buffer for FUSE */
> +	PCG_ACCT_UNSTABLE_NFS, /* NFS page not yet committed to the server */
> 
> 	3. At reducing counter, check PCG_xxx flags by
> 	TESTCLEARPCGFLAG()
> 
> This is similar to an _used_ method of LRU accounting. And We can think this
> method's error-range never go too bad number. 
> 
> I think this kind of fuzzy accounting is enough for writeback status.
> Does anyone need strict accounting ?
> 

How this looks ?
==
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>

Now, file-mapped is maintaiend. But more generic update function
will be needed for dirty page accounting.

For accountig page status, we have to guarantee lock_page_cgroup()
will be never called under tree_lock held.
To guarantee that, we use trylock at updating status.
By this, we do fuzyy accounting, but in almost all case, it's correct.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
 include/linux/memcontrol.h  |    7 +++
 include/linux/page_cgroup.h |   15 +++++++
 mm/memcontrol.c             |   88 +++++++++++++++++++++++++++++++++-----------
 mm/rmap.c                   |    4 +-
 4 files changed, 90 insertions(+), 24 deletions(-)

Index: mmotm-2.6.34-Mar9/mm/memcontrol.c
===================================================================
--- mmotm-2.6.34-Mar9.orig/mm/memcontrol.c
+++ mmotm-2.6.34-Mar9/mm/memcontrol.c
@@ -1348,30 +1348,79 @@ bool mem_cgroup_handle_oom(struct mem_cg
  * Currently used to update mapped file statistics, but the routine can be
  * generalized to update other statistics as well.
  */
-void mem_cgroup_update_file_mapped(struct page *page, int val)
+void __mem_cgroup_update_stat(struct page_cgroup *pc, int idx, bool charge)
 {
 	struct mem_cgroup *mem;
-	struct page_cgroup *pc;
-
-	pc = lookup_page_cgroup(page);
-	if (unlikely(!pc))
-		return;
+	int val;
 
-	lock_page_cgroup(pc);
 	mem = pc->mem_cgroup;
-	if (!mem)
-		goto done;
+	if (!mem || !PageCgroupUsed(pc))
+		return;
 
-	if (!PageCgroupUsed(pc))
-		goto done;
+	if (charge)
+		val = 1;
+	else
+		val = -1;
 
+	switch (idx) {
+	case MEMCG_NR_FILE_MAPPED:
+		if (charge) {
+			if (!PageCgroupFileMapped(pc))
+				SetPageCgroupFileMapped(pc);
+			else
+				val = 0;
+		} else {
+			if (PageCgroupFileMapped(pc))
+				ClearPageCgroupFileMapped(pc);
+			else
+				val = 0;
+		}
+		idx = MEM_CGROUP_STAT_FILE_MAPPED;
+		break;
+	default:
+		BUG();
+		break;
+	}
 	/*
 	 * Preemption is already disabled. We can use __this_cpu_xxx
 	 */
-	__this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], val);
+	__this_cpu_add(mem->stat->count[idx], val);
+}
 
-done:
-	unlock_page_cgroup(pc);
+void mem_cgroup_update_stat(struct page *page, int idx, bool charge)
+{
+	struct page_cgroup *pc;
+
+	pc = lookup_page_cgroup(page);
+	if (unlikely(!pc))
+		return;
+
+	if (trylock_page_cgroup(pc)) {
+		__mem_cgroup_update_stat(pc, idx, charge);
+		unlock_page_cgroup(pc);
+	}
+	return;
+}
+
+static void mem_cgroup_migrate_stat(struct page_cgroup *pc,
+	struct mem_cgroup *from, struct mem_cgroup *to)
+{
+	preempt_disable();
+	if (PageCgroupFileMapped(pc)) {
+		__this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
+		__this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
+	}
+	preempt_enable();
+}
+
+static void
+__mem_cgroup_stat_fixup(struct page_cgroup *pc, struct mem_cgroup *mem)
+{
+	/* We'are in uncharge() and lock_page_cgroup */
+	if (PageCgroupFileMapped(pc)) {
+		__this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
+		ClearPageCgroupFileMapped(pc);
+	}
 }
 
 /*
@@ -1810,13 +1859,7 @@ static void __mem_cgroup_move_account(st
 	VM_BUG_ON(pc->mem_cgroup != from);
 
 	page = pc->page;
-	if (page_mapped(page) && !PageAnon(page)) {
-		/* Update mapped_file data for mem_cgroup */
-		preempt_disable();
-		__this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
-		__this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
-		preempt_enable();
-	}
+	mem_cgroup_migrate_stat(pc, from, to);
 	mem_cgroup_charge_statistics(from, pc, false);
 	if (uncharge)
 		/* This is not "cancel", but cancel_charge does all we need. */
@@ -2208,6 +2251,9 @@ __mem_cgroup_uncharge_common(struct page
 		__do_uncharge(mem, ctype);
 	if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
 		mem_cgroup_swap_statistics(mem, true);
+	if (unlikely(PCG_PageStatMask & pc->flags))
+		__mem_cgroup_stat_fixup(pc, mem);
+
 	mem_cgroup_charge_statistics(mem, pc, false);
 
 	ClearPageCgroupUsed(pc);
Index: mmotm-2.6.34-Mar9/include/linux/page_cgroup.h
===================================================================
--- mmotm-2.6.34-Mar9.orig/include/linux/page_cgroup.h
+++ mmotm-2.6.34-Mar9/include/linux/page_cgroup.h
@@ -39,6 +39,8 @@ enum {
 	PCG_CACHE, /* charged as cache */
 	PCG_USED, /* this object is in use. */
 	PCG_ACCT_LRU, /* page has been accounted for */
+	/* for cache-status accounting */
+	PCG_FILE_MAPPED,
 };
 
 #define TESTPCGFLAG(uname, lname)			\
@@ -57,6 +59,10 @@ static inline void ClearPageCgroup##unam
 static inline int TestClearPageCgroup##uname(struct page_cgroup *pc)	\
 	{ return test_and_clear_bit(PCG_##lname, &pc->flags);  }
 
+/* Page/File stat flag mask */
+#define PCG_PageStatMask	((1 << PCG_FILE_MAPPED))
+
+
 TESTPCGFLAG(Locked, LOCK)
 
 /* Cache flag is set only once (at allocation) */
@@ -73,6 +79,10 @@ CLEARPCGFLAG(AcctLRU, ACCT_LRU)
 TESTPCGFLAG(AcctLRU, ACCT_LRU)
 TESTCLEARPCGFLAG(AcctLRU, ACCT_LRU)
 
+TESTPCGFLAG(FileMapped, FILE_MAPPED)
+SETPCGFLAG(FileMapped, FILE_MAPPED)
+CLEARPCGFLAG(FileMapped, FILE_MAPPED)
+
 static inline int page_cgroup_nid(struct page_cgroup *pc)
 {
 	return page_to_nid(pc->page);
@@ -93,6 +103,11 @@ static inline void unlock_page_cgroup(st
 	bit_spin_unlock(PCG_LOCK, &pc->flags);
 }
 
+static inline int trylock_page_cgroup(struct page_cgroup *pc)
+{
+	return bit_spin_trylock(PCG_LOCK, &pc->flags);
+}
+
 #else /* CONFIG_CGROUP_MEM_RES_CTLR */
 struct page_cgroup;
 
Index: mmotm-2.6.34-Mar9/include/linux/memcontrol.h
===================================================================
--- mmotm-2.6.34-Mar9.orig/include/linux/memcontrol.h
+++ mmotm-2.6.34-Mar9/include/linux/memcontrol.h
@@ -124,7 +124,12 @@ static inline bool mem_cgroup_disabled(v
 	return false;
 }
 
-void mem_cgroup_update_file_mapped(struct page *page, int val);
+enum mem_cgroup_page_stat_item {
+	MEMCG_NR_FILE_MAPPED,
+	MEMCG_NR_FILE_NSTAT,
+};
+
+void mem_cgroup_update_stat(struct page *page, int idx, bool charge);
 unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
 						gfp_t gfp_mask, int nid,
 						int zid);
Index: mmotm-2.6.34-Mar9/mm/rmap.c
===================================================================
--- mmotm-2.6.34-Mar9.orig/mm/rmap.c
+++ mmotm-2.6.34-Mar9/mm/rmap.c
@@ -829,7 +829,7 @@ void page_add_file_rmap(struct page *pag
 {
 	if (atomic_inc_and_test(&page->_mapcount)) {
 		__inc_zone_page_state(page, NR_FILE_MAPPED);
-		mem_cgroup_update_file_mapped(page, 1);
+		mem_cgroup_update_stat(page, MEMCG_NR_FILE_MAPPED, true);
 	}
 }
 
@@ -861,7 +861,7 @@ void page_remove_rmap(struct page *page)
 		__dec_zone_page_state(page, NR_ANON_PAGES);
 	} else {
 		__dec_zone_page_state(page, NR_FILE_MAPPED);
-		mem_cgroup_update_file_mapped(page, -1);
+		mem_cgroup_update_stat(page, MEMCG_NR_FILE_MAPPED, false);
 	}
 	/*
 	 * It would be tidy to reset the PageAnon mapping here,






--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH mmotm 2.5/4] memcg: disable irq at page cgroup lock (Re: [PATCH -mmotm 3/4] memcg: dirty pages accounting and limiting infrastructure)
  2010-03-11  6:15                                   ` KAMEZAWA Hiroyuki
@ 2010-03-11  7:50                                     ` Daisuke Nishimura
  2010-03-11  8:06                                       ` KAMEZAWA Hiroyuki
  0 siblings, 1 reply; 41+ messages in thread
From: Daisuke Nishimura @ 2010-03-11  7:50 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: balbir, linux-mm, Andrea Righi, linux-kernel, Trond Myklebust,
	Suleiman Souhlal, Andrew Morton, containers, Vivek Goyal,
	Daisuke Nishimura

On Thu, 11 Mar 2010 15:15:11 +0900, KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> On Thu, 11 Mar 2010 14:13:00 +0900
> KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> 
> > On Thu, 11 Mar 2010 13:58:47 +0900
> > Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:
> > > > I'll consider yet another fix for race in account migration if I can.
> > > > 
> > > me too.
> > > 
> > 
> > How about this ? Assume that the race is very rare.
> > 
> > 	1. use trylock when updating statistics.
> > 	   If trylock fails, don't account it.
> > 
> > 	2. add PCG_FLAG for all status as
> > 
> > +	PCG_ACCT_FILE_MAPPED, /* page is accounted as file rss*/
> > +	PCG_ACCT_DIRTY, /* page is dirty */
> > +	PCG_ACCT_WRITEBACK, /* page is being written back to disk */
> > +	PCG_ACCT_WRITEBACK_TEMP, /* page is used as temporary buffer for FUSE */
> > +	PCG_ACCT_UNSTABLE_NFS, /* NFS page not yet committed to the server */
> > 
> > 	3. At reducing counter, check PCG_xxx flags by
> > 	TESTCLEARPCGFLAG()
> > 
> > This is similar to an _used_ method of LRU accounting. And We can think this
> > method's error-range never go too bad number. 
> > 
I agree with you. I've been thinking whether we can remove page cgroup lock
in update_stat as we do in lru handling codes.

> > I think this kind of fuzzy accounting is enough for writeback status.
> > Does anyone need strict accounting ?
> > 
> 
IMHO, we don't need strict accounting.

> How this looks ?
I agree to this direction. One concern is we re-introduce "trylock" again..

Some comments are inlined.

> ==
> From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> 
> Now, file-mapped is maintaiend. But more generic update function
> will be needed for dirty page accounting.
> 
> For accountig page status, we have to guarantee lock_page_cgroup()
> will be never called under tree_lock held.
> To guarantee that, we use trylock at updating status.
> By this, we do fuzyy accounting, but in almost all case, it's correct.
> 
> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> ---
>  include/linux/memcontrol.h  |    7 +++
>  include/linux/page_cgroup.h |   15 +++++++
>  mm/memcontrol.c             |   88 +++++++++++++++++++++++++++++++++-----------
>  mm/rmap.c                   |    4 +-
>  4 files changed, 90 insertions(+), 24 deletions(-)
> 
> Index: mmotm-2.6.34-Mar9/mm/memcontrol.c
> ===================================================================
> --- mmotm-2.6.34-Mar9.orig/mm/memcontrol.c
> +++ mmotm-2.6.34-Mar9/mm/memcontrol.c
> @@ -1348,30 +1348,79 @@ bool mem_cgroup_handle_oom(struct mem_cg
>   * Currently used to update mapped file statistics, but the routine can be
>   * generalized to update other statistics as well.
>   */
> -void mem_cgroup_update_file_mapped(struct page *page, int val)
> +void __mem_cgroup_update_stat(struct page_cgroup *pc, int idx, bool charge)
>  {
>  	struct mem_cgroup *mem;
> -	struct page_cgroup *pc;
> -
> -	pc = lookup_page_cgroup(page);
> -	if (unlikely(!pc))
> -		return;
> +	int val;
>  
> -	lock_page_cgroup(pc);
>  	mem = pc->mem_cgroup;
> -	if (!mem)
> -		goto done;
> +	if (!mem || !PageCgroupUsed(pc))
> +		return;
>  
> -	if (!PageCgroupUsed(pc))
> -		goto done;
> +	if (charge)
> +		val = 1;
> +	else
> +		val = -1;
>  
> +	switch (idx) {
> +	case MEMCG_NR_FILE_MAPPED:
> +		if (charge) {
> +			if (!PageCgroupFileMapped(pc))
> +				SetPageCgroupFileMapped(pc);
> +			else
> +				val = 0;
> +		} else {
> +			if (PageCgroupFileMapped(pc))
> +				ClearPageCgroupFileMapped(pc);
> +			else
> +				val = 0;
> +		}
Using !TestSetPageCgroupFileMapped(pc) or TestClearPageCgroupFileMapped(pc) is better ?

> +		idx = MEM_CGROUP_STAT_FILE_MAPPED;
> +		break;
> +	default:
> +		BUG();
> +		break;
> +	}
>  	/*
>  	 * Preemption is already disabled. We can use __this_cpu_xxx
>  	 */
> -	__this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], val);
> +	__this_cpu_add(mem->stat->count[idx], val);
> +}
>  
> -done:
> -	unlock_page_cgroup(pc);
> +void mem_cgroup_update_stat(struct page *page, int idx, bool charge)
> +{
> +	struct page_cgroup *pc;
> +
> +	pc = lookup_page_cgroup(page);
> +	if (unlikely(!pc))
> +		return;
> +
> +	if (trylock_page_cgroup(pc)) {
> +		__mem_cgroup_update_stat(pc, idx, charge);
> +		unlock_page_cgroup(pc);
> +	}
> +	return;
> +}
> +
> +static void mem_cgroup_migrate_stat(struct page_cgroup *pc,
> +	struct mem_cgroup *from, struct mem_cgroup *to)
> +{
> +	preempt_disable();
> +	if (PageCgroupFileMapped(pc)) {
> +		__this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
> +		__this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
> +	}
> +	preempt_enable();
> +}
> +
I think preemption is already disabled here too(by lock_page_cgroup()).

> +static void
> +__mem_cgroup_stat_fixup(struct page_cgroup *pc, struct mem_cgroup *mem)
> +{
> +	/* We'are in uncharge() and lock_page_cgroup */
> +	if (PageCgroupFileMapped(pc)) {
> +		__this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
> +		ClearPageCgroupFileMapped(pc);
> +	}
>  }
>  
ditto.

>  /*
> @@ -1810,13 +1859,7 @@ static void __mem_cgroup_move_account(st
>  	VM_BUG_ON(pc->mem_cgroup != from);
>  
>  	page = pc->page;
> -	if (page_mapped(page) && !PageAnon(page)) {
> -		/* Update mapped_file data for mem_cgroup */
> -		preempt_disable();
> -		__this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
> -		__this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
> -		preempt_enable();
> -	}
> +	mem_cgroup_migrate_stat(pc, from, to);
>  	mem_cgroup_charge_statistics(from, pc, false);
>  	if (uncharge)
>  		/* This is not "cancel", but cancel_charge does all we need. */
I welcome this fixup. IIUC, we have stat leak in current implementation.


Thanks,
Daisuke Nishimura.

> @@ -2208,6 +2251,9 @@ __mem_cgroup_uncharge_common(struct page
>  		__do_uncharge(mem, ctype);
>  	if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
>  		mem_cgroup_swap_statistics(mem, true);
> +	if (unlikely(PCG_PageStatMask & pc->flags))
> +		__mem_cgroup_stat_fixup(pc, mem);
> +
>  	mem_cgroup_charge_statistics(mem, pc, false);
>  
>  	ClearPageCgroupUsed(pc);
> Index: mmotm-2.6.34-Mar9/include/linux/page_cgroup.h
> ===================================================================
> --- mmotm-2.6.34-Mar9.orig/include/linux/page_cgroup.h
> +++ mmotm-2.6.34-Mar9/include/linux/page_cgroup.h
> @@ -39,6 +39,8 @@ enum {
>  	PCG_CACHE, /* charged as cache */
>  	PCG_USED, /* this object is in use. */
>  	PCG_ACCT_LRU, /* page has been accounted for */
> +	/* for cache-status accounting */
> +	PCG_FILE_MAPPED,
>  };
>  
>  #define TESTPCGFLAG(uname, lname)			\
> @@ -57,6 +59,10 @@ static inline void ClearPageCgroup##unam
>  static inline int TestClearPageCgroup##uname(struct page_cgroup *pc)	\
>  	{ return test_and_clear_bit(PCG_##lname, &pc->flags);  }
>  
> +/* Page/File stat flag mask */
> +#define PCG_PageStatMask	((1 << PCG_FILE_MAPPED))
> +
> +
>  TESTPCGFLAG(Locked, LOCK)
>  
>  /* Cache flag is set only once (at allocation) */
> @@ -73,6 +79,10 @@ CLEARPCGFLAG(AcctLRU, ACCT_LRU)
>  TESTPCGFLAG(AcctLRU, ACCT_LRU)
>  TESTCLEARPCGFLAG(AcctLRU, ACCT_LRU)
>  
> +TESTPCGFLAG(FileMapped, FILE_MAPPED)
> +SETPCGFLAG(FileMapped, FILE_MAPPED)
> +CLEARPCGFLAG(FileMapped, FILE_MAPPED)
> +
>  static inline int page_cgroup_nid(struct page_cgroup *pc)
>  {
>  	return page_to_nid(pc->page);
> @@ -93,6 +103,11 @@ static inline void unlock_page_cgroup(st
>  	bit_spin_unlock(PCG_LOCK, &pc->flags);
>  }
>  
> +static inline int trylock_page_cgroup(struct page_cgroup *pc)
> +{
> +	return bit_spin_trylock(PCG_LOCK, &pc->flags);
> +}
> +
>  #else /* CONFIG_CGROUP_MEM_RES_CTLR */
>  struct page_cgroup;
>  
> Index: mmotm-2.6.34-Mar9/include/linux/memcontrol.h
> ===================================================================
> --- mmotm-2.6.34-Mar9.orig/include/linux/memcontrol.h
> +++ mmotm-2.6.34-Mar9/include/linux/memcontrol.h
> @@ -124,7 +124,12 @@ static inline bool mem_cgroup_disabled(v
>  	return false;
>  }
>  
> -void mem_cgroup_update_file_mapped(struct page *page, int val);
> +enum mem_cgroup_page_stat_item {
> +	MEMCG_NR_FILE_MAPPED,
> +	MEMCG_NR_FILE_NSTAT,
> +};
> +
> +void mem_cgroup_update_stat(struct page *page, int idx, bool charge);
>  unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
>  						gfp_t gfp_mask, int nid,
>  						int zid);
> Index: mmotm-2.6.34-Mar9/mm/rmap.c
> ===================================================================
> --- mmotm-2.6.34-Mar9.orig/mm/rmap.c
> +++ mmotm-2.6.34-Mar9/mm/rmap.c
> @@ -829,7 +829,7 @@ void page_add_file_rmap(struct page *pag
>  {
>  	if (atomic_inc_and_test(&page->_mapcount)) {
>  		__inc_zone_page_state(page, NR_FILE_MAPPED);
> -		mem_cgroup_update_file_mapped(page, 1);
> +		mem_cgroup_update_stat(page, MEMCG_NR_FILE_MAPPED, true);
>  	}
>  }
>  
> @@ -861,7 +861,7 @@ void page_remove_rmap(struct page *page)
>  		__dec_zone_page_state(page, NR_ANON_PAGES);
>  	} else {
>  		__dec_zone_page_state(page, NR_FILE_MAPPED);
> -		mem_cgroup_update_file_mapped(page, -1);
> +		mem_cgroup_update_stat(page, MEMCG_NR_FILE_MAPPED, false);
>  	}
>  	/*
>  	 * It would be tidy to reset the PageAnon mapping here,
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH mmotm 2.5/4] memcg: disable irq at page cgroup lock (Re: [PATCH -mmotm 3/4] memcg: dirty pages accounting and limiting infrastructure)
  2010-03-11  7:50                                     ` Daisuke Nishimura
@ 2010-03-11  8:06                                       ` KAMEZAWA Hiroyuki
  0 siblings, 0 replies; 41+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-03-11  8:06 UTC (permalink / raw)
  To: Daisuke Nishimura
  Cc: balbir, linux-mm, Andrea Righi, linux-kernel, Trond Myklebust,
	Suleiman Souhlal, Andrew Morton, containers, Vivek Goyal

On Thu, 11 Mar 2010 16:50:20 +0900
Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:

> On Thu, 11 Mar 2010 15:15:11 +0900, KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> > On Thu, 11 Mar 2010 14:13:00 +0900
> > KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> > 
> > > On Thu, 11 Mar 2010 13:58:47 +0900
> > > Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:
> > > > > I'll consider yet another fix for race in account migration if I can.
> > > > > 
> > > > me too.
> > > > 
> > > 
> > > How about this ? Assume that the race is very rare.
> > > 
> > > 	1. use trylock when updating statistics.
> > > 	   If trylock fails, don't account it.
> > > 
> > > 	2. add PCG_FLAG for all status as
> > > 
> > > +	PCG_ACCT_FILE_MAPPED, /* page is accounted as file rss*/
> > > +	PCG_ACCT_DIRTY, /* page is dirty */
> > > +	PCG_ACCT_WRITEBACK, /* page is being written back to disk */
> > > +	PCG_ACCT_WRITEBACK_TEMP, /* page is used as temporary buffer for FUSE */
> > > +	PCG_ACCT_UNSTABLE_NFS, /* NFS page not yet committed to the server */
> > > 
> > > 	3. At reducing counter, check PCG_xxx flags by
> > > 	TESTCLEARPCGFLAG()
> > > 
> > > This is similar to an _used_ method of LRU accounting. And We can think this
> > > method's error-range never go too bad number. 
> > > 
> I agree with you. I've been thinking whether we can remove page cgroup lock
> in update_stat as we do in lru handling codes.
> 
> > > I think this kind of fuzzy accounting is enough for writeback status.
> > > Does anyone need strict accounting ?
> > > 
> > 
> IMHO, we don't need strict accounting.
> 
> > How this looks ?
> I agree to this direction. One concern is we re-introduce "trylock" again..
> 
Yes, it's my concern, too.


> Some comments are inlined.

> > +	switch (idx) {
> > +	case MEMCG_NR_FILE_MAPPED:
> > +		if (charge) {
> > +			if (!PageCgroupFileMapped(pc))
> > +				SetPageCgroupFileMapped(pc);
> > +			else
> > +				val = 0;
> > +		} else {
> > +			if (PageCgroupFileMapped(pc))
> > +				ClearPageCgroupFileMapped(pc);
> > +			else
> > +				val = 0;
> > +		}
> Using !TestSetPageCgroupFileMapped(pc) or TestClearPageCgroupFileMapped(pc) is better ?
> 

I used this style because we're under lock. (IOW, to show we're guarded by lock.)


> > +		idx = MEM_CGROUP_STAT_FILE_MAPPED;
> > +		break;
> > +	default:
> > +		BUG();
> > +		break;
> > +	}
> >  	/*
> >  	 * Preemption is already disabled. We can use __this_cpu_xxx
> >  	 */
> > -	__this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], val);
> > +	__this_cpu_add(mem->stat->count[idx], val);
> > +}
> >  
> > -done:
> > -	unlock_page_cgroup(pc);
> > +void mem_cgroup_update_stat(struct page *page, int idx, bool charge)
> > +{
> > +	struct page_cgroup *pc;
> > +
> > +	pc = lookup_page_cgroup(page);
> > +	if (unlikely(!pc))
> > +		return;
> > +
> > +	if (trylock_page_cgroup(pc)) {
> > +		__mem_cgroup_update_stat(pc, idx, charge);
> > +		unlock_page_cgroup(pc);
> > +	}
> > +	return;
> > +}
> > +
> > +static void mem_cgroup_migrate_stat(struct page_cgroup *pc,
> > +	struct mem_cgroup *from, struct mem_cgroup *to)
> > +{
> > +	preempt_disable();
> > +	if (PageCgroupFileMapped(pc)) {
> > +		__this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
> > +		__this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
> > +	}
> > +	preempt_enable();
> > +}
> > +
> I think preemption is already disabled here too(by lock_page_cgroup()).
> 
Ah, yes. 


> > +static void
> > +__mem_cgroup_stat_fixup(struct page_cgroup *pc, struct mem_cgroup *mem)
> > +{
> > +	/* We'are in uncharge() and lock_page_cgroup */
> > +	if (PageCgroupFileMapped(pc)) {
> > +		__this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
> > +		ClearPageCgroupFileMapped(pc);
> > +	}
> >  }
> >  
> ditto.
> 
ok.

> >  /*
> > @@ -1810,13 +1859,7 @@ static void __mem_cgroup_move_account(st
> >  	VM_BUG_ON(pc->mem_cgroup != from);
> >  
> >  	page = pc->page;
> > -	if (page_mapped(page) && !PageAnon(page)) {
> > -		/* Update mapped_file data for mem_cgroup */
> > -		preempt_disable();
> > -		__this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
> > -		__this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
> > -		preempt_enable();
> > -	}
> > +	mem_cgroup_migrate_stat(pc, from, to);
> >  	mem_cgroup_charge_statistics(from, pc, false);
> >  	if (uncharge)
> >  		/* This is not "cancel", but cancel_charge does all we need. */
> I welcome this fixup. IIUC, we have stat leak in current implementation.
> 

If necessary, I'd like to prepare fixed one as independent patch for mmotm.

Thanks,
-Kame

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH mmotm 2.5/4] memcg: disable irq at page cgroup lock (Re: [PATCH -mmotm 3/4] memcg: dirty pages accounting and limiting infrastructure)
  2010-03-11  4:49                             ` KAMEZAWA Hiroyuki
  2010-03-11  4:58                               ` Daisuke Nishimura
@ 2010-03-11 16:54                               ` Vivek Goyal
  2010-03-11 22:34                                 ` Andrea Righi
  2010-03-11 23:46                                 ` KAMEZAWA Hiroyuki
  1 sibling, 2 replies; 41+ messages in thread
From: Vivek Goyal @ 2010-03-11 16:54 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Daisuke Nishimura, balbir, linux-mm, Andrea Righi, linux-kernel,
	Trond Myklebust, Suleiman Souhlal, Andrew Morton, containers

On Thu, Mar 11, 2010 at 01:49:08PM +0900, KAMEZAWA Hiroyuki wrote:
> On Thu, 11 Mar 2010 13:31:23 +0900
> Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:
> 
> > On Wed, 10 Mar 2010 09:26:24 +0530, Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> > > * nishimura@mxp.nes.nec.co.jp <nishimura@mxp.nes.nec.co.jp> [2010-03-10 10:43:09]:
> 
> > I made a patch(attached) using both local_irq_disable/enable and local_irq_save/restore.
> > local_irq_save/restore is used only in mem_cgroup_update_file_mapped.
> > 
> > And I attached a histogram graph of 30 times kernel build in root cgroup for each.
> > 
> >   before_root: no irq operation(original)
> >   after_root: local_irq_disable/enable for all
> >   after2_root: local_irq_save/restore for all
> >   after3_root: mixed version(attached)
> > 
> > hmm, there seems to be a tendency that before < after < after3 < after2 ?
> > Should I replace save/restore version to mixed version ?
> > 
> 
> IMHO, starting from after2_root version is the easist.
> If there is a chance to call lock/unlock page_cgroup can be called in
> interrupt context, we _have to_ disable IRQ, anyway.
> And if we have to do this, I prefer migration_lock rather than this mixture.
> 
> BTW, how big your system is ? Balbir-san's concern is for bigger machines.
> But I'm not sure this change is affecte by the size of machines.
> I'm sorry I have no big machine, now.

FWIW, I took andrea's patches (local_irq_save/restore solution) and
compiled the kernel on 32 cores hyperthreaded (64 cpus) with make -j32
in /dev/shm/. On this system, I can't see much difference.

I compiled the kernel 10 times and took average.

Without andrea's patches: 28.698 (seconds)
With andrea's patches: 28.711 (seconds).
Diff is .04%

This is all should be in root cgroup. Note, I have not mounted memory cgroup
controller but it is compiled in. So I am assuming that root group
accounting will still be taking place. Also assuming that it is not
required to do actual IO to disk and /dev/shm is enough to see the results
of local_irq_save()/restore.

Thanks
Vivek

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH mmotm 2.5/4] memcg: disable irq at page cgroup lock (Re: [PATCH -mmotm 3/4] memcg: dirty pages accounting and limiting infrastructure)
  2010-03-11 16:54                               ` Vivek Goyal
@ 2010-03-11 22:34                                 ` Andrea Righi
  2010-03-11 23:46                                 ` KAMEZAWA Hiroyuki
  1 sibling, 0 replies; 41+ messages in thread
From: Andrea Righi @ 2010-03-11 22:34 UTC (permalink / raw)
  To: Vivek Goyal
  Cc: KAMEZAWA Hiroyuki, Daisuke Nishimura, balbir, linux-mm,
	linux-kernel, Trond Myklebust, Suleiman Souhlal, Andrew Morton,
	containers

On Thu, Mar 11, 2010 at 11:54:13AM -0500, Vivek Goyal wrote:
> On Thu, Mar 11, 2010 at 01:49:08PM +0900, KAMEZAWA Hiroyuki wrote:
> > On Thu, 11 Mar 2010 13:31:23 +0900
> > Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:
> > 
> > > On Wed, 10 Mar 2010 09:26:24 +0530, Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> > > > * nishimura@mxp.nes.nec.co.jp <nishimura@mxp.nes.nec.co.jp> [2010-03-10 10:43:09]:
> > 
> > > I made a patch(attached) using both local_irq_disable/enable and local_irq_save/restore.
> > > local_irq_save/restore is used only in mem_cgroup_update_file_mapped.
> > > 
> > > And I attached a histogram graph of 30 times kernel build in root cgroup for each.
> > > 
> > >   before_root: no irq operation(original)
> > >   after_root: local_irq_disable/enable for all
> > >   after2_root: local_irq_save/restore for all
> > >   after3_root: mixed version(attached)
> > > 
> > > hmm, there seems to be a tendency that before < after < after3 < after2 ?
> > > Should I replace save/restore version to mixed version ?
> > > 
> > 
> > IMHO, starting from after2_root version is the easist.
> > If there is a chance to call lock/unlock page_cgroup can be called in
> > interrupt context, we _have to_ disable IRQ, anyway.
> > And if we have to do this, I prefer migration_lock rather than this mixture.
> > 
> > BTW, how big your system is ? Balbir-san's concern is for bigger machines.
> > But I'm not sure this change is affecte by the size of machines.
> > I'm sorry I have no big machine, now.
> 
> FWIW, I took andrea's patches (local_irq_save/restore solution) and
> compiled the kernel on 32 cores hyperthreaded (64 cpus) with make -j32
> in /dev/shm/. On this system, I can't see much difference.
> 
> I compiled the kernel 10 times and took average.
> 
> Without andrea's patches: 28.698 (seconds)
> With andrea's patches: 28.711 (seconds).
> Diff is .04%
> 
> This is all should be in root cgroup. Note, I have not mounted memory cgroup
> controller but it is compiled in. So I am assuming that root group
> accounting will still be taking place. Also assuming that it is not
> required to do actual IO to disk and /dev/shm is enough to see the results
> of local_irq_save()/restore.

cgroup disable is at boot time "cgroup_disable=...", so root cgroup
accounting should be enabled.

The same for the local_irq_save/restore() overhead, lock/unlock_page_cgroup()
is called during each charge.

Many thanks for testing!
-Andrea

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH mmotm 2.5/4] memcg: disable irq at page cgroup lock (Re: [PATCH -mmotm 3/4] memcg: dirty pages accounting and limiting infrastructure)
  2010-03-11 16:54                               ` Vivek Goyal
  2010-03-11 22:34                                 ` Andrea Righi
@ 2010-03-11 23:46                                 ` KAMEZAWA Hiroyuki
  1 sibling, 0 replies; 41+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-03-11 23:46 UTC (permalink / raw)
  To: Vivek Goyal
  Cc: Daisuke Nishimura, balbir, linux-mm, Andrea Righi, linux-kernel,
	Trond Myklebust, Suleiman Souhlal, Andrew Morton, containers

On Thu, 11 Mar 2010 11:54:13 -0500
Vivek Goyal <vgoyal@redhat.com> wrote:

> On Thu, Mar 11, 2010 at 01:49:08PM +0900, KAMEZAWA Hiroyuki wrote:
> > On Thu, 11 Mar 2010 13:31:23 +0900
> > Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:
> > 
> > > On Wed, 10 Mar 2010 09:26:24 +0530, Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> > > > * nishimura@mxp.nes.nec.co.jp <nishimura@mxp.nes.nec.co.jp> [2010-03-10 10:43:09]:
> > 
> > > I made a patch(attached) using both local_irq_disable/enable and local_irq_save/restore.
> > > local_irq_save/restore is used only in mem_cgroup_update_file_mapped.
> > > 
> > > And I attached a histogram graph of 30 times kernel build in root cgroup for each.
> > > 
> > >   before_root: no irq operation(original)
> > >   after_root: local_irq_disable/enable for all
> > >   after2_root: local_irq_save/restore for all
> > >   after3_root: mixed version(attached)
> > > 
> > > hmm, there seems to be a tendency that before < after < after3 < after2 ?
> > > Should I replace save/restore version to mixed version ?
> > > 
> > 
> > IMHO, starting from after2_root version is the easist.
> > If there is a chance to call lock/unlock page_cgroup can be called in
> > interrupt context, we _have to_ disable IRQ, anyway.
> > And if we have to do this, I prefer migration_lock rather than this mixture.
> > 
> > BTW, how big your system is ? Balbir-san's concern is for bigger machines.
> > But I'm not sure this change is affecte by the size of machines.
> > I'm sorry I have no big machine, now.
> 
> FWIW, I took andrea's patches (local_irq_save/restore solution) and
> compiled the kernel on 32 cores hyperthreaded (64 cpus) with make -j32
> in /dev/shm/. On this system, I can't see much difference.
> 
> I compiled the kernel 10 times and took average.
> 
> Without andrea's patches: 28.698 (seconds)
> With andrea's patches: 28.711 (seconds).
> Diff is .04%
> 
> This is all should be in root cgroup. Note, I have not mounted memory cgroup
> controller but it is compiled in. So I am assuming that root group
> accounting will still be taking place. Also assuming that it is not
> required to do actual IO to disk and /dev/shm is enough to see the results
> of local_irq_save()/restore.
> 

Thank you!. Hmm.then, irq_xxxx is not core of problem. The overhead problem
is using spinlock or not... 

Regards,
-Kame


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 41+ messages in thread

end of thread, other threads:[~2010-03-11 23:50 UTC | newest]

Thread overview: 41+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-03-07 20:57 [PATCH -mmotm 0/4] memcg: per cgroup dirty limit (v5) Andrea Righi
2010-03-07 20:57 ` [PATCH -mmotm 1/4] memcg: dirty memory documentation Andrea Righi
2010-03-07 20:57 ` [PATCH -mmotm 2/4] page_cgroup: introduce file cache flags Andrea Righi
2010-03-07 20:57 ` [PATCH -mmotm 3/4] memcg: dirty pages accounting and limiting infrastructure Andrea Righi
2010-03-08  1:44   ` Daisuke Nishimura
2010-03-08  1:56     ` KAMEZAWA Hiroyuki
2010-03-08  2:17       ` Daisuke Nishimura
2010-03-08  2:37         ` KAMEZAWA Hiroyuki
2010-03-08  8:07           ` Daisuke Nishimura
2010-03-08  8:31             ` KAMEZAWA Hiroyuki
2010-03-09  0:12               ` Andrea Righi
2010-03-09  0:19                 ` KAMEZAWA Hiroyuki
2010-03-09  1:29                   ` [PATCH mmotm 2.5/4] memcg: disable irq at page cgroup lock (Re: [PATCH -mmotm 3/4] memcg: dirty pages accounting and limiting infrastructure) Daisuke Nishimura
2010-03-09  2:07                     ` KAMEZAWA Hiroyuki
2010-03-09  4:50                     ` Balbir Singh
2010-03-10  1:43                       ` Daisuke Nishimura
2010-03-10  3:56                         ` Balbir Singh
2010-03-11  4:31                           ` Daisuke Nishimura
2010-03-11  4:49                             ` KAMEZAWA Hiroyuki
2010-03-11  4:58                               ` Daisuke Nishimura
2010-03-11  5:13                                 ` KAMEZAWA Hiroyuki
2010-03-11  6:15                                   ` KAMEZAWA Hiroyuki
2010-03-11  7:50                                     ` Daisuke Nishimura
2010-03-11  8:06                                       ` KAMEZAWA Hiroyuki
2010-03-11 16:54                               ` Vivek Goyal
2010-03-11 22:34                                 ` Andrea Righi
2010-03-11 23:46                                 ` KAMEZAWA Hiroyuki
2010-03-09  9:07                     ` Andrea Righi
2010-03-09  0:18               ` [PATCH -mmotm 3/4] memcg: dirty pages accounting and limiting infrastructure Daisuke Nishimura
2010-03-09  0:20                 ` KAMEZAWA Hiroyuki
2010-03-09  0:52                   ` Daisuke Nishimura
2010-03-09  0:03             ` Andrea Righi
2010-03-07 20:57 ` [PATCH -mmotm 4/4] memcg: dirty pages instrumentation Andrea Righi
2010-03-08  2:31   ` KAMEZAWA Hiroyuki
  -- strict thread matches above, loose matches on Subject: below --
2010-03-04 10:40 [PATCH -mmotm 0/4] memcg: per cgroup dirty limit (v4) Andrea Righi
2010-03-04 10:40 ` [PATCH -mmotm 3/4] memcg: dirty pages accounting and limiting infrastructure Andrea Righi
2010-03-04 11:54   ` Kirill A. Shutemov
2010-03-05  1:12   ` Daisuke Nishimura
2010-03-05  1:58     ` KAMEZAWA Hiroyuki
2010-03-05  7:01       ` Balbir Singh
2010-03-05 22:14       ` Andrea Righi
2010-03-05 22:14     ` Andrea Righi

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).