linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Greg Thelen <gthelen@google.com>
To: Andrew Morton <akpm@linux-foundation.org>
Cc: linux-kernel@vger.kernel.org, linux-mm@kvack.org,
	containers@lists.osdl.org, linux-fsdevel@vger.kernel.org,
	KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>,
	Balbir Singh <bsingharora@gmail.com>,
	Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>,
	Minchan Kim <minchan.kim@gmail.com>,
	Johannes Weiner <hannes@cmpxchg.org>,
	Wu Fengguang <fengguang.wu@intel.com>,
	Dave Chinner <david@fromorbit.com>,
	Vivek Goyal <vgoyal@redhat.com>,
	Andrea Righi <andrea@betterlinux.com>,
	Ciju Rajan K <ciju@linux.vnet.ibm.com>,
	David Rientjes <rientjes@google.com>,
	Greg Thelen <gthelen@google.com>
Subject: [PATCH v9 12/13] memcg: create support routines for page writeback
Date: Wed, 17 Aug 2011 09:15:04 -0700	[thread overview]
Message-ID: <1313597705-6093-13-git-send-email-gthelen@google.com> (raw)
In-Reply-To: <1313597705-6093-1-git-send-email-gthelen@google.com>

Introduce memcg routines to assist in per-memcg dirty page management:

- mem_cgroup_balance_dirty_pages() walks a memcg hierarchy comparing
  dirty memory usage against memcg foreground and background thresholds.
  If an over-background-threshold memcg is found, then per-memcg
  background writeback is queued.  Per-memcg writeback differs from
  classic, non-memcg, per bdi writeback by setting the new
  writeback_control.for_cgroup bit.

  If an over-foreground-threshold memcg is found, then foreground
  writeout occurs.  When performing foreground writeout, first consider
  inodes exclusive to the memcg.  If unable to make enough progress,
  then consider inodes shared between memcg.  Such cross-memcg inode
  sharing likely to be rare in situations that use per-cgroup memory
  isolation.  So the approach tries to handle the common case well
  without falling over in cases where such sharing exists.  This routine
  is used by balance_dirty_pages() in a later change.

- mem_cgroup_hierarchical_dirty_info() returns the dirty memory usage
  and limits of the memcg closest to (or over) its dirty limit.  This
  will be used by throttle_vm_writeout() in a latter change.

Signed-off-by: Greg Thelen <gthelen@google.com>
---
Changelog since v8:

- Use 'memcg' rather than 'mem' for local variables and parameters.
  This is consistent with other memory controller code.

- No more field additions to struct writeback_control.

- Added more comments to mem_cgroup_balance_dirty_pages().

- Adapted to changes in writeback_inodes_wb().

- Improved mem_cgroup_hierarchical_dirty_info() comment.

 include/linux/memcontrol.h        |   18 ++++
 include/trace/events/memcontrol.h |   88 ++++++++++++++++++++
 mm/memcontrol.c                   |  165 +++++++++++++++++++++++++++++++++++++
 3 files changed, 271 insertions(+), 0 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 103d297..f49bd2d 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -186,6 +186,11 @@ bool should_writeback_mem_cgroup_inode(struct inode *inode,
 				       bool shared_inodes);
 bool mem_cgroups_over_bground_dirty_thresh(void);
 void mem_cgroup_writeback_done(void);
+bool mem_cgroup_hierarchical_dirty_info(unsigned long sys_available_mem,
+					struct mem_cgroup *memcg,
+					struct dirty_info *info);
+void mem_cgroup_balance_dirty_pages(struct address_space *mapping,
+				    unsigned long write_chunk);
 
 unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
 						gfp_t gfp_mask,
@@ -402,6 +407,19 @@ static inline void mem_cgroup_writeback_done(void)
 {
 }
 
+static inline void mem_cgroup_balance_dirty_pages(struct address_space *mapping,
+						  unsigned long write_chunk)
+{
+}
+
+static inline bool
+mem_cgroup_hierarchical_dirty_info(unsigned long sys_available_mem,
+				   struct mem_cgroup *memcg,
+				   struct dirty_info *info)
+{
+	return false;
+}
+
 static inline
 unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
 					    gfp_t gfp_mask,
diff --git a/include/trace/events/memcontrol.h b/include/trace/events/memcontrol.h
index 966aac0..20bbb85 100644
--- a/include/trace/events/memcontrol.h
+++ b/include/trace/events/memcontrol.h
@@ -113,6 +113,94 @@ TRACE_EVENT(mem_cgroups_over_bground_dirty_thresh,
 		  __entry->first_id)
 )
 
+DECLARE_EVENT_CLASS(mem_cgroup_consider_writeback,
+	TP_PROTO(unsigned short css_id,
+		 struct backing_dev_info *bdi,
+		 unsigned long nr_reclaimable,
+		 unsigned long thresh,
+		 bool over_limit),
+
+	TP_ARGS(css_id, bdi, nr_reclaimable, thresh, over_limit),
+
+	TP_STRUCT__entry(
+		__field(unsigned short, css_id)
+		__field(struct backing_dev_info *, bdi)
+		__field(unsigned long, nr_reclaimable)
+		__field(unsigned long, thresh)
+		__field(bool, over_limit)
+	),
+
+	TP_fast_assign(
+		__entry->css_id = css_id;
+		__entry->bdi = bdi;
+		__entry->nr_reclaimable = nr_reclaimable;
+		__entry->thresh = thresh;
+		__entry->over_limit = over_limit;
+	),
+
+	TP_printk("css_id=%d bdi=%p nr_reclaimable=%ld thresh=%ld "
+		  "over_limit=%d", __entry->css_id, __entry->bdi,
+		  __entry->nr_reclaimable, __entry->thresh, __entry->over_limit)
+)
+
+#define DEFINE_MEM_CGROUP_CONSIDER_WRITEBACK_EVENT(name) \
+DEFINE_EVENT(mem_cgroup_consider_writeback, name, \
+	TP_PROTO(unsigned short id, \
+		 struct backing_dev_info *bdi, \
+		 unsigned long nr_reclaimable, \
+		 unsigned long thresh, \
+		 bool over_limit), \
+	TP_ARGS(id, bdi, nr_reclaimable, thresh, over_limit) \
+)
+
+DEFINE_MEM_CGROUP_CONSIDER_WRITEBACK_EVENT(mem_cgroup_consider_bg_writeback);
+DEFINE_MEM_CGROUP_CONSIDER_WRITEBACK_EVENT(mem_cgroup_consider_fg_writeback);
+
+TRACE_EVENT(mem_cgroup_fg_writeback,
+	TP_PROTO(unsigned long write_chunk,
+		 long nr_written,
+		 unsigned short css_id,
+		 bool shared_inodes),
+
+	TP_ARGS(write_chunk, nr_written, css_id, shared_inodes),
+
+	TP_STRUCT__entry(
+		__field(unsigned long, write_chunk)
+		__field(long, nr_written)
+		__field(unsigned short, css_id)
+		__field(bool, shared_inodes)
+	),
+
+	TP_fast_assign(
+		__entry->write_chunk = write_chunk;
+		__entry->nr_written = nr_written;
+		__entry->css_id = css_id;
+		__entry->shared_inodes = shared_inodes;
+	),
+
+	TP_printk("css_id=%d write_chunk=%ld nr_written=%ld shared_inodes=%d",
+		  __entry->css_id,
+		  __entry->write_chunk,
+		  __entry->nr_written,
+		  __entry->shared_inodes)
+)
+
+TRACE_EVENT(mem_cgroup_enable_shared_writeback,
+	TP_PROTO(unsigned short css_id),
+
+	TP_ARGS(css_id),
+
+	TP_STRUCT__entry(
+		__field(unsigned short, css_id)
+		),
+
+	TP_fast_assign(
+		__entry->css_id = css_id;
+		),
+
+	TP_printk("enabling shared writeback for memcg %d", __entry->css_id)
+)
+
 #endif /* _TRACE_MEMCONTROL_H */
 
 /* This part must be outside protection */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 5092a68..9d0b559 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1696,6 +1696,171 @@ void mem_cgroup_writeback_done(void)
 	}
 }
 
+/*
+ * This routine must be called periodically by processes which generate dirty
+ * pages.  It considers the dirty pages usage and thresholds of the current
+ * cgroup and (depending if hierarchical accounting is enabled) ancestral memcg.
+ * If any of the considered memcg are over their background dirty limit, then
+ * background writeback is queued.  If any are over the foreground dirty limit
+ * then the dirtying task is throttled while writing dirty data.  The per-memcg
+ * dirty limits checked by this routine are distinct from either the per-system,
+ * per-bdi, or per-task limits considered by balance_dirty_pages().
+ *
+ *   Example hierarchy:
+ *                 root
+ *            A            B
+ *        A1      A2         B1
+ *     A11 A12  A21 A22
+ *
+ * Assume that mem_cgroup_balance_dirty_pages() is called on A11.  This routine
+ * starts at A11 walking upwards towards the root.  If A11 is over dirty limit,
+ * then writeback A11 inodes until under limit.  Next check A1, if over limit
+ * then write A1,A11,A12.  Then check A.  If A is over A limit, then invoke
+ * writeback on A* until A is under A limit.
+ */
+void mem_cgroup_balance_dirty_pages(struct address_space *mapping,
+				    unsigned long write_chunk)
+{
+	struct backing_dev_info *bdi = mapping->backing_dev_info;
+	struct mem_cgroup *memcg;
+	struct mem_cgroup *ref_memcg;
+	struct dirty_info info;
+	unsigned long nr_reclaimable;
+	unsigned long nr_written;
+	unsigned long sys_available_mem;
+	unsigned long pause = 1;
+	unsigned short id;
+	bool over;
+	bool shared_inodes;
+
+	if (mem_cgroup_disabled())
+		return;
+
+	sys_available_mem = determine_dirtyable_memory();
+
+	/* reference the memcg so it is not deleted during this routine */
+	rcu_read_lock();
+	memcg = mem_cgroup_from_task(current);
+	if (memcg && mem_cgroup_is_root(memcg))
+		memcg = NULL;
+	if (memcg)
+		css_get(&memcg->css);
+	rcu_read_unlock();
+	ref_memcg = memcg;
+
+	/* balance entire ancestry of current's memcg. */
+	for (; mem_cgroup_has_dirty_limit(memcg);
+	     memcg = parent_mem_cgroup(memcg)) {
+		id = css_id(&memcg->css);
+
+		/*
+		 * Keep throttling and writing inode data so long as memcg is
+		 * over its dirty limit.  Inode being written by multiple memcg
+		 * (aka shared_inodes) cannot easily be attributed a particular
+		 * memcg.  Shared inodes are thought to be much rarer than
+		 * shared inodes.  First try to satisfy this memcg's dirty
+		 * limits using non-shared inodes.
+		 */
+		for (shared_inodes = false; ; ) {
+			/*
+			 * if memcg is under dirty limit, then break from
+			 * throttling loop.
+			 */
+			mem_cgroup_dirty_info(sys_available_mem, memcg, &info);
+			nr_reclaimable = dirty_info_reclaimable(&info);
+			over = nr_reclaimable > info.dirty_thresh;
+			trace_mem_cgroup_consider_fg_writeback(
+				id, bdi, nr_reclaimable, info.dirty_thresh,
+				over);
+			if (!over)
+				break;
+
+			nr_written = writeback_inodes_wb(&bdi->wb, write_chunk,
+							 memcg, shared_inodes);
+			trace_mem_cgroup_fg_writeback(write_chunk, nr_written,
+						      id, shared_inodes);
+			/* if no progress, then consider shared inodes */
+			if ((nr_written == 0) && !shared_inodes) {
+				trace_mem_cgroup_enable_shared_writeback(id);
+				shared_inodes = true;
+			}
+
+			__set_current_state(TASK_UNINTERRUPTIBLE);
+			io_schedule_timeout(pause);
+
+			/*
+			 * Increase the delay for each loop, up to our previous
+			 * default of taking a 100ms nap.
+			 */
+			pause <<= 1;
+			if (pause > HZ / 10)
+				pause = HZ / 10;
+		}
+
+		/* if memcg is over background limit, then queue bg writeback */
+		over = nr_reclaimable >= info.background_thresh;
+		trace_mem_cgroup_consider_bg_writeback(
+			id, bdi, nr_reclaimable, info.background_thresh,
+			over);
+		if (over)
+			mem_cgroup_queue_bg_writeback(memcg, bdi);
+	}
+
+	if (ref_memcg)
+		css_put(&ref_memcg->css);
+}
+
+/*
+ * Set @info to the dirty thresholds and usage of the memcg (within the
+ * ancestral chain of @memcg) closest to its dirty limit or the first memcg over
+ * its limit.
+ *
+ * The check is not stable because the usage and limits can change asynchronous
+ * to this routine.
+ *
+ * If @memcg has no per-cgroup dirty limits, then returns false.
+ * Otherwise @info is set and returns true.
+ */
+bool mem_cgroup_hierarchical_dirty_info(unsigned long sys_available_mem,
+					struct mem_cgroup *memcg,
+					struct dirty_info *info)
+{
+	unsigned long usage;
+	struct dirty_info uninitialized_var(cur_info);
+
+	if (mem_cgroup_disabled())
+		return false;
+
+	info->nr_writeback = ULONG_MAX;  /* invalid initial value */
+
+	/* walk up hierarchy enabled parents */
+	for (; mem_cgroup_has_dirty_limit(memcg);
+	     memcg = parent_mem_cgroup(memcg)) {
+		mem_cgroup_dirty_info(sys_available_mem, memcg, &cur_info);
+		usage = dirty_info_reclaimable(&cur_info) +
+			cur_info.nr_writeback;
+
+		/* if over limit, stop searching */
+		if (usage >= cur_info.dirty_thresh) {
+			*info = cur_info;
+			break;
+		}
+
+		/*
+		 * Save dirty usage of memcg closest to its limit if either:
+		 *     - memcg is the first memcg considered
+		 *     - memcg dirty margin is smaller than last recorded one
+		 */
+		if ((info->nr_writeback == ULONG_MAX) ||
+		    (cur_info.dirty_thresh - usage) <
+		    (info->dirty_thresh -
+		     (dirty_info_reclaimable(info) + info->nr_writeback)))
+			*info = cur_info;
+	}
+
+	return info->nr_writeback != ULONG_MAX;
+}
+
 static void mem_cgroup_start_move(struct mem_cgroup *mem)
 {
 	int cpu;
-- 
1.7.3.1

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

  parent reply	other threads:[~2011-08-17 16:15 UTC|newest]

Thread overview: 36+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2011-08-17 16:14 [PATCH v9 00/13] memcg: per cgroup dirty page limiting Greg Thelen
2011-08-17 16:14 ` [PATCH v9 01/13] memcg: document cgroup dirty memory interfaces Greg Thelen
2011-08-17 16:14 ` [PATCH v9 02/13] memcg: add page_cgroup flags for dirty page tracking Greg Thelen
2011-08-17 16:14 ` [PATCH v9 03/13] memcg: add dirty page accounting infrastructure Greg Thelen
2011-08-18  0:39   ` KAMEZAWA Hiroyuki
2011-08-18  6:07     ` Greg Thelen
2011-08-17 16:14 ` [PATCH v9 04/13] memcg: add kernel calls for memcg dirty page stats Greg Thelen
2011-08-17 16:14 ` [PATCH v9 05/13] memcg: add mem_cgroup_mark_inode_dirty() Greg Thelen
2011-08-18  0:51   ` KAMEZAWA Hiroyuki
2011-08-17 16:14 ` [PATCH v9 06/13] memcg: add dirty limits to mem_cgroup Greg Thelen
2011-08-18  0:53   ` KAMEZAWA Hiroyuki
2011-08-17 16:14 ` [PATCH v9 07/13] memcg: add cgroupfs interface to memcg dirty limits Greg Thelen
2011-08-18  0:55   ` KAMEZAWA Hiroyuki
2011-08-17 16:15 ` [PATCH v9 08/13] memcg: dirty page accounting support routines Greg Thelen
2011-08-18  1:05   ` KAMEZAWA Hiroyuki
2011-08-18  7:04     ` Greg Thelen
2011-08-17 16:15 ` [PATCH v9 09/13] memcg: create support routines for writeback Greg Thelen
2011-08-18  1:13   ` KAMEZAWA Hiroyuki
2011-08-17 16:15 ` [PATCH v9 10/13] writeback: pass wb_writeback_work into move_expired_inodes() Greg Thelen
2011-08-18  1:15   ` KAMEZAWA Hiroyuki
2011-08-17 16:15 ` [PATCH v9 11/13] writeback: make background writeback cgroup aware Greg Thelen
2011-08-18  1:23   ` KAMEZAWA Hiroyuki
2011-08-18  7:10     ` Greg Thelen
2011-08-18  7:17       ` KAMEZAWA Hiroyuki
2011-08-18  7:38         ` Greg Thelen
2011-08-18  7:35           ` KAMEZAWA Hiroyuki
2011-08-17 16:15 ` Greg Thelen [this message]
2011-08-18  1:38   ` [PATCH v9 12/13] memcg: create support routines for page writeback KAMEZAWA Hiroyuki
2011-08-18  2:36     ` Wu Fengguang
2011-08-18 10:12       ` Jan Kara
2011-08-18 12:17         ` Wu Fengguang
2011-08-18 20:08           ` Jan Kara
2011-08-19  1:36             ` Wu Fengguang
2011-08-17 16:15 ` [PATCH v9 13/13] memcg: check memcg dirty limits in " Greg Thelen
2011-08-18  1:40   ` KAMEZAWA Hiroyuki
2011-08-18  0:35 ` [PATCH v9 00/13] memcg: per cgroup dirty page limiting KAMEZAWA Hiroyuki

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1313597705-6093-13-git-send-email-gthelen@google.com \
    --to=gthelen@google.com \
    --cc=akpm@linux-foundation.org \
    --cc=andrea@betterlinux.com \
    --cc=bsingharora@gmail.com \
    --cc=ciju@linux.vnet.ibm.com \
    --cc=containers@lists.osdl.org \
    --cc=david@fromorbit.com \
    --cc=fengguang.wu@intel.com \
    --cc=hannes@cmpxchg.org \
    --cc=kamezawa.hiroyu@jp.fujitsu.com \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=minchan.kim@gmail.com \
    --cc=nishimura@mxp.nes.nec.co.jp \
    --cc=rientjes@google.com \
    --cc=vgoyal@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).