[PATCH v7 13/34] vmscan: per-node deferred work

linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

From: Glauber Costa <glommer@openvz.org>
To: <linux-mm@kvack.org>
Cc: <cgroups@vger.kernel.org>,
	Andrew Morton <akpm@linux-foundation.org>,
	Greg Thelen <gthelen@google.com>,
	<kamezawa.hiroyu@jp.fujitsu.com>, Michal Hocko <mhocko@suse.cz>,
	Johannes Weiner <hannes@cmpxchg.org>,
	<linux-fsdevel@vger.kernel.org>,
	Dave Chinner <david@fromorbit.com>,
	hughd@google.com, Glauber Costa <glommer@openvz.org>,
	Dave Chinner <dchinner@redhat.com>, Mel Gorman <mgorman@suse.de>
Subject: [PATCH v7 13/34] vmscan: per-node deferred work
Date: Mon, 20 May 2013 00:07:06 +0400	[thread overview]
Message-ID: <1368994047-5997-14-git-send-email-glommer@openvz.org> (raw)
In-Reply-To: <1368994047-5997-1-git-send-email-glommer@openvz.org>

We already keep per-node LRU lists for objects being shrunk, but the
work that is deferred from one run to another is kept global. This
creates an impedance problem, where upon node pressure, work deferred
will accumulate and end up being flushed in other nodes.

In large machines, many nodes can accumulate at the same time, all
adding to the global counter.  As we accumulate more and more, we start
to ask for the caches to flush even bigger numbers. The result is that
the caches are depleted and do not stabilize. To achieve stable steady
state behavior, we need to tackle it differently.

In this patch we keep the deferred count per-node, and will never
accumulate that to other nodes.

Signed-off-by: Glauber Costa <glommer@openvz.org>
Cc: Dave Chinner <dchinner@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
---
 include/linux/shrinker.h |  30 +++++-
 mm/vmscan.c              | 245 ++++++++++++++++++++++++++++-------------------
 2 files changed, 175 insertions(+), 100 deletions(-)

diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
index 98be3ab..d70b123 100644
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@@ -19,6 +19,8 @@ struct shrink_control {
 
 	/* shrink from these nodes */
 	nodemask_t nodes_to_scan;
+	/* current node being shrunk (for NUMA aware shrinkers) */
+	int nid;
 };
 
 /*
@@ -42,6 +44,8 @@ struct shrink_control {
  * objects freed during the scan, or -1 if progress cannot be made due to
  * potential deadlocks. If -1 is returned, then no further attempts to call the
  * @scan_objects will be made from the current reclaim context.
+ *
+ * @flags determine the shrinker abilities, like numa awareness 
  */
 struct shrinker {
 	int (*shrink)(struct shrinker *, struct shrink_control *sc);
@@ -50,12 +54,34 @@ struct shrinker {
 
 	int seeks;	/* seeks to recreate an obj */
 	long batch;	/* reclaim batch size, 0 = default */
+	unsigned long flags;
 
 	/* These are for internal use */
 	struct list_head list;
-	atomic_long_t nr_in_batch; /* objs pending delete */
+	/*
+	 * We would like to avoid allocating memory when registering a new
+	 * shrinker. All shrinkers will need to keep track of deferred objects,
+	 * and we need a counter for this. If the shrinkers are not NUMA aware,
+	 * this is a small and bounded space that fits into an atomic_long_t.
+	 * This is because that the deferring decisions are global, and we will
+	 * not allocate in this case.
+	 *
+	 * When the shrinker is NUMA aware, we will need this to be a per-node
+	 * array. Numerically speaking, the minority of shrinkers are NUMA
+	 * aware, so this saves quite a bit.
+	 */
+	union {
+		/* objs pending delete */
+		atomic_long_t nr_deferred;
+		/* objs pending delete, per node */
+		atomic_long_t *nr_deferred_node;
+	};
 };
 #define DEFAULT_SEEKS 2 /* A good number if you don't know better. */
-extern void register_shrinker(struct shrinker *);
+
+/* Flags */
+#define SHRINKER_NUMA_AWARE (1 << 0)
+
+extern int register_shrinker(struct shrinker *);
 extern void unregister_shrinker(struct shrinker *);
 #endif
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 35a6a9b..374d2b6 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -155,14 +155,36 @@ static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru)
 }
 
 /*
- * Add a shrinker callback to be called from the vm
+ * Add a shrinker callback to be called from the vm.
+ *
+ * It cannot fail, unless the flag SHRINKER_NUMA_AWARE is specified.
+ * With this flag set, this function will allocate memory and may fail.
  */
-void register_shrinker(struct shrinker *shrinker)
+int register_shrinker(struct shrinker *shrinker)
 {
-	atomic_long_set(&shrinker->nr_in_batch, 0);
+	/*
+	 * If we only have one possible node in the system anyway, save
+	 * ourselves the trouble and disable NUMA aware behavior. This way we
+	 * will allocate nothing and save memory and some small loop time
+	 * later.
+	 */
+	if (nr_node_ids == 1)
+		shrinker->flags &= ~SHRINKER_NUMA_AWARE;
+
+	if (shrinker->flags & SHRINKER_NUMA_AWARE) {
+		size_t size;
+
+		size = sizeof(*shrinker->nr_deferred_node) * nr_node_ids;
+		shrinker->nr_deferred_node = kzalloc(size, GFP_KERNEL);
+		if (!shrinker->nr_deferred_node)
+			return -ENOMEM;
+	} else
+		atomic_long_set(&shrinker->nr_deferred, 0);
+
 	down_write(&shrinker_rwsem);
 	list_add_tail(&shrinker->list, &shrinker_list);
 	up_write(&shrinker_rwsem);
+	return 0;
 }
 EXPORT_SYMBOL(register_shrinker);
 
@@ -186,6 +208,116 @@ static inline int do_shrinker_shrink(struct shrinker *shrinker,
 }
 
 #define SHRINK_BATCH 128
+
+static unsigned long
+shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
+		 unsigned long nr_pages_scanned, unsigned long lru_pages,
+		 atomic_long_t *deferred)
+{
+	unsigned long freed = 0;
+	unsigned long long delta;
+	long total_scan;
+	long max_pass;
+	long nr;
+	long new_nr;
+	long batch_size = shrinker->batch ? shrinker->batch
+					  : SHRINK_BATCH;
+
+	if (shrinker->scan_objects) {
+		max_pass = shrinker->count_objects(shrinker, shrinkctl);
+		WARN_ON(max_pass < 0);
+	} else
+		max_pass = do_shrinker_shrink(shrinker, shrinkctl, 0);
+	if (max_pass <= 0)
+		return 0;
+
+	/*
+	 * copy the current shrinker scan count into a local variable
+	 * and zero it so that other concurrent shrinker invocations
+	 * don't also do this scanning work.
+	 */
+	nr = atomic_long_xchg(deferred, 0);
+
+	total_scan = nr;
+	delta = (4 * nr_pages_scanned) / shrinker->seeks;
+	delta *= max_pass;
+	do_div(delta, lru_pages + 1);
+	total_scan += delta;
+	if (total_scan < 0) {
+		printk(KERN_ERR
+		"shrink_slab: %pF negative objects to delete nr=%ld\n",
+		       shrinker->shrink, total_scan);
+		total_scan = max_pass;
+	}
+
+	/*
+	 * We need to avoid excessive windup on filesystem shrinkers
+	 * due to large numbers of GFP_NOFS allocations causing the
+	 * shrinkers to return -1 all the time. This results in a large
+	 * nr being built up so when a shrink that can do some work
+	 * comes along it empties the entire cache due to nr >>>
+	 * max_pass.  This is bad for sustaining a working set in
+	 * memory.
+	 *
+	 * Hence only allow the shrinker to scan the entire cache when
+	 * a large delta change is calculated directly.
+	 */
+	if (delta < max_pass / 4)
+		total_scan = min(total_scan, max_pass / 2);
+
+	/*
+	 * Avoid risking looping forever due to too large nr value:
+	 * never try to free more than twice the estimate number of
+	 * freeable entries.
+	 */
+	if (total_scan > max_pass * 2)
+		total_scan = max_pass * 2;
+
+	trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
+				nr_pages_scanned, lru_pages,
+				max_pass, delta, total_scan);
+
+	while (total_scan >= batch_size) {
+		long ret;
+
+		if (shrinker->scan_objects) {
+			shrinkctl->nr_to_scan = batch_size;
+			ret = shrinker->scan_objects(shrinker, shrinkctl);
+
+			if (ret == -1)
+				break;
+			freed += ret;
+		} else {
+			int nr_before;
+			nr_before = do_shrinker_shrink(shrinker, shrinkctl, 0);
+			ret = do_shrinker_shrink(shrinker, shrinkctl,
+							batch_size);
+			if (ret == -1)
+				break;
+			if (ret < nr_before)
+				freed += nr_before - ret;
+		}
+
+		count_vm_events(SLABS_SCANNED, batch_size);
+		total_scan -= batch_size;
+
+		cond_resched();
+	}
+
+	/*
+	 * move the unused scan count back into the shrinker in a
+	 * manner that handles concurrent updates. If we exhausted the
+	 * scan, there is no need to do an update.
+	 */
+	if (total_scan > 0)
+		new_nr = atomic_long_add_return(total_scan, deferred);
+	else
+		new_nr = atomic_long_read(deferred);
+
+	trace_mm_shrink_slab_end(shrinker, freed, nr, new_nr);
+	return freed;
+}
+
 /*
  * Call the shrink functions to age shrinkable caches
  *
@@ -222,107 +354,24 @@ unsigned long shrink_slab(struct shrink_control *shrinkctl,
 	}
 
 	list_for_each_entry(shrinker, &shrinker_list, list) {
-		unsigned long long delta;
-		long total_scan;
-		long max_pass;
-		long nr;
-		long new_nr;
-		long batch_size = shrinker->batch ? shrinker->batch
-						  : SHRINK_BATCH;
 
-		if (shrinker->scan_objects) {
-			max_pass = shrinker->count_objects(shrinker, shrinkctl);
-			WARN_ON(max_pass < 0);
-		} else
-			max_pass = do_shrinker_shrink(shrinker, shrinkctl, 0);
-		if (max_pass <= 0)
-			continue;
+		if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) {
+			shrinkctl->nid = 0;
 
-		/*
-		 * copy the current shrinker scan count into a local variable
-		 * and zero it so that other concurrent shrinker invocations
-		 * don't also do this scanning work.
-		 */
-		nr = atomic_long_xchg(&shrinker->nr_in_batch, 0);
-
-		total_scan = nr;
-		delta = (4 * nr_pages_scanned) / shrinker->seeks;
-		delta *= max_pass;
-		do_div(delta, lru_pages + 1);
-		total_scan += delta;
-		if (total_scan < 0) {
-			printk(KERN_ERR
-			"shrink_slab: %pF negative objects to delete nr=%ld\n",
-			       shrinker->shrink, total_scan);
-			total_scan = max_pass;
+			freed += shrink_slab_node(shrinkctl, shrinker,
+				 nr_pages_scanned, lru_pages,
+				 &shrinker->nr_deferred);
+			continue;
 		}
 
-		/*
-		 * We need to avoid excessive windup on filesystem shrinkers
-		 * due to large numbers of GFP_NOFS allocations causing the
-		 * shrinkers to return -1 all the time. This results in a large
-		 * nr being built up so when a shrink that can do some work
-		 * comes along it empties the entire cache due to nr >>>
-		 * max_pass.  This is bad for sustaining a working set in
-		 * memory.
-		 *
-		 * Hence only allow the shrinker to scan the entire cache when
-		 * a large delta change is calculated directly.
-		 */
-		if (delta < max_pass / 4)
-			total_scan = min(total_scan, max_pass / 2);
-
-		/*
-		 * Avoid risking looping forever due to too large nr value:
-		 * never try to free more than twice the estimate number of
-		 * freeable entries.
-		 */
-		if (total_scan > max_pass * 2)
-			total_scan = max_pass * 2;
-
-		trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
-					nr_pages_scanned, lru_pages,
-					max_pass, delta, total_scan);
-
-		while (total_scan >= batch_size) {
-			long ret;
-
-			if (shrinker->scan_objects) {
-				shrinkctl->nr_to_scan = batch_size;
-				ret = shrinker->scan_objects(shrinker, shrinkctl);
-
-				if (ret == -1)
-					break;
-				freed += ret;
-			} else {
-				int nr_before;
-				nr_before = do_shrinker_shrink(shrinker, shrinkctl, 0);
-				ret = do_shrinker_shrink(shrinker, shrinkctl,
-								batch_size);
-				if (ret == -1)
-					break;
-				if (ret < nr_before)
-					freed += nr_before - ret;
-			}
-
-			count_vm_events(SLABS_SCANNED, batch_size);
-			total_scan -= batch_size;
+		for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) {
+			if (!node_online(shrinkctl->nid))
+				continue;
 
-			cond_resched();
+			freed += shrink_slab_node(shrinkctl, shrinker,
+				 nr_pages_scanned, lru_pages,
+				 &shrinker->nr_deferred_node[shrinkctl->nid]);
 		}
-
-		/*
-		 * move the unused scan count back into the shrinker in a
-		 * manner that handles concurrent updates. If we exhausted the
-		 * scan, there is no need to do an update.
-		 */
-		if (total_scan > 0)
-			new_nr = atomic_long_add_return(total_scan,
-					&shrinker->nr_in_batch);
-		else
-			new_nr = atomic_long_read(&shrinker->nr_in_batch);
-
-		trace_mm_shrink_slab_end(shrinker, freed, nr, new_nr);
 	}
 	up_read(&shrinker_rwsem);
 out:
-- 
1.8.1.4

next prev parent reply	other threads:[~2013-05-19 20:07 UTC|newest]

Thread overview: 49+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2013-05-19 20:06 [PATCH v7 00/34] kmemcg shrinkers Glauber Costa
2013-05-19 20:06 ` [PATCH v7 01/34] fs: bump inode and dentry counters to long Glauber Costa
2013-05-19 20:06 ` [PATCH v7 02/34] super: fix calculation of shrinkable objects for small numbers Glauber Costa
2013-05-19 20:06 ` [PATCH v7 03/34] dcache: convert dentry_stat.nr_unused to per-cpu counters Glauber Costa
2013-05-19 20:06 ` [PATCH v7 04/34] dentry: move to per-sb LRU locks Glauber Costa
2013-05-19 20:06 ` [PATCH v7 05/34] dcache: remove dentries from LRU before putting on dispose list Glauber Costa
2013-05-19 20:06 ` [PATCH v7 06/34] mm: new shrinker API Glauber Costa
2013-05-19 20:07 ` [PATCH v7 07/34] shrinker: convert superblock shrinkers to new API Glauber Costa
2013-05-20 16:39   ` Glauber Costa
2013-05-20 23:40     ` Dave Chinner
2013-05-19 20:07 ` [PATCH v7 08/34] list: add a new LRU list type Glauber Costa
2013-05-19 20:07 ` [PATCH v7 09/34] inode: convert inode lru list to generic lru list code Glauber Costa
2013-05-19 20:07 ` [PATCH v7 10/34] dcache: convert to use new lru list infrastructure Glauber Costa
2013-05-19 20:07 ` [PATCH v7 11/34] list_lru: per-node " Glauber Costa
2013-05-19 20:07 ` [PATCH v7 12/34] shrinker: add node awareness Glauber Costa
2013-05-19 20:07 ` Glauber Costa [this message]
2013-05-19 20:07 ` [PATCH v7 14/34] list_lru: per-node API Glauber Costa
2013-05-19 20:07 ` [PATCH v7 15/34] fs: convert inode and dentry shrinking to be node aware Glauber Costa
2013-05-19 20:07 ` [PATCH v7 16/34] xfs: convert buftarg LRU to generic code Glauber Costa
2013-05-19 20:07 ` [PATCH v7 17/34] xfs: convert dquot cache lru to list_lru Glauber Costa
2013-05-19 20:07 ` [PATCH v7 18/34] fs: convert fs shrinkers to new scan/count API Glauber Costa
2013-05-20  8:25   ` Steven Whitehouse
2013-05-20 13:46     ` Glauber Costa
2013-05-20 15:25       ` Glauber Costa
2013-05-20 23:38         ` Dave Chinner
2013-05-20 23:42           ` Glauber Costa
2013-05-19 20:07 ` [PATCH v7 19/34] drivers: convert shrinkers to new count/scan API Glauber Costa
2013-06-03 20:03   ` Kent Overstreet
2013-06-04  9:06     ` Glauber Costa
2013-06-04  9:10     ` Glauber Costa
2013-05-19 20:07 ` [PATCH v7 20/34] i915: bail out earlier when shrinker cannot acquire mutex Glauber Costa
2013-05-19 20:07 ` [PATCH v7 21/34] shrinker: convert remaining shrinkers to count/scan API Glauber Costa
2013-05-19 20:07 ` [PATCH v7 22/34] hugepage: convert huge zero page shrinker to new shrinker API Glauber Costa
2013-05-19 20:07 ` [PATCH v7 23/34] shrinker: Kill old ->shrink API Glauber Costa
2013-05-19 20:07 ` [PATCH v7 24/34] vmscan: also shrink slab in memcg pressure Glauber Costa
2013-05-19 20:07 ` [PATCH v7 25/34] memcg,list_lru: duplicate LRUs upon kmemcg creation Glauber Costa
2013-05-19 20:07 ` [PATCH v7 26/34] lru: add an element to a memcg list Glauber Costa
2013-05-19 20:07 ` [PATCH v7 27/34] list_lru: per-memcg walks Glauber Costa
2013-05-19 20:07 ` [PATCH v7 28/34] memcg: per-memcg kmem shrinking Glauber Costa
2013-05-19 20:07 ` [PATCH v7 29/34] memcg: scan cache objects hierarchically Glauber Costa
2013-05-19 20:07 ` [PATCH v7 30/34] vmscan: take at least one pass with shrinkers Glauber Costa
2013-05-19 20:07 ` [PATCH v7 31/34] super: targeted memcg reclaim Glauber Costa
2013-05-19 20:07 ` [PATCH v7 32/34] memcg: move initialization to memcg creation Glauber Costa
2013-05-19 20:07 ` [PATCH v7 33/34] vmpressure: in-kernel notifications Glauber Costa
2013-05-19 20:07 ` [PATCH v7 34/34] memcg: reap dead memcgs upon global memory pressure Glauber Costa
2013-05-21  7:03 ` [PATCH v7 00/34] kmemcg shrinkers Glauber Costa
2013-05-21  7:18   ` Dave Chinner
2013-05-21  7:27     ` Glauber Costa
2013-05-22  6:26       ` Dave Chinner

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:98be3ab dfblob:d70b123 dfblob:35a6a9b dfblob:374d2b6 )
 OR (
bs:"[PATCH v7 13/34] vmscan: per-node deferred work" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1368994047-5997-14-git-send-email-glommer@openvz.org \
    --to=glommer@openvz.org \
    --cc=akpm@linux-foundation.org \
    --cc=cgroups@vger.kernel.org \
    --cc=david@fromorbit.com \
    --cc=dchinner@redhat.com \
    --cc=gthelen@google.com \
    --cc=hannes@cmpxchg.org \
    --cc=hughd@google.com \
    --cc=kamezawa.hiroyu@jp.fujitsu.com \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mgorman@suse.de \
    --cc=mhocko@suse.cz \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).