[PATCH 01/10] Revert "slub: move synchronize_sched out of slab_mutex on shrink"

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Tejun Heo <tj@kernel.org>
To: vdavydov.dev@gmail.com, cl@linux.com, penberg@kernel.org,
	rientjes@google.com, iamjoonsoo.kim@lge.com,
	akpm@linux-foundation.org
Cc: jsvana@fb.com, hannes@cmpxchg.org, linux-kernel@vger.kernel.org,
	linux-mm@kvack.org, cgroups@vger.kernel.org, kernel-team@fb.com,
	Tejun Heo <tj@kernel.org>
Subject: [PATCH 01/10] Revert "slub: move synchronize_sched out of slab_mutex on shrink"
Date: Tue, 17 Jan 2017 15:54:02 -0800	[thread overview]
Message-ID: <20170117235411.9408-2-tj@kernel.org> (raw)
In-Reply-To: <20170117235411.9408-1-tj@kernel.org>

This reverts commit 89e364db71fb5e7fc8d93228152abfa67daf35fa.

With kmem cgroup support enabled, kmem_caches can be created and
destroyed frequently and a great number of near empty kmem_caches can
accumulate if there are a lot of transient cgroups and the system is
not under memory pressure.  When memory reclaim starts under such
conditions, it can lead to consecutive deactivation and destruction of
many kmem_caches, easily hundreds of thousands on moderately large
systems, exposing scalability issues in the current slab management
code.  This is one of the patches to address the issue.

Moving synchronize_sched() out of slab_mutex isn't enough as it's
still inside cgroup_mutex.  The whole deactivation / release path will
be updated to avoid all synchronous RCU operations.  Revert this
insufficient optimization in preparation to ease future changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Jay Vana <jsvana@fb.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
---
 mm/slab.c        |  4 ++--
 mm/slab.h        |  2 +-
 mm/slab_common.c | 27 ++-------------------------
 mm/slob.c        |  2 +-
 mm/slub.c        | 19 +++++++++++++++++--
 5 files changed, 23 insertions(+), 31 deletions(-)

diff --git a/mm/slab.c b/mm/slab.c
index 29bc6c0..767e8e4 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2314,7 +2314,7 @@ static int drain_freelist(struct kmem_cache *cache,
 	return nr_freed;
 }
 
-int __kmem_cache_shrink(struct kmem_cache *cachep)
+int __kmem_cache_shrink(struct kmem_cache *cachep, bool deactivate)
 {
 	int ret = 0;
 	int node;
@@ -2334,7 +2334,7 @@ int __kmem_cache_shrink(struct kmem_cache *cachep)
 
 int __kmem_cache_shutdown(struct kmem_cache *cachep)
 {
-	return __kmem_cache_shrink(cachep);
+	return __kmem_cache_shrink(cachep, false);
 }
 
 void __kmem_cache_release(struct kmem_cache *cachep)
diff --git a/mm/slab.h b/mm/slab.h
index de6579d..4acc644 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -161,7 +161,7 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size,
 
 int __kmem_cache_shutdown(struct kmem_cache *);
 void __kmem_cache_release(struct kmem_cache *);
-int __kmem_cache_shrink(struct kmem_cache *);
+int __kmem_cache_shrink(struct kmem_cache *, bool);
 void slab_kmem_cache_release(struct kmem_cache *);
 
 struct seq_file;
diff --git a/mm/slab_common.c b/mm/slab_common.c
index ae32384..46ff746 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -579,29 +579,6 @@ void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg)
 	get_online_cpus();
 	get_online_mems();
 
-#ifdef CONFIG_SLUB
-	/*
-	 * In case of SLUB, we need to disable empty slab caching to
-	 * avoid pinning the offline memory cgroup by freeable kmem
-	 * pages charged to it. SLAB doesn't need this, as it
-	 * periodically purges unused slabs.
-	 */
-	mutex_lock(&slab_mutex);
-	list_for_each_entry(s, &slab_caches, list) {
-		c = is_root_cache(s) ? cache_from_memcg_idx(s, idx) : NULL;
-		if (c) {
-			c->cpu_partial = 0;
-			c->min_partial = 0;
-		}
-	}
-	mutex_unlock(&slab_mutex);
-	/*
-	 * kmem_cache->cpu_partial is checked locklessly (see
-	 * put_cpu_partial()). Make sure the change is visible.
-	 */
-	synchronize_sched();
-#endif
-
 	mutex_lock(&slab_mutex);
 	list_for_each_entry(s, &slab_caches, list) {
 		if (!is_root_cache(s))
@@ -613,7 +590,7 @@ void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg)
 		if (!c)
 			continue;
 
-		__kmem_cache_shrink(c);
+		__kmem_cache_shrink(c, true);
 		arr->entries[idx] = NULL;
 	}
 	mutex_unlock(&slab_mutex);
@@ -784,7 +761,7 @@ int kmem_cache_shrink(struct kmem_cache *cachep)
 	get_online_cpus();
 	get_online_mems();
 	kasan_cache_shrink(cachep);
-	ret = __kmem_cache_shrink(cachep);
+	ret = __kmem_cache_shrink(cachep, false);
 	put_online_mems();
 	put_online_cpus();
 	return ret;
diff --git a/mm/slob.c b/mm/slob.c
index eac04d4..5ec1580 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -634,7 +634,7 @@ void __kmem_cache_release(struct kmem_cache *c)
 {
 }
 
-int __kmem_cache_shrink(struct kmem_cache *d)
+int __kmem_cache_shrink(struct kmem_cache *d, bool deactivate)
 {
 	return 0;
 }
diff --git a/mm/slub.c b/mm/slub.c
index 067598a..68b84f9 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3883,7 +3883,7 @@ EXPORT_SYMBOL(kfree);
  * being allocated from last increasing the chance that the last objects
  * are freed in them.
  */
-int __kmem_cache_shrink(struct kmem_cache *s)
+int __kmem_cache_shrink(struct kmem_cache *s, bool deactivate)
 {
 	int node;
 	int i;
@@ -3895,6 +3895,21 @@ int __kmem_cache_shrink(struct kmem_cache *s)
 	unsigned long flags;
 	int ret = 0;
 
+	if (deactivate) {
+		/*
+		 * Disable empty slabs caching. Used to avoid pinning offline
+		 * memory cgroups by kmem pages that can be freed.
+		 */
+		s->cpu_partial = 0;
+		s->min_partial = 0;
+
+		/*
+		 * s->cpu_partial is checked locklessly (see put_cpu_partial),
+		 * so we have to make sure the change is visible.
+		 */
+		synchronize_sched();
+	}
+
 	flush_all(s);
 	for_each_kmem_cache_node(s, node, n) {
 		INIT_LIST_HEAD(&discard);
@@ -3951,7 +3966,7 @@ static int slab_mem_going_offline_callback(void *arg)
 
 	mutex_lock(&slab_mutex);
 	list_for_each_entry(s, &slab_caches, list)
-		__kmem_cache_shrink(s);
+		__kmem_cache_shrink(s, false);
 	mutex_unlock(&slab_mutex);
 
 	return 0;
-- 
2.9.3

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

WARNING: multiple messages have this Message-ID (diff)

From: Tejun Heo <tj@kernel.org>
To: vdavydov.dev@gmail.com, cl@linux.com, penberg@kernel.org,
	rientjes@google.com, iamjoonsoo.kim@lge.com,
	akpm@linux-foundation.org
Cc: jsvana@fb.com, hannes@cmpxchg.org, linux-kernel@vger.kernel.org,
	linux-mm@kvack.org, cgroups@vger.kernel.org, kernel-team@fb.com,
	Tejun Heo <tj@kernel.org>
Subject: [PATCH 01/10] Revert "slub: move synchronize_sched out of slab_mutex on shrink"
Date: Tue, 17 Jan 2017 15:54:02 -0800	[thread overview]
Message-ID: <20170117235411.9408-2-tj@kernel.org> (raw)
In-Reply-To: <20170117235411.9408-1-tj@kernel.org>

This reverts commit 89e364db71fb5e7fc8d93228152abfa67daf35fa.

With kmem cgroup support enabled, kmem_caches can be created and
destroyed frequently and a great number of near empty kmem_caches can
accumulate if there are a lot of transient cgroups and the system is
not under memory pressure.  When memory reclaim starts under such
conditions, it can lead to consecutive deactivation and destruction of
many kmem_caches, easily hundreds of thousands on moderately large
systems, exposing scalability issues in the current slab management
code.  This is one of the patches to address the issue.

Moving synchronize_sched() out of slab_mutex isn't enough as it's
still inside cgroup_mutex.  The whole deactivation / release path will
be updated to avoid all synchronous RCU operations.  Revert this
insufficient optimization in preparation to ease future changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Jay Vana <jsvana@fb.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
---
 mm/slab.c        |  4 ++--
 mm/slab.h        |  2 +-
 mm/slab_common.c | 27 ++-------------------------
 mm/slob.c        |  2 +-
 mm/slub.c        | 19 +++++++++++++++++--
 5 files changed, 23 insertions(+), 31 deletions(-)

diff --git a/mm/slab.c b/mm/slab.c
index 29bc6c0..767e8e4 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2314,7 +2314,7 @@ static int drain_freelist(struct kmem_cache *cache,
 	return nr_freed;
 }
 
-int __kmem_cache_shrink(struct kmem_cache *cachep)
+int __kmem_cache_shrink(struct kmem_cache *cachep, bool deactivate)
 {
 	int ret = 0;
 	int node;
@@ -2334,7 +2334,7 @@ int __kmem_cache_shrink(struct kmem_cache *cachep)
 
 int __kmem_cache_shutdown(struct kmem_cache *cachep)
 {
-	return __kmem_cache_shrink(cachep);
+	return __kmem_cache_shrink(cachep, false);
 }
 
 void __kmem_cache_release(struct kmem_cache *cachep)
diff --git a/mm/slab.h b/mm/slab.h
index de6579d..4acc644 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -161,7 +161,7 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size,
 
 int __kmem_cache_shutdown(struct kmem_cache *);
 void __kmem_cache_release(struct kmem_cache *);
-int __kmem_cache_shrink(struct kmem_cache *);
+int __kmem_cache_shrink(struct kmem_cache *, bool);
 void slab_kmem_cache_release(struct kmem_cache *);
 
 struct seq_file;
diff --git a/mm/slab_common.c b/mm/slab_common.c
index ae32384..46ff746 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -579,29 +579,6 @@ void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg)
 	get_online_cpus();
 	get_online_mems();
 
-#ifdef CONFIG_SLUB
-	/*
-	 * In case of SLUB, we need to disable empty slab caching to
-	 * avoid pinning the offline memory cgroup by freeable kmem
-	 * pages charged to it. SLAB doesn't need this, as it
-	 * periodically purges unused slabs.
-	 */
-	mutex_lock(&slab_mutex);
-	list_for_each_entry(s, &slab_caches, list) {
-		c = is_root_cache(s) ? cache_from_memcg_idx(s, idx) : NULL;
-		if (c) {
-			c->cpu_partial = 0;
-			c->min_partial = 0;
-		}
-	}
-	mutex_unlock(&slab_mutex);
-	/*
-	 * kmem_cache->cpu_partial is checked locklessly (see
-	 * put_cpu_partial()). Make sure the change is visible.
-	 */
-	synchronize_sched();
-#endif
-
 	mutex_lock(&slab_mutex);
 	list_for_each_entry(s, &slab_caches, list) {
 		if (!is_root_cache(s))
@@ -613,7 +590,7 @@ void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg)
 		if (!c)
 			continue;
 
-		__kmem_cache_shrink(c);
+		__kmem_cache_shrink(c, true);
 		arr->entries[idx] = NULL;
 	}
 	mutex_unlock(&slab_mutex);
@@ -784,7 +761,7 @@ int kmem_cache_shrink(struct kmem_cache *cachep)
 	get_online_cpus();
 	get_online_mems();
 	kasan_cache_shrink(cachep);
-	ret = __kmem_cache_shrink(cachep);
+	ret = __kmem_cache_shrink(cachep, false);
 	put_online_mems();
 	put_online_cpus();
 	return ret;
diff --git a/mm/slob.c b/mm/slob.c
index eac04d4..5ec1580 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -634,7 +634,7 @@ void __kmem_cache_release(struct kmem_cache *c)
 {
 }
 
-int __kmem_cache_shrink(struct kmem_cache *d)
+int __kmem_cache_shrink(struct kmem_cache *d, bool deactivate)
 {
 	return 0;
 }
diff --git a/mm/slub.c b/mm/slub.c
index 067598a..68b84f9 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3883,7 +3883,7 @@ EXPORT_SYMBOL(kfree);
  * being allocated from last increasing the chance that the last objects
  * are freed in them.
  */
-int __kmem_cache_shrink(struct kmem_cache *s)
+int __kmem_cache_shrink(struct kmem_cache *s, bool deactivate)
 {
 	int node;
 	int i;
@@ -3895,6 +3895,21 @@ int __kmem_cache_shrink(struct kmem_cache *s)
 	unsigned long flags;
 	int ret = 0;
 
+	if (deactivate) {
+		/*
+		 * Disable empty slabs caching. Used to avoid pinning offline
+		 * memory cgroups by kmem pages that can be freed.
+		 */
+		s->cpu_partial = 0;
+		s->min_partial = 0;
+
+		/*
+		 * s->cpu_partial is checked locklessly (see put_cpu_partial),
+		 * so we have to make sure the change is visible.
+		 */
+		synchronize_sched();
+	}
+
 	flush_all(s);
 	for_each_kmem_cache_node(s, node, n) {
 		INIT_LIST_HEAD(&discard);
@@ -3951,7 +3966,7 @@ static int slab_mem_going_offline_callback(void *arg)
 
 	mutex_lock(&slab_mutex);
 	list_for_each_entry(s, &slab_caches, list)
-		__kmem_cache_shrink(s);
+		__kmem_cache_shrink(s, false);
 	mutex_unlock(&slab_mutex);
 
 	return 0;
-- 
2.9.3

next prev parent reply	other threads:[~2017-01-17 23:54 UTC|newest]

Thread overview: 35+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-01-17 23:54 [PATCHSET v3] slab: make memcg slab destruction scalable Tejun Heo
2017-01-17 23:54 ` Tejun Heo
2017-01-17 23:54 ` Tejun Heo [this message]
2017-01-17 23:54   ` [PATCH 01/10] Revert "slub: move synchronize_sched out of slab_mutex on shrink" Tejun Heo
2017-01-17 23:54 ` [PATCH 02/10] slub: separate out sysfs_slab_release() from sysfs_slab_remove() Tejun Heo
2017-01-17 23:54   ` Tejun Heo
     [not found]   ` <20170117235411.9408-3-tj-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
2017-01-23 22:54     ` [PATCH v2 " Tejun Heo
2017-01-23 22:54       ` Tejun Heo
2017-01-23 22:54       ` Tejun Heo
2017-01-27 18:00       ` Vladimir Davydov
2017-01-27 18:00         ` Vladimir Davydov
2017-01-17 23:54 ` [PATCH 03/10] slab: remove synchronous rcu_barrier() call in memcg cache release path Tejun Heo
2017-01-17 23:54   ` Tejun Heo
2017-01-27 18:03   ` Vladimir Davydov
2017-01-27 18:03     ` Vladimir Davydov
2017-01-17 23:54 ` [PATCH 04/10] slab: reorganize memcg_cache_params Tejun Heo
2017-01-17 23:54   ` Tejun Heo
2017-01-17 23:54 ` [PATCH 05/10] slab: link memcg kmem_caches on their associated memory cgroup Tejun Heo
2017-01-17 23:54   ` Tejun Heo
2017-01-17 23:54 ` [PATCH 06/10] slab: implement slab_root_caches list Tejun Heo
2017-01-17 23:54   ` Tejun Heo
2017-01-27 18:06   ` Vladimir Davydov
2017-01-27 18:06     ` Vladimir Davydov
2017-01-17 23:54 ` [PATCH 07/10] slab: introduce __kmemcg_cache_deactivate() Tejun Heo
2017-01-17 23:54   ` Tejun Heo
2017-01-17 23:54 ` [PATCH 08/10] slab: remove synchronous synchronize_sched() from memcg cache deactivation path Tejun Heo
2017-01-17 23:54   ` Tejun Heo
2017-01-17 23:54 ` [PATCH 09/10] slab: remove slub sysfs interface files early for empty memcg caches Tejun Heo
2017-01-17 23:54   ` Tejun Heo
2017-01-17 23:54 ` [PATCH 10/10] slab: use memcg_kmem_cache_wq for slab destruction operations Tejun Heo
2017-01-17 23:54   ` Tejun Heo
2017-01-29 16:04   ` Vladimir Davydov
2017-01-29 16:04     ` Vladimir Davydov
2017-02-03 17:43 ` [PATCHSET v3] slab: make memcg slab destruction scalable Tejun Heo
2017-02-03 17:43   ` Tejun Heo

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:29bc6c0 dfblob:767e8e4 dfblob:de6579d dfblob:4acc644
dfblob:ae32384 dfblob:46ff746 dfblob:eac04d4 dfblob:5ec1580
dfblob:067598a dfblob:68b84f9 dfblob:29bc6c0 dfblob:767e8e4
dfblob:de6579d dfblob:4acc644 dfblob:ae32384 dfblob:46ff746
dfblob:eac04d4 dfblob:5ec1580 dfblob:067598a dfblob:68b84f9 )
 OR (
bs:"[PATCH 01/10] Revert "
bs:"slub: move synchronize_sched out of slab_mutex on shrink" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20170117235411.9408-2-tj@kernel.org \
    --to=tj@kernel.org \
    --cc=akpm@linux-foundation.org \
    --cc=cgroups@vger.kernel.org \
    --cc=cl@linux.com \
    --cc=hannes@cmpxchg.org \
    --cc=iamjoonsoo.kim@lge.com \
    --cc=jsvana@fb.com \
    --cc=kernel-team@fb.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=penberg@kernel.org \
    --cc=rientjes@google.com \
    --cc=vdavydov.dev@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.