Re: [PATCH RFC 3/3] slub: reparent memcg caches' slabs on memcg offline

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Vladimir Davydov <vdavydov@parallels.com>
To: Christoph Lameter <cl@linux.com>
Cc: hannes@cmpxchg.org, mhocko@suse.cz, akpm@linux-foundation.org,
	linux-kernel@vger.kernel.org, linux-mm@kvack.org
Subject: Re: [PATCH RFC 3/3] slub: reparent memcg caches' slabs on memcg offline
Date: Wed, 21 May 2014 17:58:28 +0400	[thread overview]
Message-ID: <20140521135826.GA23193@esperanza> (raw)
In-Reply-To: <537A4D27.1050909@parallels.com>

On Mon, May 19, 2014 at 10:27:51PM +0400, Vladimir Davydov wrote:
[...]
> However, there is one thing regarding preemptable kernels. The problem
> is after forbidding the cache store free slabs in per-cpu/node partial
> lists by setting min_partial=0 and kmem_cache_has_cpu_partial=false
> (i.e. marking the cache as dead), we have to make sure that all frees
> that saw the cache as alive are over, otherwise they can occasionally
> add a free slab to a per-cpu/node partial list *after* the cache was
> marked dead.

Seems I've found a better way to avoid this race, which does not involve
messing up free hot paths. The idea is to explicitly zap each per-cpu
partial list by setting it pointing to an invalid ptr. Since
put_cpu_partial(), which is called from __slab_free(), uses atomic
cmpxchg for adding a new partial slab to a per cpu partial list, it is
enough to add a check if partials are zapped there and bail out if so.

The patch doing the trick is attached. Could you please take a look at
it once time permit?

Thank you.

--
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 1d9abb7d22a0..c1e318247248 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -526,7 +526,8 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
  * @memcg: pointer to the memcg this cache belongs to
  * @list: list_head for the list of all caches in this memcg
  * @root_cache: pointer to the global, root cache, this cache was derived from
- * @nr_pages: number of pages that belongs to this cache.
+ * @count: the ref count; the cache is destroyed as soon as it reaches 0
+ * @unregister_work: the cache destruction work
  */
 struct memcg_cache_params {
 	bool is_root_cache;
@@ -539,11 +540,20 @@ struct memcg_cache_params {
 			struct mem_cgroup *memcg;
 			struct list_head list;
 			struct kmem_cache *root_cache;
-			atomic_t nr_pages;
+			atomic_t count;
+			struct work_struct unregister_work;
 		};
 	};
 };
 
+/*
+ * Each active slab increments the cache's memcg_params->count, and the owner
+ * memcg, while it's online, adds MEMCG_PARAMS_COUNT_BIAS to the count so that
+ * the cache is dead (i.e. belongs to a memcg that was turned offline) iff
+ * memcg_params->count < MEMCG_PARAMS_COUNT_BIAS.
+ */
+#define MEMCG_PARAMS_COUNT_BIAS		(1U << 31)
+
 int memcg_update_all_caches(int num_memcgs);
 
 struct seq_file;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 0fb108e5b905..6c922dd96fd5 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3090,6 +3090,8 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
 	return 0;
 }
 
+static void memcg_unregister_cache_func(struct work_struct *w);
+
 int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
 			     struct kmem_cache *root_cache)
 {
@@ -3111,6 +3113,9 @@ int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
 	if (memcg) {
 		s->memcg_params->memcg = memcg;
 		s->memcg_params->root_cache = root_cache;
+		atomic_set(&s->memcg_params->count, MEMCG_PARAMS_COUNT_BIAS);
+		INIT_WORK(&s->memcg_params->unregister_work,
+			  memcg_unregister_cache_func);
 		css_get(&memcg->css);
 	} else
 		s->memcg_params->is_root_cache = true;
@@ -3192,6 +3197,17 @@ static void memcg_unregister_cache(struct kmem_cache *cachep)
 	kmem_cache_destroy(cachep);
 }
 
+static void memcg_unregister_cache_func(struct work_struct *w)
+{
+	struct memcg_cache_params *params =
+		container_of(w, struct memcg_cache_params, unregister_work);
+	struct kmem_cache *cachep = memcg_params_to_cache(params);
+
+	mutex_lock(&memcg_slab_mutex);
+	memcg_unregister_cache(cachep);
+	mutex_unlock(&memcg_slab_mutex);
+}
+
 /*
  * During the creation a new cache, we need to disable our accounting mechanism
  * altogether. This is true even if we are not creating, but rather just
@@ -3254,8 +3270,14 @@ static void memcg_unregister_all_caches(struct mem_cgroup *memcg)
 	mutex_lock(&memcg_slab_mutex);
 	list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) {
 		cachep = memcg_params_to_cache(params);
+
+		/* mark the cache as dead while still holding a ref to it */
+		atomic_sub(MEMCG_PARAMS_COUNT_BIAS - 1, &params->count);
+
 		kmem_cache_shrink(cachep);
-		if (atomic_read(&cachep->memcg_params->nr_pages) == 0)
+
+		/* if nobody except us uses the cache, destroy it immediately */
+		if (atomic_dec_and_test(&params->count))
 			memcg_unregister_cache(cachep);
 	}
 	mutex_unlock(&memcg_slab_mutex);
@@ -3329,14 +3351,15 @@ int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order)
 	res = memcg_charge_kmem(cachep->memcg_params->memcg, gfp,
 				PAGE_SIZE << order);
 	if (!res)
-		atomic_add(1 << order, &cachep->memcg_params->nr_pages);
+		atomic_inc(&cachep->memcg_params->count);
 	return res;
 }
 
 void __memcg_uncharge_slab(struct kmem_cache *cachep, int order)
 {
 	memcg_uncharge_kmem(cachep->memcg_params->memcg, PAGE_SIZE << order);
-	atomic_sub(1 << order, &cachep->memcg_params->nr_pages);
+	if (atomic_dec_and_test(&cachep->memcg_params->count))
+		schedule_work(&cachep->memcg_params->unregister_work);
 }
 
 /*
diff --git a/mm/slab.h b/mm/slab.h
index 961a3fb1f5a2..996968c55222 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -179,6 +179,14 @@ static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
 	return s->memcg_params->root_cache;
 }
 
+/* Returns true if the cache belongs to a memcg that was turned offline. */
+static inline bool memcg_cache_dead(struct kmem_cache *s)
+{
+	return !is_root_cache(s) &&
+		atomic_read(&s->memcg_params->count) < MEMCG_PARAMS_COUNT_BIAS;
+}
+
+
 static __always_inline int memcg_charge_slab(struct kmem_cache *s,
 					     gfp_t gfp, int order)
 {
@@ -225,6 +233,11 @@ static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
 	return s;
 }
 
+static inline bool memcg_cache_dead(struct kmem_cache *s)
+{
+	return false;
+}
+
 static inline int memcg_charge_slab(struct kmem_cache *s, gfp_t gfp, int order)
 {
 	return 0;
diff --git a/mm/slub.c b/mm/slub.c
index fdf0fe4da9a9..21ffae4766e5 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -123,15 +123,6 @@ static inline int kmem_cache_debug(struct kmem_cache *s)
 #endif
 }
 
-static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
-{
-#ifdef CONFIG_SLUB_CPU_PARTIAL
-	return !kmem_cache_debug(s);
-#else
-	return false;
-#endif
-}
-
 /*
  * Issues still to be resolved:
  *
@@ -186,6 +177,24 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
 /* Internal SLUB flags */
 #define __OBJECT_POISON		0x80000000UL /* Poison object */
 #define __CMPXCHG_DOUBLE	0x40000000UL /* Use cmpxchg_double */
+#define __DISABLE_CPU_PARTIAL	0x20000000UL
+
+static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
+{
+#ifdef CONFIG_SLUB_CPU_PARTIAL
+	return !kmem_cache_debug(s) &&
+		!unlikely(s->flags & __DISABLE_CPU_PARTIAL);
+#else
+	return false;
+#endif
+}
+
+#define CPU_PARTIAL_ZAPPED			((struct page *)~0UL)
+
+static inline bool CPU_PARTIAL_VALID(struct page *page)
+{
+	return page && page != CPU_PARTIAL_ZAPPED;
+}
 
 #ifdef CONFIG_SMP
 static struct notifier_block slab_notifier;
@@ -1947,6 +1956,59 @@ redo:
 	}
 }
 
+#ifdef CONFIG_SLUB_CPU_PARTIAL
+/*
+ * Unfreeze partial slab. Called with n->list_lock held.
+ */
+static void __unfreeze_partial(struct kmem_cache *s, struct kmem_cache_node *n,
+			       struct page *page, struct page **discard_page)
+{
+	struct page new, old;
+
+	do {
+
+		old.freelist = page->freelist;
+		old.counters = page->counters;
+		VM_BUG_ON(!old.frozen);
+
+		new.counters = old.counters;
+		new.freelist = old.freelist;
+
+		new.frozen = 0;
+
+	} while (!__cmpxchg_double_slab(s, page,
+			old.freelist, old.counters,
+			new.freelist, new.counters,
+			"unfreezing slab"));
+
+	if (unlikely(!new.inuse && n->nr_partial > s->min_partial)) {
+		page->next = *discard_page;
+		*discard_page = page;
+	} else {
+		add_partial(n, page, DEACTIVATE_TO_TAIL);
+		stat(s, FREE_ADD_PARTIAL);
+	}
+}
+
+static void cancel_cpu_partial(struct kmem_cache *s, struct page *page)
+{
+	struct page *discard_page = NULL;
+	struct kmem_cache_node *n;
+	unsigned long flags;
+
+	n = get_node(s, page_to_nid(page));
+	spin_lock_irqsave(&n->list_lock, flags);
+	__unfreeze_partial(s, n, page, &discard_page);
+	spin_unlock_irqrestore(&n->list_lock, flags);
+
+	if (discard_page) {
+		stat(s, DEACTIVATE_EMPTY);
+		stat(s, FREE_SLAB);
+		discard_slab(s, page);
+	}
+}
+#endif
+
 /*
  * Unfreeze all the cpu partial slabs.
  *
@@ -1961,12 +2023,12 @@ static void unfreeze_partials(struct kmem_cache *s,
 	struct kmem_cache_node *n = NULL, *n2 = NULL;
 	struct page *page, *discard_page = NULL;
 
-	while ((page = c->partial)) {
-		struct page new;
-		struct page old;
-
+	while (CPU_PARTIAL_VALID(page = c->partial)) {
 		c->partial = page->next;
 
+		if (unlikely(s->flags & __DISABLE_CPU_PARTIAL) && !c->partial)
+			c->partial = CPU_PARTIAL_ZAPPED;
+
 		n2 = get_node(s, page_to_nid(page));
 		if (n != n2) {
 			if (n)
@@ -1976,29 +2038,7 @@ static void unfreeze_partials(struct kmem_cache *s,
 			spin_lock(&n->list_lock);
 		}
 
-		do {
-
-			old.freelist = page->freelist;
-			old.counters = page->counters;
-			VM_BUG_ON(!old.frozen);
-
-			new.counters = old.counters;
-			new.freelist = old.freelist;
-
-			new.frozen = 0;
-
-		} while (!__cmpxchg_double_slab(s, page,
-				old.freelist, old.counters,
-				new.freelist, new.counters,
-				"unfreezing slab"));
-
-		if (unlikely(!new.inuse && n->nr_partial > s->min_partial)) {
-			page->next = discard_page;
-			discard_page = page;
-		} else {
-			add_partial(n, page, DEACTIVATE_TO_TAIL);
-			stat(s, FREE_ADD_PARTIAL);
-		}
+		__unfreeze_partial(s, n, page, &discard_page);
 	}
 
 	if (n)
@@ -2036,6 +2076,11 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
 		pobjects = 0;
 		oldpage = this_cpu_read(s->cpu_slab->partial);
 
+		if (unlikely(oldpage == CPU_PARTIAL_ZAPPED)) {
+			cancel_cpu_partial(s, page);
+			return;
+		}
+
 		if (oldpage) {
 			pobjects = oldpage->pobjects;
 			pages = oldpage->pages;
@@ -2106,7 +2151,7 @@ static bool has_cpu_slab(int cpu, void *info)
 	struct kmem_cache *s = info;
 	struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
 
-	return c->page || c->partial;
+	return c->page || CPU_PARTIAL_VALID(c->partial);
 }
 
 static void flush_all(struct kmem_cache *s)
@@ -3411,6 +3456,11 @@ int __kmem_cache_shrink(struct kmem_cache *s)
 	if (!slabs_by_inuse)
 		return -ENOMEM;
 
+	if (memcg_cache_dead(s)) {
+		s->flags |= __DISABLE_CPU_PARTIAL;
+		s->min_partial = 0;
+	}
+
 	flush_all(s);
 	for_each_node_state(node, N_NORMAL_MEMORY) {
 		n = get_node(s, node);
@@ -4315,7 +4365,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
 			nodes[node] += x;
 
 			page = ACCESS_ONCE(c->partial);
-			if (page) {
+			if (CPU_PARTIAL_VALID(page)) {
 				node = page_to_nid(page);
 				if (flags & SO_TOTAL)
 					WARN_ON_ONCE(1);
@@ -4471,7 +4521,8 @@ static ssize_t min_partial_store(struct kmem_cache *s, const char *buf,
 	if (err)
 		return err;
 
-	set_min_partial(s, min);
+	if (!memcg_cache_dead(s))
+		set_min_partial(s, min);
 	return length;
 }
 SLAB_ATTR(min_partial);
@@ -4547,7 +4598,7 @@ static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf)
 	for_each_online_cpu(cpu) {
 		struct page *page = per_cpu_ptr(s->cpu_slab, cpu)->partial;
 
-		if (page) {
+		if (CPU_PARTIAL_VALID(page)) {
 			pages += page->pages;
 			objects += page->pobjects;
 		}
@@ -4559,7 +4610,7 @@ static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf)
 	for_each_online_cpu(cpu) {
 		struct page *page = per_cpu_ptr(s->cpu_slab, cpu) ->partial;
 
-		if (page && len < PAGE_SIZE - 20)
+		if (CPU_PARTIAL_VALID(page) && len < PAGE_SIZE - 20)
 			len += sprintf(buf + len, " C%d=%d(%d)", cpu,
 				page->pobjects, page->pages);
 	}

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

WARNING: multiple messages have this Message-ID (diff)

From: Vladimir Davydov <vdavydov@parallels.com>
To: Christoph Lameter <cl@linux.com>
Cc: <hannes@cmpxchg.org>, <mhocko@suse.cz>,
	<akpm@linux-foundation.org>, <linux-kernel@vger.kernel.org>,
	<linux-mm@kvack.org>
Subject: Re: [PATCH RFC 3/3] slub: reparent memcg caches' slabs on memcg offline
Date: Wed, 21 May 2014 17:58:28 +0400	[thread overview]
Message-ID: <20140521135826.GA23193@esperanza> (raw)
In-Reply-To: <537A4D27.1050909@parallels.com>

On Mon, May 19, 2014 at 10:27:51PM +0400, Vladimir Davydov wrote:
[...]
> However, there is one thing regarding preemptable kernels. The problem
> is after forbidding the cache store free slabs in per-cpu/node partial
> lists by setting min_partial=0 and kmem_cache_has_cpu_partial=false
> (i.e. marking the cache as dead), we have to make sure that all frees
> that saw the cache as alive are over, otherwise they can occasionally
> add a free slab to a per-cpu/node partial list *after* the cache was
> marked dead.

Seems I've found a better way to avoid this race, which does not involve
messing up free hot paths. The idea is to explicitly zap each per-cpu
partial list by setting it pointing to an invalid ptr. Since
put_cpu_partial(), which is called from __slab_free(), uses atomic
cmpxchg for adding a new partial slab to a per cpu partial list, it is
enough to add a check if partials are zapped there and bail out if so.

The patch doing the trick is attached. Could you please take a look at
it once time permit?

Thank you.

--
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 1d9abb7d22a0..c1e318247248 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -526,7 +526,8 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
  * @memcg: pointer to the memcg this cache belongs to
  * @list: list_head for the list of all caches in this memcg
  * @root_cache: pointer to the global, root cache, this cache was derived from
- * @nr_pages: number of pages that belongs to this cache.
+ * @count: the ref count; the cache is destroyed as soon as it reaches 0
+ * @unregister_work: the cache destruction work
  */
 struct memcg_cache_params {
 	bool is_root_cache;
@@ -539,11 +540,20 @@ struct memcg_cache_params {
 			struct mem_cgroup *memcg;
 			struct list_head list;
 			struct kmem_cache *root_cache;
-			atomic_t nr_pages;
+			atomic_t count;
+			struct work_struct unregister_work;
 		};
 	};
 };
 
+/*
+ * Each active slab increments the cache's memcg_params->count, and the owner
+ * memcg, while it's online, adds MEMCG_PARAMS_COUNT_BIAS to the count so that
+ * the cache is dead (i.e. belongs to a memcg that was turned offline) iff
+ * memcg_params->count < MEMCG_PARAMS_COUNT_BIAS.
+ */
+#define MEMCG_PARAMS_COUNT_BIAS		(1U << 31)
+
 int memcg_update_all_caches(int num_memcgs);
 
 struct seq_file;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 0fb108e5b905..6c922dd96fd5 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3090,6 +3090,8 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
 	return 0;
 }
 
+static void memcg_unregister_cache_func(struct work_struct *w);
+
 int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
 			     struct kmem_cache *root_cache)
 {
@@ -3111,6 +3113,9 @@ int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
 	if (memcg) {
 		s->memcg_params->memcg = memcg;
 		s->memcg_params->root_cache = root_cache;
+		atomic_set(&s->memcg_params->count, MEMCG_PARAMS_COUNT_BIAS);
+		INIT_WORK(&s->memcg_params->unregister_work,
+			  memcg_unregister_cache_func);
 		css_get(&memcg->css);
 	} else
 		s->memcg_params->is_root_cache = true;
@@ -3192,6 +3197,17 @@ static void memcg_unregister_cache(struct kmem_cache *cachep)
 	kmem_cache_destroy(cachep);
 }
 
+static void memcg_unregister_cache_func(struct work_struct *w)
+{
+	struct memcg_cache_params *params =
+		container_of(w, struct memcg_cache_params, unregister_work);
+	struct kmem_cache *cachep = memcg_params_to_cache(params);
+
+	mutex_lock(&memcg_slab_mutex);
+	memcg_unregister_cache(cachep);
+	mutex_unlock(&memcg_slab_mutex);
+}
+
 /*
  * During the creation a new cache, we need to disable our accounting mechanism
  * altogether. This is true even if we are not creating, but rather just
@@ -3254,8 +3270,14 @@ static void memcg_unregister_all_caches(struct mem_cgroup *memcg)
 	mutex_lock(&memcg_slab_mutex);
 	list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) {
 		cachep = memcg_params_to_cache(params);
+
+		/* mark the cache as dead while still holding a ref to it */
+		atomic_sub(MEMCG_PARAMS_COUNT_BIAS - 1, &params->count);
+
 		kmem_cache_shrink(cachep);
-		if (atomic_read(&cachep->memcg_params->nr_pages) == 0)
+
+		/* if nobody except us uses the cache, destroy it immediately */
+		if (atomic_dec_and_test(&params->count))
 			memcg_unregister_cache(cachep);
 	}
 	mutex_unlock(&memcg_slab_mutex);
@@ -3329,14 +3351,15 @@ int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order)
 	res = memcg_charge_kmem(cachep->memcg_params->memcg, gfp,
 				PAGE_SIZE << order);
 	if (!res)
-		atomic_add(1 << order, &cachep->memcg_params->nr_pages);
+		atomic_inc(&cachep->memcg_params->count);
 	return res;
 }
 
 void __memcg_uncharge_slab(struct kmem_cache *cachep, int order)
 {
 	memcg_uncharge_kmem(cachep->memcg_params->memcg, PAGE_SIZE << order);
-	atomic_sub(1 << order, &cachep->memcg_params->nr_pages);
+	if (atomic_dec_and_test(&cachep->memcg_params->count))
+		schedule_work(&cachep->memcg_params->unregister_work);
 }
 
 /*
diff --git a/mm/slab.h b/mm/slab.h
index 961a3fb1f5a2..996968c55222 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -179,6 +179,14 @@ static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
 	return s->memcg_params->root_cache;
 }
 
+/* Returns true if the cache belongs to a memcg that was turned offline. */
+static inline bool memcg_cache_dead(struct kmem_cache *s)
+{
+	return !is_root_cache(s) &&
+		atomic_read(&s->memcg_params->count) < MEMCG_PARAMS_COUNT_BIAS;
+}
+
+
 static __always_inline int memcg_charge_slab(struct kmem_cache *s,
 					     gfp_t gfp, int order)
 {
@@ -225,6 +233,11 @@ static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
 	return s;
 }
 
+static inline bool memcg_cache_dead(struct kmem_cache *s)
+{
+	return false;
+}
+
 static inline int memcg_charge_slab(struct kmem_cache *s, gfp_t gfp, int order)
 {
 	return 0;
diff --git a/mm/slub.c b/mm/slub.c
index fdf0fe4da9a9..21ffae4766e5 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -123,15 +123,6 @@ static inline int kmem_cache_debug(struct kmem_cache *s)
 #endif
 }
 
-static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
-{
-#ifdef CONFIG_SLUB_CPU_PARTIAL
-	return !kmem_cache_debug(s);
-#else
-	return false;
-#endif
-}
-
 /*
  * Issues still to be resolved:
  *
@@ -186,6 +177,24 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
 /* Internal SLUB flags */
 #define __OBJECT_POISON		0x80000000UL /* Poison object */
 #define __CMPXCHG_DOUBLE	0x40000000UL /* Use cmpxchg_double */
+#define __DISABLE_CPU_PARTIAL	0x20000000UL
+
+static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
+{
+#ifdef CONFIG_SLUB_CPU_PARTIAL
+	return !kmem_cache_debug(s) &&
+		!unlikely(s->flags & __DISABLE_CPU_PARTIAL);
+#else
+	return false;
+#endif
+}
+
+#define CPU_PARTIAL_ZAPPED			((struct page *)~0UL)
+
+static inline bool CPU_PARTIAL_VALID(struct page *page)
+{
+	return page && page != CPU_PARTIAL_ZAPPED;
+}
 
 #ifdef CONFIG_SMP
 static struct notifier_block slab_notifier;
@@ -1947,6 +1956,59 @@ redo:
 	}
 }
 
+#ifdef CONFIG_SLUB_CPU_PARTIAL
+/*
+ * Unfreeze partial slab. Called with n->list_lock held.
+ */
+static void __unfreeze_partial(struct kmem_cache *s, struct kmem_cache_node *n,
+			       struct page *page, struct page **discard_page)
+{
+	struct page new, old;
+
+	do {
+
+		old.freelist = page->freelist;
+		old.counters = page->counters;
+		VM_BUG_ON(!old.frozen);
+
+		new.counters = old.counters;
+		new.freelist = old.freelist;
+
+		new.frozen = 0;
+
+	} while (!__cmpxchg_double_slab(s, page,
+			old.freelist, old.counters,
+			new.freelist, new.counters,
+			"unfreezing slab"));
+
+	if (unlikely(!new.inuse && n->nr_partial > s->min_partial)) {
+		page->next = *discard_page;
+		*discard_page = page;
+	} else {
+		add_partial(n, page, DEACTIVATE_TO_TAIL);
+		stat(s, FREE_ADD_PARTIAL);
+	}
+}
+
+static void cancel_cpu_partial(struct kmem_cache *s, struct page *page)
+{
+	struct page *discard_page = NULL;
+	struct kmem_cache_node *n;
+	unsigned long flags;
+
+	n = get_node(s, page_to_nid(page));
+	spin_lock_irqsave(&n->list_lock, flags);
+	__unfreeze_partial(s, n, page, &discard_page);
+	spin_unlock_irqrestore(&n->list_lock, flags);
+
+	if (discard_page) {
+		stat(s, DEACTIVATE_EMPTY);
+		stat(s, FREE_SLAB);
+		discard_slab(s, page);
+	}
+}
+#endif
+
 /*
  * Unfreeze all the cpu partial slabs.
  *
@@ -1961,12 +2023,12 @@ static void unfreeze_partials(struct kmem_cache *s,
 	struct kmem_cache_node *n = NULL, *n2 = NULL;
 	struct page *page, *discard_page = NULL;
 
-	while ((page = c->partial)) {
-		struct page new;
-		struct page old;
-
+	while (CPU_PARTIAL_VALID(page = c->partial)) {
 		c->partial = page->next;
 
+		if (unlikely(s->flags & __DISABLE_CPU_PARTIAL) && !c->partial)
+			c->partial = CPU_PARTIAL_ZAPPED;
+
 		n2 = get_node(s, page_to_nid(page));
 		if (n != n2) {
 			if (n)
@@ -1976,29 +2038,7 @@ static void unfreeze_partials(struct kmem_cache *s,
 			spin_lock(&n->list_lock);
 		}
 
-		do {
-
-			old.freelist = page->freelist;
-			old.counters = page->counters;
-			VM_BUG_ON(!old.frozen);
-
-			new.counters = old.counters;
-			new.freelist = old.freelist;
-
-			new.frozen = 0;
-
-		} while (!__cmpxchg_double_slab(s, page,
-				old.freelist, old.counters,
-				new.freelist, new.counters,
-				"unfreezing slab"));
-
-		if (unlikely(!new.inuse && n->nr_partial > s->min_partial)) {
-			page->next = discard_page;
-			discard_page = page;
-		} else {
-			add_partial(n, page, DEACTIVATE_TO_TAIL);
-			stat(s, FREE_ADD_PARTIAL);
-		}
+		__unfreeze_partial(s, n, page, &discard_page);
 	}
 
 	if (n)
@@ -2036,6 +2076,11 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
 		pobjects = 0;
 		oldpage = this_cpu_read(s->cpu_slab->partial);
 
+		if (unlikely(oldpage == CPU_PARTIAL_ZAPPED)) {
+			cancel_cpu_partial(s, page);
+			return;
+		}
+
 		if (oldpage) {
 			pobjects = oldpage->pobjects;
 			pages = oldpage->pages;
@@ -2106,7 +2151,7 @@ static bool has_cpu_slab(int cpu, void *info)
 	struct kmem_cache *s = info;
 	struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
 
-	return c->page || c->partial;
+	return c->page || CPU_PARTIAL_VALID(c->partial);
 }
 
 static void flush_all(struct kmem_cache *s)
@@ -3411,6 +3456,11 @@ int __kmem_cache_shrink(struct kmem_cache *s)
 	if (!slabs_by_inuse)
 		return -ENOMEM;
 
+	if (memcg_cache_dead(s)) {
+		s->flags |= __DISABLE_CPU_PARTIAL;
+		s->min_partial = 0;
+	}
+
 	flush_all(s);
 	for_each_node_state(node, N_NORMAL_MEMORY) {
 		n = get_node(s, node);
@@ -4315,7 +4365,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
 			nodes[node] += x;
 
 			page = ACCESS_ONCE(c->partial);
-			if (page) {
+			if (CPU_PARTIAL_VALID(page)) {
 				node = page_to_nid(page);
 				if (flags & SO_TOTAL)
 					WARN_ON_ONCE(1);
@@ -4471,7 +4521,8 @@ static ssize_t min_partial_store(struct kmem_cache *s, const char *buf,
 	if (err)
 		return err;
 
-	set_min_partial(s, min);
+	if (!memcg_cache_dead(s))
+		set_min_partial(s, min);
 	return length;
 }
 SLAB_ATTR(min_partial);
@@ -4547,7 +4598,7 @@ static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf)
 	for_each_online_cpu(cpu) {
 		struct page *page = per_cpu_ptr(s->cpu_slab, cpu)->partial;
 
-		if (page) {
+		if (CPU_PARTIAL_VALID(page)) {
 			pages += page->pages;
 			objects += page->pobjects;
 		}
@@ -4559,7 +4610,7 @@ static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf)
 	for_each_online_cpu(cpu) {
 		struct page *page = per_cpu_ptr(s->cpu_slab, cpu) ->partial;
 
-		if (page && len < PAGE_SIZE - 20)
+		if (CPU_PARTIAL_VALID(page) && len < PAGE_SIZE - 20)
 			len += sprintf(buf + len, " C%d=%d(%d)", cpu,
 				page->pobjects, page->pages);
 	}

next prev parent reply	other threads:[~2014-05-21 13:58 UTC|newest]

Thread overview: 62+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2014-05-13 13:48 [PATCH RFC 0/3] kmemcg slab reparenting Vladimir Davydov
2014-05-13 13:48 ` Vladimir Davydov
2014-05-13 13:48 ` [PATCH RFC 1/3] slub: keep full slabs on list for per memcg caches Vladimir Davydov
2014-05-13 13:48   ` Vladimir Davydov
2014-05-14 16:16   ` Christoph Lameter
2014-05-14 16:16     ` Christoph Lameter
2014-05-15  6:34     ` Vladimir Davydov
2014-05-15  6:34       ` Vladimir Davydov
2014-05-15 15:15       ` Christoph Lameter
2014-05-15 15:15         ` Christoph Lameter
2014-05-16 13:06         ` Vladimir Davydov
2014-05-16 13:06           ` Vladimir Davydov
2014-05-16 15:05           ` Christoph Lameter
2014-05-16 15:05             ` Christoph Lameter
2014-05-13 13:48 ` [PATCH RFC 2/3] percpu-refcount: allow to get dead reference Vladimir Davydov
2014-05-13 13:48   ` Vladimir Davydov
2014-05-13 13:48 ` [PATCH RFC 3/3] slub: reparent memcg caches' slabs on memcg offline Vladimir Davydov
2014-05-13 13:48   ` Vladimir Davydov
2014-05-14 16:20   ` Christoph Lameter
2014-05-14 16:20     ` Christoph Lameter
2014-05-15  7:16     ` Vladimir Davydov
2014-05-15  7:16       ` Vladimir Davydov
2014-05-15 15:16       ` Christoph Lameter
2014-05-15 15:16         ` Christoph Lameter
2014-05-16 13:22         ` Vladimir Davydov
2014-05-16 13:22           ` Vladimir Davydov
2014-05-16 15:03           ` Christoph Lameter
2014-05-16 15:03             ` Christoph Lameter
2014-05-19 15:24             ` Vladimir Davydov
2014-05-19 15:24               ` Vladimir Davydov
2014-05-19 16:03               ` Christoph Lameter
2014-05-19 16:03                 ` Christoph Lameter
2014-05-19 18:27                 ` Vladimir Davydov
2014-05-19 18:27                   ` Vladimir Davydov
2014-05-21 13:58                   ` Vladimir Davydov [this message]
2014-05-21 13:58                     ` Vladimir Davydov
2014-05-21 14:45                     ` Christoph Lameter
2014-05-21 14:45                       ` Christoph Lameter
2014-05-21 15:14                       ` Vladimir Davydov
2014-05-21 15:14                         ` Vladimir Davydov
2014-05-22  0:15                         ` Christoph Lameter
2014-05-22  0:15                           ` Christoph Lameter
2014-05-22 14:07                           ` Vladimir Davydov
2014-05-22 14:07                             ` Vladimir Davydov
2014-05-21 14:41                   ` Christoph Lameter
2014-05-21 14:41                     ` Christoph Lameter
2014-05-21 15:04                     ` Vladimir Davydov
2014-05-21 15:04                       ` Vladimir Davydov
2014-05-22  0:13                       ` Christoph Lameter
2014-05-22  0:13                         ` Christoph Lameter
2014-05-22 13:47                         ` Vladimir Davydov
2014-05-22 13:47                           ` Vladimir Davydov
2014-05-22 19:25                           ` Christoph Lameter
2014-05-22 19:25                             ` Christoph Lameter
2014-05-23 15:26                             ` Vladimir Davydov
2014-05-23 15:26                               ` Vladimir Davydov
2014-05-23 17:45                               ` Christoph Lameter
2014-05-23 17:45                                 ` Christoph Lameter
2014-05-23 19:57                                 ` Vladimir Davydov
2014-05-23 19:57                                   ` Vladimir Davydov
2014-05-27 14:38                                   ` Christoph Lameter
2014-05-27 14:38                                     ` Christoph Lameter

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:1d9abb7d22a dfblob:c1e31824724 dfblob:0fb108e5b90
dfblob:6c922dd96fd dfblob:961a3fb1f5a dfblob:996968c5522
dfblob:fdf0fe4da9a dfblob:21ffae4766e dfblob:1d9abb7d22a
dfblob:c1e31824724 dfblob:0fb108e5b90 dfblob:6c922dd96fd
dfblob:961a3fb1f5a dfblob:996968c5522 dfblob:fdf0fe4da9a
dfblob:21ffae4766e )
 OR (
bs:"Re: [PATCH RFC 3/3] slub: reparent memcg caches' slabs on memcg offline" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20140521135826.GA23193@esperanza \
    --to=vdavydov@parallels.com \
    --cc=akpm@linux-foundation.org \
    --cc=cl@linux.com \
    --cc=hannes@cmpxchg.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mhocko@suse.cz \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.