public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [GIT PULL] percpu: finer grained locking
@ 2009-03-06 15:54 Tejun Heo
  2009-03-06 15:54 ` [PATCH 1/4] percpu: replace pcpu_realloc() with pcpu_mem_alloc() and pcpu_mem_free() Tejun Heo
                   ` (3 more replies)
  0 siblings, 4 replies; 7+ messages in thread
From: Tejun Heo @ 2009-03-06 15:54 UTC (permalink / raw)
  To: mingo, rusty, tglx, x86, linux-kernel, hpa, npiggin, akpm


Hello,

Please pull from the following git vector.

  git://git.kernel.org/pub/scm/linux/kernel/git/tj/misc.git tj-percpu

This patchset makes locking in dynamic percpu allocator finer grained
to break deadlock[1] and allow atomic free.

This patchset is on top of the current x86/core/percpu[2] contains the
following patches.

 0001-percpu-replace-pcpu_realloc-with-pcpu_mem_alloc.patch
 0002-percpu-move-chunk-area-map-extension-out-of-area-al.patch
 0003-percpu-move-fully-free-chunk-reclamation-into-a-wor.patch
 0004-percpu-finer-grained-locking-to-break-deadlock-and.patch

0001-0003 prepare for finer grained locking.  0004 implements it.

Diffstat follows.

 mm/percpu.c |  365 ++++++++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 245 insertions(+), 120 deletions(-)

Thanks.

--
tejun

[1] http://thread.gmane.org/gmane.linux.kernel/802384
[2] 6b19b0c2400437a3c10059ede0e59b517092e1bd

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH 1/4] percpu: replace pcpu_realloc() with pcpu_mem_alloc() and pcpu_mem_free()
  2009-03-06 15:54 [GIT PULL] percpu: finer grained locking Tejun Heo
@ 2009-03-06 15:54 ` Tejun Heo
  2009-03-06 15:54 ` [PATCH 2/4] percpu: move chunk area map extension out of area allocation Tejun Heo
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 7+ messages in thread
From: Tejun Heo @ 2009-03-06 15:54 UTC (permalink / raw)
  To: mingo, rusty, tglx, x86, linux-kernel, hpa, npiggin, akpm; +Cc: Tejun Heo

Impact: code reorganization for later changes

With static map handling moved to pcpu_split_block(), pcpu_realloc()
only clutters the code and it's also unsuitable for scheduled locking
changes.  Implement and use pcpu_mem_alloc/free() instead.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 mm/percpu.c |   85 +++++++++++++++++++++++++++++------------------------------
 1 files changed, 42 insertions(+), 43 deletions(-)

diff --git a/mm/percpu.c b/mm/percpu.c
index ef8e169..f1d0e90 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -164,39 +164,41 @@ static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk,
 }
 
 /**
- * pcpu_realloc - versatile realloc
- * @p: the current pointer (can be NULL for new allocations)
- * @size: the current size in bytes (can be 0 for new allocations)
- * @new_size: the wanted new size in bytes (can be 0 for free)
+ * pcpu_mem_alloc - allocate memory
+ * @size: bytes to allocate
  *
- * More robust realloc which can be used to allocate, resize or free a
- * memory area of arbitrary size.  If the needed size goes over
- * PAGE_SIZE, kernel VM is used.
+ * Allocate @size bytes.  If @size is smaller than PAGE_SIZE,
+ * kzalloc() is used; otherwise, vmalloc() is used.  The returned
+ * memory is always zeroed.
  *
  * RETURNS:
- * The new pointer on success, NULL on failure.
+ * Pointer to the allocated area on success, NULL on failure.
  */
-static void *pcpu_realloc(void *p, size_t size, size_t new_size)
+static void *pcpu_mem_alloc(size_t size)
 {
-	void *new;
-
-	if (new_size <= PAGE_SIZE)
-		new = kmalloc(new_size, GFP_KERNEL);
-	else
-		new = vmalloc(new_size);
-	if (new_size && !new)
-		return NULL;
-
-	memcpy(new, p, min(size, new_size));
-	if (new_size > size)
-		memset(new + size, 0, new_size - size);
+	if (size <= PAGE_SIZE)
+		return kzalloc(size, GFP_KERNEL);
+	else {
+		void *ptr = vmalloc(size);
+		if (ptr)
+			memset(ptr, 0, size);
+		return ptr;
+	}
+}
 
+/**
+ * pcpu_mem_free - free memory
+ * @ptr: memory to free
+ * @size: size of the area
+ *
+ * Free @ptr.  @ptr should have been allocated using pcpu_mem_alloc().
+ */
+static void pcpu_mem_free(void *ptr, size_t size)
+{
 	if (size <= PAGE_SIZE)
-		kfree(p);
+		kfree(ptr);
 	else
-		vfree(p);
-
-	return new;
+		vfree(ptr);
 }
 
 /**
@@ -331,29 +333,27 @@ static int pcpu_split_block(struct pcpu_chunk *chunk, int i, int head, int tail)
 	if (chunk->map_alloc < target) {
 		int new_alloc;
 		int *new;
+		size_t size;
 
 		new_alloc = PCPU_DFL_MAP_ALLOC;
 		while (new_alloc < target)
 			new_alloc *= 2;
 
-		if (chunk->map_alloc < PCPU_DFL_MAP_ALLOC) {
-			/*
-			 * map_alloc smaller than the default size
-			 * indicates that the chunk is one of the
-			 * first chunks and still using static map.
-			 * Allocate a dynamic one and copy.
-			 */
-			new = pcpu_realloc(NULL, 0, new_alloc * sizeof(new[0]));
-			if (new)
-				memcpy(new, chunk->map,
-				       chunk->map_alloc * sizeof(new[0]));
-		} else
-			new = pcpu_realloc(chunk->map,
-					   chunk->map_alloc * sizeof(new[0]),
-					   new_alloc * sizeof(new[0]));
+		new = pcpu_mem_alloc(new_alloc * sizeof(new[0]));
 		if (!new)
 			return -ENOMEM;
 
+		size = chunk->map_alloc * sizeof(chunk->map[0]);
+		memcpy(new, chunk->map, size);
+
+		/*
+		 * map_alloc < PCPU_DFL_MAP_ALLOC indicates that the
+		 * chunk is one of the first chunks and still using
+		 * static map.
+		 */
+		if (chunk->map_alloc >= PCPU_DFL_MAP_ALLOC)
+			pcpu_mem_free(chunk->map, size);
+
 		chunk->map_alloc = new_alloc;
 		chunk->map = new;
 	}
@@ -696,7 +696,7 @@ static void free_pcpu_chunk(struct pcpu_chunk *chunk)
 		return;
 	if (chunk->vm)
 		free_vm_area(chunk->vm);
-	pcpu_realloc(chunk->map, chunk->map_alloc * sizeof(chunk->map[0]), 0);
+	pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0]));
 	kfree(chunk);
 }
 
@@ -708,8 +708,7 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void)
 	if (!chunk)
 		return NULL;
 
-	chunk->map = pcpu_realloc(NULL, 0,
-				  PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
+	chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
 	chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
 	chunk->map[chunk->map_used++] = pcpu_unit_size;
 	chunk->page = chunk->page_ar;
-- 
1.6.0.2


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH 2/4] percpu: move chunk area map extension out of area allocation
  2009-03-06 15:54 [GIT PULL] percpu: finer grained locking Tejun Heo
  2009-03-06 15:54 ` [PATCH 1/4] percpu: replace pcpu_realloc() with pcpu_mem_alloc() and pcpu_mem_free() Tejun Heo
@ 2009-03-06 15:54 ` Tejun Heo
  2009-03-06 15:54 ` [PATCH 3/4] percpu: move fully free chunk reclamation into a work Tejun Heo
  2009-03-06 15:54 ` [PATCH 4/4] percpu: finer grained locking to break deadlock and allow atomic free Tejun Heo
  3 siblings, 0 replies; 7+ messages in thread
From: Tejun Heo @ 2009-03-06 15:54 UTC (permalink / raw)
  To: mingo, rusty, tglx, x86, linux-kernel, hpa, npiggin, akpm; +Cc: Tejun Heo

Impact: code reorganization for later changes

Separate out chunk area map extension into a separate function -
pcpu_extend_area_map() - and call it directly from pcpu_alloc() such
that pcpu_alloc_area() is guaranteed to have enough area map slots on
invocation.

With this change, pcpu_alloc_area() does only area allocation and the
only failure mode is when the chunk doens't have enough room, so
there's no need to distinguish it from memory allocation failures.
Make it return -1 on such cases instead of hacky -ENOSPC.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 mm/percpu.c |  108 ++++++++++++++++++++++++++++++++--------------------------
 1 files changed, 60 insertions(+), 48 deletions(-)

diff --git a/mm/percpu.c b/mm/percpu.c
index f1d0e90..7d9bc35 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -307,6 +307,50 @@ static void pcpu_chunk_addr_insert(struct pcpu_chunk *new)
 }
 
 /**
+ * pcpu_extend_area_map - extend area map for allocation
+ * @chunk: target chunk
+ *
+ * Extend area map of @chunk so that it can accomodate an allocation.
+ * A single allocation can split an area into three areas, so this
+ * function makes sure that @chunk->map has at least two extra slots.
+ *
+ * RETURNS:
+ * 0 if noop, 1 if successfully extended, -errno on failure.
+ */
+static int pcpu_extend_area_map(struct pcpu_chunk *chunk)
+{
+	int new_alloc;
+	int *new;
+	size_t size;
+
+	/* has enough? */
+	if (chunk->map_alloc >= chunk->map_used + 2)
+		return 0;
+
+	new_alloc = PCPU_DFL_MAP_ALLOC;
+	while (new_alloc < chunk->map_used + 2)
+		new_alloc *= 2;
+
+	new = pcpu_mem_alloc(new_alloc * sizeof(new[0]));
+	if (!new)
+		return -ENOMEM;
+
+	size = chunk->map_alloc * sizeof(chunk->map[0]);
+	memcpy(new, chunk->map, size);
+
+	/*
+	 * map_alloc < PCPU_DFL_MAP_ALLOC indicates that the chunk is
+	 * one of the first chunks and still using static map.
+	 */
+	if (chunk->map_alloc >= PCPU_DFL_MAP_ALLOC)
+		pcpu_mem_free(chunk->map, size);
+
+	chunk->map_alloc = new_alloc;
+	chunk->map = new;
+	return 0;
+}
+
+/**
  * pcpu_split_block - split a map block
  * @chunk: chunk of interest
  * @i: index of map block to split
@@ -321,44 +365,16 @@ static void pcpu_chunk_addr_insert(struct pcpu_chunk *new)
  * depending on @head, is reduced by @tail bytes and @tail byte block
  * is inserted after the target block.
  *
- * RETURNS:
- * 0 on success, -errno on failure.
+ * @chunk->map must have enough free slots to accomodate the split.
  */
-static int pcpu_split_block(struct pcpu_chunk *chunk, int i, int head, int tail)
+static void pcpu_split_block(struct pcpu_chunk *chunk, int i,
+			     int head, int tail)
 {
 	int nr_extra = !!head + !!tail;
-	int target = chunk->map_used + nr_extra;
-
-	/* reallocation required? */
-	if (chunk->map_alloc < target) {
-		int new_alloc;
-		int *new;
-		size_t size;
-
-		new_alloc = PCPU_DFL_MAP_ALLOC;
-		while (new_alloc < target)
-			new_alloc *= 2;
-
-		new = pcpu_mem_alloc(new_alloc * sizeof(new[0]));
-		if (!new)
-			return -ENOMEM;
-
-		size = chunk->map_alloc * sizeof(chunk->map[0]);
-		memcpy(new, chunk->map, size);
-
-		/*
-		 * map_alloc < PCPU_DFL_MAP_ALLOC indicates that the
-		 * chunk is one of the first chunks and still using
-		 * static map.
-		 */
-		if (chunk->map_alloc >= PCPU_DFL_MAP_ALLOC)
-			pcpu_mem_free(chunk->map, size);
 
-		chunk->map_alloc = new_alloc;
-		chunk->map = new;
-	}
+	BUG_ON(chunk->map_alloc < chunk->map_used + nr_extra);
 
-	/* insert a new subblock */
+	/* insert new subblocks */
 	memmove(&chunk->map[i + nr_extra], &chunk->map[i],
 		sizeof(chunk->map[0]) * (chunk->map_used - i));
 	chunk->map_used += nr_extra;
@@ -371,7 +387,6 @@ static int pcpu_split_block(struct pcpu_chunk *chunk, int i, int head, int tail)
 		chunk->map[i++] -= tail;
 		chunk->map[i] = tail;
 	}
-	return 0;
 }
 
 /**
@@ -384,8 +399,11 @@ static int pcpu_split_block(struct pcpu_chunk *chunk, int i, int head, int tail)
  * Note that this function only allocates the offset.  It doesn't
  * populate or map the area.
  *
+ * @chunk->map must have at least two free slots.
+ *
  * RETURNS:
- * Allocated offset in @chunk on success, -errno on failure.
+ * Allocated offset in @chunk on success, -1 if no matching area is
+ * found.
  */
 static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
 {
@@ -433,8 +451,7 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
 
 		/* split if warranted */
 		if (head || tail) {
-			if (pcpu_split_block(chunk, i, head, tail))
-				return -ENOMEM;
+			pcpu_split_block(chunk, i, head, tail);
 			if (head) {
 				i++;
 				off += head;
@@ -461,14 +478,8 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
 	chunk->contig_hint = max_contig;	/* fully scanned */
 	pcpu_chunk_relocate(chunk, oslot);
 
-	/*
-	 * Tell the upper layer that this chunk has no area left.
-	 * Note that this is not an error condition but a notification
-	 * to upper layer that it needs to look at other chunks.
-	 * -ENOSPC is chosen as it isn't used in memory subsystem and
-	 * matches the meaning in a way.
-	 */
-	return -ENOSPC;
+	/* tell the upper layer that this chunk has no matching area */
+	return -1;
 }
 
 /**
@@ -755,7 +766,8 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved)
 	/* serve reserved allocations from the reserved chunk if available */
 	if (reserved && pcpu_reserved_chunk) {
 		chunk = pcpu_reserved_chunk;
-		if (size > chunk->contig_hint)
+		if (size > chunk->contig_hint ||
+		    pcpu_extend_area_map(chunk) < 0)
 			goto out_unlock;
 		off = pcpu_alloc_area(chunk, size, align);
 		if (off >= 0)
@@ -768,11 +780,11 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved)
 		list_for_each_entry(chunk, &pcpu_slot[slot], list) {
 			if (size > chunk->contig_hint)
 				continue;
+			if (pcpu_extend_area_map(chunk) < 0)
+				goto out_unlock;
 			off = pcpu_alloc_area(chunk, size, align);
 			if (off >= 0)
 				goto area_found;
-			if (off != -ENOSPC)
-				goto out_unlock;
 		}
 	}
 
-- 
1.6.0.2


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH 3/4] percpu: move fully free chunk reclamation into a work
  2009-03-06 15:54 [GIT PULL] percpu: finer grained locking Tejun Heo
  2009-03-06 15:54 ` [PATCH 1/4] percpu: replace pcpu_realloc() with pcpu_mem_alloc() and pcpu_mem_free() Tejun Heo
  2009-03-06 15:54 ` [PATCH 2/4] percpu: move chunk area map extension out of area allocation Tejun Heo
@ 2009-03-06 15:54 ` Tejun Heo
  2009-03-06 15:54 ` [PATCH 4/4] percpu: finer grained locking to break deadlock and allow atomic free Tejun Heo
  3 siblings, 0 replies; 7+ messages in thread
From: Tejun Heo @ 2009-03-06 15:54 UTC (permalink / raw)
  To: mingo, rusty, tglx, x86, linux-kernel, hpa, npiggin, akpm; +Cc: Tejun Heo

Impact: code reorganization for later changes

Do fully free chunk reclamation using a work.  This change is to
prepare for locking changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 mm/percpu.c |   48 ++++++++++++++++++++++++++++++++++++++----------
 1 files changed, 38 insertions(+), 10 deletions(-)

diff --git a/mm/percpu.c b/mm/percpu.c
index 7d9bc35..4c8a419 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -63,6 +63,7 @@
 #include <linux/rbtree.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
+#include <linux/workqueue.h>
 
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
@@ -118,6 +119,10 @@ static DEFINE_MUTEX(pcpu_mutex);
 static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
 static struct rb_root pcpu_addr_root = RB_ROOT;	/* chunks by address */
 
+/* reclaim work to release fully free chunks, scheduled from free path */
+static void pcpu_reclaim(struct work_struct *work);
+static DECLARE_WORK(pcpu_reclaim_work, pcpu_reclaim);
+
 static int __pcpu_size_to_slot(int size)
 {
 	int highbit = fls(size);	/* size is in bytes */
@@ -846,13 +851,37 @@ void *__alloc_reserved_percpu(size_t size, size_t align)
 	return pcpu_alloc(size, align, true);
 }
 
-static void pcpu_kill_chunk(struct pcpu_chunk *chunk)
+/**
+ * pcpu_reclaim - reclaim fully free chunks, workqueue function
+ * @work: unused
+ *
+ * Reclaim all fully free chunks except for the first one.
+ */
+static void pcpu_reclaim(struct work_struct *work)
 {
-	WARN_ON(chunk->immutable);
-	pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false);
-	list_del(&chunk->list);
-	rb_erase(&chunk->rb_node, &pcpu_addr_root);
-	free_pcpu_chunk(chunk);
+	LIST_HEAD(todo);
+	struct list_head *head = &pcpu_slot[pcpu_nr_slots - 1];
+	struct pcpu_chunk *chunk, *next;
+
+	mutex_lock(&pcpu_mutex);
+
+	list_for_each_entry_safe(chunk, next, head, list) {
+		WARN_ON(chunk->immutable);
+
+		/* spare the first one */
+		if (chunk == list_first_entry(head, struct pcpu_chunk, list))
+			continue;
+
+		rb_erase(&chunk->rb_node, &pcpu_addr_root);
+		list_move(&chunk->list, &todo);
+	}
+
+	mutex_unlock(&pcpu_mutex);
+
+	list_for_each_entry_safe(chunk, next, &todo, list) {
+		pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false);
+		free_pcpu_chunk(chunk);
+	}
 }
 
 /**
@@ -877,14 +906,13 @@ void free_percpu(void *ptr)
 
 	pcpu_free_area(chunk, off);
 
-	/* the chunk became fully free, kill one if there are other free ones */
+	/* if there are more than one fully free chunks, wake up grim reaper */
 	if (chunk->free_size == pcpu_unit_size) {
 		struct pcpu_chunk *pos;
 
-		list_for_each_entry(pos,
-				    &pcpu_slot[pcpu_chunk_slot(chunk)], list)
+		list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list)
 			if (pos != chunk) {
-				pcpu_kill_chunk(pos);
+				schedule_work(&pcpu_reclaim_work);
 				break;
 			}
 	}
-- 
1.6.0.2


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH 4/4] percpu: finer grained locking to break deadlock and allow atomic free
  2009-03-06 15:54 [GIT PULL] percpu: finer grained locking Tejun Heo
                   ` (2 preceding siblings ...)
  2009-03-06 15:54 ` [PATCH 3/4] percpu: move fully free chunk reclamation into a work Tejun Heo
@ 2009-03-06 15:54 ` Tejun Heo
  2009-03-07  5:49   ` [PATCH UPDATED " Tejun Heo
  3 siblings, 1 reply; 7+ messages in thread
From: Tejun Heo @ 2009-03-06 15:54 UTC (permalink / raw)
  To: mingo, rusty, tglx, x86, linux-kernel, hpa, npiggin, akpm; +Cc: Tejun Heo

Impact: fix deadlock and allow atomic free

Percpu allocation always uses GFP_KERNEL and whole alloc/free paths
were protected by single mutex.  All percpu allocations have been from
GFP_KERNEL-safe context and the original allocator had this assumption
too.  However, by protecting both alloc and free paths with the same
mutex, the new allocator creates free -> alloc -> GFP_KERNEL
dependency which the original allocator didn't have.  This can lead to
deadlock if free is called from FS or IO paths.  Also, in general,
allocators are expected to allow free to be called from atomic
context.

This patch implements finer grained locking to break the deadlock and
allow atomic free.  For details, please read the "Synchronization
rules" comment.

While at it, also add CONTEXT: to function comments to describe which
context they expect to be called from and what they do to it.

This problem was reported by Thomas Gleixner and Peter Zijlstra.

  http://thread.gmane.org/gmane.linux.kernel/802384

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Thomas Gleixner <tglx@linutronix.de>
Reported-by: Peter Zijlstra <peterz@infradead.org>
---
 mm/percpu.c |  160 +++++++++++++++++++++++++++++++++++++++++++++--------------
 1 files changed, 123 insertions(+), 37 deletions(-)

diff --git a/mm/percpu.c b/mm/percpu.c
index 4c8a419..d2cda3d 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -62,6 +62,7 @@
 #include <linux/pfn.h>
 #include <linux/rbtree.h>
 #include <linux/slab.h>
+#include <linux/spinlock.h>
 #include <linux/vmalloc.h>
 #include <linux/workqueue.h>
 
@@ -101,20 +102,28 @@ static struct pcpu_chunk *pcpu_reserved_chunk;
 static int pcpu_reserved_chunk_limit;
 
 /*
- * One mutex to rule them all.
- *
- * The following mutex is grabbed in the outermost public alloc/free
- * interface functions and released only when the operation is
- * complete.  As such, every function in this file other than the
- * outermost functions are called under pcpu_mutex.
- *
- * It can easily be switched to use spinlock such that only the area
- * allocation and page population commit are protected with it doing
- * actual [de]allocation without holding any lock.  However, given
- * what this allocator does, I think it's better to let them run
- * sequentially.
+ * Synchronization rules.
+ *
+ * There are two locks - pcpu_alloc_mutex and pcpu_lock.  The former
+ * protects allocation/reclaim paths, chunks and chunk->page arrays.
+ * The latter is a spinlock and protects the index data structures -
+ * chunk slots, rbtree, chunks and area maps in chunks.
+ *
+ * During allocation, pcpu_alloc_mutex is kept locked all the time and
+ * pcpu_lock is grabbed and released as necessary.  All actual memory
+ * allocations are done using GFP_KERNEL with pcpu_lock released.
+ *
+ * Free path accesses and alters only the index data structures, so it
+ * can be safely called from atomic context.  When memory needs to be
+ * returned to the system, free path schedules reclaim_work which
+ * grabs both pcpu_alloc_mutex and pcpu_lock, unlinks chunks to be
+ * reclaimed, release both locks and frees the chunks.  Note that it's
+ * necessary to grab both locks to remove a chunk from circulation as
+ * allocation path might be referencing the chunk with only
+ * pcpu_alloc_mutex locked.
  */
-static DEFINE_MUTEX(pcpu_mutex);
+static DEFINE_MUTEX(pcpu_alloc_mutex);	/* protects whole alloc and reclaim */
+static DEFINE_SPINLOCK(pcpu_lock);	/* protects index data structures */
 
 static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
 static struct rb_root pcpu_addr_root = RB_ROOT;	/* chunks by address */
@@ -176,6 +185,9 @@ static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk,
  * kzalloc() is used; otherwise, vmalloc() is used.  The returned
  * memory is always zeroed.
  *
+ * CONTEXT:
+ * Does GFP_KERNEL allocation.
+ *
  * RETURNS:
  * Pointer to the allocated area on success, NULL on failure.
  */
@@ -215,6 +227,9 @@ static void pcpu_mem_free(void *ptr, size_t size)
  * New slot according to the changed state is determined and @chunk is
  * moved to the slot.  Note that the reserved chunk is never put on
  * chunk slots.
+ *
+ * CONTEXT:
+ * pcpu_lock.
  */
 static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
 {
@@ -260,6 +275,9 @@ static struct rb_node **pcpu_chunk_rb_search(void *addr,
  * searchs for the chunk with the highest start address which isn't
  * beyond @addr.
  *
+ * CONTEXT:
+ * pcpu_lock.
+ *
  * RETURNS:
  * The address of the found chunk.
  */
@@ -300,6 +318,9 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
  * @new: chunk to insert
  *
  * Insert @new into address rb tree.
+ *
+ * CONTEXT:
+ * pcpu_lock.
  */
 static void pcpu_chunk_addr_insert(struct pcpu_chunk *new)
 {
@@ -319,6 +340,10 @@ static void pcpu_chunk_addr_insert(struct pcpu_chunk *new)
  * A single allocation can split an area into three areas, so this
  * function makes sure that @chunk->map has at least two extra slots.
  *
+ * CONTEXT:
+ * pcpu_alloc_mutex, pcpu_lock.  pcpu_lock is released and reacquired
+ * if area map is extended.
+ *
  * RETURNS:
  * 0 if noop, 1 if successfully extended, -errno on failure.
  */
@@ -332,13 +357,25 @@ static int pcpu_extend_area_map(struct pcpu_chunk *chunk)
 	if (chunk->map_alloc >= chunk->map_used + 2)
 		return 0;
 
+	spin_unlock(&pcpu_lock);
+
 	new_alloc = PCPU_DFL_MAP_ALLOC;
 	while (new_alloc < chunk->map_used + 2)
 		new_alloc *= 2;
 
 	new = pcpu_mem_alloc(new_alloc * sizeof(new[0]));
-	if (!new)
+	if (!new) {
+		spin_lock(&pcpu_lock);
 		return -ENOMEM;
+	}
+
+	/*
+	 * Acquire pcpu_lock and switch to new area map.  Only free
+	 * could have happened inbetween, so map_used couldn't have
+	 * grown.
+	 */
+	spin_lock(&pcpu_lock);
+	BUG_ON(new_alloc < chunk->map_used + 2);
 
 	size = chunk->map_alloc * sizeof(chunk->map[0]);
 	memcpy(new, chunk->map, size);
@@ -371,6 +408,9 @@ static int pcpu_extend_area_map(struct pcpu_chunk *chunk)
  * is inserted after the target block.
  *
  * @chunk->map must have enough free slots to accomodate the split.
+ *
+ * CONTEXT:
+ * pcpu_lock.
  */
 static void pcpu_split_block(struct pcpu_chunk *chunk, int i,
 			     int head, int tail)
@@ -406,6 +446,9 @@ static void pcpu_split_block(struct pcpu_chunk *chunk, int i,
  *
  * @chunk->map must have at least two free slots.
  *
+ * CONTEXT:
+ * pcpu_lock.
+ *
  * RETURNS:
  * Allocated offset in @chunk on success, -1 if no matching area is
  * found.
@@ -495,6 +538,9 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
  * Free area starting from @freeme to @chunk.  Note that this function
  * only modifies the allocation map.  It doesn't depopulate or unmap
  * the area.
+ *
+ * CONTEXT:
+ * pcpu_lock.
  */
 static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme)
 {
@@ -580,6 +626,9 @@ static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end,
  * For each cpu, depopulate and unmap pages [@page_start,@page_end)
  * from @chunk.  If @flush is true, vcache is flushed before unmapping
  * and tlb after.
+ *
+ * CONTEXT:
+ * pcpu_alloc_mutex.
  */
 static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size,
 				  bool flush)
@@ -658,6 +707,9 @@ static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end)
  *
  * For each cpu, populate and map pages [@page_start,@page_end) into
  * @chunk.  The area is cleared on return.
+ *
+ * CONTEXT:
+ * pcpu_alloc_mutex, does GFP_KERNEL allocation.
  */
 static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
 {
@@ -748,15 +800,16 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void)
  * @align: alignment of area (max PAGE_SIZE)
  * @reserved: allocate from the reserved chunk if available
  *
- * Allocate percpu area of @size bytes aligned at @align.  Might
- * sleep.  Might trigger writeouts.
+ * Allocate percpu area of @size bytes aligned at @align.
+ *
+ * CONTEXT:
+ * Does GFP_KERNEL allocation.
  *
  * RETURNS:
  * Percpu pointer to the allocated area on success, NULL on failure.
  */
 static void *pcpu_alloc(size_t size, size_t align, bool reserved)
 {
-	void *ptr = NULL;
 	struct pcpu_chunk *chunk;
 	int slot, off;
 
@@ -766,27 +819,37 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved)
 		return NULL;
 	}
 
-	mutex_lock(&pcpu_mutex);
+	mutex_lock(&pcpu_alloc_mutex);
+	spin_lock(&pcpu_lock);
 
 	/* serve reserved allocations from the reserved chunk if available */
 	if (reserved && pcpu_reserved_chunk) {
 		chunk = pcpu_reserved_chunk;
 		if (size > chunk->contig_hint ||
 		    pcpu_extend_area_map(chunk) < 0)
-			goto out_unlock;
+			goto fail_unlock;
 		off = pcpu_alloc_area(chunk, size, align);
 		if (off >= 0)
 			goto area_found;
-		goto out_unlock;
+		goto fail_unlock;
 	}
 
+restart:
 	/* search through normal chunks */
 	for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
 		list_for_each_entry(chunk, &pcpu_slot[slot], list) {
 			if (size > chunk->contig_hint)
 				continue;
-			if (pcpu_extend_area_map(chunk) < 0)
-				goto out_unlock;
+
+			switch (pcpu_extend_area_map(chunk)) {
+			case 0:
+				break;
+			case 1:
+				goto restart;	/* pcpu_lock dropped, restart */
+			default:
+				goto fail_unlock;
+			}
+
 			off = pcpu_alloc_area(chunk, size, align);
 			if (off >= 0)
 				goto area_found;
@@ -794,27 +857,36 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved)
 	}
 
 	/* hmmm... no space left, create a new chunk */
+	spin_unlock(&pcpu_lock);
+
 	chunk = alloc_pcpu_chunk();
 	if (!chunk)
-		goto out_unlock;
+		goto fail_unlock_mutex;
+
+	spin_lock(&pcpu_lock);
 	pcpu_chunk_relocate(chunk, -1);
 	pcpu_chunk_addr_insert(chunk);
-
-	off = pcpu_alloc_area(chunk, size, align);
-	if (off < 0)
-		goto out_unlock;
+	goto restart;
 
 area_found:
+	spin_unlock(&pcpu_lock);
+
 	/* populate, map and clear the area */
 	if (pcpu_populate_chunk(chunk, off, size)) {
+		spin_lock(&pcpu_lock);
 		pcpu_free_area(chunk, off);
-		goto out_unlock;
+		goto fail_unlock;
 	}
 
-	ptr = __addr_to_pcpu_ptr(chunk->vm->addr + off);
-out_unlock:
-	mutex_unlock(&pcpu_mutex);
-	return ptr;
+	mutex_unlock(&pcpu_alloc_mutex);
+
+	return __addr_to_pcpu_ptr(chunk->vm->addr + off);
+
+fail_unlock:
+	spin_unlock(&pcpu_lock);
+fail_unlock_mutex:
+	mutex_unlock(&pcpu_alloc_mutex);
+	return NULL;
 }
 
 /**
@@ -825,6 +897,9 @@ out_unlock:
  * Allocate percpu area of @size bytes aligned at @align.  Might
  * sleep.  Might trigger writeouts.
  *
+ * CONTEXT:
+ * Does GFP_KERNEL allocation.
+ *
  * RETURNS:
  * Percpu pointer to the allocated area on success, NULL on failure.
  */
@@ -843,6 +918,9 @@ EXPORT_SYMBOL_GPL(__alloc_percpu);
  * percpu area if arch has set it up; otherwise, allocation is served
  * from the same dynamic area.  Might sleep.  Might trigger writeouts.
  *
+ * CONTEXT:
+ * Does GFP_KERNEL allocation.
+ *
  * RETURNS:
  * Percpu pointer to the allocated area on success, NULL on failure.
  */
@@ -856,6 +934,9 @@ void *__alloc_reserved_percpu(size_t size, size_t align)
  * @work: unused
  *
  * Reclaim all fully free chunks except for the first one.
+ *
+ * CONTEXT:
+ * workqueue context.
  */
 static void pcpu_reclaim(struct work_struct *work)
 {
@@ -863,7 +944,8 @@ static void pcpu_reclaim(struct work_struct *work)
 	struct list_head *head = &pcpu_slot[pcpu_nr_slots - 1];
 	struct pcpu_chunk *chunk, *next;
 
-	mutex_lock(&pcpu_mutex);
+	mutex_lock(&pcpu_alloc_mutex);
+	spin_lock(&pcpu_lock);
 
 	list_for_each_entry_safe(chunk, next, head, list) {
 		WARN_ON(chunk->immutable);
@@ -876,7 +958,8 @@ static void pcpu_reclaim(struct work_struct *work)
 		list_move(&chunk->list, &todo);
 	}
 
-	mutex_unlock(&pcpu_mutex);
+	spin_unlock(&pcpu_lock);
+	mutex_unlock(&pcpu_alloc_mutex);
 
 	list_for_each_entry_safe(chunk, next, &todo, list) {
 		pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false);
@@ -888,7 +971,10 @@ static void pcpu_reclaim(struct work_struct *work)
  * free_percpu - free percpu area
  * @ptr: pointer to area to free
  *
- * Free percpu area @ptr.  Might sleep.
+ * Free percpu area @ptr.
+ *
+ * CONTEXT:
+ * Can be called from atomic context.
  */
 void free_percpu(void *ptr)
 {
@@ -899,7 +985,7 @@ void free_percpu(void *ptr)
 	if (!ptr)
 		return;
 
-	mutex_lock(&pcpu_mutex);
+	spin_lock(&pcpu_lock);
 
 	chunk = pcpu_chunk_addr_search(addr);
 	off = addr - chunk->vm->addr;
@@ -917,7 +1003,7 @@ void free_percpu(void *ptr)
 			}
 	}
 
-	mutex_unlock(&pcpu_mutex);
+	spin_unlock(&pcpu_lock);
 }
 EXPORT_SYMBOL_GPL(free_percpu);
 
-- 
1.6.0.2


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH UPDATED 4/4] percpu: finer grained locking to break deadlock and allow atomic free
  2009-03-06 15:54 ` [PATCH 4/4] percpu: finer grained locking to break deadlock and allow atomic free Tejun Heo
@ 2009-03-07  5:49   ` Tejun Heo
  2009-03-08 10:20     ` Ingo Molnar
  0 siblings, 1 reply; 7+ messages in thread
From: Tejun Heo @ 2009-03-07  5:49 UTC (permalink / raw)
  To: mingo, rusty, tglx, x86, linux-kernel, hpa, npiggin, akpm

Impact: fix deadlock and allow atomic free

Percpu allocation always uses GFP_KERNEL and whole alloc/free paths
were protected by single mutex.  All percpu allocations have been from
GFP_KERNEL-safe context and the original allocator had this assumption
too.  However, by protecting both alloc and free paths with the same
mutex, the new allocator creates free -> alloc -> GFP_KERNEL
dependency which the original allocator didn't have.  This can lead to
deadlock if free is called from FS or IO paths.  Also, in general,
allocators are expected to allow free to be called from atomic
context.

This patch implements finer grained locking to break the deadlock and
allow atomic free.  For details, please read the "Synchronization
rules" comment.

While at it, also add CONTEXT: to function comments to describe which
context they expect to be called from and what they do to it.

This problem was reported by Thomas Gleixner and Peter Zijlstra.

  http://thread.gmane.org/gmane.linux.kernel/802384

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Thomas Gleixner <tglx@linutronix.de>
Reported-by: Peter Zijlstra <peterz@infradead.org>
---
IRQ spin lock ops should be used to allow free to be called from IRQ
context.  Fixed.  The git tree is updated too.  The new commit ID is
ccea34b5d0fbab081496d1860f31acee99fa8a6d.

Thanks.

 mm/percpu.c |  161 +++++++++++++++++++++++++++++++++++++++++++++--------------
 1 files changed, 124 insertions(+), 37 deletions(-)

diff --git a/mm/percpu.c b/mm/percpu.c
index 4c8a419..bfe6a3a 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -62,6 +62,7 @@
 #include <linux/pfn.h>
 #include <linux/rbtree.h>
 #include <linux/slab.h>
+#include <linux/spinlock.h>
 #include <linux/vmalloc.h>
 #include <linux/workqueue.h>
 
@@ -101,20 +102,28 @@ static struct pcpu_chunk *pcpu_reserved_chunk;
 static int pcpu_reserved_chunk_limit;
 
 /*
- * One mutex to rule them all.
- *
- * The following mutex is grabbed in the outermost public alloc/free
- * interface functions and released only when the operation is
- * complete.  As such, every function in this file other than the
- * outermost functions are called under pcpu_mutex.
- *
- * It can easily be switched to use spinlock such that only the area
- * allocation and page population commit are protected with it doing
- * actual [de]allocation without holding any lock.  However, given
- * what this allocator does, I think it's better to let them run
- * sequentially.
+ * Synchronization rules.
+ *
+ * There are two locks - pcpu_alloc_mutex and pcpu_lock.  The former
+ * protects allocation/reclaim paths, chunks and chunk->page arrays.
+ * The latter is a spinlock and protects the index data structures -
+ * chunk slots, rbtree, chunks and area maps in chunks.
+ *
+ * During allocation, pcpu_alloc_mutex is kept locked all the time and
+ * pcpu_lock is grabbed and released as necessary.  All actual memory
+ * allocations are done using GFP_KERNEL with pcpu_lock released.
+ *
+ * Free path accesses and alters only the index data structures, so it
+ * can be safely called from atomic context.  When memory needs to be
+ * returned to the system, free path schedules reclaim_work which
+ * grabs both pcpu_alloc_mutex and pcpu_lock, unlinks chunks to be
+ * reclaimed, release both locks and frees the chunks.  Note that it's
+ * necessary to grab both locks to remove a chunk from circulation as
+ * allocation path might be referencing the chunk with only
+ * pcpu_alloc_mutex locked.
  */
-static DEFINE_MUTEX(pcpu_mutex);
+static DEFINE_MUTEX(pcpu_alloc_mutex);	/* protects whole alloc and reclaim */
+static DEFINE_SPINLOCK(pcpu_lock);	/* protects index data structures */
 
 static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
 static struct rb_root pcpu_addr_root = RB_ROOT;	/* chunks by address */
@@ -176,6 +185,9 @@ static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk,
  * kzalloc() is used; otherwise, vmalloc() is used.  The returned
  * memory is always zeroed.
  *
+ * CONTEXT:
+ * Does GFP_KERNEL allocation.
+ *
  * RETURNS:
  * Pointer to the allocated area on success, NULL on failure.
  */
@@ -215,6 +227,9 @@ static void pcpu_mem_free(void *ptr, size_t size)
  * New slot according to the changed state is determined and @chunk is
  * moved to the slot.  Note that the reserved chunk is never put on
  * chunk slots.
+ *
+ * CONTEXT:
+ * pcpu_lock.
  */
 static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
 {
@@ -260,6 +275,9 @@ static struct rb_node **pcpu_chunk_rb_search(void *addr,
  * searchs for the chunk with the highest start address which isn't
  * beyond @addr.
  *
+ * CONTEXT:
+ * pcpu_lock.
+ *
  * RETURNS:
  * The address of the found chunk.
  */
@@ -300,6 +318,9 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
  * @new: chunk to insert
  *
  * Insert @new into address rb tree.
+ *
+ * CONTEXT:
+ * pcpu_lock.
  */
 static void pcpu_chunk_addr_insert(struct pcpu_chunk *new)
 {
@@ -319,6 +340,10 @@ static void pcpu_chunk_addr_insert(struct pcpu_chunk *new)
  * A single allocation can split an area into three areas, so this
  * function makes sure that @chunk->map has at least two extra slots.
  *
+ * CONTEXT:
+ * pcpu_alloc_mutex, pcpu_lock.  pcpu_lock is released and reacquired
+ * if area map is extended.
+ *
  * RETURNS:
  * 0 if noop, 1 if successfully extended, -errno on failure.
  */
@@ -332,13 +357,25 @@ static int pcpu_extend_area_map(struct pcpu_chunk *chunk)
 	if (chunk->map_alloc >= chunk->map_used + 2)
 		return 0;
 
+	spin_unlock_irq(&pcpu_lock);
+
 	new_alloc = PCPU_DFL_MAP_ALLOC;
 	while (new_alloc < chunk->map_used + 2)
 		new_alloc *= 2;
 
 	new = pcpu_mem_alloc(new_alloc * sizeof(new[0]));
-	if (!new)
+	if (!new) {
+		spin_lock_irq(&pcpu_lock);
 		return -ENOMEM;
+	}
+
+	/*
+	 * Acquire pcpu_lock and switch to new area map.  Only free
+	 * could have happened inbetween, so map_used couldn't have
+	 * grown.
+	 */
+	spin_lock_irq(&pcpu_lock);
+	BUG_ON(new_alloc < chunk->map_used + 2);
 
 	size = chunk->map_alloc * sizeof(chunk->map[0]);
 	memcpy(new, chunk->map, size);
@@ -371,6 +408,9 @@ static int pcpu_extend_area_map(struct pcpu_chunk *chunk)
  * is inserted after the target block.
  *
  * @chunk->map must have enough free slots to accomodate the split.
+ *
+ * CONTEXT:
+ * pcpu_lock.
  */
 static void pcpu_split_block(struct pcpu_chunk *chunk, int i,
 			     int head, int tail)
@@ -406,6 +446,9 @@ static void pcpu_split_block(struct pcpu_chunk *chunk, int i,
  *
  * @chunk->map must have at least two free slots.
  *
+ * CONTEXT:
+ * pcpu_lock.
+ *
  * RETURNS:
  * Allocated offset in @chunk on success, -1 if no matching area is
  * found.
@@ -495,6 +538,9 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
  * Free area starting from @freeme to @chunk.  Note that this function
  * only modifies the allocation map.  It doesn't depopulate or unmap
  * the area.
+ *
+ * CONTEXT:
+ * pcpu_lock.
  */
 static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme)
 {
@@ -580,6 +626,9 @@ static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end,
  * For each cpu, depopulate and unmap pages [@page_start,@page_end)
  * from @chunk.  If @flush is true, vcache is flushed before unmapping
  * and tlb after.
+ *
+ * CONTEXT:
+ * pcpu_alloc_mutex.
  */
 static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size,
 				  bool flush)
@@ -658,6 +707,9 @@ static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end)
  *
  * For each cpu, populate and map pages [@page_start,@page_end) into
  * @chunk.  The area is cleared on return.
+ *
+ * CONTEXT:
+ * pcpu_alloc_mutex, does GFP_KERNEL allocation.
  */
 static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
 {
@@ -748,15 +800,16 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void)
  * @align: alignment of area (max PAGE_SIZE)
  * @reserved: allocate from the reserved chunk if available
  *
- * Allocate percpu area of @size bytes aligned at @align.  Might
- * sleep.  Might trigger writeouts.
+ * Allocate percpu area of @size bytes aligned at @align.
+ *
+ * CONTEXT:
+ * Does GFP_KERNEL allocation.
  *
  * RETURNS:
  * Percpu pointer to the allocated area on success, NULL on failure.
  */
 static void *pcpu_alloc(size_t size, size_t align, bool reserved)
 {
-	void *ptr = NULL;
 	struct pcpu_chunk *chunk;
 	int slot, off;
 
@@ -766,27 +819,37 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved)
 		return NULL;
 	}
 
-	mutex_lock(&pcpu_mutex);
+	mutex_lock(&pcpu_alloc_mutex);
+	spin_lock_irq(&pcpu_lock);
 
 	/* serve reserved allocations from the reserved chunk if available */
 	if (reserved && pcpu_reserved_chunk) {
 		chunk = pcpu_reserved_chunk;
 		if (size > chunk->contig_hint ||
 		    pcpu_extend_area_map(chunk) < 0)
-			goto out_unlock;
+			goto fail_unlock;
 		off = pcpu_alloc_area(chunk, size, align);
 		if (off >= 0)
 			goto area_found;
-		goto out_unlock;
+		goto fail_unlock;
 	}
 
+restart:
 	/* search through normal chunks */
 	for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
 		list_for_each_entry(chunk, &pcpu_slot[slot], list) {
 			if (size > chunk->contig_hint)
 				continue;
-			if (pcpu_extend_area_map(chunk) < 0)
-				goto out_unlock;
+
+			switch (pcpu_extend_area_map(chunk)) {
+			case 0:
+				break;
+			case 1:
+				goto restart;	/* pcpu_lock dropped, restart */
+			default:
+				goto fail_unlock;
+			}
+
 			off = pcpu_alloc_area(chunk, size, align);
 			if (off >= 0)
 				goto area_found;
@@ -794,27 +857,36 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved)
 	}
 
 	/* hmmm... no space left, create a new chunk */
+	spin_unlock_irq(&pcpu_lock);
+
 	chunk = alloc_pcpu_chunk();
 	if (!chunk)
-		goto out_unlock;
+		goto fail_unlock_mutex;
+
+	spin_lock_irq(&pcpu_lock);
 	pcpu_chunk_relocate(chunk, -1);
 	pcpu_chunk_addr_insert(chunk);
-
-	off = pcpu_alloc_area(chunk, size, align);
-	if (off < 0)
-		goto out_unlock;
+	goto restart;
 
 area_found:
+	spin_unlock_irq(&pcpu_lock);
+
 	/* populate, map and clear the area */
 	if (pcpu_populate_chunk(chunk, off, size)) {
+		spin_lock_irq(&pcpu_lock);
 		pcpu_free_area(chunk, off);
-		goto out_unlock;
+		goto fail_unlock;
 	}
 
-	ptr = __addr_to_pcpu_ptr(chunk->vm->addr + off);
-out_unlock:
-	mutex_unlock(&pcpu_mutex);
-	return ptr;
+	mutex_unlock(&pcpu_alloc_mutex);
+
+	return __addr_to_pcpu_ptr(chunk->vm->addr + off);
+
+fail_unlock:
+	spin_unlock_irq(&pcpu_lock);
+fail_unlock_mutex:
+	mutex_unlock(&pcpu_alloc_mutex);
+	return NULL;
 }
 
 /**
@@ -825,6 +897,9 @@ out_unlock:
  * Allocate percpu area of @size bytes aligned at @align.  Might
  * sleep.  Might trigger writeouts.
  *
+ * CONTEXT:
+ * Does GFP_KERNEL allocation.
+ *
  * RETURNS:
  * Percpu pointer to the allocated area on success, NULL on failure.
  */
@@ -843,6 +918,9 @@ EXPORT_SYMBOL_GPL(__alloc_percpu);
  * percpu area if arch has set it up; otherwise, allocation is served
  * from the same dynamic area.  Might sleep.  Might trigger writeouts.
  *
+ * CONTEXT:
+ * Does GFP_KERNEL allocation.
+ *
  * RETURNS:
  * Percpu pointer to the allocated area on success, NULL on failure.
  */
@@ -856,6 +934,9 @@ void *__alloc_reserved_percpu(size_t size, size_t align)
  * @work: unused
  *
  * Reclaim all fully free chunks except for the first one.
+ *
+ * CONTEXT:
+ * workqueue context.
  */
 static void pcpu_reclaim(struct work_struct *work)
 {
@@ -863,7 +944,8 @@ static void pcpu_reclaim(struct work_struct *work)
 	struct list_head *head = &pcpu_slot[pcpu_nr_slots - 1];
 	struct pcpu_chunk *chunk, *next;
 
-	mutex_lock(&pcpu_mutex);
+	mutex_lock(&pcpu_alloc_mutex);
+	spin_lock_irq(&pcpu_lock);
 
 	list_for_each_entry_safe(chunk, next, head, list) {
 		WARN_ON(chunk->immutable);
@@ -876,7 +958,8 @@ static void pcpu_reclaim(struct work_struct *work)
 		list_move(&chunk->list, &todo);
 	}
 
-	mutex_unlock(&pcpu_mutex);
+	spin_unlock_irq(&pcpu_lock);
+	mutex_unlock(&pcpu_alloc_mutex);
 
 	list_for_each_entry_safe(chunk, next, &todo, list) {
 		pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false);
@@ -888,18 +971,22 @@ static void pcpu_reclaim(struct work_struct *work)
  * free_percpu - free percpu area
  * @ptr: pointer to area to free
  *
- * Free percpu area @ptr.  Might sleep.
+ * Free percpu area @ptr.
+ *
+ * CONTEXT:
+ * Can be called from atomic context.
  */
 void free_percpu(void *ptr)
 {
 	void *addr = __pcpu_ptr_to_addr(ptr);
 	struct pcpu_chunk *chunk;
+	unsigned long flags;
 	int off;
 
 	if (!ptr)
 		return;
 
-	mutex_lock(&pcpu_mutex);
+	spin_lock_irqsave(&pcpu_lock, flags);
 
 	chunk = pcpu_chunk_addr_search(addr);
 	off = addr - chunk->vm->addr;
@@ -917,7 +1004,7 @@ void free_percpu(void *ptr)
 			}
 	}
 
-	mutex_unlock(&pcpu_mutex);
+	spin_unlock_irqrestore(&pcpu_lock, flags);
 }
 EXPORT_SYMBOL_GPL(free_percpu);
 
-- 
1.6.0.2


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH UPDATED 4/4] percpu: finer grained locking to break deadlock and allow atomic free
  2009-03-07  5:49   ` [PATCH UPDATED " Tejun Heo
@ 2009-03-08 10:20     ` Ingo Molnar
  0 siblings, 0 replies; 7+ messages in thread
From: Ingo Molnar @ 2009-03-08 10:20 UTC (permalink / raw)
  To: Tejun Heo; +Cc: rusty, tglx, x86, linux-kernel, hpa, npiggin, akpm


* Tejun Heo <tj@kernel.org> wrote:

> IRQ spin lock ops should be used to allow free to be called 
> from IRQ context.  Fixed.  The git tree is updated too.  The 
> new commit ID is ccea34b5d0fbab081496d1860f31acee99fa8a6d.
> 
> Thanks.

Pulled, thanks Tejun!

	Ingo

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2009-03-08 10:20 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-03-06 15:54 [GIT PULL] percpu: finer grained locking Tejun Heo
2009-03-06 15:54 ` [PATCH 1/4] percpu: replace pcpu_realloc() with pcpu_mem_alloc() and pcpu_mem_free() Tejun Heo
2009-03-06 15:54 ` [PATCH 2/4] percpu: move chunk area map extension out of area allocation Tejun Heo
2009-03-06 15:54 ` [PATCH 3/4] percpu: move fully free chunk reclamation into a work Tejun Heo
2009-03-06 15:54 ` [PATCH 4/4] percpu: finer grained locking to break deadlock and allow atomic free Tejun Heo
2009-03-07  5:49   ` [PATCH UPDATED " Tejun Heo
2009-03-08 10:20     ` Ingo Molnar

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox