[PATCH 1/2] mm: Take all anon_vma locks in anon_vma

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 1/2] mm: Take all anon_vma locks in anon_vma_lock
  2010-04-29  8:32 [PATCH 0/2] Fix migration races in rmap_walk() V3 Mel Gorman
@ 2010-04-29  8:32 ` Mel Gorman
  2010-05-02 17:28   ` Minchan Kim
  0 siblings, 1 reply; 19+ messages in thread
From: Mel Gorman @ 2010-04-29  8:32 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linux-MM, LKML, Minchan Kim, KAMEZAWA Hiroyuki, Mel Gorman,
	Christoph Lameter, Andrea Arcangeli, Rik van Riel

From: Rik van Riel <riel@redhat.com>

Take all the locks for all the anon_vmas in anon_vma_lock, this properly
excludes migration and the transparent hugepage code from VMA changes done
by mmap/munmap/mprotect/expand_stack/etc...

Unfortunately, this requires adding a new lock (mm->anon_vma_chain_lock),
otherwise we have an unavoidable lock ordering conflict.  This changes the
locking rules for the "same_vma" list to be either mm->mmap_sem for write,
or mm->mmap_sem for read plus the new mm->anon_vma_chain lock.  This limits
the place where the new lock is taken to 2 locations - anon_vma_prepare and
expand_downwards.

Document the locking rules for the same_vma list in the anon_vma_chain and
remove the anon_vma_lock call from expand_upwards, which does not need it.

Signed-off-by: Rik van Riel <riel@redhat.com>
---
 include/linux/mm_types.h |    1 +
 include/linux/rmap.h     |   28 +++++++++++++++++++---------
 kernel/fork.c            |    1 +
 mm/init-mm.c             |    1 +
 mm/mmap.c                |   21 ++++++++++++---------
 mm/rmap.c                |   10 ++++++----
 6 files changed, 40 insertions(+), 22 deletions(-)

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index b8bb9a6..a0679c6 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -239,6 +239,7 @@ struct mm_struct {
 	int map_count;				/* number of VMAs */
 	struct rw_semaphore mmap_sem;
 	spinlock_t page_table_lock;		/* Protects page tables and some counters */
+	spinlock_t anon_vma_chain_lock;		/* Protects vma->anon_vma_chain, with mmap_sem */
 
 	struct list_head mmlist;		/* List of maybe swapped mm's.	These are globally strung
 						 * together off init_mm.mmlist, and are protected
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 7721674..e8d480d 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -61,11 +61,15 @@ struct anon_vma {
  * all the anon_vmas associated with this VMA.
  * The "same_anon_vma" list contains the anon_vma_chains
  * which link all the VMAs associated with this anon_vma.
+ *
+ * The "same_vma" list is locked by either having mm->mmap_sem
+ * locked for writing, or having mm->mmap_sem locked for reading
+ * AND holding the mm->anon_vma_chain_lock.
  */
 struct anon_vma_chain {
 	struct vm_area_struct *vma;
 	struct anon_vma *anon_vma;
-	struct list_head same_vma;   /* locked by mmap_sem & page_table_lock */
+	struct list_head same_vma;	/* see above */
 	struct list_head same_anon_vma;	/* locked by anon_vma->lock */
 };
 
@@ -99,18 +103,24 @@ static inline struct anon_vma *page_anon_vma(struct page *page)
 	return page_rmapping(page);
 }
 
-static inline void anon_vma_lock(struct vm_area_struct *vma)
-{
-	struct anon_vma *anon_vma = vma->anon_vma;
-	if (anon_vma)
-		spin_lock(&anon_vma->lock);
-}
+#define anon_vma_lock(vma, nest_lock)					\
+({									\
+	struct anon_vma *anon_vma = vma->anon_vma;			\
+	if (anon_vma) {							\
+		struct anon_vma_chain *avc;				\
+		list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) \
+			spin_lock_nest_lock(&avc->anon_vma->lock, nest_lock); \
+	}								\
+})
 
 static inline void anon_vma_unlock(struct vm_area_struct *vma)
 {
 	struct anon_vma *anon_vma = vma->anon_vma;
-	if (anon_vma)
-		spin_unlock(&anon_vma->lock);
+	if (anon_vma) {
+		struct anon_vma_chain *avc;
+		list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
+			spin_unlock(&avc->anon_vma->lock);
+	}
 }
 
 /*
diff --git a/kernel/fork.c b/kernel/fork.c
index 7f4c6fe..98adcdf 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -481,6 +481,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
 	mm->nr_ptes = 0;
 	memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
 	spin_lock_init(&mm->page_table_lock);
+	spin_lock_init(&mm->anon_vma_chain_lock);
 	mm->free_area_cache = TASK_UNMAPPED_BASE;
 	mm->cached_hole_size = ~0UL;
 	mm_init_aio(mm);
diff --git a/mm/init-mm.c b/mm/init-mm.c
index 57aba0d..3ce8a1f 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -15,6 +15,7 @@ struct mm_struct init_mm = {
 	.mm_count	= ATOMIC_INIT(1),
 	.mmap_sem	= __RWSEM_INITIALIZER(init_mm.mmap_sem),
 	.page_table_lock =  __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
+	.anon_vma_chain_lock =  __SPIN_LOCK_UNLOCKED(init_mm.anon_vma_chain_lock),
 	.mmlist		= LIST_HEAD_INIT(init_mm.mmlist),
 	.cpu_vm_mask	= CPU_MASK_ALL,
 };
diff --git a/mm/mmap.c b/mm/mmap.c
index f90ea92..4602358 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -452,7 +452,7 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
 		spin_lock(&mapping->i_mmap_lock);
 		vma->vm_truncate_count = mapping->truncate_count;
 	}
-	anon_vma_lock(vma);
+	anon_vma_lock(vma, &mm->mmap_sem);
 
 	__vma_link(mm, vma, prev, rb_link, rb_parent);
 	__vma_link_file(vma);
@@ -578,6 +578,7 @@ again:			remove_next = 1 + (end > next->vm_end);
 		}
 	}
 
+	anon_vma_lock(vma, &mm->mmap_sem);
 	if (root) {
 		flush_dcache_mmap_lock(mapping);
 		vma_prio_tree_remove(vma, root);
@@ -599,6 +600,7 @@ again:			remove_next = 1 + (end > next->vm_end);
 		vma_prio_tree_insert(vma, root);
 		flush_dcache_mmap_unlock(mapping);
 	}
+	anon_vma_unlock(vma);
 
 	if (remove_next) {
 		/*
@@ -1705,12 +1707,11 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
 		return -EFAULT;
 
 	/*
-	 * We must make sure the anon_vma is allocated
-	 * so that the anon_vma locking is not a noop.
+	 * Unlike expand_downwards, we do not need to take the anon_vma lock,
+	 * because we leave vma->vm_start and vma->pgoff untouched. 
+	 * This means rmap lookups of pages inside this VMA stay valid
+	 * throughout the stack expansion.
 	 */
-	if (unlikely(anon_vma_prepare(vma)))
-		return -ENOMEM;
-	anon_vma_lock(vma);
 
 	/*
 	 * vma->vm_start/vm_end cannot change under us because the caller
@@ -1721,7 +1722,6 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
 	if (address < PAGE_ALIGN(address+4))
 		address = PAGE_ALIGN(address+4);
 	else {
-		anon_vma_unlock(vma);
 		return -ENOMEM;
 	}
 	error = 0;
@@ -1737,7 +1737,6 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
 		if (!error)
 			vma->vm_end = address;
 	}
-	anon_vma_unlock(vma);
 	return error;
 }
 #endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */
@@ -1749,6 +1748,7 @@ static int expand_downwards(struct vm_area_struct *vma,
 				   unsigned long address)
 {
 	int error;
+	struct mm_struct *mm = vma->vm_mm;
 
 	/*
 	 * We must make sure the anon_vma is allocated
@@ -1762,7 +1762,8 @@ static int expand_downwards(struct vm_area_struct *vma,
 	if (error)
 		return error;
 
-	anon_vma_lock(vma);
+	spin_lock(&mm->anon_vma_chain_lock);
+	anon_vma_lock(vma, &mm->anon_vma_chain_lock);
 
 	/*
 	 * vma->vm_start/vm_end cannot change under us because the caller
@@ -1784,6 +1785,8 @@ static int expand_downwards(struct vm_area_struct *vma,
 		}
 	}
 	anon_vma_unlock(vma);
+	spin_unlock(&mm->anon_vma_chain_lock);
+
 	return error;
 }
 
diff --git a/mm/rmap.c b/mm/rmap.c
index 85f203e..f95b66d 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -23,6 +23,7 @@
  * inode->i_mutex	(while writing or truncating, not reading or faulting)
  *   inode->i_alloc_sem (vmtruncate_range)
  *   mm->mmap_sem
+ *   mm->anon_vma_chain_lock (mmap_sem for read, protects vma->anon_vma_chain)
  *     page->flags PG_locked (lock_page)
  *       mapping->i_mmap_lock
  *         anon_vma->lock
@@ -133,10 +134,11 @@ int anon_vma_prepare(struct vm_area_struct *vma)
 				goto out_enomem_free_avc;
 			allocated = anon_vma;
 		}
+
+		/* anon_vma_chain_lock to protect against threads */
+		spin_lock(&mm->anon_vma_chain_lock);
 		spin_lock(&anon_vma->lock);
 
-		/* page_table_lock to protect against threads */
-		spin_lock(&mm->page_table_lock);
 		if (likely(!vma->anon_vma)) {
 			vma->anon_vma = anon_vma;
 			avc->anon_vma = anon_vma;
@@ -145,9 +147,9 @@ int anon_vma_prepare(struct vm_area_struct *vma)
 			list_add(&avc->same_anon_vma, &anon_vma->head);
 			allocated = NULL;
 		}
-		spin_unlock(&mm->page_table_lock);
-
 		spin_unlock(&anon_vma->lock);
+		spin_unlock(&mm->anon_vma_chain_lock);
+
 		if (unlikely(allocated)) {
 			anon_vma_free(allocated);
 			anon_vma_chain_free(avc);
-- 
1.6.5

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/2] mm: Take all anon_vma locks in anon_vma_lock
  2010-04-29  8:32 ` [PATCH 1/2] mm: Take all anon_vma locks in anon_vma_lock Mel Gorman
@ 2010-05-02 17:28   ` Minchan Kim
  0 siblings, 0 replies; 19+ messages in thread
From: Minchan Kim @ 2010-05-02 17:28 UTC (permalink / raw)
  To: Mel Gorman
  Cc: Andrew Morton, Linux-MM, LKML, KAMEZAWA Hiroyuki,
	Christoph Lameter, Andrea Arcangeli, Rik van Riel

On Thu, Apr 29, 2010 at 5:32 PM, Mel Gorman <mel@csn.ul.ie> wrote:
> From: Rik van Riel <riel@redhat.com>
>
> Take all the locks for all the anon_vmas in anon_vma_lock, this properly
> excludes migration and the transparent hugepage code from VMA changes done
> by mmap/munmap/mprotect/expand_stack/etc...
>
> Unfortunately, this requires adding a new lock (mm->anon_vma_chain_lock),
> otherwise we have an unavoidable lock ordering conflict.  This changes the
> locking rules for the "same_vma" list to be either mm->mmap_sem for write,
> or mm->mmap_sem for read plus the new mm->anon_vma_chain lock.  This limits
> the place where the new lock is taken to 2 locations - anon_vma_prepare and
> expand_downwards.
>
> Document the locking rules for the same_vma list in the anon_vma_chain and
> remove the anon_vma_lock call from expand_upwards, which does not need it.
>
> Signed-off-by: Rik van Riel <riel@redhat.com>
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>

I like this one.

Although it try to lock the number of anon_vmas attached to a VMA ,
it's small so latency couldn't be big. :)
It's height problem not width problem of tree. :)

Thanks, Rik.
-- 
Kind regards,
Minchan Kim

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH 0/2] Fix migration races in rmap_walk() V4
@ 2010-05-03 16:17 Rik van Riel
  2010-05-03 16:18 ` [PATCH 1/2] mm: Take all anon_vma locks in anon_vma_lock Rik van Riel
  2010-05-03 16:19 ` [PATCH 2/2] mm: fix race between shift_arg_pages and rmap_walk Rik van Riel
  0 siblings, 2 replies; 19+ messages in thread
From: Rik van Riel @ 2010-05-03 16:17 UTC (permalink / raw)
  To: akpm
  Cc: torvalds, Mel Gorman, Linux-MM, LKML, Minchan Kim,
	KAMEZAWA Hiroyuki, Andrea Arcangeli, Christoph Lameter

Andrew, I'm posting this version since Mel is out today and Andrea and I
would really like to get this fixed before 2.6.34 :)

Here is Mel's 0/2 text, plus my changelog update:

The sloppy cleanup of migration PTEs in V2 was to no ones liking, the fork()
change was unnecessary and Rik devised a locking scheme for anon_vma that
was more robust for transparent hugepage support than was purposed in V2.

Andrew, patch one of this series is about the correctness of locking of
anon_vma with respect to migration. While I am not aware of any reproduction
cases, it is potentially racy. Rik will probably release another versions
so I'm not expecting this one to be picked up but I'm including it for
completeness.

Patch two of this series addresses the swapops bug reported that is a race
between migration due to compaction and execve where pages get migrated from
the temporary stack before it is moved. Technically, it would be best if the
anon_vma lock was held while the temporary stack is moved but it would make
exec significantly more complex, particularly in move_page_tables to handle
a corner case in migration. I don't think adding complexity is justified. If
there are no objections, please pick it up and place it between the patches

	mmmigration-allow-the-migration-of-pageswapcache-pages.patch
	mm-allow-config_migration-to-be-set-without-config_numa-or-memory-hot-remove.patch

Unfortunately, I'll be offline for a few days but should be back online
Tuesday.

Changelog since V3
  o Rediff against the latest upstream tree
  o Improve the patch changelog a little (thanks Peterz)

Changelog since V2
  o Drop fork changes
  o Avoid pages in temporary stacks during exec instead of migration pte
    lazy cleanup
  o Drop locking-related patch and replace with Rik's

Changelog since V1
  o Handle the execve race
  o Be sure that rmap_walk() releases the correct VMA lock
  o Hold the anon_vma lock for the address lookup and the page remap
  o Add reviewed-bys

There are a number of races between migration and other operations that mean a
migration PTE can be left behind. Broadly speaking, migration works by locking
a page, unmapping it, putting a migration PTE in place that looks like a swap
entry, copying the page and remapping the page removing the old migration PTE.
If a fault occurs, the faulting process waits until migration completes.

The problem is that there are some races that either allow migration PTEs to
be copied or a migration PTE to be left behind. Migration still completes and
the page is unlocked but later a fault will call migration_entry_to_page()
and BUG() because the page is not locked. This series aims to close some
of these races.

Patch 1 notes that with the anon_vma changes, taking one lock is not
	necessarily enough to guard against changes in all VMAs on a list.
	It introduces a new lock to allow taking the locks on all anon_vmas
	to exclude migration from VMA changes.

Patch 2 notes that while a VMA is moved under the anon_vma lock, the page
	tables are not similarly protected. To avoid migration PTEs being left
	behind, pages within a temporary stack are simply not migrated.

The reproduction case was as follows;

1. Run kernel compilation in a loop
2. Start four processes, each of which creates one mapping. The three stress
   different aspects of the problem. The operations they undertake are;
	a) Forks a hundred children, each of which faults the mapping
		Purpose: stress tests migration pte removal
	b) Forks a hundred children, each which punches a hole in the mapping
	   and faults what remains
		Purpose: stress test VMA manipulations during migration
	c) Forks a hundred children, each of which execs and calls echo
		Purpose: stress test the execve race
	d) Size the mapping to be 1.5 times physical memory. Constantly
	   memset it
		Purpose: stress swapping
3. Constantly compact memory using /proc/sys/vm/compact_memory so migration
   is active all the time. In theory, you could also force this using
   sys_move_pages or memory hot-remove but it'd be nowhere near as easy
   to test.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH 1/2] mm: Take all anon_vma locks in anon_vma_lock
  2010-05-03 16:17 [PATCH 0/2] Fix migration races in rmap_walk() V4 Rik van Riel
@ 2010-05-03 16:18 ` Rik van Riel
  2010-05-03 16:41   ` Linus Torvalds
  2010-05-03 16:55   ` Peter Zijlstra
  2010-05-03 16:19 ` [PATCH 2/2] mm: fix race between shift_arg_pages and rmap_walk Rik van Riel
  1 sibling, 2 replies; 19+ messages in thread
From: Rik van Riel @ 2010-05-03 16:18 UTC (permalink / raw)
  To: akpm
  Cc: torvalds, Mel Gorman, Linux-MM, LKML, Minchan Kim,
	KAMEZAWA Hiroyuki, Andrea Arcangeli, Christoph Lameter

From: Rik van Riel <riel@redhat.com>

Both the page migration code and the transparent hugepage patches expect
100% reliable rmap lookups and use page_lock_anon_vma(page) to prevent
races with mmap, munmap, expand_stack, etc.

Specifically, try_to_unmap indirectly calls vma_address, which uses the
difference between vma->vm_start and vma->vm_pgoff, which can race when a
stack is expanded downwards.  VMA splitting and merging present similar
issues.

With the new anon_vma code, one VMA can be attached to multiple anon_vmas,
however mmap, munmap, expand_stack and others only took one anon_vma->lock.
This patch changes things so we take the anon_vma locks for all of the
anon_vmas attached to a VMA in the code that try_to_unmap would otherwise
race against: mmap, munmap, expand_stack, etc. 

Unfortunately, this leads to a lock ordering conflict with the page_table_lock,
which protected the "same_vma" list in the anon_vma_chain.  Replacing that
lock with a new lock (mm->anon_vma_chain_lock), which is taken higher up in
the mm locking hierarchy, solves that issue.  This changes the locking rules
for the "same_vma" list to be either mm->mmap_sem for write, or mm->mmap_sem
for read plus the new mm->anon_vma_chain lock.  This limits the place where
the new lock is taken to 2 locations - anon_vma_prepare and expand_downwards.

Document the locking rules for the same_vma list in the anon_vma_chain and
remove the anon_vma_lock call from expand_upwards, which does not need it.

Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Acked-by: Andrea Arcangeli <aarcange@redhat.com>

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index b8bb9a6..a0679c6 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -239,6 +239,7 @@ struct mm_struct {
 	int map_count;				/* number of VMAs */
 	struct rw_semaphore mmap_sem;
 	spinlock_t page_table_lock;		/* Protects page tables and some counters */
+	spinlock_t anon_vma_chain_lock;		/* Protects vma->anon_vma_chain, with mmap_sem */
 
 	struct list_head mmlist;		/* List of maybe swapped mm's.	These are globally strung
 						 * together off init_mm.mmlist, and are protected
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index d25bd22..703c472 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -52,11 +52,15 @@ struct anon_vma {
  * all the anon_vmas associated with this VMA.
  * The "same_anon_vma" list contains the anon_vma_chains
  * which link all the VMAs associated with this anon_vma.
+ *
+ * The "same_vma" list is locked by either having mm->mmap_sem
+ * locked for writing, or having mm->mmap_sem locked for reading
+ * AND holding the mm->anon_vma_chain_lock.
  */
 struct anon_vma_chain {
 	struct vm_area_struct *vma;
 	struct anon_vma *anon_vma;
-	struct list_head same_vma;   /* locked by mmap_sem & page_table_lock */
+	struct list_head same_vma;	/* see above */
 	struct list_head same_anon_vma;	/* locked by anon_vma->lock */
 };
 
@@ -90,18 +94,24 @@ static inline struct anon_vma *page_anon_vma(struct page *page)
 	return page_rmapping(page);
 }
 
-static inline void anon_vma_lock(struct vm_area_struct *vma)
-{
-	struct anon_vma *anon_vma = vma->anon_vma;
-	if (anon_vma)
-		spin_lock(&anon_vma->lock);
-}
+#define anon_vma_lock(vma, nest_lock)					\
+({									\
+	struct anon_vma *anon_vma = vma->anon_vma;			\
+	if (anon_vma) {							\
+		struct anon_vma_chain *avc;				\
+		list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) \
+			spin_lock_nest_lock(&avc->anon_vma->lock, nest_lock); \
+	}								\
+})
 
 static inline void anon_vma_unlock(struct vm_area_struct *vma)
 {
 	struct anon_vma *anon_vma = vma->anon_vma;
-	if (anon_vma)
-		spin_unlock(&anon_vma->lock);
+	if (anon_vma) {
+		struct anon_vma_chain *avc;
+		list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
+			spin_unlock(&avc->anon_vma->lock);
+	}
 }
 
 /*
diff --git a/kernel/fork.c b/kernel/fork.c
index 44b0791..83b1ba2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -468,6 +468,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
 	mm->nr_ptes = 0;
 	memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
 	spin_lock_init(&mm->page_table_lock);
+	spin_lock_init(&mm->anon_vma_chain_lock);
 	mm->free_area_cache = TASK_UNMAPPED_BASE;
 	mm->cached_hole_size = ~0UL;
 	mm_init_aio(mm);
diff --git a/mm/init-mm.c b/mm/init-mm.c
index 57aba0d..3ce8a1f 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -15,6 +15,7 @@ struct mm_struct init_mm = {
 	.mm_count	= ATOMIC_INIT(1),
 	.mmap_sem	= __RWSEM_INITIALIZER(init_mm.mmap_sem),
 	.page_table_lock =  __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
+	.anon_vma_chain_lock =  __SPIN_LOCK_UNLOCKED(init_mm.anon_vma_chain_lock),
 	.mmlist		= LIST_HEAD_INIT(init_mm.mmlist),
 	.cpu_vm_mask	= CPU_MASK_ALL,
 };
diff --git a/mm/mmap.c b/mm/mmap.c
index 456ec6f..81850fc 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -452,7 +452,7 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
 		spin_lock(&mapping->i_mmap_lock);
 		vma->vm_truncate_count = mapping->truncate_count;
 	}
-	anon_vma_lock(vma);
+	anon_vma_lock(vma, &mm->mmap_sem);
 
 	__vma_link(mm, vma, prev, rb_link, rb_parent);
 	__vma_link_file(vma);
@@ -578,6 +578,7 @@ again:			remove_next = 1 + (end > next->vm_end);
 		}
 	}
 
+	anon_vma_lock(vma, &mm->mmap_sem);
 	if (root) {
 		flush_dcache_mmap_lock(mapping);
 		vma_prio_tree_remove(vma, root);
@@ -599,6 +600,7 @@ again:			remove_next = 1 + (end > next->vm_end);
 		vma_prio_tree_insert(vma, root);
 		flush_dcache_mmap_unlock(mapping);
 	}
+	anon_vma_unlock(vma);
 
 	if (remove_next) {
 		/*
@@ -1705,12 +1707,11 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
 		return -EFAULT;
 
 	/*
-	 * We must make sure the anon_vma is allocated
-	 * so that the anon_vma locking is not a noop.
+	 * Unlike expand_downwards, we do not need to take the anon_vma lock,
+	 * because we leave vma->vm_start and vma->pgoff untouched. 
+	 * This means rmap lookups of pages inside this VMA stay valid
+	 * throughout the stack expansion.
 	 */
-	if (unlikely(anon_vma_prepare(vma)))
-		return -ENOMEM;
-	anon_vma_lock(vma);
 
 	/*
 	 * vma->vm_start/vm_end cannot change under us because the caller
@@ -1721,7 +1722,6 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
 	if (address < PAGE_ALIGN(address+4))
 		address = PAGE_ALIGN(address+4);
 	else {
-		anon_vma_unlock(vma);
 		return -ENOMEM;
 	}
 	error = 0;
@@ -1737,7 +1737,6 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
 		if (!error)
 			vma->vm_end = address;
 	}
-	anon_vma_unlock(vma);
 	return error;
 }
 #endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */
@@ -1749,6 +1748,7 @@ static int expand_downwards(struct vm_area_struct *vma,
 				   unsigned long address)
 {
 	int error;
+	struct mm_struct *mm = vma->vm_mm;
 
 	/*
 	 * We must make sure the anon_vma is allocated
@@ -1762,7 +1762,8 @@ static int expand_downwards(struct vm_area_struct *vma,
 	if (error)
 		return error;
 
-	anon_vma_lock(vma);
+	spin_lock(&mm->anon_vma_chain_lock);
+	anon_vma_lock(vma, &mm->anon_vma_chain_lock);
 
 	/*
 	 * vma->vm_start/vm_end cannot change under us because the caller
@@ -1784,6 +1785,8 @@ static int expand_downwards(struct vm_area_struct *vma,
 		}
 	}
 	anon_vma_unlock(vma);
+	spin_unlock(&mm->anon_vma_chain_lock);
+
 	return error;
 }
 
diff --git a/mm/rmap.c b/mm/rmap.c
index 07fc947..a2dc8d9 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -23,6 +23,7 @@
  * inode->i_mutex	(while writing or truncating, not reading or faulting)
  *   inode->i_alloc_sem (vmtruncate_range)
  *   mm->mmap_sem
+ *   mm->anon_vma_chain_lock (mmap_sem for read, protects vma->anon_vma_chain)
  *     page->flags PG_locked (lock_page)
  *       mapping->i_mmap_lock
  *         anon_vma->lock
@@ -134,9 +135,10 @@ int anon_vma_prepare(struct vm_area_struct *vma)
 			allocated = anon_vma;
 		}
 
+		/* anon_vma_chain_lock to protect against threads */
+		spin_lock(&mm->anon_vma_chain_lock);
 		spin_lock(&anon_vma->lock);
-		/* page_table_lock to protect against threads */
-		spin_lock(&mm->page_table_lock);
+
 		if (likely(!vma->anon_vma)) {
 			vma->anon_vma = anon_vma;
 			avc->anon_vma = anon_vma;
@@ -146,8 +148,8 @@ int anon_vma_prepare(struct vm_area_struct *vma)
 			allocated = NULL;
 			avc = NULL;
 		}
-		spin_unlock(&mm->page_table_lock);
 		spin_unlock(&anon_vma->lock);
+		spin_unlock(&mm->anon_vma_chain_lock);
 
 		if (unlikely(allocated))
 			anon_vma_free(allocated);

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH 2/2] mm: fix race between shift_arg_pages and rmap_walk
  2010-05-03 16:17 [PATCH 0/2] Fix migration races in rmap_walk() V4 Rik van Riel
  2010-05-03 16:18 ` [PATCH 1/2] mm: Take all anon_vma locks in anon_vma_lock Rik van Riel
@ 2010-05-03 16:19 ` Rik van Riel
  2010-05-03 16:34   ` Linus Torvalds
  1 sibling, 1 reply; 19+ messages in thread
From: Rik van Riel @ 2010-05-03 16:19 UTC (permalink / raw)
  To: akpm
  Cc: torvalds, Mel Gorman, Linux-MM, LKML, Minchan Kim,
	KAMEZAWA Hiroyuki, Andrea Arcangeli, Christoph Lameter

From: Andrea Arcangeli <aarcange@redhat.com>

migrate.c requires rmap to be able to find all ptes mapping a page at
all times, otherwise the migration entry can be instantiated, but it
can't be removed if the second rmap_walk fails to find the page.

And split_huge_page() will have the same requirements as migrate.c
already has.

The fix is to instantiate a temporary vma during shift_arg_pages.
Every page can be found in either the old or the new VMA.  This
essentially is a light-weight implementation of mremap.

Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Minchan Kim <minchan.kim@gmail.com>

diff --git a/fs/exec.c b/fs/exec.c
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -55,6 +55,7 @@
 #include <linux/fsnotify.h>
 #include <linux/fs_struct.h>
 #include <linux/pipe_fs_i.h>
+#include <linux/rmap.h>
 
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -503,7 +504,9 @@ static int shift_arg_pages(struct vm_are
 	unsigned long length = old_end - old_start;
 	unsigned long new_start = old_start - shift;
 	unsigned long new_end = old_end - shift;
+	unsigned long moved_length;
 	struct mmu_gather *tlb;
+	struct vm_area_struct *tmp_vma;
 
 	BUG_ON(new_start > new_end);
 
@@ -515,17 +518,46 @@ static int shift_arg_pages(struct vm_are
 		return -EFAULT;
 
 	/*
+	 * We need to create a fake temporary vma and index it in the
+	 * anon_vma list in order to allow the pages to be reachable
+	 * at all times by the rmap walk for migrate, while
+	 * move_page_tables() is running.
+	 */
+	tmp_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
+	if (!tmp_vma)
+		return -ENOMEM;
+	*tmp_vma = *vma;
+	INIT_LIST_HEAD(&tmp_vma->anon_vma_chain);
+	if (unlikely(anon_vma_clone(tmp_vma, vma))) {
+		kmem_cache_free(vm_area_cachep, tmp_vma);
+		return -ENOMEM;
+	}
+
+	/*
 	 * cover the whole range: [new_start, old_end)
+	 *
+	 * The vma is attached only to vma->anon_vma so one lock is
+	 * enough. Even this lock might be removed and we could run it
+	 * out of order but it's nicer to make atomic updates to
+	 * vm_start and so we won't need a smb_wmb() before calling
+	 * move_page_tables.
 	 */
-	if (vma_adjust(vma, new_start, old_end, vma->vm_pgoff, NULL))
-		return -ENOMEM;
+	spin_lock(&vma->anon_vma->lock);
+	vma->vm_start = new_start;
+	spin_unlock(&vma->anon_vma->lock);
 
 	/*
 	 * move the page tables downwards, on failure we rely on
 	 * process cleanup to remove whatever mess we made.
 	 */
-	if (length != move_page_tables(vma, old_start,
-				       vma, new_start, length))
+	moved_length = move_page_tables(vma, old_start,
+					vma, new_start, length);
+
+	/* rmap walk will already find all pages using the new_start */
+	unlink_anon_vmas(tmp_vma);
+	kmem_cache_free(vm_area_cachep, tmp_vma);
+
+	if (length != moved_length) 
 		return -ENOMEM;
 
 	lru_add_drain();
@@ -551,7 +583,7 @@ static int shift_arg_pages(struct vm_are
 	/*
 	 * Shrink the vma to just the new range.  Always succeeds.
 	 */
-	vma_adjust(vma, new_start, new_end, vma->vm_pgoff, NULL);
+	vma->vm_end = new_end;
 
 	return 0;
 }

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/2] mm: fix race between shift_arg_pages and rmap_walk
  2010-05-03 16:19 ` [PATCH 2/2] mm: fix race between shift_arg_pages and rmap_walk Rik van Riel
@ 2010-05-03 16:34   ` Linus Torvalds
  2010-05-03 16:37     ` Linus Torvalds
  0 siblings, 1 reply; 19+ messages in thread
From: Linus Torvalds @ 2010-05-03 16:34 UTC (permalink / raw)
  To: Rik van Riel
  Cc: akpm, Mel Gorman, Linux-MM, LKML, Minchan Kim, KAMEZAWA Hiroyuki,
	Andrea Arcangeli, Christoph Lameter



On Mon, 3 May 2010, Rik van Riel wrote:
> 
> migrate.c requires rmap to be able to find all ptes mapping a page at
> all times, otherwise the migration entry can be instantiated, but it
> can't be removed if the second rmap_walk fails to find the page.

Please correct me if I'm wrong, but this patch looks like pure and utter 
garbage.

It looks like it makes execve() do a totally insane "create and then 
immediately destroy temporary vma and anon_vma chain" for a case that is 
unlikely to ever matter. 

In fact, for a case that isn't even normally _enabled_, namely migration.

Why would we want to slow down execve() for that?

		Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/2] mm: fix race between shift_arg_pages and rmap_walk
  2010-05-03 16:34   ` Linus Torvalds
@ 2010-05-03 16:37     ` Linus Torvalds
  0 siblings, 0 replies; 19+ messages in thread
From: Linus Torvalds @ 2010-05-03 16:37 UTC (permalink / raw)
  To: Rik van Riel
  Cc: akpm, Mel Gorman, Linux-MM, LKML, Minchan Kim, KAMEZAWA Hiroyuki,
	Andrea Arcangeli, Christoph Lameter



On Mon, 3 May 2010, Linus Torvalds wrote:
> 
> It looks like it makes execve() do a totally insane "create and then 
> immediately destroy temporary vma and anon_vma chain" for a case that is 
> unlikely to ever matter. 
> 
> In fact, for a case that isn't even normally _enabled_, namely migration.
> 
> Why would we want to slow down execve() for that?

Alternate suggestions:

 - clean up the patch so that it is explicitly abouy migration, and 
   doesn't even get enabled for anything else.

 - make the migration code take the VM lock for writing (why doesn't it 
   already?) and never race with things like this in the first place.

 - explain why the new code isn't any slower.

Hmm?

		Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/2] mm: Take all anon_vma locks in anon_vma_lock
  2010-05-03 16:18 ` [PATCH 1/2] mm: Take all anon_vma locks in anon_vma_lock Rik van Riel
@ 2010-05-03 16:41   ` Linus Torvalds
  2010-05-03 16:53     ` Rik van Riel
  2010-05-03 16:55   ` Peter Zijlstra
  1 sibling, 1 reply; 19+ messages in thread
From: Linus Torvalds @ 2010-05-03 16:41 UTC (permalink / raw)
  To: Rik van Riel
  Cc: akpm, Mel Gorman, Linux-MM, LKML, Minchan Kim, KAMEZAWA Hiroyuki,
	Andrea Arcangeli, Christoph Lameter



On Mon, 3 May 2010, Rik van Riel wrote:
> 
> Both the page migration code and the transparent hugepage patches expect
> 100% reliable rmap lookups and use page_lock_anon_vma(page) to prevent
> races with mmap, munmap, expand_stack, etc.

Pretty much same comments as for the other one. Why are we pandering to 
the case that is/should be unusual?

		Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/2] mm: Take all anon_vma locks in anon_vma_lock
  2010-05-03 16:41   ` Linus Torvalds
@ 2010-05-03 16:53     ` Rik van Riel
  2010-05-03 17:17       ` Linus Torvalds
  0 siblings, 1 reply; 19+ messages in thread
From: Rik van Riel @ 2010-05-03 16:53 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: akpm, Mel Gorman, Linux-MM, LKML, Minchan Kim, KAMEZAWA Hiroyuki,
	Andrea Arcangeli, Christoph Lameter

On 05/03/2010 12:41 PM, Linus Torvalds wrote:
> On Mon, 3 May 2010, Rik van Riel wrote:
>>
>> Both the page migration code and the transparent hugepage patches expect
>> 100% reliable rmap lookups and use page_lock_anon_vma(page) to prevent
>> races with mmap, munmap, expand_stack, etc.
>
> Pretty much same comments as for the other one. Why are we pandering to
> the case that is/should be unusual?

In this case, because the fix from the migration side is
difficult and fragile, while fixing things from the mmap
side is straightforward.

I believe the overhead of patch 1/2 should be minimal
as well, because the locks we take are the _depth_ of
the process tree (truncated every exec), not the width.


As for patch 2/2, Mel has an alternative approach for that:

http://lkml.org/lkml/2010/4/30/198

Does Mel's patch seem more reasonable to you?

-- 
All rights reversed

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/2] mm: Take all anon_vma locks in anon_vma_lock
  2010-05-03 16:18 ` [PATCH 1/2] mm: Take all anon_vma locks in anon_vma_lock Rik van Riel
  2010-05-03 16:41   ` Linus Torvalds
@ 2010-05-03 16:55   ` Peter Zijlstra
  2010-05-03 17:02     ` Andrea Arcangeli
  1 sibling, 1 reply; 19+ messages in thread
From: Peter Zijlstra @ 2010-05-03 16:55 UTC (permalink / raw)
  To: Rik van Riel
  Cc: akpm, torvalds, Mel Gorman, Linux-MM, LKML, Minchan Kim,
	KAMEZAWA Hiroyuki, Andrea Arcangeli, Christoph Lameter

On Mon, 2010-05-03 at 12:18 -0400, Rik van Riel wrote:
> From: Rik van Riel <riel@redhat.com>
> 
> Both the page migration code and the transparent hugepage patches expect
> 100% reliable rmap lookups and use page_lock_anon_vma(page) to prevent
> races with mmap, munmap, expand_stack, etc.
> 
> Specifically, try_to_unmap indirectly calls vma_address, which uses the
> difference between vma->vm_start and vma->vm_pgoff, which can race when a
> stack is expanded downwards.  VMA splitting and merging present similar
> issues.
> 
> With the new anon_vma code, one VMA can be attached to multiple anon_vmas,
> however mmap, munmap, expand_stack and others only took one anon_vma->lock.
> This patch changes things so we take the anon_vma locks for all of the
> anon_vmas attached to a VMA in the code that try_to_unmap would otherwise
> race against: mmap, munmap, expand_stack, etc. 
> 
> Unfortunately, this leads to a lock ordering conflict with the page_table_lock,
> which protected the "same_vma" list in the anon_vma_chain.  Replacing that
> lock with a new lock (mm->anon_vma_chain_lock), which is taken higher up in
> the mm locking hierarchy, solves that issue.  This changes the locking rules
> for the "same_vma" list to be either mm->mmap_sem for write, or mm->mmap_sem
> for read plus the new mm->anon_vma_chain lock.  This limits the place where
> the new lock is taken to 2 locations - anon_vma_prepare and expand_downwards.
> 
> Document the locking rules for the same_vma list in the anon_vma_chain and
> remove the anon_vma_lock call from expand_upwards, which does not need it.

> diff --git a/mm/mmap.c b/mm/mmap.c
> index 456ec6f..81850fc 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c

> @@ -1705,12 +1707,11 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
>  		return -EFAULT;
>  
>  	/*
> -	 * We must make sure the anon_vma is allocated
> -	 * so that the anon_vma locking is not a noop.
> +	 * Unlike expand_downwards, we do not need to take the anon_vma lock,
> +	 * because we leave vma->vm_start and vma->pgoff untouched. 
> +	 * This means rmap lookups of pages inside this VMA stay valid
> +	 * throughout the stack expansion.
>  	 */
> -	if (unlikely(anon_vma_prepare(vma)))
> -		return -ENOMEM;
> -	anon_vma_lock(vma);
>  
>  	/*
>  	 * vma->vm_start/vm_end cannot change under us because the caller
> @@ -1721,7 +1722,6 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
>  	if (address < PAGE_ALIGN(address+4))
>  		address = PAGE_ALIGN(address+4);
>  	else {
> -		anon_vma_unlock(vma);
>  		return -ENOMEM;
>  	}
>  	error = 0;
> @@ -1737,7 +1737,6 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
>  		if (!error)
>  			vma->vm_end = address;
>  	}
> -	anon_vma_unlock(vma);
>  	return error;
>  }
>  #endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */


This does leave me worrying about concurrent faults poking at
vma->vm_end without synchronization.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/2] mm: Take all anon_vma locks in anon_vma_lock
  2010-05-03 16:55   ` Peter Zijlstra
@ 2010-05-03 17:02     ` Andrea Arcangeli
  2010-05-03 17:11       ` Peter Zijlstra
  0 siblings, 1 reply; 19+ messages in thread
From: Andrea Arcangeli @ 2010-05-03 17:02 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Rik van Riel, akpm, torvalds, Mel Gorman, Linux-MM, LKML,
	Minchan Kim, KAMEZAWA Hiroyuki, Christoph Lameter

On Mon, May 03, 2010 at 06:55:12PM +0200, Peter Zijlstra wrote:
> This does leave me worrying about concurrent faults poking at
> vma->vm_end without synchronization.

I didn't check this patch in detail yet. I agree it can be removed and
I think it can be safely replaced with the page_table_lock.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/2] mm: Take all anon_vma locks in anon_vma_lock
  2010-05-03 17:02     ` Andrea Arcangeli
@ 2010-05-03 17:11       ` Peter Zijlstra
  2010-05-03 17:18         ` Andrea Arcangeli
  0 siblings, 1 reply; 19+ messages in thread
From: Peter Zijlstra @ 2010-05-03 17:11 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Rik van Riel, akpm, torvalds, Mel Gorman, Linux-MM, LKML,
	Minchan Kim, KAMEZAWA Hiroyuki, Christoph Lameter

On Mon, 2010-05-03 at 19:02 +0200, Andrea Arcangeli wrote:
> On Mon, May 03, 2010 at 06:55:12PM +0200, Peter Zijlstra wrote:
> > This does leave me worrying about concurrent faults poking at
> > vma->vm_end without synchronization.
> 
> I didn't check this patch in detail yet. I agree it can be removed and
> I think it can be safely replaced with the page_table_lock.

Sure, it could probably be replaced with the ptl, but a single
anon_vma->lock would I think be better since there's more of them.



--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/2] mm: Take all anon_vma locks in anon_vma_lock
  2010-05-03 16:53     ` Rik van Riel
@ 2010-05-03 17:17       ` Linus Torvalds
  2010-05-03 17:58         ` Rik van Riel
  0 siblings, 1 reply; 19+ messages in thread
From: Linus Torvalds @ 2010-05-03 17:17 UTC (permalink / raw)
  To: Rik van Riel
  Cc: akpm, Mel Gorman, Linux-MM, LKML, Minchan Kim, KAMEZAWA Hiroyuki,
	Andrea Arcangeli, Christoph Lameter

On Mon, 3 May 2010, Rik van Riel wrote:
> > 
> > Pretty much same comments as for the other one. Why are we pandering to
> > the case that is/should be unusual?
> 
> In this case, because the fix from the migration side is
> difficult and fragile, while fixing things from the mmap
> side is straightforward.
> 
> I believe the overhead of patch 1/2 should be minimal
> as well, because the locks we take are the _depth_ of
> the process tree (truncated every exec), not the width.

Quite frankly, I think it's totally insane to walk a list of all 
anon_vma's that are associated with one vma, just to lock them all.

Tell me why you just don't put the lock in the vma itself then? Walking a 
list in order to lock multiple things is something we should _never_ do 
under any normal circumstances.

I can see why you'd want to do this in theory (the "other side" of the 
locker might have to lock just the _one_ anon_vma), but if your argument 
is that the list is usually not very deep ("one"?), then there is no 
advantage, because putting the lock in the anon_vma vs the vma will get 
the same kind of contention.

And if the list _is_ deep, then walking the list to lock them all is a 
crime against humanity.

So explain.

> As for patch 2/2, Mel has an alternative approach for that:
> 
> http://lkml.org/lkml/2010/4/30/198
> 
> Does Mel's patch seem more reasonable to you?

Well, I certainly think that seems to be a lot more targeted, and not add 
new allocations in a path that I think is already one of the more 
expensive ones. Yes, you can argue that execve() is already so expensive 
that a few more allocations don't matter, and you migth be right, but 
that's how things get to be too expensive to begin with.

That said, I do still wonder why we shouldn't just say that the person who 
wants the safety is the one that should do the extra work.

In particular, why don't we just make rmap_walk() be the one that locks 
all the anon_vma's? Instead of locking just one? THAT is the function that 
cares. THAT is the function that should do proper locking and not expect 
others to do extra unnecessary locking.

So again, my gut feel is that if the lock just were in the vma itself, 
then the "normal" users would have just one natural lock, while the 
special case users (rmap_walk_anon) would have to lock each vma it 
traverses. That would seem to be the more natural way to lock things.

I dunno. There may well be reasons why it doesn't work, like just the 
allocation lifetime rules for vma's vs anon_vma's. I'm not claiming I've 
thought this true. I just get a feeling of "that isn't right" when I look 
at the original 2/2 patch, and while Mel's patch certainly looks better, 
it seems to be a bit ad-hoc and hacky to me.

Btw, Mel's patch doesn't really match the description of 2/2. 2/2 says 
that all pages must always be findable in rmap. Mel's patch seems to 
explicitly say "we want to ignore that thing that is busy for execve". Are 
we just avoiding a BUG_ON()? Is perhaps the BUG_ON() buggy?

		Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/2] mm: Take all anon_vma locks in anon_vma_lock
  2010-05-03 17:11       ` Peter Zijlstra
@ 2010-05-03 17:18         ` Andrea Arcangeli
  0 siblings, 0 replies; 19+ messages in thread
From: Andrea Arcangeli @ 2010-05-03 17:18 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Rik van Riel, akpm, torvalds, Mel Gorman, Linux-MM, LKML,
	Minchan Kim, KAMEZAWA Hiroyuki, Christoph Lameter

On Mon, May 03, 2010 at 07:11:19PM +0200, Peter Zijlstra wrote:
> On Mon, 2010-05-03 at 19:02 +0200, Andrea Arcangeli wrote:
> > On Mon, May 03, 2010 at 06:55:12PM +0200, Peter Zijlstra wrote:
> > > This does leave me worrying about concurrent faults poking at
> > > vma->vm_end without synchronization.
> > 
> > I didn't check this patch in detail yet. I agree it can be removed and
> > I think it can be safely replaced with the page_table_lock.
> 
> Sure, it could probably be replaced with the ptl, but a single
> anon_vma->lock would I think be better since there's more of them.

ptl not enough, or it'd break if stack grows fast more than the size
of one pmd, page_table_lock enough instead.

Keeping anon_vma lock is sure fine with me ;), I was informally asked
if it was a must have, and I couldn't foresee any problem in
_replacing_ it (not removing) with page_table_lock (which I hope I
mentioned in my answer ;). But I never had an interest to remove it,
just I couldn't find any good reason to keep it either other than
"paranoid just in case", which is good enough justification to me ;)
considering these archs are uncommon and by definition gets less
testing.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/2] mm: Take all anon_vma locks in anon_vma_lock
  2010-05-03 17:17       ` Linus Torvalds
@ 2010-05-03 17:58         ` Rik van Riel
  2010-05-03 18:13           ` Andrea Arcangeli
                             ` (2 more replies)
  0 siblings, 3 replies; 19+ messages in thread
From: Rik van Riel @ 2010-05-03 17:58 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: akpm, Mel Gorman, Linux-MM, LKML, Minchan Kim, KAMEZAWA Hiroyuki,
	Andrea Arcangeli, Christoph Lameter

On 05/03/2010 01:17 PM, Linus Torvalds wrote:

> Quite frankly, I think it's totally insane to walk a list of all
> anon_vma's that are associated with one vma, just to lock them all.
>
> Tell me why you just don't put the lock in the vma itself then? Walking a
> list in order to lock multiple things is something we should _never_ do
> under any normal circumstances.

One problem is that we cannot find the VMAs (multiple) from
the page, except by walking the anon_vma_chain.same_anon_vma
list.  At the very least, that list requires locking, done
by the anon_vma.lock.

When already you have that lock, also taking per-VMA locks
would be superfluous. It could also be difficult, especially
since the rmap side code and the mmap side code approach the
data structures from the other side, potentially leading to
locking order conflicts.

> I can see why you'd want to do this in theory (the "other side" of the
> locker might have to lock just the _one_ anon_vma), but if your argument
> is that the list is usually not very deep ("one"?), then there is no
> advantage, because putting the lock in the anon_vma vs the vma will get
> the same kind of contention.

In a freshly exec()d process, there will be one anon_vma.

In workloads with forking daemons, like apache, postgresql
or sendmail, the child process will end up with two anon_vmas
in VMAs inherited from the parent.

> And if the list _is_ deep, then walking the list to lock them all is a
> crime against humanity.

A forkbomb could definately end up getting slowed down by
this patch.  Is there any real workload out there that just
forks deeper and deeper from the parent process, without
calling exec() after a generation or two?

>> As for patch 2/2, Mel has an alternative approach for that:
>>
>> http://lkml.org/lkml/2010/4/30/198
>>
>> Does Mel's patch seem more reasonable to you?
>
> Well, I certainly think that seems to be a lot more targeted,

> In particular, why don't we just make rmap_walk() be the one that locks
> all the anon_vma's? Instead of locking just one? THAT is the function that
> cares. THAT is the function that should do proper locking and not expect
> others to do extra unnecessary locking.

That looks like it might work for rmap_walk.

> So again, my gut feel is that if the lock just were in the vma itself,
> then the "normal" users would have just one natural lock, while the
> special case users (rmap_walk_anon) would have to lock each vma it
> traverses. That would seem to be the more natural way to lock things.

However ... there's still the issue of page_lock_anon_vma
in try_to_unmap_anon.

I guess it protects against any of the VMAs going away,
because anon_vma_unlink also takes the anon_vma->lock.

Does it need to protect against anything else?

> Btw, Mel's patch doesn't really match the description of 2/2. 2/2 says
> that all pages must always be findable in rmap. Mel's patch seems to
> explicitly say "we want to ignore that thing that is busy for execve". Are
> we just avoiding a BUG_ON()? Is perhaps the BUG_ON() buggy?

I have no good answer to this question.

Mel?  Andrea?

-- 
All rights reversed

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/2] mm: Take all anon_vma locks in anon_vma_lock
  2010-05-03 17:58         ` Rik van Riel
@ 2010-05-03 18:13           ` Andrea Arcangeli
  2010-05-03 18:19           ` Linus Torvalds
  2010-05-04 13:12           ` Mel Gorman
  2 siblings, 0 replies; 19+ messages in thread
From: Andrea Arcangeli @ 2010-05-03 18:13 UTC (permalink / raw)
  To: Rik van Riel
  Cc: Linus Torvalds, akpm, Mel Gorman, Linux-MM, LKML, Minchan Kim,
	KAMEZAWA Hiroyuki, Christoph Lameter

> > Btw, Mel's patch doesn't really match the description of 2/2. 2/2 says
> > that all pages must always be findable in rmap. Mel's patch seems to
> > explicitly say "we want to ignore that thing that is busy for execve". Are
> > we just avoiding a BUG_ON()? Is perhaps the BUG_ON() buggy?
> 
> I have no good answer to this question.
> 
> Mel?  Andrea?

If try_to_unmap is allowed to establish the migration pte, then such
pte has to remain reachable through rmap_walk at all times after that,
or migration_entry_wait will crash because it notices the page has
been migrated already (PG_lock not set) but there is still a migration
pte established. (remove_migration_pte like split_huge_page isn't
allowed to fail finding all migration ptes mapping a page during the
rmap walk)

It's not false positive BUG_ON if that's what you mean, removing the
BUG_ON would still lead to infinite hang waiting on a migration pte
that shouldn't be there anymore.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/2] mm: Take all anon_vma locks in anon_vma_lock
  2010-05-03 17:58         ` Rik van Riel
  2010-05-03 18:13           ` Andrea Arcangeli
@ 2010-05-03 18:19           ` Linus Torvalds
  2010-05-03 18:38             ` Rik van Riel
  2010-05-04 13:12           ` Mel Gorman
  2 siblings, 1 reply; 19+ messages in thread
From: Linus Torvalds @ 2010-05-03 18:19 UTC (permalink / raw)
  To: Rik van Riel
  Cc: akpm, Mel Gorman, Linux-MM, LKML, Minchan Kim, KAMEZAWA Hiroyuki,
	Andrea Arcangeli, Christoph Lameter

On Mon, 3 May 2010, Rik van Riel wrote:
> 
> One problem is that we cannot find the VMAs (multiple) from
> the page, except by walking the anon_vma_chain.same_anon_vma
> list.  At the very least, that list requires locking, done
> by the anon_vma.lock.

But that's exactly what we do in rmap_walk() anyway.

But yes, I can well imagine that in other cases we only do the one 
anon_vma. I didn't check who used the lock.

So if we do want to keep the lock in the anon_vma, I would just suggest 
that instead of making "normal" users do lots of locking, make the 
rmap_walk side one.

> A forkbomb could definately end up getting slowed down by
> this patch.  Is there any real workload out there that just
> forks deeper and deeper from the parent process, without
> calling exec() after a generation or two?

Heh. AIM7. Wasn't that why we merged the multiple anon_vma's in the first 
place?

> > So again, my gut feel is that if the lock just were in the vma itself,
> > then the "normal" users would have just one natural lock, while the
> > special case users (rmap_walk_anon) would have to lock each vma it
> > traverses. That would seem to be the more natural way to lock things.
> 
> However ... there's still the issue of page_lock_anon_vma
> in try_to_unmap_anon.

Do we care?

We've not locked them all there, and we've historically not cares about 
the rmap list being "perfect", have we? 

So I _think_ it's just the migration case (and apparently potentially the 
hugepage case) that wants _exact_ information. Which is why I suggest the 
onus of the extra locking should be on _them_, not on the regular code.

I dunno. Again, my objections to the patches are really based more on a 
gut feel of "that can't be the right thing to do" than anything else.

We have _extremely_ few places that walk lists to lock things. And they 
are never "normal" code. Things like that magic "mm_take_all_locks()", for 
example. That is why I then react with "that can't be right" to patches 
like this.

			Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/2] mm: Take all anon_vma locks in anon_vma_lock
  2010-05-03 18:19           ` Linus Torvalds
@ 2010-05-03 18:38             ` Rik van Riel
  0 siblings, 0 replies; 19+ messages in thread
From: Rik van Riel @ 2010-05-03 18:38 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: akpm, Mel Gorman, Linux-MM, LKML, Minchan Kim, KAMEZAWA Hiroyuki,
	Andrea Arcangeli, Christoph Lameter

On 05/03/2010 02:19 PM, Linus Torvalds wrote:
> On Mon, 3 May 2010, Rik van Riel wrote:
>>
>> One problem is that we cannot find the VMAs (multiple) from
>> the page, except by walking the anon_vma_chain.same_anon_vma
>> list.  At the very least, that list requires locking, done
>> by the anon_vma.lock.
>
> But that's exactly what we do in rmap_walk() anyway.

Mel's original patch adds trylock & retry all code to rmap_walk
and a few other places:

http://lkml.org/lkml/2010/4/26/321

I submitted my patch 1/2 as an alternative, because these repeated
trylocks are pretty complex and easy to accidentally break when
changes to other VM code are made.

>> A forkbomb could definately end up getting slowed down by
>> this patch.  Is there any real workload out there that just
>> forks deeper and deeper from the parent process, without
>> calling exec() after a generation or two?
>
> Heh. AIM7. Wasn't that why we merged the multiple anon_vma's in the first
> place?

AIM7, like sendmail, apache or postgresql, is only 2 deep.

>>> So again, my gut feel is that if the lock just were in the vma itself,
>>> then the "normal" users would have just one natural lock, while the
>>> special case users (rmap_walk_anon) would have to lock each vma it
>>> traverses. That would seem to be the more natural way to lock things.
>>
>> However ... there's still the issue of page_lock_anon_vma
>> in try_to_unmap_anon.
>
> Do we care?
>
> We've not locked them all there, and we've historically not cares about
> the rmap list being "perfect", have we?

Well, try_to_unmap_anon walks just one page, and has the anon_vma
for that page locked.

Having said that, for pageout we do indeed not care about getting
it perfect.

> So I _think_ it's just the migration case (and apparently potentially the
> hugepage case) that wants _exact_ information. Which is why I suggest the
> onus of the extra locking should be on _them_, not on the regular code.

It's a matter of cost vs complexity.  IMHO the locking changes in
the lowest overhead patches (Mel's) are quite complex and could end
up being hard to maintain in the future.  I wanted to introduce
something a little simpler, with hopefully minimal overhead.

But hey, that's just my opinion - what matters is that the bug gets
fixed somehow.  If you prefer the more complex but slightly lower
overhead patches from Mel, that's fine too.

-- 
All rights reversed

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/2] mm: Take all anon_vma locks in anon_vma_lock
  2010-05-03 17:58         ` Rik van Riel
  2010-05-03 18:13           ` Andrea Arcangeli
  2010-05-03 18:19           ` Linus Torvalds
@ 2010-05-04 13:12           ` Mel Gorman
  2 siblings, 0 replies; 19+ messages in thread
From: Mel Gorman @ 2010-05-04 13:12 UTC (permalink / raw)
  To: Rik van Riel
  Cc: Linus Torvalds, akpm, Linux-MM, LKML, Minchan Kim,
	KAMEZAWA Hiroyuki, Andrea Arcangeli, Christoph Lameter

On Mon, May 03, 2010 at 01:58:36PM -0400, Rik van Riel wrote:
>>
>> Btw, Mel's patch doesn't really match the description of 2/2. 2/2 says
>> that all pages must always be findable in rmap. Mel's patch seems to
>> explicitly say "we want to ignore that thing that is busy for execve". Are
>> we just avoiding a BUG_ON()? Is perhaps the BUG_ON() buggy?
>
> I have no good answer to this question.
>
> Mel?  Andrea?
>

The wording could have been better.

The problem is that once a migration PTE is established, it is expected that
rmap can find it. In the specific case of exec, this can fail because of
how the temporary stack is moved. As migration colliding with exec is rare,
the approach taken by the patch was to not create migration PTEs that rmap
could not find. On the plus side, exec (the common case) is unaffected. On
the negative side, it's avoiding the exec vs migration problem instead of
fixing it.

The BUG_ON is not a buggy check. While migration is taking place, the page lock
is held and not unreleased until all the migration PTEs have been removed. If
a migration entry exists and the page is unlocked, it means that rmap failed
to find all the entries. If the BUG_ON was not made, do_swap_page() would
either end up looking up a semi-random entry in swap cache and inserting it
(memory corruption), inserting a random page from swap (memory corruption)
or returning VM_FAULT_OOM to the fault handler (general carnage).

It was considered to lazily clean up the migration PTEs
(http://lkml.org/lkml/2010/4/27/458) but there is no guarantee that the page
the migration PTE pointed to is still the correct one. If it had been freed
and re-used, the results would probably be memory corruption.

-- 
Mel Gorman
Part-time Phd Student                          Linux Technology Center
University of Limerick                         IBM Dublin Software Lab

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2010-05-04 13:13 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-05-03 16:17 [PATCH 0/2] Fix migration races in rmap_walk() V4 Rik van Riel
2010-05-03 16:18 ` [PATCH 1/2] mm: Take all anon_vma locks in anon_vma_lock Rik van Riel
2010-05-03 16:41   ` Linus Torvalds
2010-05-03 16:53     ` Rik van Riel
2010-05-03 17:17       ` Linus Torvalds
2010-05-03 17:58         ` Rik van Riel
2010-05-03 18:13           ` Andrea Arcangeli
2010-05-03 18:19           ` Linus Torvalds
2010-05-03 18:38             ` Rik van Riel
2010-05-04 13:12           ` Mel Gorman
2010-05-03 16:55   ` Peter Zijlstra
2010-05-03 17:02     ` Andrea Arcangeli
2010-05-03 17:11       ` Peter Zijlstra
2010-05-03 17:18         ` Andrea Arcangeli
2010-05-03 16:19 ` [PATCH 2/2] mm: fix race between shift_arg_pages and rmap_walk Rik van Riel
2010-05-03 16:34   ` Linus Torvalds
2010-05-03 16:37     ` Linus Torvalds
  -- strict thread matches above, loose matches on Subject: below --
2010-04-29  8:32 [PATCH 0/2] Fix migration races in rmap_walk() V3 Mel Gorman
2010-04-29  8:32 ` [PATCH 1/2] mm: Take all anon_vma locks in anon_vma_lock Mel Gorman
2010-05-02 17:28   ` Minchan Kim

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).