[rfc][patches] fix for munmap/truncate races

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [rfc][patches] fix for munmap/truncate races
@ 2012-03-18 19:07 Al Viro
  2012-03-18 22:01 ` Linus Torvalds
  2012-03-18 22:23 ` Al Viro
  0 siblings, 2 replies; 6+ messages in thread
From: Al Viro @ 2012-03-18 19:07 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-kernel, linux-mm

	Background: truncate() ends up going through the shared mappings
of file being truncated (under ->i_mmap_mutex, to protect them from
getting removed while we do that) and calling unmap_vmas() on them,
with range passed to unmap_vmas() sitting entirely within the vma
being passed to it.  The trouble is, unmap_vmas() expects a chain of
vmas.  It will look into the next vma, see that it's beyond the range
we'd been given and do nothing to it.  Fine, except that there's nothing
to protect that next vma from being removed just as we do that - we do
*not* hold ->i_mmap and ->i_mmap_mutex held on our file won't do anything
to mappings that have nothing to do with the file in question.

	There's an obvious way to deal with that - introducing a variant
of unmap_vmas() that would handle a single vma and switch these callers
of unmap_vmas() to using it.  It requires some preparations; below is
the combined diff, for those who prefer to review the splitup, it is in
git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs.git #vm

 include/linux/mm.h |    4 +-
 mm/memory.c        |  133 +++++++++++++++++++++++++++++++---------------------
 mm/mmap.c          |    5 +-
 3 files changed, 84 insertions(+), 58 deletions(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 17b27cd..b5bb54d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -893,9 +893,9 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
 
 int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
 		unsigned long size);
-unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
+void zap_page_range(struct vm_area_struct *vma, unsigned long address,
 		unsigned long size, struct zap_details *);
-unsigned long unmap_vmas(struct mmu_gather *tlb,
+void unmap_vmas(struct mmu_gather *tlb,
 		struct vm_area_struct *start_vma, unsigned long start_addr,
 		unsigned long end_addr, unsigned long *nr_accounted,
 		struct zap_details *);
diff --git a/mm/memory.c b/mm/memory.c
index fa2f04e..8ab0918 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1282,10 +1282,10 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
 	return addr;
 }
 
-static unsigned long unmap_page_range(struct mmu_gather *tlb,
-				struct vm_area_struct *vma,
-				unsigned long addr, unsigned long end,
-				struct zap_details *details)
+static void unmap_page_range(struct mmu_gather *tlb,
+			     struct vm_area_struct *vma,
+			     unsigned long addr, unsigned long end,
+			     struct zap_details *details)
 {
 	pgd_t *pgd;
 	unsigned long next;
@@ -1305,8 +1305,47 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
 	} while (pgd++, addr = next, addr != end);
 	tlb_end_vma(tlb, vma);
 	mem_cgroup_uncharge_end();
+}
 
-	return addr;
+
+static void unmap_single_vma(struct mmu_gather *tlb,
+		struct vm_area_struct *vma, unsigned long start_addr,
+		unsigned long end_addr, unsigned long *nr_accounted,
+		struct zap_details *details)
+{
+	unsigned long start = max(vma->vm_start, start_addr);
+	unsigned long end;
+
+	if (start >= vma->vm_end)
+		return;
+	end = min(vma->vm_end, end_addr);
+	if (end <= vma->vm_start)
+		return;
+
+	if (vma->vm_flags & VM_ACCOUNT)
+		*nr_accounted += (end - start) >> PAGE_SHIFT;
+
+	if (unlikely(is_pfn_mapping(vma)))
+		untrack_pfn_vma(vma, 0, 0);
+
+	if (start != end) {
+		if (unlikely(is_vm_hugetlb_page(vma))) {
+			/*
+			 * It is undesirable to test vma->vm_file as it
+			 * should be non-null for valid hugetlb area.
+			 * However, vm_file will be NULL in the error
+			 * cleanup path of do_mmap_pgoff. When
+			 * hugetlbfs ->mmap method fails,
+			 * do_mmap_pgoff() nullifies vma->vm_file
+			 * before calling this function to clean up.
+			 * Since no pte has actually been setup, it is
+			 * safe to do nothing in this case.
+			 */
+			if (vma->vm_file)
+				unmap_hugepage_range(vma, start, end, NULL);
+		} else
+			unmap_page_range(tlb, vma, start, end, details);
+	}
 }
 
 /**
@@ -1318,8 +1357,6 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
  * @nr_accounted: Place number of unmapped pages in vm-accountable vma's here
  * @details: details of nonlinear truncation or shared cache invalidation
  *
- * Returns the end address of the unmapping (restart addr if interrupted).
- *
  * Unmap all pages in the vma list.
  *
  * Only addresses between `start' and `end' will be unmapped.
@@ -1331,55 +1368,18 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
  * ensure that any thus-far unmapped pages are flushed before unmap_vmas()
  * drops the lock and schedules.
  */
-unsigned long unmap_vmas(struct mmu_gather *tlb,
+void unmap_vmas(struct mmu_gather *tlb,
 		struct vm_area_struct *vma, unsigned long start_addr,
 		unsigned long end_addr, unsigned long *nr_accounted,
 		struct zap_details *details)
 {
-	unsigned long start = start_addr;
 	struct mm_struct *mm = vma->vm_mm;
 
 	mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
-	for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {
-		unsigned long end;
-
-		start = max(vma->vm_start, start_addr);
-		if (start >= vma->vm_end)
-			continue;
-		end = min(vma->vm_end, end_addr);
-		if (end <= vma->vm_start)
-			continue;
-
-		if (vma->vm_flags & VM_ACCOUNT)
-			*nr_accounted += (end - start) >> PAGE_SHIFT;
-
-		if (unlikely(is_pfn_mapping(vma)))
-			untrack_pfn_vma(vma, 0, 0);
-
-		while (start != end) {
-			if (unlikely(is_vm_hugetlb_page(vma))) {
-				/*
-				 * It is undesirable to test vma->vm_file as it
-				 * should be non-null for valid hugetlb area.
-				 * However, vm_file will be NULL in the error
-				 * cleanup path of do_mmap_pgoff. When
-				 * hugetlbfs ->mmap method fails,
-				 * do_mmap_pgoff() nullifies vma->vm_file
-				 * before calling this function to clean up.
-				 * Since no pte has actually been setup, it is
-				 * safe to do nothing in this case.
-				 */
-				if (vma->vm_file)
-					unmap_hugepage_range(vma, start, end, NULL);
-
-				start = end;
-			} else
-				start = unmap_page_range(tlb, vma, start, end, details);
-		}
-	}
-
+	for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)
+		unmap_single_vma(tlb, vma, start_addr, end_addr, nr_accounted,
+				 details);
 	mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
-	return start;	/* which is now the end (or restart) address */
 }
 
 /**
@@ -1388,8 +1388,34 @@ unsigned long unmap_vmas(struct mmu_gather *tlb,
  * @address: starting address of pages to zap
  * @size: number of bytes to zap
  * @details: details of nonlinear truncation or shared cache invalidation
+ *
+ * Caller must protect the VMA list
+ */
+void zap_page_range(struct vm_area_struct *vma, unsigned long address,
+		unsigned long size, struct zap_details *details)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	struct mmu_gather tlb;
+	unsigned long end = address + size;
+	unsigned long nr_accounted = 0;
+
+	lru_add_drain();
+	tlb_gather_mmu(&tlb, mm, 0);
+	update_hiwater_rss(mm);
+	unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
+	tlb_finish_mmu(&tlb, address, end);
+}
+
+/**
+ * zap_page_range_single - remove user pages in a given range
+ * @vma: vm_area_struct holding the applicable pages
+ * @address: starting address of pages to zap
+ * @size: number of bytes to zap
+ * @details: details of nonlinear truncation or shared cache invalidation
+ *
+ * The range must fit into one VMA.
  */
-unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
+static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
 		unsigned long size, struct zap_details *details)
 {
 	struct mm_struct *mm = vma->vm_mm;
@@ -1400,9 +1426,10 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
 	lru_add_drain();
 	tlb_gather_mmu(&tlb, mm, 0);
 	update_hiwater_rss(mm);
-	end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
+	mmu_notifier_invalidate_range_start(mm, address, end);
+	unmap_single_vma(&tlb, vma, address, end, &nr_accounted, details);
+	mmu_notifier_invalidate_range_end(mm, address, end);
 	tlb_finish_mmu(&tlb, address, end);
-	return end;
 }
 
 /**
@@ -1423,7 +1450,7 @@ int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
 	if (address < vma->vm_start || address + size > vma->vm_end ||
 	    		!(vma->vm_flags & VM_PFNMAP))
 		return -1;
-	zap_page_range(vma, address, size, NULL);
+	zap_page_range_single(vma, address, size, NULL);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(zap_vma_ptes);
@@ -2770,7 +2797,7 @@ static void unmap_mapping_range_vma(struct vm_area_struct *vma,
 		unsigned long start_addr, unsigned long end_addr,
 		struct zap_details *details)
 {
-	zap_page_range(vma, start_addr, end_addr - start_addr, details);
+	zap_page_range_single(vma, start_addr, end_addr - start_addr, details);
 }
 
 static inline void unmap_mapping_range_tree(struct prio_tree_root *root,
diff --git a/mm/mmap.c b/mm/mmap.c
index da15a79..9365a8f 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2224,7 +2224,6 @@ void exit_mmap(struct mm_struct *mm)
 	struct mmu_gather tlb;
 	struct vm_area_struct *vma;
 	unsigned long nr_accounted = 0;
-	unsigned long end;
 
 	/* mm's last user has gone, and its about to be pulled down */
 	mmu_notifier_release(mm);
@@ -2249,11 +2248,11 @@ void exit_mmap(struct mm_struct *mm)
 	tlb_gather_mmu(&tlb, mm, 1);
 	/* update_hiwater_rss(mm) here? but nobody should be looking */
 	/* Use -1 here to ensure all VMAs in the mm are unmapped */
-	end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
+	unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
 	vm_unacct_memory(nr_accounted);
 
 	free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);
-	tlb_finish_mmu(&tlb, 0, end);
+	tlb_finish_mmu(&tlb, 0, -1);
 
 	/*
 	 * Walk the list again, actually closing and freeing it,

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [rfc][patches] fix for munmap/truncate races
  2012-03-18 19:07 [rfc][patches] fix for munmap/truncate races Al Viro
@ 2012-03-18 22:01 ` Linus Torvalds
  2012-03-18 22:06   ` Al Viro
  2012-03-18 22:23 ` Al Viro
  1 sibling, 1 reply; 6+ messages in thread
From: Linus Torvalds @ 2012-03-18 22:01 UTC (permalink / raw)
  To: Al Viro; +Cc: linux-kernel, linux-mm

On Sun, Mar 18, 2012 at 12:07 PM, Al Viro <viro@zeniv.linux.org.uk> wrote:
> -       tlb_finish_mmu(&tlb, 0, end);
> +       tlb_finish_mmu(&tlb, 0, -1);

Hmm. The fact that you drop the end pointer means that some
architectures that optimize the TLB flushing for ranges now
effectively can't do it any more.

Now, I think it's only ia64 that really is affected, but it *might* matter.

In particular, ia64 has some logic for "if you only flush one single
region, you can optimize it", and the region sizes are in the
terabytes. And I'm pretty sure you broke that - I'm just not entirely
sure how much we care.

                 Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [rfc][patches] fix for munmap/truncate races
  2012-03-18 22:01 ` Linus Torvalds
@ 2012-03-18 22:06   ` Al Viro
  2012-03-18 22:07     ` Al Viro
  0 siblings, 1 reply; 6+ messages in thread
From: Al Viro @ 2012-03-18 22:06 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-kernel, linux-mm

On Sun, Mar 18, 2012 at 03:01:08PM -0700, Linus Torvalds wrote:
> On Sun, Mar 18, 2012 at 12:07 PM, Al Viro <viro@zeniv.linux.org.uk> wrote:
> > - ? ? ? tlb_finish_mmu(&tlb, 0, end);
> > + ? ? ? tlb_finish_mmu(&tlb, 0, -1);
> 
> Hmm. The fact that you drop the end pointer means that some
> architectures that optimize the TLB flushing for ranges now
> effectively can't do it any more.
> 
> Now, I think it's only ia64 that really is affected, but it *might* matter.
> 
> In particular, ia64 has some logic for "if you only flush one single
> region, you can optimize it", and the region sizes are in the
> terabytes. And I'm pretty sure you broke that - I'm just not entirely
> sure how much we care.

Nope - ia64 check explicitly for precisely that case:
static inline void
ia64_tlb_flush_mmu (struct mmu_gather *tlb, unsigned long start, unsigned long end)
{
        unsigned int nr;

        if (!tlb->need_flush)
                return;
        tlb->need_flush = 0;

        if (tlb->fullmm) {
                /*
                 * Tearing down the entire address space.  This happens both as a result
                 * of exit() and execve().  The latter case necessitates the call to
                 * flush_tlb_mm() here. 
                 */
                flush_tlb_mm(tlb->mm);
....
and if that condition is true, we don't even look at start or end.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [rfc][patches] fix for munmap/truncate races
  2012-03-18 22:06   ` Al Viro
@ 2012-03-18 22:07     ` Al Viro
  2012-03-18 22:09       ` Linus Torvalds
  0 siblings, 1 reply; 6+ messages in thread
From: Al Viro @ 2012-03-18 22:07 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-kernel, linux-mm

On Sun, Mar 18, 2012 at 10:06:10PM +0000, Al Viro wrote:
> On Sun, Mar 18, 2012 at 03:01:08PM -0700, Linus Torvalds wrote:
> > On Sun, Mar 18, 2012 at 12:07 PM, Al Viro <viro@zeniv.linux.org.uk> wrote:
> > > - ? ? ? tlb_finish_mmu(&tlb, 0, end);
> > > + ? ? ? tlb_finish_mmu(&tlb, 0, -1);
> > 
> > Hmm. The fact that you drop the end pointer means that some
> > architectures that optimize the TLB flushing for ranges now
> > effectively can't do it any more.
> > 
> > Now, I think it's only ia64 that really is affected, but it *might* matter.
> > 
> > In particular, ia64 has some logic for "if you only flush one single
> > region, you can optimize it", and the region sizes are in the
> > terabytes. And I'm pretty sure you broke that - I'm just not entirely
> > sure how much we care.
> 
> Nope - ia64 check explicitly for precisely that case:
[snip]
... and everything else doesn't look at start or end at all.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [rfc][patches] fix for munmap/truncate races
  2012-03-18 22:07     ` Al Viro
@ 2012-03-18 22:09       ` Linus Torvalds
  0 siblings, 0 replies; 6+ messages in thread
From: Linus Torvalds @ 2012-03-18 22:09 UTC (permalink / raw)
  To: Al Viro; +Cc: linux-kernel, linux-mm

On Sun, Mar 18, 2012 at 3:07 PM, Al Viro <viro@zeniv.linux.org.uk> wrote:
>>
>> Nope - ia64 check explicitly for precisely that case:
> [snip]
> ... and everything else doesn't look at start or end at all.

Ok, then I don't really care, and it certainly simplifies the calling
conventions.

                   Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [rfc][patches] fix for munmap/truncate races
  2012-03-18 19:07 [rfc][patches] fix for munmap/truncate races Al Viro
  2012-03-18 22:01 ` Linus Torvalds
@ 2012-03-18 22:23 ` Al Viro
  1 sibling, 0 replies; 6+ messages in thread
From: Al Viro @ 2012-03-18 22:23 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-kernel, linux-mm

On Sun, Mar 18, 2012 at 07:07:45PM +0000, Al Viro wrote:
> 	Background: truncate() ends up going through the shared mappings
> of file being truncated (under ->i_mmap_mutex, to protect them from
> getting removed while we do that) and calling unmap_vmas() on them,
> with range passed to unmap_vmas() sitting entirely within the vma
> being passed to it.  The trouble is, unmap_vmas() expects a chain of
> vmas.  It will look into the next vma, see that it's beyond the range
> we'd been given and do nothing to it.  Fine, except that there's nothing
> to protect that next vma from being removed just as we do that - we do
> *not* hold ->i_mmap and ->i_mmap_mutex held on our file won't do anything
> to mappings that have nothing to do with the file in question.
> 
> 	There's an obvious way to deal with that - introducing a variant
> of unmap_vmas() that would handle a single vma and switch these callers
> of unmap_vmas() to using it.  It requires some preparations; below is
> the combined diff, for those who prefer to review the splitup, it is in
> git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs.git #vm

BTW, the missing part of pull request:

Shortlog:
Al Viro (6):
      VM: unmap_page_range() can return void
      VM: can't go through the inner loop in unmap_vmas() more than once...
      VM: make zap_page_range() return void
      VM: don't bother with feeding upper limit to tlb_finish_mmu() in exit_mmap()
      VM: make unmap_vmas() return void
      VM: make zap_page_range() callers that act on a single VMA use separate helper

Diffstat:
 include/linux/mm.h |    4 +-
 mm/memory.c        |  133 +++++++++++++++++++++++++++++++---------------------
 mm/mmap.c          |    5 +-
 3 files changed, 84 insertions(+), 58 deletions(-)

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2012-03-18 22:23 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2012-03-18 19:07 [rfc][patches] fix for munmap/truncate races Al Viro
2012-03-18 22:01 ` Linus Torvalds
2012-03-18 22:06   ` Al Viro
2012-03-18 22:07     ` Al Viro
2012-03-18 22:09       ` Linus Torvalds
2012-03-18 22:23 ` Al Viro

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).