public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH] rmap 14 i_shared_lock fixes
@ 2004-04-27 23:59 Hugh Dickins
  2004-04-28  0:01 ` [PATCH] rmap 15 vma_adjust Hugh Dickins
                   ` (4 more replies)
  0 siblings, 5 replies; 13+ messages in thread
From: Hugh Dickins @ 2004-04-27 23:59 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Rajesh Venkatasubramanian, linux-kernel

First of batch of six patches against 2.6.6-rc2-mm2, which introduce
Rajesh Venkatasubramanian's implementation of a radix priority search
tree of vmas, to handle object-based reverse mapping corner cases well.

rmap 14 i_shared_lock fixes

Start the sequence with a couple of outstanding i_shared_lock fixes.

Since i_shared_sem became i_shared_lock, we've had to shift and then
temporarily remove mremap move's protection of concurrent truncation
- if mremap moves ptes while unmap_mapping_range_list is making its
way through the vmas, there's a danger we'd move a pte from an area
yet to be cleaned back into an area already cleared.

Now site the i_shared_lock with the page_table_lock in move_one_page.
Replace page_table_present by get_one_pte_map, so we know when it's
necessary to allocate a new page table: in which case have to drop
i_shared_lock, trylock and perhaps reorder locks on the way back.
Yet another fix: must check for NULL dst before pte_unmap(dst).

And over in rmap.c, try_to_unmap_file's cond_resched amidst its
lengthy nonlinear swapping was now causing might_sleep warnings:
moved to a rather unsatisfactory and less frequent cond_resched_lock
on i_shared_lock when we reach the end of the list; and one before
starting on the nonlinears too: the "cursor" may become out-of-date
if we do schedule, but I doubt it's worth bothering about.

 mm/mremap.c |   42 +++++++++++++++++++++++++++++++++---------
 mm/rmap.c   |    3 ++-
 2 files changed, 35 insertions(+), 10 deletions(-)

--- 2.6.6-rc2-mm2/mm/mremap.c	2004-04-26 12:39:46.961068192 +0100
+++ rmap14/mm/mremap.c	2004-04-27 19:18:19.421286456 +0100
@@ -55,16 +55,18 @@ end:
 	return pte;
 }
 
-static inline int page_table_present(struct mm_struct *mm, unsigned long addr)
+static pte_t *get_one_pte_map(struct mm_struct *mm, unsigned long addr)
 {
 	pgd_t *pgd;
 	pmd_t *pmd;
 
 	pgd = pgd_offset(mm, addr);
 	if (pgd_none(*pgd))
-		return 0;
+		return NULL;
 	pmd = pmd_offset(pgd, addr);
-	return pmd_present(*pmd);
+	if (!pmd_present(*pmd))
+		return NULL;
+	return pte_offset_map(pmd, addr);
 }
 
 static inline pte_t *alloc_one_pte_map(struct mm_struct *mm, unsigned long addr)
@@ -97,11 +99,23 @@ static int
 move_one_page(struct vm_area_struct *vma, unsigned long old_addr,
 		unsigned long new_addr)
 {
+	struct address_space *mapping = NULL;
 	struct mm_struct *mm = vma->vm_mm;
 	int error = 0;
 	pte_t *src, *dst;
 
+	if (vma->vm_file) {
+		/*
+		 * Subtle point from Rajesh Venkatasubramanian: before
+		 * moving file-based ptes, we must lock vmtruncate out,
+		 * since it might clean the dst vma before the src vma,
+		 * and we propagate stale pages into the dst afterward.
+		 */
+		mapping = vma->vm_file->f_mapping;
+		spin_lock(&mapping->i_shared_lock);
+	}
 	spin_lock(&mm->page_table_lock);
+
 	src = get_one_pte_map_nested(mm, old_addr);
 	if (src) {
 		/*
@@ -109,13 +123,19 @@ move_one_page(struct vm_area_struct *vma
 		 * memory allocation.  If it does then we need to drop the
 		 * atomic kmap
 		 */
-		if (!page_table_present(mm, new_addr)) {
+		dst = get_one_pte_map(mm, new_addr);
+		if (unlikely(!dst)) {
 			pte_unmap_nested(src);
-			src = NULL;
-		}
-		dst = alloc_one_pte_map(mm, new_addr);
-		if (src == NULL)
+			if (mapping)
+				spin_unlock(&mapping->i_shared_lock);
+			dst = alloc_one_pte_map(mm, new_addr);
+			if (mapping && !spin_trylock(&mapping->i_shared_lock)) {
+				spin_unlock(&mm->page_table_lock);
+				spin_lock(&mapping->i_shared_lock);
+				spin_lock(&mm->page_table_lock);
+			}
 			src = get_one_pte_map_nested(mm, old_addr);
+		}
 		/*
 		 * Since alloc_one_pte_map can drop and re-acquire
 		 * page_table_lock, we should re-check the src entry...
@@ -132,9 +152,13 @@ move_one_page(struct vm_area_struct *vma
 			}
 			pte_unmap_nested(src);
 		}
-		pte_unmap(dst);
+		if (dst)
+			pte_unmap(dst);
 	}
+
 	spin_unlock(&mm->page_table_lock);
+	if (mapping)
+		spin_unlock(&mapping->i_shared_lock);
 	return error;
 }
 
--- 2.6.6-rc2-mm2/mm/rmap.c	2004-04-26 12:39:46.000000000 +0100
+++ rmap14/mm/rmap.c	2004-04-27 19:18:19.423286152 +0100
@@ -777,6 +777,7 @@ static inline int try_to_unmap_file(stru
 	 * but even so use it as a guide to how hard we should try?
 	 */
 	rmap_unlock(page);
+	cond_resched_lock(&mapping->i_shared_lock);
 
 	max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK;
 	if (max_nl_cursor == 0)
@@ -799,13 +800,13 @@ static inline int try_to_unmap_file(stru
 				vma->vm_private_data = (void *) cursor;
 				if ((int)mapcount <= 0)
 					goto relock;
-				cond_resched();
 			}
 			if (ret != SWAP_FAIL)
 				vma->vm_private_data =
 					(void *) max_nl_cursor;
 			ret = SWAP_AGAIN;
 		}
+		cond_resched_lock(&mapping->i_shared_lock);
 		max_nl_cursor += CLUSTER_SIZE;
 	} while (max_nl_cursor <= max_nl_size);
 


^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2004-04-29 15:23 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2004-04-27 23:59 [PATCH] rmap 14 i_shared_lock fixes Hugh Dickins
2004-04-28  0:01 ` [PATCH] rmap 15 vma_adjust Hugh Dickins
2004-04-28  0:02 ` [PATCH] rmap 16 pretend prio_tree Hugh Dickins
2004-04-28  0:03 ` [PATCH] rmap 17 real prio_tree Hugh Dickins
2004-04-28  0:04 ` [PATCH] rmap 18 i_mmap_nonlinear Hugh Dickins
2004-04-28 23:11   ` Rik van Riel
2004-04-28 23:44     ` William Lee Irwin III
2004-04-29  6:10       ` Hugh Dickins
2004-04-29  6:32         ` William Lee Irwin III
2004-04-29 13:43         ` James Bottomley
2004-04-29 14:24           ` Hugh Dickins
2004-04-29 15:20             ` Russell King
2004-04-28  0:06 ` [PATCH] rmap 19 arch prio_tree Hugh Dickins

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox