All of lore.kernel.org
 help / color / mirror / Atom feed
From: Nick Piggin <nickpiggin@yahoo.com.au>
To: Linux Memory Management <linux-mm@kvack.org>
Subject: [PATCH 4/7] abstract pagetable locking and pte updates
Date: Fri, 29 Oct 2004 17:21:58 +1000	[thread overview]
Message-ID: <4181EF96.2030602@yahoo.com.au> (raw)
In-Reply-To: <4181EF80.3030709@yahoo.com.au>

[-- Attachment #1: Type: text/plain, Size: 4 bytes --]

4/7

[-- Attachment #2: vm-abstract-pgtable-locking.patch --]
[-- Type: text/x-patch, Size: 95884 bytes --]



Abstract out page table locking and pte updating. Move over to a
transactional type API for doing pte updates. See asm-generic/pgtable.h
for more details.

* VMAs pin pagetables. You must hold the mmap_sem or anon vma lock
  in order to pin the vmas before doing any page table operations.
  
* mm_lock_page_table(mm); must also be taken when doing page table
  operations.

* In order to modify a pte, one must do the following:
{
  struct pte_modify pmod; /* This can store the old pteval for cmpxchg */
  pte_t pte;
  pte = ptep_begin_modify(&pmod, mm, ptep);

  /* confirm pte is what we want */
  if (wrong_pte(pte)) {
    ptep_abort(&pmod, mm, ptep);
    goto out;
  }
  
  ... /* modify pte (not *ptep) */

  if (ptep_commit(&pmod, mm, ptep, pte)) {
      /* commit failed - usually cleanup & retry or cleanup & fail */
  } else {
      /*
       * *ptep was updated.
       * The old *ptep value is guaranteed not to have changed between
       * ptep_begin_modify and ptep_commit _except_ some implementations
       * may allow hardware bits to have changed, so we need a range of
       * ptep_commit_xxx functions to cope with those situations.
       */
  }
}




---

 linux-2.6-npiggin/arch/i386/kernel/vm86.c       |   19 
 linux-2.6-npiggin/arch/i386/mm/hugetlbpage.c    |   11 
 linux-2.6-npiggin/arch/i386/mm/ioremap.c        |   23 
 linux-2.6-npiggin/fs/exec.c                     |   22 
 linux-2.6-npiggin/include/asm-generic/pgtable.h |  298 +++++++++
 linux-2.6-npiggin/include/asm-generic/tlb.h     |    9 
 linux-2.6-npiggin/include/linux/mm.h            |    1 
 linux-2.6-npiggin/kernel/fork.c                 |   10 
 linux-2.6-npiggin/kernel/futex.c                |    7 
 linux-2.6-npiggin/mm/fremap.c                   |   44 -
 linux-2.6-npiggin/mm/hugetlb.c                  |    4 
 linux-2.6-npiggin/mm/memory.c                   |  780 ++++++++++++++----------
 linux-2.6-npiggin/mm/mmap.c                     |    4 
 linux-2.6-npiggin/mm/mprotect.c                 |   30 
 linux-2.6-npiggin/mm/mremap.c                   |   25 
 linux-2.6-npiggin/mm/msync.c                    |   52 +
 linux-2.6-npiggin/mm/rmap.c                     |  175 +++--
 linux-2.6-npiggin/mm/swap_state.c               |    2 
 linux-2.6-npiggin/mm/swapfile.c                 |   63 -
 linux-2.6-npiggin/mm/vmalloc.c                  |   24 
 20 files changed, 1104 insertions(+), 499 deletions(-)

diff -puN mm/memory.c~vm-abstract-pgtable-locking mm/memory.c
--- linux-2.6/mm/memory.c~vm-abstract-pgtable-locking	2004-10-29 16:28:08.000000000 +1000
+++ linux-2.6-npiggin/mm/memory.c	2004-10-29 16:28:08.000000000 +1000
@@ -145,11 +145,14 @@ static inline void free_one_pgd(struct m
  */
 void clear_page_tables(struct mmu_gather *tlb, unsigned long first, int nr)
 {
-	pgd_t * page_dir = tlb->mm->pgd;
+	struct mm_struct *mm = tlb->mm;
+	pgd_t * page_dir = mm->pgd;
 
 	page_dir += first;
 	do {
+		mm_lock_page_table(mm);
 		free_one_pgd(tlb, page_dir);
+		mm_unlock_page_table(mm);
 		page_dir++;
 	} while (--nr);
 }
@@ -159,35 +162,50 @@ pte_t fastcall * pte_alloc_map(struct mm
 	if (!pmd_present(*pmd)) {
 		struct page *new;
 
-		spin_unlock(&mm->page_table_lock);
+		mm_unlock_page_table(mm);
 		new = pte_alloc_one(mm, address);
-		spin_lock(&mm->page_table_lock);
+		mm_lock_page_table(mm);
 		if (!new)
 			return NULL;
 		/*
 		 * Because we dropped the lock, we should re-check the
 		 * entry, as somebody else could have populated it..
 		 */
-		if (pmd_present(*pmd)) {
+		if (pmd_test_and_populate(mm, pmd, new)) {
 			pte_free(new);
 			goto out;
 		}
 		mm->nr_ptes++;
 		inc_page_state(nr_page_table_pages);
-		pmd_populate(mm, pmd, new);
 	}
 out:
 	return pte_offset_map(pmd, address);
 }
 
+static inline pte_t * __pte_alloc_map_unlocked(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
+{
+	if (!pmd_present(*pmd)) {
+		struct page *new;
+
+		new = pte_alloc_one(mm, address);
+		if (!new)
+			return NULL;
+
+		pmd_populate(mm, pmd, new);
+		mm->nr_ptes++;
+		inc_page_state(nr_page_table_pages);
+	}
+	return pte_offset_map(pmd, address);
+}
+
 pte_t fastcall * pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
 {
 	if (!pmd_present(*pmd)) {
 		pte_t *new;
 
-		spin_unlock(&mm->page_table_lock);
+		mm_unlock_page_table(mm);
 		new = pte_alloc_one_kernel(mm, address);
-		spin_lock(&mm->page_table_lock);
+		mm_lock_page_table(mm);
 		if (!new)
 			return NULL;
 
@@ -195,13 +213,9 @@ pte_t fastcall * pte_alloc_kernel(struct
 		 * Because we dropped the lock, we should re-check the
 		 * entry, as somebody else could have populated it..
 		 */
-		if (pmd_present(*pmd)) {
+		if (pmd_test_and_populate_kernel(mm, pmd, new))
 			pte_free_kernel(new);
-			goto out;
-		}
-		pmd_populate_kernel(mm, pmd, new);
 	}
-out:
 	return pte_offset_kernel(pmd, address);
 }
 #define PTE_TABLE_MASK	((PTRS_PER_PTE-1) * sizeof(pte_t))
@@ -214,9 +228,6 @@ out:
  *
  * 08Jan98 Merged into one routine from several inline routines to reduce
  *         variable count and make things faster. -jj
- *
- * dst->page_table_lock is held on entry and exit,
- * but may be dropped within pmd_alloc() and pte_alloc_map().
  */
 int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
 			struct vm_area_struct *vma)
@@ -237,9 +248,9 @@ int copy_page_range(struct mm_struct *ds
 		pmd_t * src_pmd, * dst_pmd;
 
 		src_pgd++; dst_pgd++;
-		
+
 		/* copy_pmd_range */
-		
+
 		if (pgd_none(*src_pgd))
 			goto skip_copy_pmd_range;
 		if (unlikely(pgd_bad(*src_pgd))) {
@@ -251,6 +262,7 @@ skip_copy_pmd_range:	address = (address 
 			continue;
 		}
 
+		/* XXX: Don't we worry about the lock for pgd? */
 		src_pmd = pmd_offset(src_pgd, address);
 		dst_pmd = pmd_alloc(dst, dst_pgd, address);
 		if (!dst_pmd)
@@ -258,9 +270,9 @@ skip_copy_pmd_range:	address = (address 
 
 		do {
 			pte_t * src_pte, * dst_pte;
-		
+
 			/* copy_pte_range */
-		
+
 			if (pmd_none(*src_pmd))
 				goto skip_copy_pte_range;
 			if (unlikely(pmd_bad(*src_pmd))) {
@@ -273,24 +285,43 @@ skip_copy_pte_range:
 				goto cont_copy_pmd_range;
 			}
 
-			dst_pte = pte_alloc_map(dst, dst_pmd, address);
+			dst_pte = __pte_alloc_map_unlocked(dst, dst_pmd, address);
 			if (!dst_pte)
 				goto nomem;
-			spin_lock(&src->page_table_lock);	
+			mm_lock_page_table(src);
+			mm_pin_pages(src);
 			src_pte = pte_offset_map_nested(src_pmd, address);
 			do {
-				pte_t pte = *src_pte;
+				struct pte_modify pmod;
+				pte_t new;
 				struct page *page;
 				unsigned long pfn;
 
+again:
 				/* copy_one_pte */
 
-				if (pte_none(pte))
+				/*
+				 * We use this transaction to check that the
+				 * src hasn't changed from under us. Even if
+				 * we don't actually change it.
+				 */
+				new = ptep_begin_modify(&pmod, src, src_pte);
+				if (pte_none(new)) {
+					ptep_abort(&pmod, src, src_pte);
 					goto cont_copy_pte_range_noset;
+				}
 				/* pte contains position in swap, so copy. */
-				if (!pte_present(pte)) {
-					if (!pte_file(pte)) {
-						swap_duplicate(pte_to_swp_entry(pte));
+				if (!pte_present(new)) {
+					if (!pte_file(new))
+						swap_duplicate(pte_to_swp_entry(new));
+					set_pte(dst_pte, new);
+					if (ptep_verify_finish(&pmod, src, src_pte)) {
+						pte_clear(dst_pte);
+						if (!pte_file(new))
+							free_swap_and_cache(pte_to_swp_entry(new));
+						goto again;
+					}
+					if (!pte_file(new)) {
 						if (list_empty(&dst->mmlist)) {
 							spin_lock(&mmlist_lock);
 							list_add(&dst->mmlist,
@@ -298,10 +329,9 @@ skip_copy_pte_range:
 							spin_unlock(&mmlist_lock);
 						}
 					}
-					set_pte(dst_pte, pte);
 					goto cont_copy_pte_range_noset;
 				}
-				pfn = pte_pfn(pte);
+				pfn = pte_pfn(new);
 				/* the pte points outside of valid memory, the
 				 * mapping is assumed to be good, meaningful
 				 * and not mapped via rmap - duplicate the
@@ -312,7 +342,11 @@ skip_copy_pte_range:
 					page = pfn_to_page(pfn); 
 
 				if (!page || PageReserved(page)) {
-					set_pte(dst_pte, pte);
+					set_pte(dst_pte, new);
+					if (ptep_verify_finish(&pmod, src, src_pte)) {
+						pte_clear(dst_pte);
+						goto again;
+					}
 					goto cont_copy_pte_range_noset;
 				}
 
@@ -320,22 +354,26 @@ skip_copy_pte_range:
 				 * If it's a COW mapping, write protect it both
 				 * in the parent and the child
 				 */
-				if (cow) {
-					ptep_set_wrprotect(src_pte);
-					pte = *src_pte;
-				}
+				if (cow)
+					new = pte_wrprotect(new);
 
 				/*
 				 * If it's a shared mapping, mark it clean in
 				 * the child
 				 */
 				if (vma->vm_flags & VM_SHARED)
-					pte = pte_mkclean(pte);
-				pte = pte_mkold(pte);
+					new = pte_mkclean(new);
+				new = pte_mkold(new);
 				get_page(page);
-				dst->rss++;
-				set_pte(dst_pte, pte);
 				page_dup_rmap(page);
+				set_pte(dst_pte, new);
+				if (ptep_commit(&pmod, src, src_pte, new)) {
+					pte_clear(dst_pte);
+					page_remove_rmap(page);
+					put_page(page);
+					goto again;
+				}
+				dst->rss++;
 cont_copy_pte_range_noset:
 				address += PAGE_SIZE;
 				if (address >= end) {
@@ -348,22 +386,23 @@ cont_copy_pte_range_noset:
 			} while ((unsigned long)src_pte & PTE_TABLE_MASK);
 			pte_unmap_nested(src_pte-1);
 			pte_unmap(dst_pte-1);
-			spin_unlock(&src->page_table_lock);
-			cond_resched_lock(&dst->page_table_lock);
+			mm_unpin_pages(src);
+			mm_unlock_page_table(src);
 cont_copy_pmd_range:
 			src_pmd++;
 			dst_pmd++;
 		} while ((unsigned long)src_pmd & PMD_TABLE_MASK);
 	}
 out_unlock:
-	spin_unlock(&src->page_table_lock);
+	mm_unpin_pages(src);
+	mm_unlock_page_table(src);
 out:
 	return 0;
 nomem:
 	return -ENOMEM;
 }
 
-static void zap_pte_range(struct mmu_gather *tlb,
+static void zap_pte_range(struct mmu_gather *tlb, struct mm_struct *mm,
 		pmd_t *pmd, unsigned long address,
 		unsigned long size, struct zap_details *details)
 {
@@ -384,13 +423,17 @@ static void zap_pte_range(struct mmu_gat
 	size &= PAGE_MASK;
 	if (details && !details->check_mapping && !details->nonlinear_vma)
 		details = NULL;
+	mm_pin_pages(mm);
 	for (offset=0; offset < size; ptep++, offset += PAGE_SIZE) {
-		pte_t pte = *ptep;
-		if (pte_none(pte))
-			continue;
-		if (pte_present(pte)) {
+		struct pte_modify pmod;
+		pte_t old, new;
+again:
+		new = ptep_begin_modify(&pmod, mm, ptep);
+		if (pte_none(new))
+			goto trns_abort;
+		if (pte_present(new)) {
 			struct page *page = NULL;
-			unsigned long pfn = pte_pfn(pte);
+			unsigned long pfn = pte_pfn(new);
 			if (pfn_valid(pfn)) {
 				page = pfn_to_page(pfn);
 				if (PageReserved(page))
@@ -404,7 +447,7 @@ static void zap_pte_range(struct mmu_gat
 				 */
 				if (details->check_mapping &&
 				    details->check_mapping != page->mapping)
-					continue;
+					goto trns_abort;
 				/*
 				 * Each page->index must be checked when
 				 * invalidating or truncating nonlinear.
@@ -412,23 +455,27 @@ static void zap_pte_range(struct mmu_gat
 				if (details->nonlinear_vma &&
 				    (page->index < details->first_index ||
 				     page->index > details->last_index))
-					continue;
+					goto trns_abort;
 			}
-			pte = ptep_get_and_clear(ptep);
+			pte_clear(&new);
+			if (likely(page)) {
+				if (unlikely(details) && details->nonlinear_vma
+				    && linear_page_index(details->nonlinear_vma,
+						address+offset) != page->index)
+					new = pgoff_to_pte(page->index);
+			}
+			if (ptep_commit_clear(&pmod, mm, ptep, new, old))
+				goto again;
 			tlb_remove_tlb_entry(tlb, ptep, address+offset);
-			if (unlikely(!page))
-				continue;
-			if (unlikely(details) && details->nonlinear_vma
-			    && linear_page_index(details->nonlinear_vma,
-					address+offset) != page->index)
-				set_pte(ptep, pgoff_to_pte(page->index));
-			if (pte_dirty(pte))
-				set_page_dirty(page);
-			if (pte_young(pte) && !PageAnon(page))
-				mark_page_accessed(page);
-			tlb->freed++;
-			page_remove_rmap(page);
-			tlb_remove_page(tlb, page);
+			if (likely(page)) {
+				if (pte_dirty(old))
+					set_page_dirty(page);
+				if (pte_young(old) && !PageAnon(page))
+					mark_page_accessed(page);
+				tlb->freed++;
+				page_remove_rmap(page);
+				tlb_remove_page(tlb, page);
+			}
 			continue;
 		}
 		/*
@@ -436,15 +483,22 @@ static void zap_pte_range(struct mmu_gat
 		 * if details->nonlinear_vma, we leave file entries.
 		 */
 		if (unlikely(details))
-			continue;
-		if (!pte_file(pte))
-			free_swap_and_cache(pte_to_swp_entry(pte));
-		pte_clear(ptep);
+			goto trns_abort;
+		pte_clear(&new);
+		if (ptep_commit_clear(&pmod, mm, ptep, new, old))
+			goto again;
+		if (!pte_file(old))
+			free_swap_and_cache(pte_to_swp_entry(old));
+
+		continue;
+trns_abort:
+		ptep_abort(&pmod, mm, ptep);
 	}
+	mm_unpin_pages(mm);
 	pte_unmap(ptep-1);
 }
 
-static void zap_pmd_range(struct mmu_gather *tlb,
+static void zap_pmd_range(struct mmu_gather *tlb, struct mm_struct *mm,
 		pgd_t * dir, unsigned long address,
 		unsigned long size, struct zap_details *details)
 {
@@ -463,27 +517,29 @@ static void zap_pmd_range(struct mmu_gat
 	if (end > ((address + PGDIR_SIZE) & PGDIR_MASK))
 		end = ((address + PGDIR_SIZE) & PGDIR_MASK);
 	do {
-		zap_pte_range(tlb, pmd, address, end - address, details);
+		zap_pte_range(tlb, mm, pmd, address, end - address, details);
 		address = (address + PMD_SIZE) & PMD_MASK; 
 		pmd++;
 	} while (address && (address < end));
 }
 
-static void unmap_page_range(struct mmu_gather *tlb,
+static void unmap_page_range(struct mmu_gather *tlb, struct mm_struct *mm,
 		struct vm_area_struct *vma, unsigned long address,
 		unsigned long end, struct zap_details *details)
 {
 	pgd_t * dir;
 
 	BUG_ON(address >= end);
-	dir = pgd_offset(vma->vm_mm, address);
+	mm_lock_page_table(mm);
+	dir = pgd_offset(mm, address);
 	tlb_start_vma(tlb, vma);
 	do {
-		zap_pmd_range(tlb, dir, address, end - address, details);
+		zap_pmd_range(tlb, mm, dir, address, end - address, details);
 		address = (address + PGDIR_SIZE) & PGDIR_MASK;
 		dir++;
 	} while (address && (address < end));
 	tlb_end_vma(tlb, vma);
+	mm_unlock_page_table(mm);
 }
 
 /* Dispose of an entire struct mmu_gather per rescheduling point */
@@ -513,11 +569,7 @@ static void unmap_page_range(struct mmu_
  *
  * Returns the number of vma's which were covered by the unmapping.
  *
- * Unmap all pages in the vma list.  Called under page_table_lock.
- *
- * We aim to not hold page_table_lock for too long (for scheduling latency
- * reasons).  So zap pages in ZAP_BLOCK_SIZE bytecounts.  This means we need to
- * return the ending mmu_gather to the caller.
+ * Unmap all pages in the vma list.
  *
  * Only addresses between `start' and `end' will be unmapped.
  *
@@ -533,7 +585,7 @@ static int __unmap_vmas(struct mmu_gathe
 		unsigned long end_addr, unsigned long *nr_accounted,
 		struct zap_details *details)
 {
-	unsigned long zap_bytes = ZAP_BLOCK_SIZE;
+	unsigned long zap_bytes;
 	unsigned long tlb_start = 0;	/* For tlb_finish_mmu */
 	int tlb_start_valid = 0;
 	int ret = 0;
@@ -556,6 +608,7 @@ static int __unmap_vmas(struct mmu_gathe
 		ret++;
 		while (start != end) {
 			unsigned long block;
+			zap_bytes = ZAP_BLOCK_SIZE;
 
 			if (!tlb_start_valid) {
 				tlb_start = start;
@@ -567,7 +620,7 @@ static int __unmap_vmas(struct mmu_gathe
 				unmap_hugepage_range(vma, start, end);
 			} else {
 				block = min(zap_bytes, end - start);
-				unmap_page_range(*tlbp, vma, start,
+				unmap_page_range(*tlbp, mm, vma, start,
 						start + block, details);
 			}
 
@@ -578,7 +631,7 @@ static int __unmap_vmas(struct mmu_gathe
 			if (!atomic && need_resched()) {
 				int fullmm = tlb_is_full_mm(*tlbp);
 				tlb_finish_mmu(*tlbp, tlb_start, start);
-				cond_resched_lock(&mm->page_table_lock);
+				cond_resched();
 				*tlbp = tlb_gather_mmu(mm, fullmm);
 				tlb_start_valid = 0;
 			}
@@ -594,12 +647,10 @@ void unmap_vmas(struct mm_struct *mm, st
 {
 	struct mmu_gather *tlb;
 	lru_add_drain();
-	spin_lock(&mm->page_table_lock);
 	tlb = tlb_gather_mmu(mm, 0);
 	__unmap_vmas(&tlb, mm, vma,
 			start_addr, end_addr, nr_accounted, details);
 	tlb_finish_mmu(tlb, start_addr, end_addr);
-	spin_unlock(&mm->page_table_lock);
 }
 
 int unmap_all_vmas(struct mm_struct *mm, unsigned long *nr_accounted)
@@ -607,13 +658,11 @@ int unmap_all_vmas(struct mm_struct *mm,
 	struct mmu_gather *tlb;
 	int ret;
 	lru_add_drain();
-	spin_lock(&mm->page_table_lock);
 	tlb = tlb_gather_mmu(mm, 1);
 	flush_cache_mm(mm);
 	/* Use ~0UL here to ensure all VMAs in the mm are unmapped */
 	ret = __unmap_vmas(&tlb, mm, mm->mmap, 0, ~0UL, nr_accounted, NULL);
 	tlb_finish_mmu(tlb, 0, MM_VM_SIZE(mm));
-	spin_unlock(&mm->page_table_lock);
 
 	return ret;
 }
@@ -640,9 +689,14 @@ void zap_page_range(struct vm_area_struc
 	unmap_vmas(mm, vma, address, end, &nr_accounted, details);
 }
 
+void follow_page_finish(struct mm_struct *mm, unsigned long address)
+{
+	mm_unpin_pages(mm);
+	mm_unlock_page_table(mm);
+}
+
 /*
  * Do a quick page-table lookup for a single page.
- * mm->page_table_lock must be held.
  */
 struct page *
 follow_page(struct mm_struct *mm, unsigned long address, int write) 
@@ -653,7 +707,8 @@ follow_page(struct mm_struct *mm, unsign
 	unsigned long pfn;
 	struct page *page;
 
-	page = follow_huge_addr(mm, address, write);
+	mm_lock_page_table(mm);
+	page = follow_huge_addr(mm, address, write); /* XXX: hugepages are broken */
 	if (! IS_ERR(page))
 		return page;
 
@@ -673,11 +728,16 @@ follow_page(struct mm_struct *mm, unsign
 	if (!ptep)
 		goto out;
 
-	pte = *ptep;
+	/* XXX: should be able to drop the mm_pin_pages lock after pinning the
+	 * page with get_page? 
+	 */
+	mm_pin_pages(mm);
+	pte = ptep_atomic_read(ptep);
 	pte_unmap(ptep);
+
 	if (pte_present(pte)) {
 		if (write && !pte_write(pte))
-			goto out;
+			goto out_unpin;
 		pfn = pte_pfn(pte);
 		if (pfn_valid(pfn)) {
 			page = pfn_to_page(pfn);
@@ -688,7 +748,10 @@ follow_page(struct mm_struct *mm, unsign
 		}
 	}
 
+out_unpin:
+	mm_unpin_pages(mm);
 out:
+	mm_unlock_page_table(mm);
 	return NULL;
 }
 
@@ -698,23 +761,29 @@ untouched_anonymous_page(struct mm_struc
 {
 	pgd_t *pgd;
 	pmd_t *pmd;
+	int ret = 1;
 
 	/* Check if the vma is for an anonymous mapping. */
 	if (vma->vm_ops && vma->vm_ops->nopage)
 		return 0;
 
+	mm_lock_page_table(mm);
+
 	/* Check if page directory entry exists. */
 	pgd = pgd_offset(mm, address);
 	if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
-		return 1;
+		goto out;
 
 	/* Check if page middle directory entry exists. */
 	pmd = pmd_offset(pgd, address);
 	if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
-		return 1;
+		goto out;
 
 	/* There is a pte slot for 'address' in 'mm'. */
-	return 0;
+	ret = 0;
+out:
+	mm_unlock_page_table(mm);
+	return ret;
 }
 
 int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
@@ -753,6 +822,7 @@ int get_user_pages(struct task_struct *t
 			pte = pte_offset_map(pmd, pg);
 			if (!pte)
 				return i ? : -EFAULT;
+			/* XXX: don't need atomic read for *pte? (guess not) */
 			if (!pte_present(*pte)) {
 				pte_unmap(pte);
 				return i ? : -EFAULT;
@@ -779,7 +849,6 @@ int get_user_pages(struct task_struct *t
 						&start, &len, i);
 			continue;
 		}
-		spin_lock(&mm->page_table_lock);
 		do {
 			struct page *page;
 			int lookup_write = write;
@@ -793,10 +862,10 @@ int get_user_pages(struct task_struct *t
 				 */
 				if (!lookup_write &&
 				    untouched_anonymous_page(mm,vma,start)) {
-					page = ZERO_PAGE(start);
-					break;
+					if (pages)
+						pages[i] = ZERO_PAGE(start);
+					goto set_vmas;
 				}
-				spin_unlock(&mm->page_table_lock);
 				switch (handle_mm_fault(mm,vma,start,write)) {
 				case VM_FAULT_MINOR:
 					tsk->min_flt++;
@@ -819,7 +888,6 @@ int get_user_pages(struct task_struct *t
 				 * we are forcing write access.
 				 */
 				lookup_write = write && !force;
-				spin_lock(&mm->page_table_lock);
 			}
 			if (pages) {
 				pages[i] = page;
@@ -827,21 +895,23 @@ int get_user_pages(struct task_struct *t
 				if (!PageReserved(page))
 					page_cache_get(page);
 			}
+			if (page)
+				follow_page_finish(mm, start);
+set_vmas:
 			if (vmas)
 				vmas[i] = vma;
 			i++;
 			start += PAGE_SIZE;
 			len--;
 		} while(len && start < vma->vm_end);
-		spin_unlock(&mm->page_table_lock);
 	} while(len);
 	return i;
 }
 
 EXPORT_SYMBOL(get_user_pages);
 
-static void zeromap_pte_range(pte_t * pte, unsigned long address,
-                                     unsigned long size, pgprot_t prot)
+static void zeromap_pte_range(struct mm_struct *mm, pte_t * pte,
+		unsigned long address, unsigned long size, pgprot_t prot)
 {
 	unsigned long end;
 
@@ -850,9 +920,14 @@ static void zeromap_pte_range(pte_t * pt
 	if (end > PMD_SIZE)
 		end = PMD_SIZE;
 	do {
-		pte_t zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE(address), prot));
-		BUG_ON(!pte_none(*pte));
-		set_pte(pte, zero_pte);
+		struct pte_modify pmod;
+		pte_t new;
+again:
+		new = ptep_begin_modify(&pmod, mm, pte);
+		BUG_ON(!pte_none(new));
+		new = pte_wrprotect(mk_pte(ZERO_PAGE(address), prot));
+		if (ptep_commit(&pmod, mm, pte, new))
+			goto again;
 		address += PAGE_SIZE;
 		pte++;
 	} while (address && (address < end));
@@ -872,7 +947,7 @@ static inline int zeromap_pmd_range(stru
 		pte_t * pte = pte_alloc_map(mm, pmd, base + address);
 		if (!pte)
 			return -ENOMEM;
-		zeromap_pte_range(pte, base + address, end - address, prot);
+		zeromap_pte_range(mm, pte, base + address, end - address, prot);
 		pte_unmap(pte);
 		address = (address + PMD_SIZE) & PMD_MASK;
 		pmd++;
@@ -893,7 +968,7 @@ int zeromap_page_range(struct vm_area_st
 	if (address >= end)
 		BUG();
 
-	spin_lock(&mm->page_table_lock);
+	mm_lock_page_table(mm);
 	do {
 		pmd_t *pmd = pmd_alloc(mm, dir, address);
 		error = -ENOMEM;
@@ -909,7 +984,7 @@ int zeromap_page_range(struct vm_area_st
 	 * Why flush? zeromap_pte_range has a BUG_ON for !pte_none()
 	 */
 	flush_tlb_range(vma, beg, end);
-	spin_unlock(&mm->page_table_lock);
+	mm_unlock_page_table(mm);
 	return error;
 }
 
@@ -918,8 +993,9 @@ int zeromap_page_range(struct vm_area_st
  * mappings are removed. any references to nonexistent pages results
  * in null mappings (currently treated as "copy-on-access")
  */
-static inline void remap_pte_range(pte_t * pte, unsigned long address, unsigned long size,
-	unsigned long pfn, pgprot_t prot)
+static inline void remap_pte_range(struct mm_struct *mm, pte_t * pte,
+		unsigned long address, unsigned long size,
+		unsigned long pfn, pgprot_t prot)
 {
 	unsigned long end;
 
@@ -927,14 +1003,26 @@ static inline void remap_pte_range(pte_t
 	end = address + size;
 	if (end > PMD_SIZE)
 		end = PMD_SIZE;
+	mm_pin_pages(mm);
 	do {
-		BUG_ON(!pte_none(*pte));
-		if (!pfn_valid(pfn) || PageReserved(pfn_to_page(pfn)))
- 			set_pte(pte, pfn_pte(pfn, prot));
+		struct pte_modify pmod;
+		pte_t new;
+
+again:
+		new = ptep_begin_modify(&pmod, mm, pte);
+		BUG_ON(!pte_none(new));
+		if (!pfn_valid(pfn) || PageReserved(pfn_to_page(pfn))) {
+			new = pfn_pte(pfn, prot);
+			if (ptep_commit(&pmod, mm, pte, new))
+				goto again;
+		} else
+			ptep_abort(&pmod, mm, pte);
+
 		address += PAGE_SIZE;
 		pfn++;
 		pte++;
 	} while (address && (address < end));
+	mm_unpin_pages(mm);
 }
 
 static inline int remap_pmd_range(struct mm_struct *mm, pmd_t * pmd, unsigned long address, unsigned long size,
@@ -952,7 +1040,7 @@ static inline int remap_pmd_range(struct
 		pte_t * pte = pte_alloc_map(mm, pmd, base + address);
 		if (!pte)
 			return -ENOMEM;
-		remap_pte_range(pte, base + address, end - address, pfn + (address >> PAGE_SHIFT), prot);
+		remap_pte_range(mm, pte, base + address, end - address, pfn + (address >> PAGE_SHIFT), prot);
 		pte_unmap(pte);
 		address = (address + PMD_SIZE) & PMD_MASK;
 		pmd++;
@@ -984,7 +1072,7 @@ int remap_pfn_range(struct vm_area_struc
 	 *	this region.
 	 */
 	vma->vm_flags |= VM_IO | VM_RESERVED;
-	spin_lock(&mm->page_table_lock);
+	mm_lock_page_table(mm);
 	do {
 		pmd_t *pmd = pmd_alloc(mm, dir, from);
 		error = -ENOMEM;
@@ -1000,7 +1088,7 @@ int remap_pfn_range(struct vm_area_struc
 	 * Why flush? remap_pte_range has a BUG_ON for !pte_none()
 	 */
 	flush_tlb_range(vma, beg, end);
-	spin_unlock(&mm->page_table_lock);
+	mm_unlock_page_table(mm);
 	return error;
 }
 EXPORT_SYMBOL(remap_pfn_range);
@@ -1019,21 +1107,6 @@ static inline pte_t maybe_mkwrite(pte_t 
 }
 
 /*
- * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock
- */
-static inline void break_cow(struct vm_area_struct * vma, struct page * new_page, unsigned long address, 
-		pte_t *page_table)
-{
-	pte_t entry;
-
-	flush_cache_page(vma, address);
-	entry = maybe_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot)),
-			      vma);
-	ptep_establish(vma, address, page_table, entry);
-	update_mmu_cache(vma, address, entry);
-}
-
-/*
  * This routine handles present pages, when users try to write
  * to a shared page. It is done by copying the page to a new address
  * and decrementing the shared-page counter for the old page.
@@ -1050,15 +1123,30 @@ static inline void break_cow(struct vm_a
  * change only once the write actually happens. This avoids a few races,
  * and potentially makes it more efficient.
  *
- * We hold the mm semaphore and the page_table_lock on entry and exit
- * with the page_table_lock released.
+ * We hold the mm semaphore and have the page table locked on entry, and exit
+ * with the page table unlocked.
  */
-static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
-	unsigned long address, pte_t *page_table, pmd_t *pmd, pte_t pte)
+static int do_wp_page(struct pte_modify *pmod, struct mm_struct *mm,
+	struct vm_area_struct * vma, unsigned long address,
+	pte_t *page_table, pmd_t *pmd, pte_t pte)
 {
+	pte_t new;
 	struct page *old_page, *new_page;
-	unsigned long pfn = pte_pfn(pte);
-	pte_t entry;
+	unsigned long pfn;
+	int ret = VM_FAULT_OOM;
+
+	/* Audit use of mm_pin_pages nesting with ptep_begin_modify, maybe
+	 * deadlockable if we do pte locks.
+	 */
+	mm_pin_pages(mm);
+
+	/* Make sure the pte hasn't changed under us after pinning */
+	if (ptep_verify(pmod, mm, page_table)) {
+		ret = VM_FAULT_MINOR;
+		goto out_error;
+	}
+
+	pfn = pte_pfn(pte);
 
 	if (unlikely(!pfn_valid(pfn))) {
 		/*
@@ -1066,25 +1154,25 @@ static int do_wp_page(struct mm_struct *
 		 * at least the kernel stops what it's doing before it corrupts
 		 * data, but for the moment just pretend this is OOM.
 		 */
-		pte_unmap(page_table);
 		printk(KERN_ERR "do_wp_page: bogus page at address %08lx\n",
 				address);
-		spin_unlock(&mm->page_table_lock);
-		return VM_FAULT_OOM;
+		goto out_error;
 	}
+
 	old_page = pfn_to_page(pfn);
 
 	if (!TestSetPageLocked(old_page)) {
 		int reuse = can_share_swap_page(old_page);
 		unlock_page(old_page);
 		if (reuse) {
+			mm_unpin_pages(mm);
 			flush_cache_page(vma, address);
-			entry = maybe_mkwrite(pte_mkyoung(pte_mkdirty(pte)),
-					      vma);
-			ptep_set_access_flags(vma, address, page_table, entry, 1);
-			update_mmu_cache(vma, address, entry);
+			new = maybe_mkwrite(pte_mkyoung(pte_mkdirty(pte)), vma);
+			if (!ptep_commit_access_flush(pmod, mm, vma, address,
+							page_table, new, 1))
+				update_mmu_cache(vma, address, new);
 			pte_unmap(page_table);
-			spin_unlock(&mm->page_table_lock);
+			mm_unlock_page_table(mm);
 			return VM_FAULT_MINOR;
 		}
 	}
@@ -1095,41 +1183,70 @@ static int do_wp_page(struct mm_struct *
 	 */
 	if (!PageReserved(old_page))
 		page_cache_get(old_page);
-	spin_unlock(&mm->page_table_lock);
+	ptep_abort(pmod, mm, page_table);
+	mm_unpin_pages(mm);
+	mm_unlock_page_table(mm);
 
-	if (unlikely(anon_vma_prepare(vma)))
+	if (unlikely(anon_vma_prepare(vma))) {
+		ptep_abort(pmod, mm, page_table);
 		goto no_new_page;
+	}
 	new_page = alloc_page_vma(GFP_HIGHUSER, vma, address);
-	if (!new_page)
+	if (!new_page) {
+		ptep_abort(pmod, mm, page_table);
 		goto no_new_page;
-	copy_cow_page(old_page,new_page,address);
+	}
+	copy_cow_page(old_page, new_page, address);
 
 	/*
 	 * Re-check the pte - we dropped the lock
 	 */
-	spin_lock(&mm->page_table_lock);
+	mm_lock_page_table(mm);
 	page_table = pte_offset_map(pmd, address);
-	if (likely(pte_same(*page_table, pte))) {
-		if (PageReserved(old_page))
-			++mm->rss;
-		else
-			page_remove_rmap(old_page);
-		break_cow(vma, new_page, address, page_table);
-		lru_cache_add_active(new_page);
-		page_add_anon_rmap(new_page, vma, address);
+	new = ptep_begin_modify(pmod, mm, page_table);
 
-		/* Free the old page.. */
-		new_page = old_page;
+	if (unlikely(!pte_same(new, pte))) {
+		ptep_abort(pmod, mm, page_table);
+		goto out;
+	}
+
+	/* break COW */
+	flush_cache_page(vma, address);
+	new = maybe_mkwrite(pte_mkdirty(
+				mk_pte(new_page, vma->vm_page_prot)), vma);
+	page_add_anon_rmap(new_page, vma, address);
+	if (ptep_commit_establish_flush(pmod, mm, vma, address,
+				page_table, new)) {
+		page_remove_rmap(new_page);
+		goto out;
 	}
+	update_mmu_cache(vma, address, new);
+	if (PageReserved(old_page))
+		++mm->rss;
+	else
+		page_remove_rmap(old_page);
+
+	/* After lru_cache_add_active new_page may disappear, so don't touch! */
+	lru_cache_add_active(new_page);
+
+	/* Free the old page.. */
+	new_page = old_page;
+
+out:
+	ret = VM_FAULT_MINOR;
 	pte_unmap(page_table);
+	mm_unlock_page_table(mm);
 	page_cache_release(new_page);
-	page_cache_release(old_page);
-	spin_unlock(&mm->page_table_lock);
-	return VM_FAULT_MINOR;
-
 no_new_page:
 	page_cache_release(old_page);
-	return VM_FAULT_OOM;
+	return ret;
+
+out_error:
+	ptep_abort(pmod, mm, page_table);
+	pte_unmap(page_table);
+	mm_unpin_pages(mm);
+	mm_unlock_page_table(mm);
+	return ret;
 }
 
 /*
@@ -1201,6 +1318,7 @@ void unmap_mapping_range(struct address_
 	spin_lock(&mapping->i_mmap_lock);
 	/* Protect against page fault */
 	atomic_inc(&mapping->truncate_count);
+	smp_wmb(); /* For truncate_count */
 
 	if (unlikely(!prio_tree_empty(&mapping->i_mmap)))
 		unmap_mapping_range_list(&mapping->i_mmap, &details);
@@ -1329,37 +1447,39 @@ void swapin_readahead(swp_entry_t entry,
 }
 
 /*
- * We hold the mm semaphore and the page_table_lock on entry and
- * should release the pagetable lock on exit..
+ * We hold the mm semaphore and the page table locked on entry.
+ * We release the pagetable lock on exit.
  */
-static int do_swap_page(struct mm_struct * mm,
-	struct vm_area_struct * vma, unsigned long address,
-	pte_t *page_table, pmd_t *pmd, pte_t orig_pte, int write_access)
+static int do_swap_page(struct pte_modify *pmod, struct mm_struct * mm,
+	struct vm_area_struct * vma, unsigned long address, int write_access,
+	pte_t *page_table, pmd_t *pmd, pte_t orig_pte)
 {
+	int used_swap_page = 0;
+	pte_t new, old;
 	struct page *page;
 	swp_entry_t entry = pte_to_swp_entry(orig_pte);
-	pte_t pte;
 	int ret = VM_FAULT_MINOR;
 
+	ptep_abort(pmod, mm, page_table);
 	pte_unmap(page_table);
-	spin_unlock(&mm->page_table_lock);
+	mm_unlock_page_table(mm);
 	page = lookup_swap_cache(entry);
 	if (!page) {
  		swapin_readahead(entry, address, vma);
  		page = read_swap_cache_async(entry, vma, address);
 		if (!page) {
 			/*
-			 * Back out if somebody else faulted in this pte while
-			 * we released the page table lock.
+			 * Back out if somebody else faulted in this pte.
 			 */
-			spin_lock(&mm->page_table_lock);
+			mm_lock_page_table(mm);
 			page_table = pte_offset_map(pmd, address);
-			if (likely(pte_same(*page_table, orig_pte)))
+			if (likely(pte_same(ptep_atomic_read(page_table),
+							orig_pte)))
 				ret = VM_FAULT_OOM;
 			else
 				ret = VM_FAULT_MINOR;
 			pte_unmap(page_table);
-			spin_unlock(&mm->page_table_lock);
+			mm_unlock_page_table(mm);
 			goto out;
 		}
 
@@ -1376,71 +1496,83 @@ static int do_swap_page(struct mm_struct
 	 * Back out if somebody else faulted in this pte while we
 	 * released the page table lock.
 	 */
-	spin_lock(&mm->page_table_lock);
+	mm_lock_page_table(mm);
 	page_table = pte_offset_map(pmd, address);
-	if (unlikely(!pte_same(*page_table, orig_pte))) {
-		pte_unmap(page_table);
-		spin_unlock(&mm->page_table_lock);
+	new = ptep_begin_modify(pmod, mm, page_table);
+	if (unlikely(!pte_same(new, orig_pte))) {
+		ptep_abort(pmod, mm, page_table);
 		unlock_page(page);
-		page_cache_release(page);
-		ret = VM_FAULT_MINOR;
-		goto out;
+		goto out_failed;
 	}
 
 	/* The page isn't present yet, go ahead with the fault. */
-		
+
 	swap_free(entry);
-	if (vm_swap_full())
-		remove_exclusive_swap_page(page);
 
-	mm->rss++;
-	pte = mk_pte(page, vma->vm_page_prot);
+	new = mk_pte(page, vma->vm_page_prot);
 	if (write_access && can_share_swap_page(page)) {
-		pte = maybe_mkwrite(pte_mkdirty(pte), vma);
+		new = maybe_mkwrite(pte_mkdirty(new), vma);
 		write_access = 0;
+		used_swap_page = 1;
 	}
-	unlock_page(page);
 
 	flush_icache_page(vma, page);
-	set_pte(page_table, pte);
 	page_add_anon_rmap(page, vma, address);
+	if (ptep_commit(pmod, mm, page_table, new)) {
+		page_remove_rmap(page);
+		swap_duplicate(entry);
+		unlock_page(page);
+		goto out_failed;
+	}
+	if (!used_swap_page && vm_swap_full())
+		remove_exclusive_swap_page(page);
+	unlock_page(page);
+	mm->rss++;
 
 	if (write_access) {
-		if (do_wp_page(mm, vma, address,
-				page_table, pmd, pte) == VM_FAULT_OOM)
-			ret = VM_FAULT_OOM;
-		goto out;
+		old = new;
+		new = ptep_begin_modify(pmod, mm, page_table);
+		if (likely(pte_same(old, new))) {
+			if (do_wp_page(pmod, mm, vma, address,
+					page_table, pmd, new) == VM_FAULT_OOM)
+				ret = VM_FAULT_OOM;
+			goto out;
+		}
+		ptep_abort(pmod, mm, page_table);
 	}
 
 	/* No need to invalidate - it was non-present before */
-	update_mmu_cache(vma, address, pte);
+	update_mmu_cache(vma, address, new);
 	pte_unmap(page_table);
-	spin_unlock(&mm->page_table_lock);
+	mm_unlock_page_table(mm);
 out:
 	return ret;
+
+out_failed:
+	pte_unmap(page_table);
+	mm_unlock_page_table(mm);
+	page_cache_release(page);
+	return ret;
 }
 
 /*
- * We are called with the MM semaphore and page_table_lock
- * spinlock held to protect against concurrent faults in
- * multithreaded programs. 
+ * We are called with the MM semaphore and page table locked
+ * to protect against concurrent faults in multithreaded programs.
  */
 static int
-do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
-		pte_t *page_table, pmd_t *pmd, int write_access,
-		unsigned long addr)
+do_anonymous_page(struct pte_modify *pmod, struct mm_struct *mm,
+		struct vm_area_struct *vma, unsigned long addr,
+		int write_access, pte_t *page_table, pmd_t *pmd)
 {
-	pte_t entry;
-	struct page * page = ZERO_PAGE(addr);
-
-	/* Read-only mapping of ZERO_PAGE. */
-	entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot));
+	pte_t new;
+	struct page *page;
 
-	/* ..except if it's a write access */
+	/* XXX: is this really unlikely? The code previously suggested so */
 	if (write_access) {
 		/* Allocate our own private page. */
+		ptep_abort(ptep, mm, page_table);
 		pte_unmap(page_table);
-		spin_unlock(&mm->page_table_lock);
+		mm_unlock_page_table(mm);
 
 		if (unlikely(anon_vma_prepare(vma)))
 			goto no_mem;
@@ -1449,31 +1581,40 @@ do_anonymous_page(struct mm_struct *mm, 
 			goto no_mem;
 		clear_user_highpage(page, addr);
 
-		spin_lock(&mm->page_table_lock);
+		mm_lock_page_table(mm);
 		page_table = pte_offset_map(pmd, addr);
+		new = ptep_begin_modify(pmod, mm, page_table);
 
-		if (!pte_none(*page_table)) {
-			pte_unmap(page_table);
+		if (unlikely(!pte_none(new))) {
+			ptep_abort(ptep, mm, page_table);
+			page_cache_release(page);
+			goto out;
+		}
+		new = maybe_mkwrite(pte_mkdirty(mk_pte(page,
+						vma->vm_page_prot)), vma);
+		page_add_anon_rmap(page, vma, addr);
+		if (ptep_commit(pmod, mm, page_table, new)) {
+			page_remove_rmap(page);
 			page_cache_release(page);
-			spin_unlock(&mm->page_table_lock);
 			goto out;
 		}
+
 		mm->rss++;
-		entry = maybe_mkwrite(pte_mkdirty(mk_pte(page,
-							 vma->vm_page_prot)),
-				      vma);
-		lru_cache_add_active(page);
 		mark_page_accessed(page);
-		page_add_anon_rmap(page, vma, addr);
+		lru_cache_add_active(page);
+	} else {
+		/* Read-only mapping of ZERO_PAGE. */
+		page = ZERO_PAGE(addr);
+		new = pte_wrprotect(mk_pte(page, vma->vm_page_prot));
+		if (ptep_commit(pmod, mm, page_table, new))
+			goto out;
 	}
 
-	set_pte(page_table, entry);
-	pte_unmap(page_table);
-
 	/* No need to invalidate - it was non-present before */
-	update_mmu_cache(vma, addr, entry);
-	spin_unlock(&mm->page_table_lock);
+	update_mmu_cache(vma, addr, new);
 out:
+	pte_unmap(page_table);
+	mm_unlock_page_table(mm);
 	return VM_FAULT_MINOR;
 no_mem:
 	return VM_FAULT_OOM;
@@ -1492,27 +1633,29 @@ no_mem:
  * spinlock held. Exit with the spinlock released.
  */
 static int
-do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
-	unsigned long address, int write_access, pte_t *page_table, pmd_t *pmd)
+do_no_page(struct pte_modify *pmod, struct mm_struct *mm,
+		struct vm_area_struct *vma, unsigned long address,
+		int write_access, pte_t *page_table, pmd_t *pmd, pte_t pte)
 {
+	pte_t new;
 	struct page * new_page;
 	struct address_space *mapping = NULL;
-	pte_t entry;
 	int sequence = 0;
 	int ret = VM_FAULT_MINOR;
 	int anon = 0;
 
 	if (!vma->vm_ops || !vma->vm_ops->nopage)
-		return do_anonymous_page(mm, vma, page_table,
-					pmd, write_access, address);
+		return do_anonymous_page(pmod, mm, vma, address,
+				write_access, page_table, pmd);
+
+	ptep_abort(ptep, mm, page_table);
 	pte_unmap(page_table);
-	spin_unlock(&mm->page_table_lock);
+	mm_unlock_page_table(mm);
 
 	if (vma->vm_file) {
 		mapping = vma->vm_file->f_mapping;
 		sequence = atomic_read(&mapping->truncate_count);
 	}
-	smp_rmb();  /* Prevent CPU from reordering lock-free ->nopage() */
 retry:
 	new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret);
 
@@ -1539,20 +1682,32 @@ retry:
 		anon = 1;
 	}
 
-	spin_lock(&mm->page_table_lock);
+	mm_lock_page_table(mm);
+	/* XXX: investigate this further WRT lockless page table issues. */
 	/*
 	 * For a file-backed vma, someone could have truncated or otherwise
 	 * invalidated this page.  If unmap_mapping_range got called,
 	 * retry getting the page.
 	 */
-	if (mapping &&
-	      (unlikely(sequence != atomic_read(&mapping->truncate_count)))) {
-		sequence = atomic_read(&mapping->truncate_count);
-		spin_unlock(&mm->page_table_lock);
-		page_cache_release(new_page);
-		goto retry;
+	if (mapping) {
+		smp_rmb(); /* For truncate_count */
+		if (unlikely(sequence !=
+				atomic_read(&mapping->truncate_count))) {
+			sequence = atomic_read(&mapping->truncate_count);
+			mm_unlock_page_table(mm);
+			page_cache_release(new_page);
+			goto retry;
+		}
 	}
 	page_table = pte_offset_map(pmd, address);
+	new = ptep_begin_modify(pmod, mm, page_table);
+
+	/* Only go through if we didn't race with anybody else... */
+	if (unlikely(!pte_none(new))) {
+		/* One of our sibling threads was faster, back out. */
+		ptep_abort(ptep, mm, page_table);
+		goto out_failed;
+	}
 
 	/*
 	 * This silly early PAGE_DIRTY setting removes a race
@@ -1564,34 +1719,39 @@ retry:
 	 * so we can make it writable and dirty to avoid having to
 	 * handle that later.
 	 */
-	/* Only go through if we didn't race with anybody else... */
-	if (pte_none(*page_table)) {
-		if (!PageReserved(new_page))
-			++mm->rss;
-		flush_icache_page(vma, new_page);
-		entry = mk_pte(new_page, vma->vm_page_prot);
-		if (write_access)
-			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
-		set_pte(page_table, entry);
-		if (anon) {
-			lru_cache_add_active(new_page);
-			page_add_anon_rmap(new_page, vma, address);
-		} else
-			page_add_file_rmap(new_page);
-		pte_unmap(page_table);
-	} else {
-		/* One of our sibling threads was faster, back out. */
-		pte_unmap(page_table);
-		page_cache_release(new_page);
-		spin_unlock(&mm->page_table_lock);
-		goto out;
-	}
+
+	flush_icache_page(vma, new_page);
+	new = mk_pte(new_page, vma->vm_page_prot);
+	if (write_access)
+		new = maybe_mkwrite(pte_mkdirty(new), vma);
+
+	if (anon) {
+		page_add_anon_rmap(new_page, vma, address);
+	} else
+		page_add_file_rmap(new_page);
+
+	if (ptep_commit(pmod, mm, page_table, new)) {
+		page_remove_rmap(new_page);
+		goto out_failed;
+	}
+	if (!PageReserved(new_page))
+		++mm->rss;
+	if (anon)
+		lru_cache_add_active(new_page);
+
+	pte_unmap(page_table);
 
 	/* no need to invalidate: a not-present page shouldn't be cached */
-	update_mmu_cache(vma, address, entry);
-	spin_unlock(&mm->page_table_lock);
+	update_mmu_cache(vma, address, new);
 out:
+	mm_unlock_page_table(mm);
 	return ret;
+
+out_failed:
+	pte_unmap(page_table);
+	mm_unlock_page_table(mm);
+	page_cache_release(new_page);
+	return VM_FAULT_MINOR;
 oom:
 	page_cache_release(new_page);
 	ret = VM_FAULT_OOM;
@@ -1603,8 +1763,9 @@ oom:
  * from the encoded file_pte if possible. This enables swappable
  * nonlinear vmas.
  */
-static int do_file_page(struct mm_struct * mm, struct vm_area_struct * vma,
-	unsigned long address, int write_access, pte_t *pte, pmd_t *pmd)
+static int do_file_page(struct pte_modify *pmod, struct mm_struct * mm,
+		struct vm_area_struct * vma, unsigned long address,
+		int write_access, pte_t *ptep, pmd_t *pmd, pte_t pte)
 {
 	unsigned long pgoff;
 	int err;
@@ -1616,14 +1777,27 @@ static int do_file_page(struct mm_struct
 	 */
 	if (!vma->vm_ops || !vma->vm_ops->populate || 
 			(write_access && !(vma->vm_flags & VM_SHARED))) {
-		pte_clear(pte);
-		return do_no_page(mm, vma, address, write_access, pte, pmd);
+		pte_clear(&pte);
+		if (ptep_commit(pmod, mm, ptep, pte)) {
+			pte_unmap(ptep);
+			mm_unlock_page_table(mm);
+			return VM_FAULT_MINOR;
+		}
+		pte = ptep_begin_modify(pmod, mm, ptep);
+		return do_no_page(pmod, mm, vma, address,
+				write_access, ptep, pmd, pte);
 	}
 
-	pgoff = pte_to_pgoff(*pte);
+	pgoff = pte_to_pgoff(ptep_atomic_read(ptep));
+	/* XXX: is this right? */
+	if (ptep_verify_finish(pmod, mm, ptep)) {
+		pte_unmap(ptep);
+		mm_unlock_page_table(mm);
+		return VM_FAULT_MINOR;
+	}
 
-	pte_unmap(pte);
-	spin_unlock(&mm->page_table_lock);
+	pte_unmap(ptep);
+	mm_unlock_page_table(mm);
 
 	err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE, vma->vm_page_prot, pgoff, 0);
 	if (err == -ENOMEM)
@@ -1642,25 +1816,16 @@ static int do_file_page(struct mm_struct
  * with external mmu caches can use to update those (ie the Sparc or
  * PowerPC hashed page tables that act as extended TLBs).
  *
- * Note the "page_table_lock". It is to protect against kswapd removing
- * pages from under us. Note that kswapd only ever _removes_ pages, never
- * adds them. As such, once we have noticed that the page is not present,
- * we can drop the lock early.
- *
- * The adding of pages is protected by the MM semaphore (which we hold),
- * so we don't need to worry about a page being suddenly been added into
- * our VM.
- *
- * We enter with the pagetable spinlock held, we are supposed to
- * release it when done.
+ * We enter with the page table locked, and exit with it unlocked.
  */
 static inline int handle_pte_fault(struct mm_struct *mm,
 	struct vm_area_struct * vma, unsigned long address,
 	int write_access, pte_t *pte, pmd_t *pmd)
 {
+	struct pte_modify pmod;
 	pte_t entry;
 
-	entry = *pte;
+	entry = ptep_begin_modify(&pmod, mm, pte);
 	if (!pte_present(entry)) {
 		/*
 		 * If it truly wasn't present, we know that kswapd
@@ -1668,28 +1833,37 @@ static inline int handle_pte_fault(struc
 		 * drop the lock.
 		 */
 		if (pte_none(entry))
-			return do_no_page(mm, vma, address, write_access, pte, pmd);
+			return do_no_page(&pmod, mm, vma, address,
+					write_access, pte, pmd, entry);
 		if (pte_file(entry))
-			return do_file_page(mm, vma, address, write_access, pte, pmd);
-		return do_swap_page(mm, vma, address, pte, pmd, entry, write_access);
+			return do_file_page(&pmod, mm, vma, address,
+					write_access, pte, pmd, entry);
+
+		return do_swap_page(&pmod, mm, vma, address,
+				write_access, pte, pmd, entry);
 	}
 
 	if (write_access) {
 		if (!pte_write(entry))
-			return do_wp_page(mm, vma, address, pte, pmd, entry);
+			return do_wp_page(&pmod, mm, vma, address,
+							pte, pmd, entry);
 
 		entry = pte_mkdirty(entry);
 	}
 	entry = pte_mkyoung(entry);
-	ptep_set_access_flags(vma, address, pte, entry, write_access);
-	update_mmu_cache(vma, address, entry);
+	if (!ptep_commit_access_flush(&pmod, mm, vma, address,
+					pte, entry, write_access)) {
+		/* Success */
+		update_mmu_cache(vma, address, entry);
+	}
+
 	pte_unmap(pte);
-	spin_unlock(&mm->page_table_lock);
+	mm_unlock_page_table(mm);
 	return VM_FAULT_MINOR;
 }
 
 /*
- * By the time we get here, we already hold the mm semaphore
+ * This must be called with mmap_sem held for reading.
  */
 int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma,
 	unsigned long address, int write_access)
@@ -1698,26 +1872,22 @@ int handle_mm_fault(struct mm_struct *mm
 	pmd_t *pmd;
 
 	__set_current_state(TASK_RUNNING);
-	pgd = pgd_offset(mm, address);
-
 	inc_page_state(pgfault);
 
 	if (is_vm_hugetlb_page(vma))
 		return VM_FAULT_SIGBUS;	/* mapping truncation does this. */
 
-	/*
-	 * We need the page table lock to synchronize with kswapd
-	 * and the SMP-safe atomic PTE updates.
-	 */
-	spin_lock(&mm->page_table_lock);
+	mm_lock_page_table(mm);
+	pgd = pgd_offset(mm, address);
 	pmd = pmd_alloc(mm, pgd, address);
-
 	if (pmd) {
 		pte_t * pte = pte_alloc_map(mm, pmd, address);
 		if (pte)
-			return handle_pte_fault(mm, vma, address, write_access, pte, pmd);
+			return handle_pte_fault(mm, vma, address,
+						write_access, pte, pmd);
 	}
-	spin_unlock(&mm->page_table_lock);
+	mm_unlock_page_table(mm);
+
 	return VM_FAULT_OOM;
 }
 
@@ -1734,22 +1904,15 @@ pmd_t fastcall *__pmd_alloc(struct mm_st
 {
 	pmd_t *new;
 
-	spin_unlock(&mm->page_table_lock);
+	mm_unlock_page_table(mm);
 	new = pmd_alloc_one(mm, address);
-	spin_lock(&mm->page_table_lock);
+	mm_lock_page_table(mm);
 	if (!new)
 		return NULL;
 
-	/*
-	 * Because we dropped the lock, we should re-check the
-	 * entry, as somebody else could have populated it..
-	 */
-	if (pgd_present(*pgd)) {
+	if (pgd_test_and_populate(mm, pgd, new))
 		pmd_free(new);
-		goto out;
-	}
-	pgd_populate(mm, pgd, new);
-out:
+
 	return pmd_offset(pgd, address);
 }
 
@@ -1784,7 +1947,8 @@ struct page * vmalloc_to_page(void * vma
 	pgd_t *pgd = pgd_offset_k(addr);
 	pmd_t *pmd;
 	pte_t *ptep, pte;
-  
+
+	/* XXX: investigate */
 	if (!pgd_none(*pgd)) {
 		pmd = pmd_offset(pgd, addr);
 		if (!pmd_none(*pmd)) {
diff -puN include/asm-generic/pgtable.h~vm-abstract-pgtable-locking include/asm-generic/pgtable.h
--- linux-2.6/include/asm-generic/pgtable.h~vm-abstract-pgtable-locking	2004-10-29 16:28:08.000000000 +1000
+++ linux-2.6-npiggin/include/asm-generic/pgtable.h	2004-10-29 16:40:39.000000000 +1000
@@ -134,4 +134,302 @@ static inline void ptep_mkdirty(pte_t *p
 #define pgd_offset_gate(mm, addr)	pgd_offset(mm, addr)
 #endif
 
+#ifndef __ASSEMBLY__
+#ifdef __HAVE_ARCH_PTEP_CMPXCHG
+#define mm_lock_page_table(__mm)					\
+do {									\
+} while (0);
+
+#define mm_unlock_page_table(__mm)					\
+do {									\
+} while (0);
+
+#define mm_pin_pages(__mm)						\
+do {									\
+	spin_lock(&__mm->page_table_lock);				\
+} while (0)
+
+#define mm_unpin_pages(__mm)						\
+do {									\
+	spin_unlock(&__mm->page_table_lock);				\
+} while (0)
+
+/* mm_lock_page_table doesn't actually take a lock, so this can be 0 */
+#define MM_RELOCK_CHECK 0
+
+struct pte_modify {
+	pte_t oldval;
+};
+
+#ifndef __HAVE_ARCH_PTEP_ATOMIC_READ
+#define ptep_atomic_read(__ptep)					\
+({									\
+	*__ptep;							\
+})
+#endif
+
+#define ptep_begin_modify(__pmod, __mm, __ptep)				\
+({									\
+ 	(void)__mm;							\
+ 	(__pmod)->oldval = ptep_atomic_read(__ptep);			\
+ 	(__pmod)->oldval;						\
+})
+
+#define ptep_abort(__pmod, __mm, __ptep)				\
+do {} while (0)
+
+#define ptep_commit(__pmod, __mm, __ptep, __newval)			\
+({									\
+	unlikely(ptep_cmpxchg(__ptep, (__pmod)->oldval, __newval));	\
+})
+
+#define ptep_commit_flush(__pmod, __mm, __vma, __address, __ptep, __newval) \
+({									\
+ 	int ret = ptep_cmpxchg(__ptep, (__pmod)->oldval, __newval);	\
+ 	/* XXX:								\		 * worthwhile to see if cmpxchg has succeeded before flushing?	\
+ 	 * worthwhile to see if pte_val has changed before flushing?	\
+	 * like so?:							\
+	 *  if (!ret && pte_val((__pmod)->oldval) != pte_val(__newval)) \
+	 */								\
+	flush_tlb_page(__vma, __address);				\
+	unlikely(ret);							\
+})
+
+#define ptep_commit_access_flush(__pmod, __mm, __vma, __address, __ptep, __newval, __dirty) \
+({									\
+ 	int ret = ptep_cmpxchg(__ptep, (__pmod)->oldval, __newval);	\
+	flush_tlb_page(__vma, __address);				\
+	unlikely(ret);							\
+})
+
+#define ptep_commit_establish_flush(__pmod, __mm, __vma, __address, __ptep, __newval) \
+({									\
+	int ret = ptep_cmpxchg(__ptep, (__pmod)->oldval, __newval);	\
+	flush_tlb_page(__vma, __address);				\
+	unlikely(ret);							\
+})
+
+#define ptep_commit_clear(__pmod, __mm, __ptep, __newval, __oldval) 	\
+({									\
+	int ret = ptep_cmpxchg(__ptep, (__pmod)->oldval, __newval);	\
+	__oldval = (__pmod)->oldval;					\
+	unlikely(ret);							\
+})
+
+#define ptep_commit_clear_flush(__pmod, __mm, __vma, __address, __ptep, __newval, __oldval) \
+({									\
+	int ret = ptep_cmpxchg(__ptep, (__pmod)->oldval, __newval);	\
+	flush_tlb_page(__vma, __address);				\
+	__oldval = (__pmod)->oldval;					\
+	unlikely(ret);							\
+})
+
+#define ptep_commit_clear_flush_young(__pmod, __mm, __vma, __address, __ptep, __young) \
+({									\
+ 	pte_t oldval = (__pmod)->oldval;				\
+	int ret = ptep_cmpxchg(__ptep, oldval, pte_mkold(oldval)); 	\
+	*__young = pte_young(oldval);					\
+	if (likely(!ret) && *__young)					\
+ 		flush_tlb_page(__vma, __address);			\
+	unlikely(ret);							\
+})
+
+#define ptep_commit_clear_flush_dirty(__pmod, __mm, __vma, __address, __ptep, __dirty) \
+({									\
+ 	pte_t oldval = (__pmod)->oldval;				\
+	int ret = ptep_cmpxchg(__ptep, oldval, pte_mkclean(oldval)); 	\
+	*__dirty = pte_dirty(oldval);					\
+	if (likely(!ret) && *__dirty)					\
+ 		flush_tlb_page(__vma, __address);			\
+	unlikely(ret);							\
+})
+
+#define ptep_verify(__pmod, __mm, __ptep)				\
+({									\
+ 	/* Prevent writes leaking forward and reads leaking back */	\
+ 	smp_mb();							\
+	unlikely(pte_val((__pmod)->oldval) != pte_val(ptep_atomic_read(__ptep))); \
+})
+
+#define ptep_verify_finish(__pmod, __mm, __ptep)			\
+	ptep_verify(__pmod, __mm, __ptep)
+
+#else /* __HAVE_ARCH_PTEP_CMPXCHG */ /* GENERIC_PTEP_LOCKING follows */
+/* Use the generic mm->page_table_lock serialised scheme */
+/*
+ * XXX: can we make use of this?
+ * At the moment, yes because some code is holding a ptep_begin_modify
+ * transaction across dropping and retaking the mm_lock_page_table (see
+ * mm/memory.c do_??? pagefault routines). A pte cmpxchg system can take
+ * advantage of this (holding the transaction open), but it possibly isn't
+ * exactly clean, and will blow up if ptep_begin_modify takes a lock itself.
+ *
+ * And ptep_begin_modify would probably like to take a lock if an architecture
+ * wants to do per-pte locking (ppc64, maybe).
+ */
+#define MM_RELOCK_CHECK 1
+
+/*
+ * Lock and unlock the pagetable for walking. This guarantees we can safely
+ * walk pgd->pmd->pte, and only that.
+ */
+#define mm_lock_page_table(__mm)					\
+do {									\
+	spin_lock(&(__mm)->page_table_lock);				\
+} while (0)
+
+#define mm_unlock_page_table(__mm)					\
+do {									\
+	spin_unlock(&(__mm)->page_table_lock);				\
+} while (0)
+
+/*
+ * XXX: pin and unpin may be tricky without a page_table_lock.
+ * Use vma locks maybe? Pte page locks? Pte bit?
+ */
+/*
+ * Prevent pages mapped into __mm, __vma from being freed.
+ * Taken inside mm_lock_page_table
+ */
+#define mm_pin_pages(__mm)						\
+do {									\
+	(void)__mm;							\
+} while (0)
+
+#define mm_unpin_pages(__mm)						\
+do {									\
+	(void)__mm;							\
+} while (0)
+
+#define ptep_atomic_read(__ptep)					\
+({									\
+	*__ptep;							\
+})
+
+/* XXX: will we want pmd/pgd_atomic_read? Yes. (big job) */
+
+/*
+ * A pte modification sequence goes something like this:
+ * struct pte_modify pmod;
+ * pte_t pte;
+ *
+ * mm_lock_page_table(mm);
+ * // walk page table to find ptep
+ * pte = ptep_begin_modify(&pmod, mm, ptep)
+ * if (!pte is valid) {
+ *	ptep_abort(&pmod, mm, ptep); // XXX: isn't yet part of the API.
+ *	goto out;
+ * }
+ * // modify pte, or make one that we want to install
+ *
+ * if (ptep_commit(&pmod, mm, ptep, pte)) {
+ * 	// commit failed
+ * 	goto out;
+ * }
+ *
+ * // At this point, the pte replaced by the commit is guaranteed to be the
+ * // same as the one returned by ptep_begin_modify, although hardware bits
+ * // may have changed. The other ptep_commit_* functions can provide
+ * // protection against hardware bits changing.
+ */
+struct pte_modify {
+};
+
+#define ptep_begin_modify(__pmod, __mm, __ptep)				\
+({									\
+ 	(void)__pmod;							\
+ 	(void)__mm;							\
+ 	ptep_atomic_read(__ptep);					\
+})
+
+#define ptep_abort(__pmod, __mm, __ptep)				\
+do {} while (0)
+
+#define ptep_commit(__pmod, __mm, __ptep, __newval)			\
+({									\
+	set_pte_atomic(__ptep, __newval);				\
+	0;								\
+})
+
+#define ptep_commit_flush(__pmod, __mm, __vma, __address, __ptep, __newval) \
+({									\
+	set_pte_atomic(__ptep, __newval);				\
+	flush_tlb_page(__vma, __address);				\
+	0;								\
+})
+
+#define ptep_commit_access_flush(__pmod, __mm, __vma, __address, __ptep, __newval, __dirty) \
+({									\
+ 	ptep_set_access_flags(__vma, __address, __ptep, __newval, __dirty); \
+	0;								\
+})
+
+#define ptep_commit_establish_flush(__pmod, __mm, __vma, __address, __ptep, __newval) \
+({									\
+ 	ptep_establish(__vma, __address, __ptep, __newval);		\
+	0;								\
+})
+
+#define ptep_commit_clear(__pmod, __mm, __ptep, __newval, __oldval) \
+({									\
+ 	__oldval = ptep_get_and_clear(__ptep);				\
+ 	set_pte(__ptep, __newval);					\
+	0;								\
+})
+
+#define ptep_commit_clear_flush(__pmod, __mm, __vma, __address, __ptep, __newval, __oldval) \
+({									\
+ 	__oldval = ptep_clear_flush(__vma, __address, __ptep);		\
+ 	set_pte(__ptep, __newval);					\
+	0;								\
+})
+
+#define ptep_commit_clear_flush_young(__pmod, __mm, __vma, __address, __ptep, __young) \
+({									\
+ 	*__young = ptep_clear_flush_young(__vma, __address, __ptep);	\
+ 	0;								\
+})
+
+#define ptep_commit_clear_flush_dirty(__pmod, __mm, __vma, __address, __ptep, __dirty) \
+({									\
+ 	*__dirty = ptep_clear_flush_dirty(__vma, __address, __ptep);	\
+	0;								\
+})
+
+#define ptep_verify(__pmod, __mm, __ptep)				\
+({									\
+ 	(void)__pmod;							\
+	0;								\
+})
+
+#define ptep_verify_finish(__pmod, __mm, __ptep)			\
+	ptep_verify(__pmod, __mm, __ptep)
+
+#define pgd_test_and_populate(__mm, ___pgd, ___pmd)			\
+({									\
+	int ret = pgd_present(*(___pgd));				\
+ 	if (likely(!ret))						\
+ 		pgd_populate(__mm, ___pgd, ___pmd);			\
+ 	unlikely(ret);							\
+})
+
+#define pmd_test_and_populate(__mm, ___pmd, ___page)			\
+({									\
+	int ret = pmd_present(*(___pmd));				\
+ 	if (likely(!ret))						\
+ 		pmd_populate(__mm, ___pmd, ___page);			\
+ 	unlikely(ret);							\
+})
+
+#define pmd_test_and_populate_kernel(__mm, ___pmd, ___page)		\
+({									\
+	int ret = pmd_present(*(___pmd));				\
+ 	if (likely(!ret))						\
+ 		pmd_populate_kernel(__mm, ___pmd, ___page);		\
+ 	unlikely(ret);							\
+})
+
+#endif /* GENERIC_PTEP_LOCKING */
+#endif /* ASSEMBLY */
+
 #endif /* _ASM_GENERIC_PGTABLE_H */
diff -puN kernel/fork.c~vm-abstract-pgtable-locking kernel/fork.c
--- linux-2.6/kernel/fork.c~vm-abstract-pgtable-locking	2004-10-29 16:28:08.000000000 +1000
+++ linux-2.6-npiggin/kernel/fork.c	2004-10-29 16:28:08.000000000 +1000
@@ -227,7 +227,6 @@ static inline int dup_mmap(struct mm_str
 		 * link in first so that swapoff can see swap entries,
 		 * and try_to_unmap_one's find_vma find the new vma.
 		 */
-		spin_lock(&mm->page_table_lock);
 		*pprev = tmp;
 		pprev = &tmp->vm_next;
 
@@ -237,7 +236,6 @@ static inline int dup_mmap(struct mm_str
 
 		mm->map_count++;
 		retval = copy_page_range(mm, current->mm, tmp);
-		spin_unlock(&mm->page_table_lock);
 
 		if (tmp->vm_ops && tmp->vm_ops->open)
 			tmp->vm_ops->open(tmp);
@@ -446,7 +444,15 @@ static int copy_mm(unsigned long clone_f
 		 * allows optimizing out ipis; the tlb_gather_mmu code
 		 * is an example.
 		 */
+		/*
+		 * XXX: I think this is only needed for sparc64's tlb and
+		 * context switching code - but sparc64 is in big trouble
+		 * now anyway because tlb_gather_mmu can be done without
+		 * holding the page table lock now anyway.
+		 */
+#if 0
 		spin_unlock_wait(&oldmm->page_table_lock);
+#endif
 		goto good_mm;
 	}
 
diff -puN kernel/futex.c~vm-abstract-pgtable-locking kernel/futex.c
--- linux-2.6/kernel/futex.c~vm-abstract-pgtable-locking	2004-10-29 16:28:08.000000000 +1000
+++ linux-2.6-npiggin/kernel/futex.c	2004-10-29 16:28:08.000000000 +1000
@@ -204,15 +204,13 @@ static int get_futex_key(unsigned long u
 	/*
 	 * Do a quick atomic lookup first - this is the fastpath.
 	 */
-	spin_lock(&current->mm->page_table_lock);
 	page = follow_page(mm, uaddr, 0);
 	if (likely(page != NULL)) {
 		key->shared.pgoff =
 			page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
-		spin_unlock(&current->mm->page_table_lock);
+		follow_page_finish(mm, uaddr);
 		return 0;
 	}
-	spin_unlock(&current->mm->page_table_lock);
 
 	/*
 	 * Do it the general way.
@@ -505,7 +503,7 @@ static int futex_wait(unsigned long uadd
 	/*
 	 * Now the futex is queued and we have checked the data, we
 	 * don't want to hold mmap_sem while we sleep.
-	 */	
+	 */
 	up_read(&current->mm->mmap_sem);
 
 	/*
@@ -520,6 +518,7 @@ static int futex_wait(unsigned long uadd
 	/* add_wait_queue is the barrier after __set_current_state. */
 	__set_current_state(TASK_INTERRUPTIBLE);
 	add_wait_queue(&q.waiters, &wait);
+
 	/*
 	 * !list_empty() is safe here without any lock.
 	 * q.lock_ptr != 0 is not safe, because of ordering against wakeup.
diff -puN include/linux/mm.h~vm-abstract-pgtable-locking include/linux/mm.h
--- linux-2.6/include/linux/mm.h~vm-abstract-pgtable-locking	2004-10-29 16:28:08.000000000 +1000
+++ linux-2.6-npiggin/include/linux/mm.h	2004-10-29 16:28:08.000000000 +1000
@@ -758,6 +758,7 @@ extern struct vm_area_struct *find_exten
 extern struct page * vmalloc_to_page(void *addr);
 extern struct page * follow_page(struct mm_struct *mm, unsigned long address,
 		int write);
+extern void follow_page_finish(struct mm_struct *mm, unsigned long address);
 int remap_pfn_range(struct vm_area_struct *, unsigned long,
 		unsigned long, unsigned long, pgprot_t);
 
diff -puN include/asm-generic/tlb.h~vm-abstract-pgtable-locking include/asm-generic/tlb.h
--- linux-2.6/include/asm-generic/tlb.h~vm-abstract-pgtable-locking	2004-10-29 16:28:08.000000000 +1000
+++ linux-2.6-npiggin/include/asm-generic/tlb.h	2004-10-29 16:28:08.000000000 +1000
@@ -53,7 +53,13 @@ DECLARE_PER_CPU(struct mmu_gather, mmu_g
 static inline struct mmu_gather *
 tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush)
 {
-	struct mmu_gather *tlb = &per_cpu(mmu_gathers, smp_processor_id());
+	/*
+	 * XXX: Now calling this without the page_table_lock!
+	 * This will blow up at least sparc64 (see sparc64's switch_mm
+	 * and kernel/fork.c:copy_mm for more details.
+	 */
+	int cpu = get_cpu();
+	struct mmu_gather *tlb = &per_cpu(mmu_gathers, cpu);
 
 	tlb->mm = mm;
 
@@ -97,6 +103,7 @@ tlb_finish_mmu(struct mmu_gather *tlb, u
 
 	/* keep the page table cache within bounds */
 	check_pgt_cache();
+	put_cpu();
 }
 
 static inline unsigned int
diff -puN mm/mmap.c~vm-abstract-pgtable-locking mm/mmap.c
--- linux-2.6/mm/mmap.c~vm-abstract-pgtable-locking	2004-10-29 16:28:08.000000000 +1000
+++ linux-2.6-npiggin/mm/mmap.c	2004-10-29 16:28:08.000000000 +1000
@@ -1575,14 +1575,12 @@ static void free_dangling_pgtables_regio
 {
 	struct mmu_gather *tlb;
 
-	spin_lock(&mm->page_table_lock);
 	tlb = tlb_gather_mmu(mm, 0);
 	if (is_hugepage_only_range(start, end - start))
 		hugetlb_free_pgtables(tlb, prev, start, end);
 	else
 		free_pgtables(tlb, prev, start, end);
 	tlb_finish_mmu(tlb, start, end);
-	spin_unlock(&mm->page_table_lock);
 }
 
 /*
@@ -1866,11 +1864,9 @@ void exit_mmap(struct mm_struct *mm)
 	 * Finally, free the pagetables. By this point, nothing should
 	 * refer to them.
 	 */
-	spin_lock(&mm->page_table_lock);
 	tlb = tlb_gather_mmu(mm, 1);
 	clear_page_tables(tlb, FIRST_USER_PGD_NR, USER_PTRS_PER_PGD);
 	tlb_finish_mmu(tlb, 0, MM_VM_SIZE(mm));
-	spin_unlock(&mm->page_table_lock);
 }
 
 /* Insert vm structure into process list sorted by address
diff -puN mm/rmap.c~vm-abstract-pgtable-locking mm/rmap.c
--- linux-2.6/mm/rmap.c~vm-abstract-pgtable-locking	2004-10-29 16:28:08.000000000 +1000
+++ linux-2.6-npiggin/mm/rmap.c	2004-10-29 16:28:08.000000000 +1000
@@ -32,7 +32,7 @@
  *   page->flags PG_locked (lock_page)
  *     mapping->i_mmap_lock
  *       anon_vma->lock
- *         mm->page_table_lock
+ *         mm_lock_page_table(mm)
  *           zone->lru_lock (in mark_page_accessed)
  *           swap_list_lock (in swap_free etc's swap_info_get)
  *             mmlist_lock (in mmput, drain_mmlist and others)
@@ -101,7 +101,11 @@ int anon_vma_prepare(struct vm_area_stru
 			locked = NULL;
 		}
 
-		/* page_table_lock to protect against threads */
+		/* protect against threads */
+		/*
+		 * XXX: this only needs to serialise against itself.
+		 * Perhaps we should rename the page table lock at some point.
+		 */
 		spin_lock(&mm->page_table_lock);
 		if (likely(!vma->anon_vma)) {
 			vma->anon_vma = anon_vma;
@@ -256,6 +260,8 @@ unsigned long page_address_in_vma(struct
 static int page_referenced_one(struct page *page,
 	struct vm_area_struct *vma, unsigned int *mapcount)
 {
+	struct pte_modify pmod;
+	pte_t new;
 	struct mm_struct *mm = vma->vm_mm;
 	unsigned long address;
 	pgd_t *pgd;
@@ -269,7 +275,7 @@ static int page_referenced_one(struct pa
 	if (address == -EFAULT)
 		goto out;
 
-	spin_lock(&mm->page_table_lock);
+	mm_lock_page_table(mm);
 
 	pgd = pgd_offset(mm, address);
 	if (!pgd_present(*pgd))
@@ -280,14 +286,19 @@ static int page_referenced_one(struct pa
 		goto out_unlock;
 
 	pte = pte_offset_map(pmd, address);
-	if (!pte_present(*pte))
-		goto out_unmap;
-
-	if (page_to_pfn(page) != pte_pfn(*pte))
-		goto out_unmap;
+	new = ptep_begin_modify(&pmod, mm, pte);
+	if (!pte_present(new))
+		goto out_abort;
+
+	/* 
+	 * This doesn't need mm_pin_pages, because the anonvma locks
+	 * serialise against try_to_unmap.
+	 */
+	if (page_to_pfn(page) != pte_pfn(new))
+		goto out_abort;
 
-	if (ptep_clear_flush_young(vma, address, pte))
-		referenced++;
+	/* Doesn't matter much if this fails */
+	ptep_commit_clear_flush_young(&pmod, mm, vma, address, pte, &referenced);
 
 	if (mm != current->mm && has_swap_token(mm))
 		referenced++;
@@ -297,9 +308,13 @@ static int page_referenced_one(struct pa
 out_unmap:
 	pte_unmap(pte);
 out_unlock:
-	spin_unlock(&mm->page_table_lock);
+	mm_unlock_page_table(mm);
 out:
 	return referenced;
+
+out_abort:
+	ptep_abort(&pmod, mm, pte);
+	goto out_unmap;
 }
 
 static int page_referenced_anon(struct page *page)
@@ -420,8 +435,6 @@ int page_referenced(struct page *page, i
  * @page:	the page to add the mapping to
  * @vma:	the vm area in which the mapping is added
  * @address:	the user virtual address mapped
- *
- * The caller needs to hold the mm->page_table_lock.
  */
 void page_add_anon_rmap(struct page *page,
 	struct vm_area_struct *vma, unsigned long address)
@@ -448,8 +461,6 @@ void page_add_anon_rmap(struct page *pag
 /**
  * page_add_file_rmap - add pte mapping to a file page
  * @page: the page to add the mapping to
- *
- * The caller needs to hold the mm->page_table_lock.
  */
 void page_add_file_rmap(struct page *page)
 {
@@ -464,8 +475,6 @@ void page_add_file_rmap(struct page *pag
 /**
  * page_remove_rmap - take down pte mapping from a page
  * @page: page to remove mapping from
- *
- * Caller needs to hold the mm->page_table_lock.
  */
 void page_remove_rmap(struct page *page)
 {
@@ -494,12 +503,14 @@ void page_remove_rmap(struct page *page)
  */
 static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
 {
+	struct pte_modify pmod;
+	swp_entry_t entry;
+	pte_t new, old;
 	struct mm_struct *mm = vma->vm_mm;
 	unsigned long address;
 	pgd_t *pgd;
 	pmd_t *pmd;
 	pte_t *pte;
-	pte_t pteval;
 	int ret = SWAP_AGAIN;
 
 	if (!mm->rss)
@@ -509,10 +520,10 @@ static int try_to_unmap_one(struct page 
 		goto out;
 
 	/*
-	 * We need the page_table_lock to protect us from page faults,
-	 * munmap, fork, etc...
+	 * We need to lock the page table to protect from page faults,
+	 * munmap, fork, exit, etc...
 	 */
-	spin_lock(&mm->page_table_lock);
+	mm_lock_page_table(mm);
 
 	pgd = pgd_offset(mm, address);
 	if (!pgd_present(*pgd))
@@ -523,27 +534,37 @@ static int try_to_unmap_one(struct page 
 		goto out_unlock;
 
 	pte = pte_offset_map(pmd, address);
-	if (!pte_present(*pte))
-		goto out_unmap;
+	new = ptep_begin_modify(&pmod, mm, pte);
+	if (!pte_present(new))
+		goto out_abort;
 
-	if (page_to_pfn(page) != pte_pfn(*pte))
-		goto out_unmap;
+	/*
+	 * XXX: don't need to pin pages here because anonvma locking means
+	 * this page can't come out from underneath us (ie. we serialise
+	 * with other try_to_unmap's
+	 */
+	if (page_to_pfn(page) != pte_pfn(new))
+		goto out_abort;
 
 	/*
 	 * If the page is mlock()d, we cannot swap it out.
 	 * If it's recently referenced (perhaps page_referenced
 	 * skipped over this mm) then we should reactivate it.
 	 */
-	if ((vma->vm_flags & (VM_LOCKED|VM_RESERVED)) ||
-			ptep_clear_flush_young(vma, address, pte)) {
+	if (vma->vm_flags & (VM_LOCKED|VM_RESERVED)) {
 		ret = SWAP_FAIL;
-		goto out_unmap;
+		goto out_abort;
+	}
+
+	if (pte_young(new)) {
+		ret = SWAP_AGAIN;
+		goto out_abort;
 	}
 
 	/*
 	 * Don't pull an anonymous page out from under get_user_pages.
-	 * GUP carefully breaks COW and raises page count (while holding
-	 * page_table_lock, as we have here) to make sure that the page
+	 * GUP carefully breaks COW and raises page count (while the page
+	 * table is locked, as we have here) to make sure that the page
 	 * cannot be freed.  If we unmap that page here, a user write
 	 * access to the virtual address will bring back the page, but
 	 * its raised count will (ironically) be taken to mean it's not
@@ -555,22 +576,27 @@ static int try_to_unmap_one(struct page 
 	 * to drop page lock: its reference to the page stops existing
 	 * ptes from being unmapped, so swapoff can make progress.
 	 */
+	/*
+	 * XXX: this should be ok, as GUP is doing atomic checking...?
+	 * Well maybe not because neither are serialised. But hmm, GUP
+	 * and friends need to pin pages anyway, so it may be that these
+	 * paths will actually get serialised even without the page table
+	 * lock.
+	 */
+	/* XXX: Should this be enough? (Obviously a finer lock would be nice) */
+	mm_pin_pages(mm);
 	if (PageSwapCache(page) &&
 	    page_count(page) != page_mapcount(page) + 2) {
-		ret = SWAP_FAIL;
-		goto out_unmap;
+		mm_unpin_pages(mm);
+		ret = SWAP_AGAIN;
+		goto out_abort;
 	}
 
 	/* Nuke the page table entry. */
 	flush_cache_page(vma, address);
-	pteval = ptep_clear_flush(vma, address, pte);
-
-	/* Move the dirty bit to the physical page now the pte is gone. */
-	if (pte_dirty(pteval))
-		set_page_dirty(page);
-
+	pte_clear(&new);
 	if (PageAnon(page)) {
-		swp_entry_t entry = { .val = page->private };
+		entry.val = page->private;
 		/*
 		 * Store the swap location in the pte.
 		 * See handle_pte_fault() ...
@@ -582,9 +608,22 @@ static int try_to_unmap_one(struct page 
 			list_add(&mm->mmlist, &init_mm.mmlist);
 			spin_unlock(&mmlist_lock);
 		}
-		set_pte(pte, swp_entry_to_pte(entry));
-		BUG_ON(pte_file(*pte));
+		new = swp_entry_to_pte(entry);
+		BUG_ON(pte_file(new));
+	}
+
+	if (ptep_commit_clear_flush(&pmod, mm, vma, address, pte, new, old)) {
+		ret = SWAP_AGAIN;
+		mm_unpin_pages(mm);
+		if (PageAnon(page))
+			free_swap_and_cache(entry);
+		goto out_unmap;
 	}
+	mm_unpin_pages(mm);
+
+	/* Move the dirty bit to the physical page now the pte is gone. */
+	if (pte_dirty(old))
+		set_page_dirty(page);
 
 	mm->rss--;
 	page_remove_rmap(page);
@@ -593,9 +632,13 @@ static int try_to_unmap_one(struct page 
 out_unmap:
 	pte_unmap(pte);
 out_unlock:
-	spin_unlock(&mm->page_table_lock);
+	mm_unlock_page_table(mm);
 out:
 	return ret;
+
+out_abort:
+	ptep_abort(&pmod, mm, pte);
+	goto out_unmap;
 }
 
 /*
@@ -627,18 +670,11 @@ static void try_to_unmap_cluster(unsigne
 	pgd_t *pgd;
 	pmd_t *pmd;
 	pte_t *pte;
-	pte_t pteval;
 	struct page *page;
 	unsigned long address;
 	unsigned long end;
 	unsigned long pfn;
 
-	/*
-	 * We need the page_table_lock to protect us from page faults,
-	 * munmap, fork, etc...
-	 */
-	spin_lock(&mm->page_table_lock);
-
 	address = (vma->vm_start + cursor) & CLUSTER_MASK;
 	end = address + CLUSTER_SIZE;
 	if (address < vma->vm_start)
@@ -646,6 +682,12 @@ static void try_to_unmap_cluster(unsigne
 	if (end > vma->vm_end)
 		end = vma->vm_end;
 
+	/*
+	 * We need to lock the page table to protect from page faults,
+	 * munmap, fork, exit, etc...
+	 */
+	mm_lock_page_table(mm);
+
 	pgd = pgd_offset(mm, address);
 	if (!pgd_present(*pgd))
 		goto out_unlock;
@@ -656,44 +698,57 @@ static void try_to_unmap_cluster(unsigne
 
 	for (pte = pte_offset_map(pmd, address);
 			address < end; pte++, address += PAGE_SIZE) {
+		struct pte_modify pmod;
+		pte_t new, old;
 
-		if (!pte_present(*pte))
-			continue;
+again:
+		new = ptep_begin_modify(&pmod, mm, pte);
+
+		if (!pte_present(new))
+			goto out_abort;
 
-		pfn = pte_pfn(*pte);
+		pfn = pte_pfn(new);
 		if (!pfn_valid(pfn))
-			continue;
+			goto out_abort;
 
 		page = pfn_to_page(pfn);
 		BUG_ON(PageAnon(page));
 		if (PageReserved(page))
-			continue;
+			goto out_abort;
 
-		if (ptep_clear_flush_young(vma, address, pte))
-			continue;
+		if (pte_young(new))
+			goto out_abort;
 
 		/* Nuke the page table entry. */
 		flush_cache_page(vma, address);
-		pteval = ptep_clear_flush(vma, address, pte);
+		pte_clear(&new);
 
 		/* If nonlinear, store the file page offset in the pte. */
 		if (page->index != linear_page_index(vma, address))
-			set_pte(pte, pgoff_to_pte(page->index));
+			new = pgoff_to_pte(page->index);
+
+		if (ptep_commit_clear_flush(&pmod, mm, vma, address, pte, new, old))
+			goto again;
+		flush_tlb_page(vma, address);
 
 		/* Move the dirty bit to the physical page now the pte is gone. */
-		if (pte_dirty(pteval))
+		if (pte_dirty(old))
 			set_page_dirty(page);
 
 		page_remove_rmap(page);
 		page_cache_release(page);
 		mm->rss--;
 		(*mapcount)--;
+
+		continue;
+out_abort:
+		ptep_abort(&pmod, mm, pte);
 	}
 
 	pte_unmap(pte);
 
 out_unlock:
-	spin_unlock(&mm->page_table_lock);
+	mm_unlock_page_table(mm);
 }
 
 static int try_to_unmap_anon(struct page *page)
diff -puN mm/mremap.c~vm-abstract-pgtable-locking mm/mremap.c
--- linux-2.6/mm/mremap.c~vm-abstract-pgtable-locking	2004-10-29 16:28:08.000000000 +1000
+++ linux-2.6-npiggin/mm/mremap.c	2004-10-29 16:28:08.000000000 +1000
@@ -99,7 +99,7 @@ move_one_page(struct vm_area_struct *vma
 		mapping = vma->vm_file->f_mapping;
 		spin_lock(&mapping->i_mmap_lock);
 	}
-	spin_lock(&mm->page_table_lock);
+	mm_lock_page_table(mm);
 
 	src = get_one_pte_map_nested(mm, old_addr);
 	if (src) {
@@ -115,21 +115,28 @@ move_one_page(struct vm_area_struct *vma
 				spin_unlock(&mapping->i_mmap_lock);
 			dst = alloc_one_pte_map(mm, new_addr);
 			if (mapping && !spin_trylock(&mapping->i_mmap_lock)) {
-				spin_unlock(&mm->page_table_lock);
+				mm_unlock_page_table(mm);
 				spin_lock(&mapping->i_mmap_lock);
-				spin_lock(&mm->page_table_lock);
+				mm_lock_page_table(mm);
 			}
 			src = get_one_pte_map_nested(mm, old_addr);
 		}
+
 		/*
-		 * Since alloc_one_pte_map can drop and re-acquire
-		 * page_table_lock, we should re-check the src entry...
+		 * Since alloc_one_pte_map can drop and re-lock the
+		 * page table, we should re-check the src entry...
 		 */
 		if (src) {
 			if (dst) {
-				pte_t pte;
-				pte = ptep_clear_flush(vma, old_addr, src);
-				set_pte(dst, pte);
+				struct pte_modify pmod;
+				pte_t new, old;
+again:
+				new = ptep_begin_modify(&pmod, mm, src);
+				pte_clear(&new);
+				if (ptep_commit_clear_flush(&pmod, mm, vma,
+						old_addr, src, new, old))
+					goto again;
+				set_pte(dst, old);
 			} else
 				error = -ENOMEM;
 			pte_unmap_nested(src);
@@ -137,7 +144,7 @@ move_one_page(struct vm_area_struct *vma
 		if (dst)
 			pte_unmap(dst);
 	}
-	spin_unlock(&mm->page_table_lock);
+	mm_unlock_page_table(mm);
 	if (mapping)
 		spin_unlock(&mapping->i_mmap_lock);
 	return error;
diff -puN mm/msync.c~vm-abstract-pgtable-locking mm/msync.c
--- linux-2.6/mm/msync.c~vm-abstract-pgtable-locking	2004-10-29 16:28:08.000000000 +1000
+++ linux-2.6-npiggin/mm/msync.c	2004-10-29 16:28:08.000000000 +1000
@@ -18,27 +18,40 @@
 #include <asm/tlbflush.h>
 
 /*
- * Called with mm->page_table_lock held to protect against other
+ * Called with the page table locked to protect against other
  * threads/the swapper from ripping pte's out from under us.
  */
-static int filemap_sync_pte(pte_t *ptep, struct vm_area_struct *vma,
-	unsigned long address, unsigned int flags)
-{
-	pte_t pte = *ptep;
-	unsigned long pfn = pte_pfn(pte);
+static int filemap_sync_pte(struct mm_struct *mm, pte_t *ptep,
+		struct vm_area_struct *vma, unsigned long address,
+		unsigned int flags)
+{
+	struct pte_modify pmod;
+	pte_t new;
+	unsigned long pfn;
 	struct page *page;
+	int dirty;
+
+again:
+	new = ptep_begin_modify(&pmod, mm, ptep);
 
-	if (pte_present(pte) && pfn_valid(pfn)) {
+	pfn = pte_pfn(new);
+	if (pte_present(new) && pfn_valid(pfn)) {
 		page = pfn_to_page(pfn);
-		if (!PageReserved(page) &&
-		    (ptep_clear_flush_dirty(vma, address, ptep) ||
-		     page_test_and_clear_dirty(page)))
-			set_page_dirty(page);
+		if (!PageReserved(page)) {
+			new = pte_mkclean(new);
+			if (ptep_commit_clear_flush_dirty(&pmod, mm, vma, address, ptep, &dirty))
+					goto again;
+			if (dirty || page_test_and_clear_dirty(page))
+				set_page_dirty(page);
+			goto out;
+		}
 	}
+	ptep_abort(&pmod, mm, ptep);
+out:
 	return 0;
 }
 
-static int filemap_sync_pte_range(pmd_t * pmd,
+static int filemap_sync_pte_range(struct mm_struct *mm, pmd_t * pmd,
 	unsigned long address, unsigned long end, 
 	struct vm_area_struct *vma, unsigned int flags)
 {
@@ -52,22 +65,25 @@ static int filemap_sync_pte_range(pmd_t 
 		pmd_clear(pmd);
 		return 0;
 	}
+
+	mm_pin_pages(mm); /* Required for filemap_sync_pte */
 	pte = pte_offset_map(pmd, address);
 	if ((address & PMD_MASK) != (end & PMD_MASK))
 		end = (address & PMD_MASK) + PMD_SIZE;
 	error = 0;
 	do {
-		error |= filemap_sync_pte(pte, vma, address, flags);
+		error |= filemap_sync_pte(mm, pte, vma, address, flags);
 		address += PAGE_SIZE;
 		pte++;
 	} while (address && (address < end));
 
 	pte_unmap(pte - 1);
+	mm_unpin_pages(mm);
 
 	return error;
 }
 
-static inline int filemap_sync_pmd_range(pgd_t * pgd,
+static inline int filemap_sync_pmd_range(struct mm_struct *mm, pgd_t * pgd,
 	unsigned long address, unsigned long end, 
 	struct vm_area_struct *vma, unsigned int flags)
 {
@@ -86,7 +102,7 @@ static inline int filemap_sync_pmd_range
 		end = (address & PGDIR_MASK) + PGDIR_SIZE;
 	error = 0;
 	do {
-		error |= filemap_sync_pte_range(pmd, address, end, vma, flags);
+		error |= filemap_sync_pte_range(mm, pmd, address, end, vma, flags);
 		address = (address + PMD_SIZE) & PMD_MASK;
 		pmd++;
 	} while (address && (address < end));
@@ -103,7 +119,7 @@ static int filemap_sync(struct vm_area_s
 	/* Aquire the lock early; it may be possible to avoid dropping
 	 * and reaquiring it repeatedly.
 	 */
-	spin_lock(&vma->vm_mm->page_table_lock);
+	mm_lock_page_table(vma->vm_mm);
 
 	dir = pgd_offset(vma->vm_mm, address);
 	flush_cache_range(vma, address, end);
@@ -117,7 +133,7 @@ static int filemap_sync(struct vm_area_s
 	if (address >= end)
 		BUG();
 	do {
-		error |= filemap_sync_pmd_range(dir, address, end, vma, flags);
+		error |= filemap_sync_pmd_range(vma->vm_mm, dir, address, end, vma, flags);
 		address = (address + PGDIR_SIZE) & PGDIR_MASK;
 		dir++;
 	} while (address && (address < end));
@@ -127,7 +143,7 @@ static int filemap_sync(struct vm_area_s
 	 */
 	flush_tlb_range(vma, end - size, end);
  out:
-	spin_unlock(&vma->vm_mm->page_table_lock);
+	mm_unlock_page_table(vma->vm_mm);
 
 	return error;
 }
diff -puN mm/mprotect.c~vm-abstract-pgtable-locking mm/mprotect.c
--- linux-2.6/mm/mprotect.c~vm-abstract-pgtable-locking	2004-10-29 16:28:08.000000000 +1000
+++ linux-2.6-npiggin/mm/mprotect.c	2004-10-29 16:28:08.000000000 +1000
@@ -26,7 +26,7 @@
 #include <asm/tlbflush.h>
 
 static inline void
-change_pte_range(pmd_t *pmd, unsigned long address,
+change_pte_range(struct mm_struct *mm, pmd_t *pmd, unsigned long address,
 		unsigned long size, pgprot_t newprot)
 {
 	pte_t * pte;
@@ -45,16 +45,21 @@ change_pte_range(pmd_t *pmd, unsigned lo
 	if (end > PMD_SIZE)
 		end = PMD_SIZE;
 	do {
-		if (pte_present(*pte)) {
-			pte_t entry;
-
+		struct pte_modify pmod;
+		pte_t new, old;
+again:
+		new = ptep_begin_modify(&pmod, mm, pte);
+		if (pte_present(new)) {
 			/* Avoid an SMP race with hardware updated dirty/clean
 			 * bits by wiping the pte and then setting the new pte
 			 * into place.
 			 */
-			entry = ptep_get_and_clear(pte);
-			set_pte(pte, pte_modify(entry, newprot));
-		}
+			new = pte_modify(new, newprot);
+			if (ptep_commit_clear(&pmod, mm, pte, new, old))
+				goto again;
+		} else
+			ptep_abort(&pmod, mm, pte);
+
 		address += PAGE_SIZE;
 		pte++;
 	} while (address && (address < end));
@@ -62,7 +67,7 @@ change_pte_range(pmd_t *pmd, unsigned lo
 }
 
 static inline void
-change_pmd_range(pgd_t *pgd, unsigned long address,
+change_pmd_range(struct mm_struct *mm, pgd_t *pgd, unsigned long address,
 		unsigned long size, pgprot_t newprot)
 {
 	pmd_t * pmd;
@@ -81,7 +86,7 @@ change_pmd_range(pgd_t *pgd, unsigned lo
 	if (end > PGDIR_SIZE)
 		end = PGDIR_SIZE;
 	do {
-		change_pte_range(pmd, address, end - address, newprot);
+		change_pte_range(mm, pmd, address, end - address, newprot);
 		address = (address + PMD_SIZE) & PMD_MASK;
 		pmd++;
 	} while (address && (address < end));
@@ -93,19 +98,20 @@ change_protection(struct vm_area_struct 
 {
 	pgd_t *dir;
 	unsigned long beg = start;
+	struct mm_struct *mm = current->mm;
 
 	dir = pgd_offset(current->mm, start);
 	flush_cache_range(vma, beg, end);
 	if (start >= end)
 		BUG();
-	spin_lock(&current->mm->page_table_lock);
+	mm_lock_page_table(mm);
 	do {
-		change_pmd_range(dir, start, end - start, newprot);
+		change_pmd_range(mm, dir, start, end - start, newprot);
 		start = (start + PGDIR_SIZE) & PGDIR_MASK;
 		dir++;
 	} while (start && (start < end));
 	flush_tlb_range(vma, beg, end);
-	spin_unlock(&current->mm->page_table_lock);
+	mm_unlock_page_table(mm);
 	return;
 }
 
diff -puN mm/swap_state.c~vm-abstract-pgtable-locking mm/swap_state.c
--- linux-2.6/mm/swap_state.c~vm-abstract-pgtable-locking	2004-10-29 16:28:08.000000000 +1000
+++ linux-2.6-npiggin/mm/swap_state.c	2004-10-29 16:28:08.000000000 +1000
@@ -273,7 +273,7 @@ static inline void free_swap_cache(struc
 /* 
  * Perform a free_page(), also freeing any swap cache associated with
  * this page if it is the last user of the page. Can not do a lock_page,
- * as we are holding the page_table_lock spinlock.
+ * as the page table is locked.
  */
 void free_page_and_swap_cache(struct page *page)
 {
diff -puN fs/exec.c~vm-abstract-pgtable-locking fs/exec.c
--- linux-2.6/fs/exec.c~vm-abstract-pgtable-locking	2004-10-29 16:28:08.000000000 +1000
+++ linux-2.6-npiggin/fs/exec.c	2004-10-29 16:28:08.000000000 +1000
@@ -298,10 +298,12 @@ EXPORT_SYMBOL(copy_strings_kernel);
 void install_arg_page(struct vm_area_struct *vma,
 			struct page *page, unsigned long address)
 {
+	struct pte_modify pmod;
 	struct mm_struct *mm = vma->vm_mm;
 	pgd_t * pgd;
 	pmd_t * pmd;
 	pte_t * pte;
+	pte_t new;
 
 	if (unlikely(anon_vma_prepare(vma)))
 		goto out_sig;
@@ -309,29 +311,35 @@ void install_arg_page(struct vm_area_str
 	flush_dcache_page(page);
 	pgd = pgd_offset(mm, address);
 
-	spin_lock(&mm->page_table_lock);
+	mm_lock_page_table(mm);
 	pmd = pmd_alloc(mm, pgd, address);
 	if (!pmd)
 		goto out;
 	pte = pte_alloc_map(mm, pmd, address);
 	if (!pte)
 		goto out;
-	if (!pte_none(*pte)) {
+again:
+	new = ptep_begin_modify(&pmod, mm, pte);
+	if (!pte_none(new)) {
+		ptep_abort(&pmod, mm, pte);
 		pte_unmap(pte);
 		goto out;
 	}
+	new = pte_mkdirty(pte_mkwrite(mk_pte(page, vma->vm_page_prot)));
+	page_add_anon_rmap(page, vma, address);
+	if (ptep_commit(&pmod, mm, pte, new)) {
+		page_remove_rmap(page);
+		goto again;
+	}
 	mm->rss++;
 	lru_cache_add_active(page);
-	set_pte(pte, pte_mkdirty(pte_mkwrite(mk_pte(
-					page, vma->vm_page_prot))));
-	page_add_anon_rmap(page, vma, address);
 	pte_unmap(pte);
-	spin_unlock(&mm->page_table_lock);
+	mm_unlock_page_table(mm);
 
 	/* no need for flush_tlb */
 	return;
 out:
-	spin_unlock(&mm->page_table_lock);
+	mm_unlock_page_table(mm);
 out_sig:
 	__free_page(page);
 	force_sig(SIGKILL, current);
diff -puN arch/i386/kernel/vm86.c~vm-abstract-pgtable-locking arch/i386/kernel/vm86.c
--- linux-2.6/arch/i386/kernel/vm86.c~vm-abstract-pgtable-locking	2004-10-29 16:28:08.000000000 +1000
+++ linux-2.6-npiggin/arch/i386/kernel/vm86.c	2004-10-29 16:28:08.000000000 +1000
@@ -136,13 +136,13 @@ struct pt_regs * fastcall save_v86_state
 
 static void mark_screen_rdonly(struct task_struct * tsk)
 {
+	struct mm_struct *mm = tsk->mm;
 	pgd_t *pgd;
 	pmd_t *pmd;
 	pte_t *pte, *mapped;
 	int i;
 
-	preempt_disable();
-	spin_lock(&tsk->mm->page_table_lock);
+	mm_lock_page_table(mm);
 	pgd = pgd_offset(tsk->mm, 0xA0000);
 	if (pgd_none(*pgd))
 		goto out;
@@ -161,14 +161,21 @@ static void mark_screen_rdonly(struct ta
 	}
 	pte = mapped = pte_offset_map(pmd, 0xA0000);
 	for (i = 0; i < 32; i++) {
-		if (pte_present(*pte))
-			set_pte(pte, pte_wrprotect(*pte));
+		struct pte_modify pmod;
+		pte_t new;
+again:
+		new = ptep_begin_modify(&pmod, mm, pte);
+		if (pte_present(new)) {
+			new = pte_wrprotect(new);
+			if (ptep_commit(&pmod, mm, pte, new))
+				goto again;
+		} else
+			ptep_abort(&pmod, mm, pte);
 		pte++;
 	}
 	pte_unmap(mapped);
 out:
-	spin_unlock(&tsk->mm->page_table_lock);
-	preempt_enable();
+	mm_unlock_page_table(mm);
 	flush_tlb();
 }
 
diff -puN arch/i386/mm/hugetlbpage.c~vm-abstract-pgtable-locking arch/i386/mm/hugetlbpage.c
--- linux-2.6/arch/i386/mm/hugetlbpage.c~vm-abstract-pgtable-locking	2004-10-29 16:28:08.000000000 +1000
+++ linux-2.6-npiggin/arch/i386/mm/hugetlbpage.c	2004-10-29 16:28:08.000000000 +1000
@@ -40,6 +40,7 @@ static pte_t *huge_pte_offset(struct mm_
 
 static void set_huge_pte(struct mm_struct *mm, struct vm_area_struct *vma, struct page *page, pte_t * page_table, int write_access)
 {
+	struct pte_modify pmod;
 	pte_t entry;
 
 	mm->rss += (HPAGE_SIZE / PAGE_SIZE);
@@ -50,7 +51,11 @@ static void set_huge_pte(struct mm_struc
 		entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot));
 	entry = pte_mkyoung(entry);
 	mk_pte_huge(entry);
-	set_pte(page_table, entry);
+	
+	/* XXX: ... */
+	do {
+		ptep_begin_modify(&pmod, mm, page_table);
+	} while (ptep_commit(&pmod, mm, page_table, entry));
 }
 
 /*
@@ -231,7 +236,7 @@ int hugetlb_prefault(struct address_spac
 	BUG_ON(vma->vm_start & ~HPAGE_MASK);
 	BUG_ON(vma->vm_end & ~HPAGE_MASK);
 
-	spin_lock(&mm->page_table_lock);
+	mm_lock_page_table(mm);
 	for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
 		unsigned long idx;
 		pte_t *pte = huge_pte_alloc(mm, addr);
@@ -279,7 +284,7 @@ int hugetlb_prefault(struct address_spac
 		set_huge_pte(mm, vma, page, pte, vma->vm_flags & VM_WRITE);
 	}
 out:
-	spin_unlock(&mm->page_table_lock);
+	mm_unlock_page_table(mm);
 	return ret;
 }
 
diff -puN mm/swapfile.c~vm-abstract-pgtable-locking mm/swapfile.c
--- linux-2.6/mm/swapfile.c~vm-abstract-pgtable-locking	2004-10-29 16:28:08.000000000 +1000
+++ linux-2.6-npiggin/mm/swapfile.c	2004-10-29 16:28:08.000000000 +1000
@@ -426,22 +426,9 @@ void free_swap_and_cache(swp_entry_t ent
  * share this swap entry, so be cautious and let do_wp_page work out
  * what to do if a write is requested later.
  */
-/* vma->vm_mm->page_table_lock is held */
-static void
-unuse_pte(struct vm_area_struct *vma, unsigned long address, pte_t *dir,
-	swp_entry_t entry, struct page *page)
-{
-	vma->vm_mm->rss++;
-	get_page(page);
-	set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot)));
-	page_add_anon_rmap(page, vma, address);
-	swap_free(entry);
-}
-
-/* vma->vm_mm->page_table_lock is held */
-static unsigned long unuse_pmd(struct vm_area_struct * vma, pmd_t *dir,
-	unsigned long address, unsigned long size, unsigned long offset,
-	swp_entry_t entry, struct page *page)
+static unsigned long unuse_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
+	pmd_t *dir, unsigned long address, unsigned long size,
+	unsigned long offset, swp_entry_t entry, struct page *page)
 {
 	pte_t * pte;
 	unsigned long end;
@@ -461,12 +448,26 @@ static unsigned long unuse_pmd(struct vm
 	if (end > PMD_SIZE)
 		end = PMD_SIZE;
 	do {
+		struct pte_modify pmod;
+		pte_t new;
 		/*
 		 * swapoff spends a _lot_ of time in this loop!
 		 * Test inline before going to call unuse_pte.
 		 */
-		if (unlikely(pte_same(*pte, swp_pte))) {
-			unuse_pte(vma, offset + address, pte, entry, page);
+again:
+		new = ptep_begin_modify(&pmod, mm, pte);
+		if (unlikely(pte_same(new, swp_pte))) {
+			get_page(page);
+			new = pte_mkold(mk_pte(page, vma->vm_page_prot));
+			if (ptep_commit(&pmod, mm, pte, new)) {
+				put_page(page);
+				goto again;
+			}
+
+			vma->vm_mm->rss++;
+			page_add_anon_rmap(page, vma, address);
+			swap_free(entry);
+
 			pte_unmap(pte);
 
 			/*
@@ -477,7 +478,9 @@ static unsigned long unuse_pmd(struct vm
 
 			/* add 1 since address may be 0 */
 			return 1 + offset + address;
-		}
+		} else
+			ptep_abort(&pmod, mm, pte);
+
 		address += PAGE_SIZE;
 		pte++;
 	} while (address && (address < end));
@@ -485,9 +488,8 @@ static unsigned long unuse_pmd(struct vm
 	return 0;
 }
 
-/* vma->vm_mm->page_table_lock is held */
-static unsigned long unuse_pgd(struct vm_area_struct * vma, pgd_t *dir,
-	unsigned long address, unsigned long size,
+static unsigned long unuse_pgd(struct mm_struct *mm, struct vm_area_struct *vma,
+	pgd_t *dir, unsigned long address, unsigned long size,
 	swp_entry_t entry, struct page *page)
 {
 	pmd_t * pmd;
@@ -510,7 +512,7 @@ static unsigned long unuse_pgd(struct vm
 	if (address >= end)
 		BUG();
 	do {
-		foundaddr = unuse_pmd(vma, pmd, address, end - address,
+		foundaddr = unuse_pmd(mm, vma, pmd, address, end - address,
 						offset, entry, page);
 		if (foundaddr)
 			return foundaddr;
@@ -520,9 +522,8 @@ static unsigned long unuse_pgd(struct vm
 	return 0;
 }
 
-/* vma->vm_mm->page_table_lock is held */
-static unsigned long unuse_vma(struct vm_area_struct * vma,
-	swp_entry_t entry, struct page *page)
+static unsigned long unuse_vma(struct mm_struct *mm, struct vm_area_struct *vma,
+		swp_entry_t entry, struct page *page)
 {
 	pgd_t *pgdir;
 	unsigned long start, end;
@@ -538,15 +539,17 @@ static unsigned long unuse_vma(struct vm
 		start = vma->vm_start;
 		end = vma->vm_end;
 	}
+	mm_lock_page_table(vma->vm_mm);
 	pgdir = pgd_offset(vma->vm_mm, start);
 	do {
-		foundaddr = unuse_pgd(vma, pgdir, start, end - start,
-						entry, page);
+		foundaddr = unuse_pgd(mm, vma, pgdir, start,
+						end - start, entry, page);
 		if (foundaddr)
 			return foundaddr;
 		start = (start + PGDIR_SIZE) & PGDIR_MASK;
 		pgdir++;
 	} while (start && (start < end));
+	mm_unlock_page_table(vma->vm_mm);
 	return 0;
 }
 
@@ -568,15 +571,13 @@ static int unuse_process(struct mm_struc
 		down_read(&mm->mmap_sem);
 		lock_page(page);
 	}
-	spin_lock(&mm->page_table_lock);
 	for (vma = mm->mmap; vma; vma = vma->vm_next) {
 		if (vma->anon_vma) {
-			foundaddr = unuse_vma(vma, entry, page);
+			foundaddr = unuse_vma(mm, vma, entry, page);
 			if (foundaddr)
 				break;
 		}
 	}
-	spin_unlock(&mm->page_table_lock);
 	up_read(&mm->mmap_sem);
 	/*
 	 * Currently unuse_process cannot fail, but leave error handling
diff -puN mm/vmalloc.c~vm-abstract-pgtable-locking mm/vmalloc.c
--- linux-2.6/mm/vmalloc.c~vm-abstract-pgtable-locking	2004-10-29 16:28:08.000000000 +1000
+++ linux-2.6-npiggin/mm/vmalloc.c	2004-10-29 16:28:08.000000000 +1000
@@ -45,6 +45,7 @@ static void unmap_area_pte(pmd_t *pmd, u
 
 	do {
 		pte_t page;
+		/* XXX: make this use ptep_begin_modify */
 		page = ptep_get_and_clear(pte);
 		address += PAGE_SIZE;
 		pte++;
@@ -57,7 +58,7 @@ static void unmap_area_pte(pmd_t *pmd, u
 }
 
 static void unmap_area_pmd(pgd_t *dir, unsigned long address,
-				  unsigned long size)
+				unsigned long size)
 {
 	unsigned long end;
 	pmd_t *pmd;
@@ -84,8 +85,7 @@ static void unmap_area_pmd(pgd_t *dir, u
 }
 
 static int map_area_pte(pte_t *pte, unsigned long address,
-			       unsigned long size, pgprot_t prot,
-			       struct page ***pages)
+		unsigned long size, pgprot_t prot, struct page ***pages)
 {
 	unsigned long end;
 
@@ -95,13 +95,18 @@ static int map_area_pte(pte_t *pte, unsi
 		end = PMD_SIZE;
 
 	do {
+		struct pte_modify pmod;
+		pte_t new;
 		struct page *page = **pages;
-
-		WARN_ON(!pte_none(*pte));
 		if (!page)
 			return -ENOMEM;
 
-		set_pte(pte, mk_pte(page, prot));
+again:
+		new = ptep_begin_modify(&pmod, &init_mm, pte);
+		WARN_ON(!pte_none(new));
+		new = mk_pte(page, prot);
+		if (ptep_commit(&pmod, &init_mm, pte, new))
+			goto again;
 		address += PAGE_SIZE;
 		pte++;
 		(*pages)++;
@@ -110,8 +115,7 @@ static int map_area_pte(pte_t *pte, unsi
 }
 
 static int map_area_pmd(pmd_t *pmd, unsigned long address,
-			       unsigned long size, pgprot_t prot,
-			       struct page ***pages)
+		unsigned long size, pgprot_t prot, struct page ***pages)
 {
 	unsigned long base, end;
 
@@ -158,7 +162,7 @@ int map_vm_area(struct vm_struct *area, 
 	int err = 0;
 
 	dir = pgd_offset_k(address);
-	spin_lock(&init_mm.page_table_lock);
+	mm_lock_page_table(&init_mm);
 	do {
 		pmd_t *pmd = pmd_alloc(&init_mm, dir, address);
 		if (!pmd) {
@@ -174,7 +178,7 @@ int map_vm_area(struct vm_struct *area, 
 		dir++;
 	} while (address && (address < end));
 
-	spin_unlock(&init_mm.page_table_lock);
+	mm_unlock_page_table(&init_mm);
 	flush_cache_vmap((unsigned long) area->addr, end);
 	return err;
 }
diff -puN mm/hugetlb.c~vm-abstract-pgtable-locking mm/hugetlb.c
--- linux-2.6/mm/hugetlb.c~vm-abstract-pgtable-locking	2004-10-29 16:28:08.000000000 +1000
+++ linux-2.6-npiggin/mm/hugetlb.c	2004-10-29 16:28:08.000000000 +1000
@@ -253,7 +253,7 @@ void zap_hugepage_range(struct vm_area_s
 {
 	struct mm_struct *mm = vma->vm_mm;
 
-	spin_lock(&mm->page_table_lock);
+	mm_lock_page_table(mm);
 	unmap_hugepage_range(vma, start, start + length);
-	spin_unlock(&mm->page_table_lock);
+	mm_unlock_page_table(mm);
 }
diff -puN mm/fremap.c~vm-abstract-pgtable-locking mm/fremap.c
--- linux-2.6/mm/fremap.c~vm-abstract-pgtable-locking	2004-10-29 16:28:08.000000000 +1000
+++ linux-2.6-npiggin/mm/fremap.c	2004-10-29 16:28:08.000000000 +1000
@@ -23,19 +23,28 @@
 static inline void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
 			unsigned long addr, pte_t *ptep)
 {
-	pte_t pte = *ptep;
+	struct pte_modify pmod;
+	pte_t new, old;
 
-	if (pte_none(pte))
+again:
+	new = ptep_begin_modify(&pmod, mm, ptep);
+	if (pte_none(new)) {
+		ptep_abort(&pmod, mm, ptep);
 		return;
-	if (pte_present(pte)) {
-		unsigned long pfn = pte_pfn(pte);
+	}
+	if (pte_present(new)) {
+		/* XXX: needs mm_pin_pages */
+		unsigned long pfn = pte_pfn(new);
 
 		flush_cache_page(vma, addr);
-		pte = ptep_clear_flush(vma, addr, ptep);
+		pte_clear(&new);
+		if (ptep_commit_clear_flush(&pmod, mm, vma, addr,
+							ptep, new, old))
+			goto again;
 		if (pfn_valid(pfn)) {
 			struct page *page = pfn_to_page(pfn);
 			if (!PageReserved(page)) {
-				if (pte_dirty(pte))
+				if (pte_dirty(old))
 					set_page_dirty(page);
 				page_remove_rmap(page);
 				page_cache_release(page);
@@ -43,9 +52,12 @@ static inline void zap_pte(struct mm_str
 			}
 		}
 	} else {
-		if (!pte_file(pte))
-			free_swap_and_cache(pte_to_swp_entry(pte));
-		pte_clear(ptep);
+		/* XXX: this will need to be done under a lock. Or maybe
+		 * we should clear the pte first?
+		 */
+		if (!pte_file(new))
+			free_swap_and_cache(pte_to_swp_entry(new));
+		ptep_abort(&pmod, mm, ptep);
 	}
 }
 
@@ -65,7 +77,7 @@ int install_page(struct mm_struct *mm, s
 	pte_t pte_val;
 
 	pgd = pgd_offset(mm, addr);
-	spin_lock(&mm->page_table_lock);
+	mm_lock_page_table(mm);
 
 	pmd = pmd_alloc(mm, pgd, addr);
 	if (!pmd)
@@ -85,6 +97,10 @@ int install_page(struct mm_struct *mm, s
 	if (!page->mapping || page->index >= size)
 		goto err_unlock;
 
+	/*
+	 * XXX: locking becomes probably very broken - all this will now
+	 * be non atomic with lockless pagetables. Investigate.
+	 */
 	zap_pte(mm, vma, addr, pte);
 
 	mm->rss++;
@@ -97,7 +113,7 @@ int install_page(struct mm_struct *mm, s
 
 	err = 0;
 err_unlock:
-	spin_unlock(&mm->page_table_lock);
+	mm_unlock_page_table(mm);
 	return err;
 }
 EXPORT_SYMBOL(install_page);
@@ -117,7 +133,7 @@ int install_file_pte(struct mm_struct *m
 	pte_t pte_val;
 
 	pgd = pgd_offset(mm, addr);
-	spin_lock(&mm->page_table_lock);
+	mm_lock_page_table(mm);
 
 	pmd = pmd_alloc(mm, pgd, addr);
 	if (!pmd)
@@ -133,11 +149,11 @@ int install_file_pte(struct mm_struct *m
 	pte_val = *pte;
 	pte_unmap(pte);
 	update_mmu_cache(vma, addr, pte_val);
-	spin_unlock(&mm->page_table_lock);
+	mm_unlock_page_table(mm);
 	return 0;
 
 err_unlock:
-	spin_unlock(&mm->page_table_lock);
+	mm_unlock_page_table(mm);
 	return err;
 }
 
diff -puN arch/i386/mm/ioremap.c~vm-abstract-pgtable-locking arch/i386/mm/ioremap.c
--- linux-2.6/arch/i386/mm/ioremap.c~vm-abstract-pgtable-locking	2004-10-29 16:28:08.000000000 +1000
+++ linux-2.6-npiggin/arch/i386/mm/ioremap.c	2004-10-29 16:28:08.000000000 +1000
@@ -17,8 +17,9 @@
 #include <asm/tlbflush.h>
 #include <asm/pgtable.h>
 
-static inline void remap_area_pte(pte_t * pte, unsigned long address, unsigned long size,
-	unsigned long phys_addr, unsigned long flags)
+static inline void remap_area_pte(pte_t * pte, unsigned long address,
+		unsigned long size, unsigned long phys_addr,
+		unsigned long flags)
 {
 	unsigned long end;
 	unsigned long pfn;
@@ -31,12 +32,20 @@ static inline void remap_area_pte(pte_t 
 		BUG();
 	pfn = phys_addr >> PAGE_SHIFT;
 	do {
-		if (!pte_none(*pte)) {
+		struct pte_modify pmod;
+		pte_t new;
+again:
+		new = ptep_begin_modify(&pmod, &init_mm, pte);
+		if (!pte_none(new)) {
 			printk("remap_area_pte: page already exists\n");
 			BUG();
 		}
-		set_pte(pte, pfn_pte(pfn, __pgprot(_PAGE_PRESENT | _PAGE_RW | 
-					_PAGE_DIRTY | _PAGE_ACCESSED | flags)));
+		new = pfn_pte(pfn, __pgprot(_PAGE_PRESENT | _PAGE_RW |
+					_PAGE_DIRTY | _PAGE_ACCESSED | flags));
+		if (ptep_commit(&pmod, &init_mm, pte, new)) {
+			printk("remap_area_pte: ptep_commit raced\n");
+			goto again;
+		}
 		address += PAGE_SIZE;
 		pfn++;
 		pte++;
@@ -78,7 +87,7 @@ static int remap_area_pages(unsigned lon
 	flush_cache_all();
 	if (address >= end)
 		BUG();
-	spin_lock(&init_mm.page_table_lock);
+	mm_lock_page_table(&init_mm);
 	do {
 		pmd_t *pmd;
 		pmd = pmd_alloc(&init_mm, dir, address);
@@ -92,7 +101,7 @@ static int remap_area_pages(unsigned lon
 		address = (address + PGDIR_SIZE) & PGDIR_MASK;
 		dir++;
 	} while (address && (address < end));
-	spin_unlock(&init_mm.page_table_lock);
+	mm_unlock_page_table(&init_mm);
 	flush_tlb_all();
 	return error;
 }

_

  reply	other threads:[~2004-10-29  7:21 UTC|newest]

Thread overview: 20+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2004-10-29  7:20 [PATCH 0/7] abstract pagetable locking and pte updates Nick Piggin
2004-10-29  7:20 ` [PATCH 1/7] " Nick Piggin
2004-10-29  7:21   ` [PATCH 2/7] " Nick Piggin
2004-10-29  7:21     ` [PATCH 3/7] " Nick Piggin
2004-10-29  7:21       ` Nick Piggin [this message]
2004-10-29  7:22         ` [PATCH 5/7] " Nick Piggin
2004-10-29  7:23           ` [PATCH 6/7] " Nick Piggin
2004-10-29  7:23             ` [PATCH 7/7] " Nick Piggin
2004-10-29  7:46 ` [PATCH 0/7] " William Lee Irwin III
2004-11-02  0:15   ` Christoph Lameter
2004-11-02  0:54     ` William Lee Irwin III
2004-11-02  1:34       ` Nick Piggin
2004-11-02  1:55         ` William Lee Irwin III
2004-11-02  2:38           ` Nick Piggin
2004-11-02  6:57             ` William Lee Irwin III
2004-11-02 17:55         ` Christoph Lameter
2004-10-29 11:45 ` Nick Piggin
2004-10-29 20:52   ` William Lee Irwin III
2004-10-30  2:46     ` Nick Piggin
2004-11-02  0:19       ` Christoph Lameter

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=4181EF96.2030602@yahoo.com.au \
    --to=nickpiggin@yahoo.com.au \
    --cc=linux-mm@kvack.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.