linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* [PATCH/RFC 0/9] VMA lookup with RCU
@ 2007-10-22 10:45 Vaidyanathan Srinivasan
  2007-10-22 10:45 ` [PATCH/RFC 1/9] Data structure changes Vaidyanathan Srinivasan
                   ` (8 more replies)
  0 siblings, 9 replies; 10+ messages in thread
From: Vaidyanathan Srinivasan @ 2007-10-22 10:45 UTC (permalink / raw)
  To: linux-mm; +Cc: Alexis Bruemmer, Peter Zijlstra

This is forward port of Peter Zijlstra's RCU based 
VMA lookup patch set to kernel version 2.6.23.1

Reference:
http://programming.kicks-ass.net/kernel-patches/vma_lookup/

The patch set still needs some cleanup.  I am sending this out 
for anybody to review and try more experiments.

I have been posting results based on this patch set at linux-mm
http://marc.info/?l=linux-mm&m=119186241210311&w=2

--Vaidy

-- 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH/RFC 1/9] Data structure changes
  2007-10-22 10:45 [PATCH/RFC 0/9] VMA lookup with RCU Vaidyanathan Srinivasan
@ 2007-10-22 10:45 ` Vaidyanathan Srinivasan
  2007-10-22 10:45 ` [PATCH/RFC 2/9] lib: RCU friendly B+tree Vaidyanathan Srinivasan
                   ` (7 subsequent siblings)
  8 siblings, 0 replies; 10+ messages in thread
From: Vaidyanathan Srinivasan @ 2007-10-22 10:45 UTC (permalink / raw)
  To: linux-mm; +Cc: Alexis Bruemmer, Peter Zijlstra

[-- Attachment #1: 1_mm_vmas.patch --]
[-- Type: text/plain, Size: 39930 bytes --]

* Replace vm_next with list_head in mm_struct
* Change all instance of mm->vm_next and mm->mmap
* TBD: Not all architectures covered


Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>
---
 arch/alpha/kernel/osf_sys.c     |    2 
 arch/arm/mm/mmap.c              |    2 
 arch/frv/mm/elf-fdpic.c         |    4 -
 arch/i386/mm/hugetlbpage.c      |    2 
 arch/ia64/kernel/sys_ia64.c     |    2 
 arch/ia64/mm/hugetlbpage.c      |    2 
 arch/mips/kernel/irixelf.c      |   19 +++---
 arch/mips/kernel/syscall.c      |    2 
 arch/parisc/kernel/sys_parisc.c |    4 -
 arch/powerpc/mm/tlb_32.c        |    2 
 arch/ppc/mm/tlb.c               |    2 
 arch/x86_64/ia32/ia32_aout.c    |    2 
 arch/x86_64/kernel/sys_x86_64.c |    2 
 drivers/char/mem.c              |    2 
 drivers/oprofile/buffer_sync.c  |    4 -
 fs/binfmt_aout.c                |    2 
 fs/binfmt_elf.c                 |    4 -
 fs/binfmt_elf_fdpic.c           |    5 +
 fs/exec.c                       |    4 -
 fs/hugetlbfs/inode.c            |    2 
 fs/proc/task_mmu.c              |   18 +++---
 include/linux/init_task.h       |    1 
 include/linux/mm.h              |   44 ++++++++++++++-
 include/linux/sched.h           |    2 
 ipc/shm.c                       |    4 -
 kernel/acct.c                   |    5 -
 kernel/auditsc.c                |    4 -
 kernel/fork.c                   |   11 +--
 mm/madvise.c                    |    2 
 mm/memory.c                     |   21 +++----
 mm/mempolicy.c                  |   10 +--
 mm/migrate.c                    |    2 
 mm/mlock.c                      |    5 +
 mm/mmap.c                       |  114 +++++++++++++++++++---------------------
 mm/mprotect.c                   |    2 
 mm/mremap.c                     |    7 +-
 mm/msync.c                      |    2 
 mm/swapfile.c                   |    2 
 38 files changed, 180 insertions(+), 146 deletions(-)

--- linux-2.6.23-rc9.orig/arch/alpha/kernel/osf_sys.c
+++ linux-2.6.23-rc9/arch/alpha/kernel/osf_sys.c
@@ -1255,7 +1255,7 @@ arch_get_unmapped_area_1(unsigned long a
 		if (!vma || addr + len <= vma->vm_start)
 			return addr;
 		addr = vma->vm_end;
-		vma = vma->vm_next;
+		vma = vma_next(vma);
 	}
 }
 
--- linux-2.6.23-rc9.orig/arch/arm/mm/mmap.c
+++ linux-2.6.23-rc9/arch/arm/mm/mmap.c
@@ -84,7 +84,7 @@ full_search:
 	else
 		addr = PAGE_ALIGN(addr);
 
-	for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
+	for (vma = find_vma(mm, addr); ; vma = vma_next(vma)) {
 		/* At this point:  (!vma || addr < vma->vm_end). */
 		if (TASK_SIZE - len < addr) {
 			/*
--- linux-2.6.23-rc9.orig/arch/frv/mm/elf-fdpic.c
+++ linux-2.6.23-rc9/arch/frv/mm/elf-fdpic.c
@@ -86,7 +86,7 @@ unsigned long arch_get_unmapped_area(str
 
 		if (addr <= limit) {
 			vma = find_vma(current->mm, PAGE_SIZE);
-			for (; vma; vma = vma->vm_next) {
+			for (; vma; vma = vma_next(vma)) {
 				if (addr > limit)
 					break;
 				if (addr + len <= vma->vm_start)
@@ -101,7 +101,7 @@ unsigned long arch_get_unmapped_area(str
 	limit = TASK_SIZE - len;
 	if (addr <= limit) {
 		vma = find_vma(current->mm, addr);
-		for (; vma; vma = vma->vm_next) {
+		for (; vma; vma = vma_next(vma)) {
 			if (addr > limit)
 				break;
 			if (addr + len <= vma->vm_start)
--- linux-2.6.23-rc9.orig/arch/i386/mm/hugetlbpage.c
+++ linux-2.6.23-rc9/arch/i386/mm/hugetlbpage.c
@@ -241,7 +241,7 @@ static unsigned long hugetlb_get_unmappe
 full_search:
 	addr = ALIGN(start_addr, HPAGE_SIZE);
 
-	for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
+	for (vma = find_vma(mm, addr); ; vma = vma_next(vma)) {
 		/* At this point:  (!vma || addr < vma->vm_end). */
 		if (TASK_SIZE - len < addr) {
 			/*
--- linux-2.6.23-rc9.orig/arch/ia64/kernel/sys_ia64.c
+++ linux-2.6.23-rc9/arch/ia64/kernel/sys_ia64.c
@@ -58,7 +58,7 @@ arch_get_unmapped_area (struct file *fil
   full_search:
 	start_addr = addr = (addr + align_mask) & ~align_mask;
 
-	for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
+	for (vma = find_vma(mm, addr); ; vma = vma_next(vma)) {
 		/* At this point:  (!vma || addr < vma->vm_end). */
 		if (TASK_SIZE - len < addr || RGN_MAP_LIMIT - len < REGION_OFFSET(addr)) {
 			if (start_addr != TASK_UNMAPPED_BASE) {
--- linux-2.6.23-rc9.orig/arch/ia64/mm/hugetlbpage.c
+++ linux-2.6.23-rc9/arch/ia64/mm/hugetlbpage.c
@@ -159,7 +159,7 @@ unsigned long hugetlb_get_unmapped_area(
 		addr = HPAGE_REGION_BASE;
 	else
 		addr = ALIGN(addr, HPAGE_SIZE);
-	for (vmm = find_vma(current->mm, addr); ; vmm = vmm->vm_next) {
+	for (vmm = find_vma(current->mm, addr); ; vmm = vmm_next(vma)) {
 		/* At this point:  (!vmm || addr < vmm->vm_end). */
 		if (REGION_OFFSET(addr) + len > RGN_MAP_LIMIT)
 			return -ENOMEM;
--- linux-2.6.23-rc9.orig/arch/mips/kernel/irixelf.c
+++ linux-2.6.23-rc9/arch/mips/kernel/irixelf.c
@@ -718,7 +718,7 @@ static int load_irix_binary(struct linux
 	/* OK, This is the point of no return */
 	current->mm->end_data = 0;
 	current->mm->end_code = 0;
-	current->mm->mmap = NULL;
+	INIT_LIST_HEAD(&current->mm->mm_vmas);
 	current->flags &= ~PF_FORKNOEXEC;
 	elf_entry = (unsigned int) elf_ex.e_entry;
 
@@ -1108,7 +1108,7 @@ static int irix_core_dump(long signr, st
 	/* Count what's needed to dump, up to the limit of coredump size. */
 	segs = 0;
 	size = 0;
-	for (vma = current->mm->mmap; vma != NULL; vma = vma->vm_next) {
+	for (vma = current->mm->mmap; vma != NULL; vma = vma_next(vma)) {
 		if (maydump(vma))
 		{
 			int sz = vma->vm_end-vma->vm_start;
@@ -1267,12 +1267,13 @@ static int irix_core_dump(long signr, st
 	dataoff = offset = roundup(offset, PAGE_SIZE);
 
 	/* Write program headers for segments dump. */
-	for (vma = current->mm->mmap, i = 0;
-		i < segs && vma != NULL; vma = vma->vm_next) {
+	i = 0
+	list_for_each_entry(vma, &current->mm->mm_vmas, vm_list) {
 		struct elf_phdr phdr;
 		size_t sz;
 
-		i++;
+		if (i++ == seg)
+			break;
 
 		sz = vma->vm_end - vma->vm_start;
 
@@ -1301,15 +1302,15 @@ static int irix_core_dump(long signr, st
 
 	DUMP_SEEK(dataoff);
 
-	for (i = 0, vma = current->mm->mmap;
-	    i < segs && vma != NULL;
-	    vma = vma->vm_next) {
+	i = 0
+	list_for_each_entry(vma, &current->mm->mm_vmas, vm_list) {
 		unsigned long addr = vma->vm_start;
 		unsigned long len = vma->vm_end - vma->vm_start;
 
 		if (!maydump(vma))
 			continue;
-		i++;
+		if (i++ == seg)
+			break;
 		pr_debug("elf_core_dump: writing %08lx %lx\n", addr, len);
 		DUMP_WRITE((void __user *)addr, len);
 	}
--- linux-2.6.23-rc9.orig/arch/mips/kernel/syscall.c
+++ linux-2.6.23-rc9/arch/mips/kernel/syscall.c
@@ -104,7 +104,7 @@ unsigned long arch_get_unmapped_area(str
 	else
 		addr = PAGE_ALIGN(addr);
 
-	for (vmm = find_vma(current->mm, addr); ; vmm = vmm->vm_next) {
+	for (vmm = find_vma(current->mm, addr); ; vmm = vmm_next(vmm)) {
 		/* At this point:  (!vmm || addr < vmm->vm_end). */
 		if (task_size - len < addr)
 			return -ENOMEM;
--- linux-2.6.23-rc9.orig/arch/parisc/kernel/sys_parisc.c
+++ linux-2.6.23-rc9/arch/parisc/kernel/sys_parisc.c
@@ -52,7 +52,7 @@ static unsigned long get_unshared_area(u
 
 	addr = PAGE_ALIGN(addr);
 
-	for (vma = find_vma(current->mm, addr); ; vma = vma->vm_next) {
+	for (vma = find_vma(current->mm, addr); ; vma = vma_next(vma)) {
 		/* At this point:  (!vma || addr < vma->vm_end). */
 		if (TASK_SIZE - len < addr)
 			return -ENOMEM;
@@ -88,7 +88,7 @@ static unsigned long get_shared_area(str
 
 	addr = DCACHE_ALIGN(addr - offset) + offset;
 
-	for (vma = find_vma(current->mm, addr); ; vma = vma->vm_next) {
+	for (vma = find_vma(current->mm, addr); ; vma = vma_next(vma)) {
 		/* At this point:  (!vma || addr < vma->vm_end). */
 		if (TASK_SIZE - len < addr)
 			return -ENOMEM;
--- linux-2.6.23-rc9.orig/arch/powerpc/mm/tlb_32.c
+++ linux-2.6.23-rc9/arch/powerpc/mm/tlb_32.c
@@ -155,7 +155,7 @@ void flush_tlb_mm(struct mm_struct *mm)
 	 * unmap_region or exit_mmap, but not from vmtruncate on SMP -
 	 * but it seems dup_mmap is the only SMP case which gets here.
 	 */
-	for (mp = mm->mmap; mp != NULL; mp = mp->vm_next)
+	list_for_each_entry(mp, &mm->mm_vmas, vm_list)
 		flush_range(mp->vm_mm, mp->vm_start, mp->vm_end);
 	FINISH_FLUSH;
 }
--- linux-2.6.23-rc9.orig/arch/ppc/mm/tlb.c
+++ linux-2.6.23-rc9/arch/ppc/mm/tlb.c
@@ -149,7 +149,7 @@ void flush_tlb_mm(struct mm_struct *mm)
 		return;
 	}
 
-	for (mp = mm->mmap; mp != NULL; mp = mp->vm_next)
+	list_for_each_entry(mp, &mm->mm_vmas, vm_list)
 		flush_range(mp->vm_mm, mp->vm_start, mp->vm_end);
 	FINISH_FLUSH;
 }
--- linux-2.6.23-rc9.orig/arch/x86_64/ia32/ia32_aout.c
+++ linux-2.6.23-rc9/arch/x86_64/ia32/ia32_aout.c
@@ -311,7 +311,7 @@ static int load_aout_binary(struct linux
 	current->mm->free_area_cache = TASK_UNMAPPED_BASE;
 	current->mm->cached_hole_size = 0;
 
-	current->mm->mmap = NULL;
+	INIT_LIST_HEAD(&current->mm->mm_vmas);
 	compute_creds(bprm);
  	current->flags &= ~PF_FORKNOEXEC;
 
--- linux-2.6.23-rc9.orig/arch/x86_64/kernel/sys_x86_64.c
+++ linux-2.6.23-rc9/arch/x86_64/kernel/sys_x86_64.c
@@ -119,7 +119,7 @@ arch_get_unmapped_area(struct file *filp
 	start_addr = addr;
 
 full_search:
-	for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
+	for (vma = find_vma(mm, addr); ; vma = vma_next(vma)) {
 		/* At this point:  (!vma || addr < vma->vm_end). */
 		if (end - len < addr) {
 			/*
--- linux-2.6.23-rc9.orig/drivers/char/mem.c
+++ linux-2.6.23-rc9/drivers/char/mem.c
@@ -640,7 +640,7 @@ static inline size_t read_zero_pagealign
 	down_read(&mm->mmap_sem);
 
 	/* For private mappings, just map in zero pages. */
-	for (vma = find_vma(mm, addr); vma; vma = vma->vm_next) {
+	for (vma = find_vma(mm, addr); vma; vma = vma_next(vma)) {
 		unsigned long count;
 
 		if (vma->vm_start > addr || (vma->vm_flags & VM_WRITE) == 0)
--- linux-2.6.23-rc9.orig/drivers/oprofile/buffer_sync.c
+++ linux-2.6.23-rc9/drivers/oprofile/buffer_sync.c
@@ -217,7 +217,7 @@ static unsigned long get_exec_dcookie(st
 	if (!mm)
 		goto out;
  
-	for (vma = mm->mmap; vma; vma = vma->vm_next) {
+	list_for_each_entry(vma, &mm->mm_vmas, vm_list) {
 		if (!vma->vm_file)
 			continue;
 		if (!(vma->vm_flags & VM_EXECUTABLE))
@@ -242,7 +242,7 @@ static unsigned long lookup_dcookie(stru
 	unsigned long cookie = NO_COOKIE;
 	struct vm_area_struct * vma;
 
-	for (vma = find_vma(mm, addr); vma; vma = vma->vm_next) {
+	for (vma = find_vma(mm, addr); vma; vma = vma_next(vma)) {
  
 		if (addr < vma->vm_start || addr >= vma->vm_end)
 			continue;
--- linux-2.6.23-rc9.orig/fs/binfmt_aout.c
+++ linux-2.6.23-rc9/fs/binfmt_aout.c
@@ -323,7 +323,7 @@ static int load_aout_binary(struct linux
 	current->mm->free_area_cache = current->mm->mmap_base;
 	current->mm->cached_hole_size = 0;
 
-	current->mm->mmap = NULL;
+	INIT_LIST_HEAD(&current->mm->mm_vmas);
 	compute_creds(bprm);
  	current->flags &= ~PF_FORKNOEXEC;
 #ifdef __sparc__
--- linux-2.6.23-rc9.orig/fs/binfmt_elf.c
+++ linux-2.6.23-rc9/fs/binfmt_elf.c
@@ -1458,7 +1458,7 @@ static int elf_dump_thread_status(long s
 static struct vm_area_struct *first_vma(struct task_struct *tsk,
 					struct vm_area_struct *gate_vma)
 {
-	struct vm_area_struct *ret = tsk->mm->mmap;
+	struct vm_area_struct *ret = __vma_next(&tsk->mm->mm_vmas, NULL);
 
 	if (ret)
 		return ret;
@@ -1473,7 +1473,7 @@ static struct vm_area_struct *next_vma(s
 {
 	struct vm_area_struct *ret;
 
-	ret = this_vma->vm_next;
+	ret = vma_next(this_vma);
 	if (ret)
 		return ret;
 	if (this_vma == gate_vma)
--- linux-2.6.23-rc9.orig/fs/binfmt_elf_fdpic.c
+++ linux-2.6.23-rc9/fs/binfmt_elf_fdpic.c
@@ -1471,7 +1471,8 @@ static int elf_fdpic_dump_segments(struc
 {
 	struct vm_area_struct *vma;
 
-	for (vma = current->mm->mmap; vma; vma = vma->vm_next) {
+	list_for_each_entry(vma, &current->mm->mm_vmas, vm_list) {
+
 		unsigned long addr;
 
 		if (!maydump(vma, mm_flags))
@@ -1728,7 +1729,7 @@ static int elf_fdpic_core_dump(long sign
 	/* write program headers for segments dump */
 	for (
 #ifdef CONFIG_MMU
-		vma = current->mm->mmap; vma; vma = vma->vm_next
+		vma = __vma_next(&current->mm->mm_vmas, NULL); vma; vma = vma_next(vma)
 #else
 			vml = current->mm->context.vmlist; vml; vml = vml->next
 #endif
--- linux-2.6.23-rc9.orig/fs/exec.c
+++ linux-2.6.23-rc9/fs/exec.c
@@ -555,7 +555,7 @@ static int shift_arg_pages(struct vm_are
 		 * when the old and new regions overlap clear from new_end.
 		 */
 		free_pgd_range(&tlb, new_end, old_end, new_end,
-			vma->vm_next ? vma->vm_next->vm_start : 0);
+			vma_next(vma) ? vma_next(vma)->vm_start : 0);
 	} else {
 		/*
 		 * otherwise, clean from old_start; this is done to not touch
@@ -564,7 +564,7 @@ static int shift_arg_pages(struct vm_are
 		 * for the others its just a little faster.
 		 */
 		free_pgd_range(&tlb, old_start, old_end, new_end,
-			vma->vm_next ? vma->vm_next->vm_start : 0);
+			vma_next(vma) ? vma_next(vma)->vm_start : 0);
 	}
 	tlb_finish_mmu(tlb, new_end, old_end);
 
--- linux-2.6.23-rc9.orig/fs/hugetlbfs/inode.c
+++ linux-2.6.23-rc9/fs/hugetlbfs/inode.c
@@ -158,7 +158,7 @@ hugetlb_get_unmapped_area(struct file *f
 full_search:
 	addr = ALIGN(start_addr, HPAGE_SIZE);
 
-	for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
+	for (vma = find_vma(mm, addr); ; vma = vma_next(vma)) {
 		/* At this point:  (!vma || addr < vma->vm_end). */
 		if (TASK_SIZE - len < addr) {
 			/*
--- linux-2.6.23-rc9.orig/fs/proc/task_mmu.c
+++ linux-2.6.23-rc9/fs/proc/task_mmu.c
@@ -87,11 +87,11 @@ int proc_exe_link(struct inode *inode, s
 		goto out;
 	down_read(&mm->mmap_sem);
 
-	vma = mm->mmap;
+	vma = __vma_next(&mm->mm_vmas, NULL);
 	while (vma) {
 		if ((vma->vm_flags & VM_EXECUTABLE) && vma->vm_file)
 			break;
-		vma = vma->vm_next;
+		vma = vma_next(vma);
 	}
 
 	if (vma) {
@@ -364,7 +364,7 @@ void clear_refs_smap(struct mm_struct *m
 	struct vm_area_struct *vma;
 
 	down_read(&mm->mmap_sem);
-	for (vma = mm->mmap; vma; vma = vma->vm_next)
+	list_for_each_entry(vma, &mm->mm_vmas, vm_list)
 		if (vma->vm_mm && !is_vm_hugetlb_page(vma))
 			walk_page_range(vma, clear_refs_pte_range, NULL);
 	flush_tlb_mm(mm);
@@ -406,7 +406,7 @@ static void *m_start(struct seq_file *m,
 
 	/* Start with last addr hint */
 	if (last_addr && (vma = find_vma(mm, last_addr))) {
-		vma = vma->vm_next;
+		vma = vma_next(vma);
 		goto out;
 	}
 
@@ -416,9 +416,9 @@ static void *m_start(struct seq_file *m,
 	 */
 	vma = NULL;
 	if ((unsigned long)l < mm->map_count) {
-		vma = mm->mmap;
+		vma = __vma_next(&mm->mm_vmas, NULL);
 		while (l-- && vma)
-			vma = vma->vm_next;
+			vma = vma_next(vma);
 		goto out;
 	}
 
@@ -448,12 +448,12 @@ static void vma_stop(struct proc_maps_pr
 static void *m_next(struct seq_file *m, void *v, loff_t *pos)
 {
 	struct proc_maps_private *priv = m->private;
-	struct vm_area_struct *vma = v;
+	struct vm_area_struct *vma = v, *next;
 	struct vm_area_struct *tail_vma = priv->tail_vma;
 
 	(*pos)++;
-	if (vma && (vma != tail_vma) && vma->vm_next)
-		return vma->vm_next;
+	if (vma && (vma != tail_vma) && (next = vma_next(vma)))
+		return next;
 	vma_stop(priv, vma);
 	return (vma != tail_vma)? tail_vma: NULL;
 }
--- linux-2.6.23-rc9.orig/include/linux/init_task.h
+++ linux-2.6.23-rc9/include/linux/init_task.h
@@ -47,6 +47,7 @@
 
 #define INIT_MM(name) \
 {			 					\
+	.mm_vmas	= LIST_HEAD_INIT(name.mm_vmas),		\
 	.mm_rb		= RB_ROOT,				\
 	.pgd		= swapper_pg_dir, 			\
 	.mm_users	= ATOMIC_INIT(2), 			\
--- linux-2.6.23-rc9.orig/include/linux/mm.h
+++ linux-2.6.23-rc9/include/linux/mm.h
@@ -35,6 +35,7 @@ extern int sysctl_legacy_va_layout;
 #define sysctl_legacy_va_layout 0
 #endif
 
+#include <linux/sched.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
 #include <asm/processor.h>
@@ -63,7 +64,7 @@ struct vm_area_struct {
 					   within vm_mm. */
 
 	/* linked list of VM areas per task, sorted by address */
-	struct vm_area_struct *vm_next;
+	struct list_head vm_list;
 
 	pgprot_t vm_page_prot;		/* Access permissions of this VMA. */
 	unsigned long vm_flags;		/* Flags, listed below. */
@@ -113,6 +114,42 @@ struct vm_area_struct {
 #endif
 };
 
+static inline struct vm_area_struct *
+__vma_next(struct list_head *head, struct vm_area_struct *vma)
+{
+	if (unlikely(!vma))
+		vma = container_of(head, struct vm_area_struct, vm_list);
+
+	if (vma->vm_list.next == head)
+		return NULL;
+
+	return list_entry(vma->vm_list.next, struct vm_area_struct, vm_list);
+}
+
+static inline struct vm_area_struct *
+vma_next(struct vm_area_struct *vma)
+{
+	return __vma_next(&vma->vm_mm->mm_vmas, vma);
+}
+
+static inline struct vm_area_struct *
+__vma_prev(struct list_head *head, struct vm_area_struct *vma)
+{
+	if (unlikely(!vma))
+		vma = container_of(head, struct vm_area_struct, vm_list);
+
+	if (vma->vm_list.prev == head)
+		return NULL;
+
+	return list_entry(vma->vm_list.prev, struct vm_area_struct, vm_list);
+}
+
+static inline struct vm_area_struct *
+vma_prev(struct vm_area_struct *vma)
+{
+	return __vma_prev(&vma->vm_mm->mm_vmas, vma);
+}
+
 extern struct kmem_cache *vm_area_cachep;
 
 /*
@@ -769,13 +806,14 @@ struct zap_details {
 struct page *vm_normal_page(struct vm_area_struct *, unsigned long, pte_t);
 unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
 		unsigned long size, struct zap_details *);
-unsigned long unmap_vmas(struct mmu_gather **tlb,
+unsigned long unmap_vmas(struct mmu_gather **tlb, struct list_head *vmas,
 		struct vm_area_struct *start_vma, unsigned long start_addr,
 		unsigned long end_addr, unsigned long *nr_accounted,
 		struct zap_details *);
 void free_pgd_range(struct mmu_gather **tlb, unsigned long addr,
 		unsigned long end, unsigned long floor, unsigned long ceiling);
-void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *start_vma,
+void free_pgtables(struct mmu_gather **tlb, struct list_head *vmas,
+		struct vm_area_struct *start_vma,
 		unsigned long floor, unsigned long ceiling);
 int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
 			struct vm_area_struct *vma);
--- linux-2.6.23-rc9.orig/include/linux/sched.h
+++ linux-2.6.23-rc9/include/linux/sched.h
@@ -367,7 +367,7 @@ extern int get_dumpable(struct mm_struct
 	((1 << MMF_DUMP_ANON_PRIVATE) |	(1 << MMF_DUMP_ANON_SHARED))
 
 struct mm_struct {
-	struct vm_area_struct * mmap;		/* list of VMAs */
+	struct list_head mm_vmas;
 	struct rb_root mm_rb;
 	struct vm_area_struct * mmap_cache;	/* last find_vma result */
 	unsigned long (*get_unmapped_area) (struct file *filp,
--- linux-2.6.23-rc9.orig/ipc/shm.c
+++ linux-2.6.23-rc9/ipc/shm.c
@@ -1034,7 +1034,7 @@ asmlinkage long sys_shmdt(char __user *s
 	vma = find_vma(mm, addr);
 
 	while (vma) {
-		next = vma->vm_next;
+		next = vma_next(vma);
 
 		/*
 		 * Check if the starting address would match, i.e. it's
@@ -1067,7 +1067,7 @@ asmlinkage long sys_shmdt(char __user *s
 	 */
 	size = PAGE_ALIGN(size);
 	while (vma && (loff_t)(vma->vm_end - addr) <= size) {
-		next = vma->vm_next;
+		next = vma_next(vma);
 
 		/* finding a matching vma now does not alter retval */
 		if ((vma->vm_ops == &shm_vm_ops) &&
--- linux-2.6.23-rc9.orig/kernel/acct.c
+++ linux-2.6.23-rc9/kernel/acct.c
@@ -540,11 +540,8 @@ void acct_collect(long exitcode, int gro
 	if (group_dead && current->mm) {
 		struct vm_area_struct *vma;
 		down_read(&current->mm->mmap_sem);
-		vma = current->mm->mmap;
-		while (vma) {
+		list_for_each_entry(vma, &current->mm->mm_vmas, vm_list)
 			vsize += vma->vm_end - vma->vm_start;
-			vma = vma->vm_next;
-		}
 		up_read(&current->mm->mmap_sem);
 	}
 
--- linux-2.6.23-rc9.orig/kernel/auditsc.c
+++ linux-2.6.23-rc9/kernel/auditsc.c
@@ -780,8 +780,7 @@ static void audit_log_task_info(struct a
 
 	if (mm) {
 		down_read(&mm->mmap_sem);
-		vma = mm->mmap;
-		while (vma) {
+		list_for_each_entry(vma, &mm->mm_vmas, vm_list) {
 			if ((vma->vm_flags & VM_EXECUTABLE) &&
 			    vma->vm_file) {
 				audit_log_d_path(ab, "exe=",
@@ -789,7 +788,6 @@ static void audit_log_task_info(struct a
 						 vma->vm_file->f_path.mnt);
 				break;
 			}
-			vma = vma->vm_next;
 		}
 		up_read(&mm->mmap_sem);
 	}
--- linux-2.6.23-rc9.orig/kernel/fork.c
+++ linux-2.6.23-rc9/kernel/fork.c
@@ -197,7 +197,7 @@ static struct task_struct *dup_task_stru
 #ifdef CONFIG_MMU
 static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 {
-	struct vm_area_struct *mpnt, *tmp, **pprev;
+	struct vm_area_struct *mpnt, *tmp;
 	struct rb_node **rb_link, *rb_parent;
 	int retval;
 	unsigned long charge;
@@ -211,7 +211,6 @@ static inline int dup_mmap(struct mm_str
 	down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING);
 
 	mm->locked_vm = 0;
-	mm->mmap = NULL;
 	mm->mmap_cache = NULL;
 	mm->free_area_cache = oldmm->mmap_base;
 	mm->cached_hole_size = ~0UL;
@@ -220,9 +219,8 @@ static inline int dup_mmap(struct mm_str
 	mm->mm_rb = RB_ROOT;
 	rb_link = &mm->mm_rb.rb_node;
 	rb_parent = NULL;
-	pprev = &mm->mmap;
 
-	for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
+	list_for_each_entry(mpnt, &oldmm->mm_vmas, vm_list) {
 		struct file *file;
 
 		if (mpnt->vm_flags & VM_DONTCOPY) {
@@ -250,7 +248,6 @@ static inline int dup_mmap(struct mm_str
 		vma_set_policy(tmp, pol);
 		tmp->vm_flags &= ~VM_LOCKED;
 		tmp->vm_mm = mm;
-		tmp->vm_next = NULL;
 		anon_vma_link(tmp);
 		file = tmp->vm_file;
 		if (file) {
@@ -271,9 +268,8 @@ static inline int dup_mmap(struct mm_str
 		/*
 		 * Link in the new vma and copy the page table entries.
 		 */
-		*pprev = tmp;
-		pprev = &tmp->vm_next;
 
+		list_add_tail(&tmp->vm_list, &mm->mm_vmas);
 		__vma_link_rb(mm, tmp, rb_link, rb_parent);
 		rb_link = &tmp->vm_rb.rb_right;
 		rb_parent = &tmp->vm_rb;
@@ -330,6 +326,7 @@ static inline void mm_free_pgd(struct mm
 
 static struct mm_struct * mm_init(struct mm_struct * mm)
 {
+	INIT_LIST_HEAD(&mm->mm_vmas);
 	atomic_set(&mm->mm_users, 1);
 	atomic_set(&mm->mm_count, 1);
 	init_rwsem(&mm->mmap_sem);
--- linux-2.6.23-rc9.orig/mm/madvise.c
+++ linux-2.6.23-rc9/mm/madvise.c
@@ -351,7 +351,7 @@ asmlinkage long sys_madvise(unsigned lon
 		if (start >= end)
 			goto out;
 		if (prev)
-			vma = prev->vm_next;
+			vma = vma_next(prev);
 		else	/* madvise_remove dropped mmap_sem */
 			vma = find_vma(current->mm, start);
 	}
--- linux-2.6.23-rc9.orig/mm/memory.c
+++ linux-2.6.23-rc9/mm/memory.c
@@ -264,11 +264,11 @@ void free_pgd_range(struct mmu_gather **
 		flush_tlb_pgtables((*tlb)->mm, start, end);
 }
 
-void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
-		unsigned long floor, unsigned long ceiling)
+void free_pgtables(struct mmu_gather **tlb, struct list_head *vmas,
+		struct vm_area_struct *vma, unsigned long floor, unsigned long ceiling)
 {
 	while (vma) {
-		struct vm_area_struct *next = vma->vm_next;
+		struct vm_area_struct *next = __vma_next(vmas, vma);
 		unsigned long addr = vma->vm_start;
 
 		/*
@@ -287,7 +287,7 @@ void free_pgtables(struct mmu_gather **t
 			while (next && next->vm_start <= vma->vm_end + PMD_SIZE
 			       && !is_vm_hugetlb_page(next)) {
 				vma = next;
-				next = vma->vm_next;
+				next = __vma_next(vmas, vma);
 				anon_vma_unlink(vma);
 				unlink_file_vma(vma);
 			}
@@ -806,7 +806,7 @@ static unsigned long unmap_page_range(st
  * ensure that any thus-far unmapped pages are flushed before unmap_vmas()
  * drops the lock and schedules.
  */
-unsigned long unmap_vmas(struct mmu_gather **tlbp,
+unsigned long unmap_vmas(struct mmu_gather **tlbp, struct list_head *vmas,
 		struct vm_area_struct *vma, unsigned long start_addr,
 		unsigned long end_addr, unsigned long *nr_accounted,
 		struct zap_details *details)
@@ -818,7 +818,7 @@ unsigned long unmap_vmas(struct mmu_gath
 	spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
 	int fullmm = (*tlbp)->fullmm;
 
-	for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {
+	for ( ; vma && vma->vm_start < end_addr; vma = __vma_next(vmas, vma)) {
 		unsigned long end;
 
 		start = max(vma->vm_start, start_addr);
@@ -889,7 +889,8 @@ unsigned long zap_page_range(struct vm_a
 	lru_add_drain();
 	tlb = tlb_gather_mmu(mm, 0);
 	update_hiwater_rss(mm);
-	end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
+	end = unmap_vmas(&tlb, &vma->vm_mm->mm_vmas, vma,
+			address, end, &nr_accounted, details);
 	if (tlb)
 		tlb_finish_mmu(tlb, address, end);
 	return end;
@@ -2093,7 +2094,7 @@ int vmtruncate_range(struct inode *inode
 void swapin_readahead(swp_entry_t entry, unsigned long addr,struct vm_area_struct *vma)
 {
 #ifdef CONFIG_NUMA
-	struct vm_area_struct *next_vma = vma ? vma->vm_next : NULL;
+	struct vm_area_struct *next_vma = vma ? vma_next(vma) : NULL;
 #endif
 	int i, num;
 	struct page *new_page;
@@ -2120,14 +2121,14 @@ void swapin_readahead(swp_entry_t entry,
 		if (vma) {
 			if (addr >= vma->vm_end) {
 				vma = next_vma;
-				next_vma = vma ? vma->vm_next : NULL;
+				next_vma = vma ? vma_next(vma) : NULL;
 			}
 			if (vma && addr < vma->vm_start)
 				vma = NULL;
 		} else {
 			if (next_vma && addr >= next_vma->vm_start) {
 				vma = next_vma;
-				next_vma = vma->vm_next;
+				next_vma = vma_next(vma);
 			}
 		}
 #endif
--- linux-2.6.23-rc9.orig/mm/mempolicy.c
+++ linux-2.6.23-rc9/mm/mempolicy.c
@@ -344,9 +344,9 @@ check_range(struct mm_struct *mm, unsign
 	if (!first)
 		return ERR_PTR(-EFAULT);
 	prev = NULL;
-	for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
+	for (vma = first; vma && vma->vm_start < end; vma = vma_next(vma)) {
 		if (!(flags & MPOL_MF_DISCONTIG_OK)) {
-			if (!vma->vm_next && vma->vm_end < end)
+			if (!vma_next(vma) && vma->vm_end < end)
 				return ERR_PTR(-EFAULT);
 			if (prev && prev->vm_end < vma->vm_start)
 				return ERR_PTR(-EFAULT);
@@ -403,7 +403,7 @@ static int mbind_range(struct vm_area_st
 
 	err = 0;
 	for (; vma && vma->vm_start < end; vma = next) {
-		next = vma->vm_next;
+		next = vma_next(vma);
 		if (vma->vm_start < start)
 			err = split_vma(vma->vm_mm, vma, start, 1);
 		if (!err && vma->vm_end > end)
@@ -610,7 +610,7 @@ int migrate_to_node(struct mm_struct *mm
 	nodes_clear(nmask);
 	node_set(source, nmask);
 
-	check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
+	check_range(mm, __vma_next(&mm->mm_vmas, NULL)->vm_start, TASK_SIZE, &nmask,
 			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
 
 	if (!list_empty(&pagelist))
@@ -1785,7 +1785,7 @@ void mpol_rebind_mm(struct mm_struct *mm
 	struct vm_area_struct *vma;
 
 	down_write(&mm->mmap_sem);
-	for (vma = mm->mmap; vma; vma = vma->vm_next)
+	list_for_each_entry(vma, &mm->mm_vmas, vm_list)
 		mpol_rebind_policy(vma->vm_policy, new);
 	up_write(&mm->mmap_sem);
 }
--- linux-2.6.23-rc9.orig/mm/migrate.c
+++ linux-2.6.23-rc9/mm/migrate.c
@@ -1030,7 +1030,7 @@ int migrate_vmas(struct mm_struct *mm, c
  	struct vm_area_struct *vma;
  	int err = 0;
 
- 	for(vma = mm->mmap; vma->vm_next && !err; vma = vma->vm_next) {
+	list_for_each_entry(vma, &mm->mm_vmas, vm_list) {
  		if (vma->vm_ops && vma->vm_ops->migrate) {
  			err = vma->vm_ops->migrate(vma, to, from, flags);
  			if (err)
--- linux-2.6.23-rc9.orig/mm/mlock.c
+++ linux-2.6.23-rc9/mm/mlock.c
@@ -123,7 +123,7 @@ static int do_mlock(unsigned long start,
 		if (nstart >= end)
 			break;
 
-		vma = prev->vm_next;
+		vma = vma_next(prev);
 		if (!vma || vma->vm_start != nstart) {
 			error = -ENOMEM;
 			break;
@@ -181,7 +181,7 @@ static int do_mlockall(int flags)
 	if (flags == MCL_FUTURE)
 		goto out;
 
-	for (vma = current->mm->mmap; vma ; vma = prev->vm_next) {
+	list_for_each_entry(vma, &current->mm->mm_vmas, vm_list) {
 		unsigned int newflags;
 
 		newflags = vma->vm_flags | VM_LOCKED;
@@ -190,6 +190,7 @@ static int do_mlockall(int flags)
 
 		/* Ignore errors */
 		mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags);
+		vma = prev;
 	}
 out:
 	return 0;
--- linux-2.6.23-rc9.orig/mm/mmap.c
+++ linux-2.6.23-rc9/mm/mmap.c
@@ -35,7 +35,7 @@
 #define arch_mmap_check(addr, len, flags)	(0)
 #endif
 
-static void unmap_region(struct mm_struct *mm,
+static void unmap_region(struct mm_struct *mm, struct list_head *vmas,
 		struct vm_area_struct *vma, struct vm_area_struct *prev,
 		unsigned long start, unsigned long end);
 
@@ -220,18 +220,17 @@ void unlink_file_vma(struct vm_area_stru
 /*
  * Close a vm structure and free it, returning the next.
  */
-static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
+static void remove_vma(struct vm_area_struct *vma)
 {
-	struct vm_area_struct *next = vma->vm_next;
-
 	might_sleep();
+	list_del(&vma->vm_list);
 	if (vma->vm_ops && vma->vm_ops->close)
 		vma->vm_ops->close(vma);
 	if (vma->vm_file)
 		fput(vma->vm_file);
 	mpol_free(vma_policy(vma));
 	kmem_cache_free(vm_area_cachep, vma);
-	return next;
+	return;
 }
 
 asmlinkage unsigned long sys_brk(unsigned long brk)
@@ -316,11 +315,9 @@ void validate_mm(struct mm_struct *mm)
 {
 	int bug = 0;
 	int i = 0;
-	struct vm_area_struct *tmp = mm->mmap;
-	while (tmp) {
-		tmp = tmp->vm_next;
+	struct vm_area_struct *vma;
+	list_for_each_entry(vma, &mm->mm_vmas, vm_list)
 		i++;
-	}
 	if (i != mm->map_count)
 		printk("map_count %d vm_next %d\n", mm->map_count, i), bug = 1;
 	i = browse_rb(&mm->mm_rb);
@@ -374,15 +371,14 @@ __vma_link_list(struct mm_struct *mm, st
 		struct vm_area_struct *prev, struct rb_node *rb_parent)
 {
 	if (prev) {
-		vma->vm_next = prev->vm_next;
-		prev->vm_next = vma;
+		list_add(&vma->vm_list, &prev->vm_list);
 	} else {
-		mm->mmap = vma;
-		if (rb_parent)
-			vma->vm_next = rb_entry(rb_parent,
+		if (rb_parent) {
+			struct vm_area_struct *next = rb_entry(rb_parent,
 					struct vm_area_struct, vm_rb);
-		else
-			vma->vm_next = NULL;
+			list_add_tail(&vma->vm_list, &next->vm_list);
+		} else
+			list_add(&vma->vm_list, &mm->mm_vmas);
 	}
 }
 
@@ -472,7 +468,7 @@ static inline void
 __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
 		struct vm_area_struct *prev)
 {
-	prev->vm_next = vma->vm_next;
+	list_del(&vma->vm_list);
 	rb_erase(&vma->vm_rb, &mm->mm_rb);
 	if (mm->mmap_cache == vma)
 		mm->mmap_cache = prev;
@@ -489,7 +485,7 @@ void vma_adjust(struct vm_area_struct *v
 	unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert)
 {
 	struct mm_struct *mm = vma->vm_mm;
-	struct vm_area_struct *next = vma->vm_next;
+	struct vm_area_struct *next = vma_next(vma);
 	struct vm_area_struct *importer = NULL;
 	struct address_space *mapping = NULL;
 	struct prio_tree_root *root = NULL;
@@ -630,7 +626,7 @@ again:			remove_next = 1 + (end > next->
 		 * up the code too much to do both in one go.
 		 */
 		if (remove_next == 2) {
-			next = vma->vm_next;
+			next = vma_next(vma);
 			goto again;
 		}
 	}
@@ -751,13 +747,10 @@ struct vm_area_struct *vma_merge(struct 
 	if (vm_flags & VM_SPECIAL)
 		return NULL;
 
-	if (prev)
-		next = prev->vm_next;
-	else
-		next = mm->mmap;
+	next = __vma_next(&mm->mm_vmas, prev);
 	area = next;
 	if (next && next->vm_end == end)		/* cases 6, 7, 8 */
-		next = next->vm_next;
+		next = vma_next(next);
 
 	/*
 	 * Can it merge with the predecessor?
@@ -816,7 +809,7 @@ struct anon_vma *find_mergeable_anon_vma
 	struct vm_area_struct *near;
 	unsigned long vm_flags;
 
-	near = vma->vm_next;
+	near = vma_next(vma);
 	if (!near)
 		goto try_prev;
 
@@ -899,6 +892,7 @@ unsigned long do_mmap_pgoff(struct file 
 	int error;
 	int accountable = 1;
 	unsigned long reqprot = prot;
+	LIST_HEAD(vmas);
 
 	/*
 	 * Does the application expect PROT_READ to imply PROT_EXEC?
@@ -1075,6 +1069,7 @@ unsigned long mmap_region(struct file *f
 	struct rb_node **rb_link, *rb_parent;
 	unsigned long charged = 0;
 	struct inode *inode =  file ? file->f_path.dentry->d_inode : NULL;
+	LIST_HEAD(vmas);
 
 	/* Clear old maps */
 	error = -ENOMEM;
@@ -1210,7 +1205,8 @@ unmap_and_free_vma:
 	fput(file);
 
 	/* Undo any partial mapping done by a device driver. */
-	unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end);
+	list_add(&vma->vm_list, &vmas);
+	unmap_region(mm, &vmas, vma, prev, vma->vm_start, vma->vm_end);
 	charged = 0;
 free_vma:
 	kmem_cache_free(vm_area_cachep, vma);
@@ -1261,7 +1257,7 @@ arch_get_unmapped_area(struct file *filp
 	}
 
 full_search:
-	for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
+	for (vma = find_vma(mm, addr); ; vma = vma_next(vma)) {
 		/* At this point:  (!vma || addr < vma->vm_end). */
 		if (TASK_SIZE - len < addr) {
 			/*
@@ -1472,14 +1468,11 @@ struct vm_area_struct *
 find_vma_prev(struct mm_struct *mm, unsigned long addr,
 			struct vm_area_struct **pprev)
 {
-	struct vm_area_struct *vma = NULL, *prev = NULL;
+	struct vm_area_struct *prev = NULL, *next;
 	struct rb_node * rb_node;
 	if (!mm)
 		goto out;
 
-	/* Guard against addr being lower than the first VMA */
-	vma = mm->mmap;
-
 	/* Go through the RB tree quickly. */
 	rb_node = mm->mm_rb.rb_node;
 
@@ -1491,7 +1484,8 @@ find_vma_prev(struct mm_struct *mm, unsi
 			rb_node = rb_node->rb_left;
 		} else {
 			prev = vma_tmp;
-			if (!prev->vm_next || (addr < prev->vm_next->vm_end))
+			next = __vma_next(&mm->mm_vmas, prev);
+			if (!next || (addr < next->vm_end))
 				break;
 			rb_node = rb_node->rb_right;
 		}
@@ -1499,7 +1493,7 @@ find_vma_prev(struct mm_struct *mm, unsi
 
 out:
 	*pprev = prev;
-	return prev ? prev->vm_next : vma;
+	return __vma_next(&mm->mm_vmas, prev);
 }
 
 /*
@@ -1707,18 +1701,21 @@ find_extend_vma(struct mm_struct * mm, u
  *
  * Called with the mm semaphore held.
  */
-static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
+static void remove_vma_list(struct mm_struct *mm, struct list_head *vmas,
+		struct vm_area_struct *vma)
 {
 	/* Update high watermark before we lower total_vm */
 	update_hiwater_vm(mm);
 	do {
+		struct vm_area_struct *next = __vma_next(vmas, vma);
 		long nrpages = vma_pages(vma);
 
 		mm->total_vm -= nrpages;
 		if (vma->vm_flags & VM_LOCKED)
 			mm->locked_vm -= nrpages;
 		vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages);
-		vma = remove_vma(vma);
+		remove_vma(vma);
+		vma = next;
 	} while (vma);
 	validate_mm(mm);
 }
@@ -1728,21 +1725,22 @@ static void remove_vma_list(struct mm_st
  *
  * Called with the mm semaphore held.
  */
-static void unmap_region(struct mm_struct *mm,
+static void unmap_region(struct mm_struct *mm, struct list_head *vmas,
 		struct vm_area_struct *vma, struct vm_area_struct *prev,
 		unsigned long start, unsigned long end)
 {
-	struct vm_area_struct *next = prev? prev->vm_next: mm->mmap;
+	struct vm_area_struct *next = __vma_next(&mm->mm_vmas, prev);
 	struct mmu_gather *tlb;
 	unsigned long nr_accounted = 0;
 
 	lru_add_drain();
 	tlb = tlb_gather_mmu(mm, 0);
 	update_hiwater_rss(mm);
-	unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL);
+	unmap_vmas(&tlb, vmas, vma, start, end, &nr_accounted, NULL);
 	vm_unacct_memory(nr_accounted);
-	free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
-				 next? next->vm_start: 0);
+	free_pgtables(&tlb, vmas, vma,
+			prev ? prev->vm_end : FIRST_USER_ADDRESS,
+			next ? next->vm_start : 0);
 	tlb_finish_mmu(tlb, start, end);
 }
 
@@ -1752,21 +1750,17 @@ static void unmap_region(struct mm_struc
  */
 static void
 detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
-	struct vm_area_struct *prev, unsigned long end)
+	struct vm_area_struct *prev, unsigned long end, struct list_head *vmas)
 {
-	struct vm_area_struct **insertion_point;
-	struct vm_area_struct *tail_vma = NULL;
 	unsigned long addr;
 
-	insertion_point = (prev ? &prev->vm_next : &mm->mmap);
 	do {
+		struct vm_area_struct *next = vma_next(vma);
 		rb_erase(&vma->vm_rb, &mm->mm_rb);
 		mm->map_count--;
-		tail_vma = vma;
-		vma = vma->vm_next;
+		list_move_tail(&vma->vm_list, vmas);
+		vma = next;
 	} while (vma && vma->vm_start < end);
-	*insertion_point = vma;
-	tail_vma->vm_next = NULL;
 	if (mm->unmap_area == arch_unmap_area)
 		addr = prev ? prev->vm_end : mm->mmap_base;
 	else
@@ -1836,6 +1830,7 @@ int do_munmap(struct mm_struct *mm, unsi
 {
 	unsigned long end;
 	struct vm_area_struct *vma, *prev, *last;
+	LIST_HEAD(vmas);
 
 	if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start)
 		return -EINVAL;
@@ -1875,16 +1870,16 @@ int do_munmap(struct mm_struct *mm, unsi
 		if (error)
 			return error;
 	}
-	vma = prev? prev->vm_next: mm->mmap;
+	vma = __vma_next(&mm->mm_vmas, prev);
 
 	/*
 	 * Remove the vma's, and unmap the actual pages
 	 */
-	detach_vmas_to_be_unmapped(mm, vma, prev, end);
-	unmap_region(mm, vma, prev, start, end);
+	detach_vmas_to_be_unmapped(mm, vma, prev, end, &vmas);
+	unmap_region(mm, &vmas, vma, prev, start, end);
 
 	/* Fix up all other VM information */
-	remove_vma_list(mm, vma);
+	remove_vma_list(mm, &vmas, vma);
 
 	return 0;
 }
@@ -2021,7 +2016,9 @@ EXPORT_SYMBOL(do_brk);
 void exit_mmap(struct mm_struct *mm)
 {
 	struct mmu_gather *tlb;
-	struct vm_area_struct *vma = mm->mmap;
+	LIST_HEAD(vmas);
+	struct vm_area_struct *vma = __vma_next(&mm->mm_vmas, NULL);
+	struct vm_area_struct *next;
 	unsigned long nr_accounted = 0;
 	unsigned long end;
 
@@ -2030,22 +2027,23 @@ void exit_mmap(struct mm_struct *mm)
 
 	lru_add_drain();
 	flush_cache_mm(mm);
+	detach_vmas_to_be_unmapped(mm, vma, NULL, -1, &vmas);
 	tlb = tlb_gather_mmu(mm, 1);
 	/* Don't update_hiwater_rss(mm) here, do_exit already did */
 	/* Use -1 here to ensure all VMAs in the mm are unmapped */
-	end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
+	end = unmap_vmas(&tlb, &vmas, vma, 0, -1, &nr_accounted, NULL);
 	vm_unacct_memory(nr_accounted);
-	free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);
+	free_pgtables(&tlb, &vmas, vma, FIRST_USER_ADDRESS, 0);
 	tlb_finish_mmu(tlb, 0, end);
 
 	/*
 	 * Walk the list again, actually closing and freeing it,
 	 * with preemption enabled, without holding any MM locks.
 	 */
-	while (vma)
-		vma = remove_vma(vma);
+	list_for_each_entry_safe(vma, next, &vmas, vm_list)
+		remove_vma(vma);
 
-	BUG_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
+	/* BUG_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT); */
 }
 
 /* Insert vm structure into process list sorted by address
--- linux-2.6.23-rc9.orig/mm/mprotect.c
+++ linux-2.6.23-rc9/mm/mprotect.c
@@ -302,7 +302,7 @@ sys_mprotect(unsigned long start, size_t
 		if (nstart >= end)
 			goto out;
 
-		vma = prev->vm_next;
+		vma = vma_next(prev);
 		if (!vma || vma->vm_start != nstart) {
 			error = -ENOMEM;
 			goto out;
--- linux-2.6.23-rc9.orig/mm/mremap.c
+++ linux-2.6.23-rc9/mm/mremap.c
@@ -226,7 +226,7 @@ static unsigned long move_vma(struct vm_
 	if (excess) {
 		vma->vm_flags |= VM_ACCOUNT;
 		if (split)
-			vma->vm_next->vm_flags |= VM_ACCOUNT;
+			vma_next(vma)->vm_flags |= VM_ACCOUNT;
 	}
 
 	if (vm_flags & VM_LOCKED) {
@@ -360,8 +360,9 @@ unsigned long do_mremap(unsigned long ad
 	    !((flags & MREMAP_FIXED) && (addr != new_addr)) &&
 	    (old_len != new_len || !(flags & MREMAP_MAYMOVE))) {
 		unsigned long max_addr = TASK_SIZE;
-		if (vma->vm_next)
-			max_addr = vma->vm_next->vm_start;
+		struct vm_area_struct *next = vma_next(vma);
+		if (next)
+			max_addr = next->vm_start;
 		/* can we just expand the current mapping? */
 		if (max_addr - addr >= new_len) {
 			int pages = (new_len - old_len) >> PAGE_SHIFT;
--- linux-2.6.23-rc9.orig/mm/msync.c
+++ linux-2.6.23-rc9/mm/msync.c
@@ -93,7 +93,7 @@ asmlinkage long sys_msync(unsigned long 
 				error = 0;
 				goto out_unlock;
 			}
-			vma = vma->vm_next;
+			vma = vma_next(vma);
 		}
 	}
 out_unlock:
--- linux-2.6.23-rc9.orig/mm/swapfile.c
+++ linux-2.6.23-rc9/mm/swapfile.c
@@ -626,7 +626,7 @@ static int unuse_mm(struct mm_struct *mm
 		down_read(&mm->mmap_sem);
 		lock_page(page);
 	}
-	for (vma = mm->mmap; vma; vma = vma->vm_next) {
+	list_for_each_entry(vma, &mm->mm_vmas, vm_list) {
 		if (vma->anon_vma && unuse_vma(vma, entry, page))
 			break;
 	}

-- 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH/RFC 2/9] lib: RCU friendly B+tree
  2007-10-22 10:45 [PATCH/RFC 0/9] VMA lookup with RCU Vaidyanathan Srinivasan
  2007-10-22 10:45 ` [PATCH/RFC 1/9] Data structure changes Vaidyanathan Srinivasan
@ 2007-10-22 10:45 ` Vaidyanathan Srinivasan
  2007-10-22 10:45 ` [PATCH/RFC 3/9] mm: use the B+tree for vma lookups Vaidyanathan Srinivasan
                   ` (6 subsequent siblings)
  8 siblings, 0 replies; 10+ messages in thread
From: Vaidyanathan Srinivasan @ 2007-10-22 10:45 UTC (permalink / raw)
  To: linux-mm; +Cc: Alexis Bruemmer, Peter Zijlstra

[-- Attachment #1: 2_btree.patch --]
[-- Type: text/plain, Size: 33151 bytes --]

Introduce an RCU friendly B+tree in lib

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/btree.h |  151 ++++++
 init/main.c           |    2 
 lib/Makefile          |    3 
 lib/btree.c           | 1189 ++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 1344 insertions(+), 1 deletion(-)

--- /dev/null
+++ linux-2.6.23-rc6/include/linux/btree.h
@@ -0,0 +1,151 @@
+#ifndef _LINUX_BTREE_H
+#define _LINUX_BTREE_H
+
+#define BTREE_DEBUG	1
+
+#include <linux/mempool.h>
+#include <linux/log2.h>
+#include <linux/rcupdate.h>
+#include <asm/cache.h>
+#include <asm/bitops.h>
+
+struct btree_item {
+	unsigned long key;
+	void *data;
+};
+
+#define BTREE_ITEM_CACHELINE (L1_CACHE_BYTES / sizeof(struct btree_item))
+#define BTREE_ITEM_MAX (BTREE_ITEM_CACHELINE < 16 ? 16 : BTREE_ITEM_CACHELINE)
+#define BTREE_ITEM_HALF (BTREE_ITEM_MAX/2)
+
+struct btree_node {
+	struct btree_item item[BTREE_ITEM_MAX];
+};
+
+/*
+ * Max depth of the tree.
+ * Assumes BTREE_ITEM_MAX is power of two.
+ */
+#define BTREE_MAX_DEPTH \
+	(1 + DIV_ROUND_UP(BITS_PER_LONG, ilog2(BTREE_ITEM_MAX)))
+
+/*
+ * Max number of nodes to be RCUed per modification.
+ */
+#define BTREE_NODE_REPLACE (2 * BTREE_MAX_DEPTH)
+
+struct btree_freevec {
+	struct rcu_head rcu_head;
+	int pos;
+	void *slot[BTREE_NODE_REPLACE];
+};
+
+struct btree_root {
+	struct btree_node *root;
+	int height;
+	mempool_t *mempool;
+	struct btree_freevec *freevec;
+	gfp_t gfp_mask;
+	void (*flush)(struct btree_freevec *);
+};
+
+#ifdef BTREE_DEBUG
+void btree_print(struct btree_root *);
+#else
+#define btree_print(root) do { } while (0)
+#endif
+
+static inline void *btree_item_deref(struct btree_item *itemp)
+{
+	void *ptr = rcu_dereference(itemp->data);
+	__clear_bit(0, (void *)&ptr);
+	return ptr;
+}
+
+static inline unsigned long btree_item_key(struct btree_item *itemp)
+{
+	unsigned long key = itemp->key;
+	/*
+	 * match the smp_wmb() in btree_item_key_set()
+	 */
+	smp_rmb();
+	return key;
+}
+
+extern struct btree_item *btree_search_next(struct btree_root *root,
+		unsigned long index, struct btree_item **nextp);
+
+static inline void *btree_lookup(struct btree_root *root, unsigned long index)
+{
+	struct btree_item *item;
+
+	item = btree_search_next(root, index, NULL);
+	if (!item || btree_item_key(item) != index)
+		return NULL;
+
+	return item->data;
+}
+
+static inline void *btree_lookup_next(struct btree_root *root,
+		unsigned long index, void **nextp)
+{
+	struct btree_item *next = NULL, *item;
+
+	item = btree_search_next(root, index, &next);
+	if (next)
+		*nextp = next->data;
+	if (!item || btree_item_key(item) != index)
+		return NULL;
+
+	return item->data;
+}
+
+static inline void *btree_stab(struct btree_root *root, unsigned long index)
+{
+	struct btree_item *item;
+
+	item = btree_search_next(root, index, NULL);
+	if (!item)
+		return NULL;
+
+	return item->data;
+}
+
+static inline void *btree_stab_next(struct btree_root *root,
+		unsigned long index, void **nextp)
+{
+	struct btree_item *next = NULL, *item;
+
+	item = btree_search_next(root, index, &next);
+	if (next)
+		*nextp = next->data;
+	if (!item)
+		return NULL;
+
+	return item->data;
+}
+
+extern int btree_preload(struct btree_root *, gfp_t);
+extern int btree_insert(struct btree_root *, unsigned long, void *);
+extern int btree_update(struct btree_root *, unsigned long, unsigned long);
+extern void *btree_remove(struct btree_root *, unsigned long);
+
+extern void btree_root_init(struct btree_root *, gfp_t);
+extern void btree_root_destroy(struct btree_root *);
+
+extern void btree_freevec_flush(struct btree_freevec *);
+
+#define BTREE_INIT_FLUSH(gfp, f) (struct btree_root){ 	\
+	.root = NULL, 					\
+	.height = 0,					\
+	.mempool = NULL,				\
+	.freevec = NULL,				\
+	.gfp_mask = (gfp),				\
+	.flush = (f),					\
+}
+
+#define BTREE_INIT(gfp) BTREE_INIT_FLUSH(gfp, btree_freevec_flush)
+
+extern void __init btree_init(void);
+
+#endif /* _LINUX_BTREE_H */
--- linux-2.6.23-rc6.orig/init/main.c
+++ linux-2.6.23-rc6/init/main.c
@@ -55,6 +55,7 @@
 #include <linux/pid_namespace.h>
 #include <linux/device.h>
 #include <linux/kthread.h>
+#include <linux/btree.h>
 
 #include <asm/io.h>
 #include <asm/bugs.h>
@@ -613,6 +614,7 @@ asmlinkage void __init start_kernel(void
 	cpuset_init_early();
 	mem_init();
 	kmem_cache_init();
+	btree_init();
 	setup_per_cpu_pageset();
 	numa_policy_init();
 	if (late_time_init)
--- linux-2.6.23-rc6.orig/lib/Makefile
+++ linux-2.6.23-rc6/lib/Makefile
@@ -5,7 +5,8 @@
 lib-y := ctype.o string.o vsprintf.o kasprintf.o cmdline.o \
 	 rbtree.o radix-tree.o dump_stack.o \
 	 idr.o int_sqrt.o bitmap.o extable.o prio_tree.o \
-	 sha1.o irq_regs.o reciprocal_div.o argv_split.o
+	 sha1.o irq_regs.o reciprocal_div.o argv_split.o \
+	 btree.o
 
 lib-$(CONFIG_MMU) += ioremap.o
 lib-$(CONFIG_SMP) += cpumask.o
--- /dev/null
+++ linux-2.6.23-rc6/lib/btree.c
@@ -0,0 +1,1189 @@
+/*
+ * Copyright 2007, Red Hat Inc. Peter Zijlstra <pzijlstr@redhat.com>
+ * GPLv2
+ *
+ * Something that started out as an RCU friendly B+tree.
+ *
+ * Contrairy to most B-trees the nodes have an even number of slots.  This can
+ * be done because we carry all the children in the leafs and thus we don't
+ * need to push one up on a split.  We can just duplicate the first.
+ *
+ * The inner nodes are basically an N-way space partition.
+ *
+ * Another difference to the typical B+tree is that this implementation does
+ * not thread the leafs, this is left up to the user if so desired.
+ *
+ *
+ *   [------------------------------>[------------------------------>
+ *   |                               |
+ *   [------------------->[--------->[-------------->[-------------->
+ *   |                    |          |               |
+ *   [..) [..) [..) [..)  [..) [..)  [..) [..) [..)  [..) [..) [..)
+ *
+ *
+ */
+#include <linux/btree.h>
+#include <linux/slab.h>
+#include <linux/log2.h>
+#include <linux/err.h>
+
+#ifdef BTREE_DEBUG
+void btree_print(struct btree_root *root);
+int btree_validate(struct btree_root *root);
+#else
+#define btree_print(root) do { } while (0)
+#define btree_validate(root) (0)
+#endif
+
+static struct kmem_cache *btree_cachep;
+
+static inline void btree_free(struct btree_root *root, void *ptr)
+{
+	root->freevec->slot[root->freevec->pos++] = ptr;
+}
+
+static inline void btree_ptr_flip(struct btree_root *root,
+		void **nodep, void *new)
+{
+	void *old = rcu_dereference(*nodep);
+	rcu_assign_pointer(*nodep, new);
+	__clear_bit(0, (void *)&old);
+	btree_free(root, old);
+}
+
+/*
+ * node pointers have bit 0 set.
+ */
+static inline int btree_item_node(struct btree_item *itemp)
+{
+	return test_bit(0, (void *)&itemp->data);
+}
+
+static inline struct btree_node *btree_node_ptr(struct btree_node *nodep)
+{
+	__set_bit(0, (void *)&nodep);
+	return nodep;
+}
+
+static inline void btree_item_key_set(struct btree_item *itemp,
+		unsigned long index)
+{
+	/*
+	 * we can only tighten when the new node is linked in
+	 */
+	smp_wmb();
+	itemp->key = index;
+}
+
+static void btree_item_flip(struct btree_root *root,
+		struct btree_item *itemp, struct btree_node *new)
+{
+	unsigned long index;
+
+	btree_ptr_flip(root, &itemp->data, btree_node_ptr(new));
+
+	/* possibly tighten */
+	index = btree_item_key(&new->item[0]);
+	if (btree_item_key(itemp) != index)
+		btree_item_key_set(itemp, index);
+}
+
+static inline void btree_item_copy(void *dst, void *src, int nr)
+{
+	memcpy(dst, src, nr*sizeof(struct btree_item));
+}
+
+static inline int btree_item_count(struct btree_node *node)
+{
+	int j;
+
+	for (j = 0; j < BTREE_ITEM_MAX &&
+			node->item[j].data ; j++)
+		;
+
+	return j;
+}
+
+static struct btree_node *btree_node_alloc(struct btree_root *root)
+{
+	struct btree_node *node;
+
+	node = mempool_alloc(root->mempool, root->gfp_mask);
+	memset(node, 0, sizeof(struct btree_node));
+
+	return node;
+}
+
+static inline void btree_item_free(struct btree_root *root,
+		struct btree_item *itemp)
+{
+	if (btree_item_node(itemp))
+		btree_free(root, btree_item_deref(itemp));
+}
+
+/*
+ * node storage; expose interface to claim all the needed memory in advance
+ * this avoids getting stuck in a situation where going bad is as impossible
+ * as going forward.
+ */
+int btree_preload(struct btree_root *root, gfp_t gfp_mask)
+{
+	int ret = 0;
+
+	if (!root->mempool) {
+		root->mempool = mempool_create_slab_pool(0, btree_cachep);
+		if (!root->mempool)
+			return -ENOMEM;
+	}
+
+	if (!root->freevec) {
+		root->freevec = kzalloc(sizeof(struct btree_freevec), gfp_mask);
+		if (!root->freevec)
+			return -ENOMEM;
+	}
+
+	ret = mempool_resize(root->mempool, 1+2*root->height, gfp_mask);
+
+	return ret;
+}
+
+static int btree_mod_init(struct btree_root *root)
+{
+	return btree_preload(root, root->gfp_mask);
+}
+
+static int btree_mod_finish(struct btree_root *root, unsigned long index,
+		char *op)
+{
+	int ret = 0;
+	if (root->mempool)
+		ret = mempool_resize(root->mempool, 1, root->gfp_mask);
+
+	if (btree_validate(root)) {
+		printk(KERN_DEBUG "modified(%s): %lu\n", op, index);
+		btree_print(root);
+		BUG();
+	}
+
+	root->flush(root->freevec);
+	root->freevec = NULL;
+
+	return ret;
+}
+
+/*
+ * stack, to store traversal path.
+ */
+struct btree_path {
+	struct btree_node *node;
+	int offset;
+};
+
+struct btree_stack {
+	int p;
+	struct btree_path path[BTREE_MAX_DEPTH];
+};
+
+static inline void btree_stack_init(struct btree_stack *stack)
+{
+	memset(stack, 0, sizeof(*stack));
+}
+
+static inline void btree_stack_push(struct btree_stack *stack,
+		struct btree_node *node, int offset)
+{
+	stack->path[stack->p].node = node;
+	stack->path[stack->p].offset = offset;
+	stack->p++;
+
+	BUG_ON(stack->p > ARRAY_SIZE(stack->path));
+}
+
+static inline void btree_stack_pop(struct btree_stack *stack,
+		struct btree_node **nodep, int *offsetp)
+{
+	stack->p--;
+	*nodep = stack->path[stack->p].node;
+	*offsetp = stack->path[stack->p].offset;
+
+	BUG_ON(stack->p < 0);
+}
+
+static inline int btree_stack_size(struct btree_stack *stack)
+{
+	return stack->p;
+}
+
+static struct btree_node *btree_stack_peek_left(struct btree_stack *stack)
+{
+	struct btree_node *node;
+	struct btree_item *item;
+	int offset;
+
+	if (!btree_stack_size(stack))
+		return NULL;
+
+	node = stack->path[stack->p - 1].node;
+	offset = stack->path[stack->p - 1].offset;
+
+	if (offset == 0)
+		return NULL;
+
+	offset--;
+	item = &node->item[offset];
+	return btree_item_deref(item);
+}
+
+static struct btree_node *btree_stack_peek_right(struct btree_stack *stack)
+{
+	struct btree_node *node;
+	struct btree_item *item;
+	int offset;
+
+	if (!btree_stack_size(stack))
+		return NULL;
+
+	node = stack->path[stack->p - 1].node;
+	offset = stack->path[stack->p - 1].offset;
+
+	offset++;
+	if (offset == BTREE_ITEM_MAX)
+		return NULL;
+
+	item = &node->item[offset];
+	return btree_item_deref(item);
+}
+
+/* ------ search ------- */
+
+/*
+ * Since we can re-adjust the space partitioning in delete and update a
+ * lockless lookup might hit a wrong branch once in a while, hence we might
+ * need to backtrack.
+ *
+ * Because we do a search for a less than or equal index we can only backtrack
+ * to the left. So the split can be off to the left but never to the right.
+ * Hence we need to move splits left on descend and right on ascend of the
+ * tree. See update and remove.
+ *
+ * NOTE the backtrack should be limited to one entry so worst time is:
+ *   O(2*log(n)-1)
+ */
+
+/*
+ * find which branch to take
+ */
+static inline
+int __btree_item_search(struct btree_root *root,
+		struct btree_node *node, unsigned long index)
+{
+	int i;
+
+	for (i = 1; i < BTREE_ITEM_MAX; i++)
+		if (node->item[i].data == NULL ||
+				index < btree_item_key(&node->item[i]))
+			break;
+	i--;
+	return i;
+}
+
+/*
+ * find item and the path to it.
+ */
+static struct btree_item *__btree_search(struct btree_root *root,
+		struct btree_stack *stack, unsigned long index)
+{
+	struct btree_node *node = rcu_dereference(root->root);
+	struct btree_item *item;
+	int offset;
+
+	btree_stack_init(stack);
+
+	if (!node)
+		return NULL;
+
+	do {
+		/*
+		 * if the split was too low, back-track and take the previous
+		 * branch
+		 */
+		if (unlikely(btree_item_key(&node->item[0]) > index)) {
+			if (!btree_stack_size(stack))
+				return NULL;
+
+			do {
+				btree_stack_pop(stack, &node, &offset);
+			} while (btree_stack_size(stack) && offset == 0);
+
+			if (!btree_stack_size(stack) && offset == 0)
+				return NULL;
+
+			offset--;
+		} else
+			offset = __btree_item_search(root, node, index);
+
+		btree_stack_push(stack, node, offset);
+
+		item = &node->item[offset];
+		if (!btree_item_node(item))
+			break;
+
+		node = btree_item_deref(item);
+	} while (node);
+
+	return item;
+}
+
+/*
+ * find the left-most item in a sub-tree, and the path to it.
+ */
+static struct btree_item *__btree_find_first(struct btree_root *root,
+		struct btree_stack *stack, struct btree_node *node, int offset)
+{
+	struct btree_item *item;
+
+	do {
+		btree_stack_push(stack, node, offset);
+
+		item = &node->item[offset];
+		if (!btree_item_node(item))
+			break;
+
+		node = btree_item_deref(item);
+		offset = 0;
+	} while (node);
+
+	return item;
+}
+
+/*
+ * find the item with a lesser or equal index
+ * and optionally the next item.
+ */
+struct btree_item *btree_search_next(struct btree_root *root,
+		unsigned long index, struct btree_item **nextp)
+{
+	struct btree_stack stack;
+	struct btree_node *node;
+	struct btree_item *item, *next;
+	int offset;
+
+	item = __btree_search(root, &stack, index);
+
+	if (item && nextp) {
+		do {
+			btree_stack_pop(&stack, &node, &offset);
+		} while (btree_stack_size(&stack) &&
+				offset == BTREE_ITEM_MAX-1);
+
+		if (offset != BTREE_ITEM_MAX-1) {
+			offset++;
+			next = __btree_find_first(root, &stack, node, offset);
+		} else
+			next = NULL;
+
+		*nextp = next;
+	} else if (nextp) {
+		node = rcu_dereference(root->root);
+		offset = 0;
+
+		if (node) {
+			next = __btree_find_first(root, &stack, node, offset);
+			*nextp = next;
+		}
+	}
+
+	return item;
+}
+
+/* ------ insert ------- */
+
+/*
+ * insert item; split nodes when needed
+ */
+int btree_insert(struct btree_root *root,
+		unsigned long index, void *data)
+{
+	struct btree_stack stack;
+	struct btree_node *node = NULL,
+			  *old_node,
+			  *update,
+			  *new = NULL,
+			  *split = NULL,
+			  *uleft,
+			  *uright,
+			  *left = NULL,
+			  *right = NULL;
+	struct btree_item *item;
+	struct btree_item nitem = {
+		.key = index,
+		.data = data,
+	};
+	unsigned long key;
+	int i, j, n, ret = 0;
+
+	ret = btree_mod_init(root);
+	if (ret)
+		return ret;
+
+	item = __btree_search(root, &stack, index);
+	if (!item) {
+		if (!root->root) {
+			root->root = btree_node_alloc(root);
+			root->height = 1;
+			btree_stack_push(&stack, root->root, 0);
+		} else
+			__btree_find_first(root, &stack, root->root, 0);
+	} else if (btree_item_key(item) == index) {
+		ret = -EEXIST;
+		goto out;
+	}
+
+	do {
+		old_node = node;
+		btree_stack_pop(&stack, &node, &i);
+		item = &node->item[i];
+
+		update = NULL;
+		if (btree_item_node(item)) {
+			/* no change */
+			if (!new && !split && !left && !right) {
+				BUG_ON(btree_item_deref(item) != old_node);
+
+				/* possibly tighten the split on our way back up  */
+				key = btree_item_key(&old_node->item[0]);
+				if (btree_item_key(item) != key)
+					btree_item_key_set(item, key);
+				continue;
+			}
+
+			/* single change */
+			if (new && !split && !left && !right) {
+				btree_item_flip(root, item, new);
+				new = NULL;
+				continue;
+			}
+
+			/* merge left */
+			if (new && !split && left && !right) {
+				update = btree_node_alloc(root);
+				btree_item_copy(update->item, node->item,
+						BTREE_ITEM_MAX);
+				update->item[i-1] = (struct btree_item){
+					.key = btree_item_key(&left->item[0]),
+					.data = btree_node_ptr(left),
+				};
+				update->item[i] = (struct btree_item){
+					.key = btree_item_key(&new->item[0]),
+					.data = btree_node_ptr(new),
+				};
+				btree_item_free(root, &node->item[i-1]);
+				btree_item_free(root, &node->item[i]);
+
+				left = NULL;
+				new = update;
+				continue;
+			}
+
+			/* merge right */
+			if (new && !split && !left && right) {
+				update = btree_node_alloc(root);
+				btree_item_copy(update->item, node->item,
+						BTREE_ITEM_MAX);
+				update->item[i] = (struct btree_item){
+					.key = btree_item_key(&new->item[0]),
+					.data = btree_node_ptr(new),
+				};
+				update->item[i+1] = (struct btree_item){
+					.key = btree_item_key(&right->item[0]),
+					.data = btree_node_ptr(right),
+				};
+				btree_item_free(root, &node->item[i]);
+				btree_item_free(root, &node->item[i+1]);
+
+				new = update;
+				right = NULL;
+				continue;
+			}
+
+			/* split */
+			BUG_ON(left || right);
+			update = new;
+		}
+
+		if (btree_item_deref(item) &&
+				btree_item_key(item) < nitem.key)
+			i++;
+
+		split = NULL;
+		left = NULL;
+		right = NULL;
+		new = btree_node_alloc(root);
+		/* insert has room */
+		if (node->item[BTREE_ITEM_MAX-1].data == NULL) {
+			btree_item_copy(new->item, node->item, i);
+			new->item[i] = nitem;
+			btree_item_copy(new->item + i + 1, node->item + i,
+					BTREE_ITEM_MAX - i - 1);
+
+			if (update)
+				btree_item_flip(root, &new->item[i-1], update);
+
+			continue;
+		}
+
+#if 0	/* there be dragons here ! */
+
+		/* insert overflows */
+
+		/* see if left sibling will take some overflow */
+		uleft = btree_stack_peek_left(&stack);
+		if (uleft && uleft->item[BTREE_ITEM_MAX-1].data == NULL) {
+
+			j = btree_item_count(uleft);
+			n = (BTREE_ITEM_MAX - j + 1)/2;
+
+			left = btree_node_alloc(root);
+			/*
+			 * LLLLl... CCCCcccc
+			 * LLLLlC.. CCCcc*cc
+			 */
+			btree_item_copy(left->item, uleft->item, j);
+			if (i < n) {
+				btree_item_copy(left->item+j, node->item, i);
+				left->item[j+i] = nitem;
+				btree_item_copy(left->item+j+i+1, node->item+i,
+						n-i);
+				btree_item_copy(new->item, node->item+n,
+						BTREE_ITEM_MAX-n);
+			} else {
+				btree_item_copy(left->item+j, node->item, n);
+				btree_item_copy(new->item, node->item+n, i-n);
+				new->item[i-n] = nitem;
+				btree_item_copy(new->item+i-n+1, node->item+i,
+						BTREE_ITEM_MAX-i);
+			}
+
+			if (update) {
+				if (i-1 < n)
+					btree_item_flip(root,
+							&left->item[j+i-1],
+							update);
+				else
+					btree_item_flip(root, &new->item[i-n-1],
+							update);
+			}
+
+			continue;
+		}
+
+		/* see if right sibling will take some overflow */
+		uright = btree_stack_peek_right(&stack);
+		if (uright && uright->item[BTREE_ITEM_MAX-1].data == NULL) {
+
+			j = btree_item_count(uright);
+			n = (BTREE_ITEM_MAX - j + 1)/2;
+
+			right = btree_node_alloc(root);
+			/*
+			 * CCCCcccc RRRR....
+			 * CCCC*cc. ccRRRR..
+			 */
+			if (i <= BTREE_ITEM_MAX-n) {
+				btree_item_copy(new->item, node->item, i);
+				new->item[i] = nitem;
+				btree_item_copy(new->item+i+1, node->item+i,
+						BTREE_ITEM_MAX-n-i);
+				btree_item_copy(right->item,
+						node->item+BTREE_ITEM_MAX-n, n);
+				btree_item_copy(right->item+n, uright->item, j);
+			} else {
+				btree_item_copy(new->item, node->item,
+					       	BTREE_ITEM_MAX-n);
+
+				btree_item_copy(right->item,
+						node->item+(BTREE_ITEM_MAX-n),
+						i-(BTREE_ITEM_MAX-n));
+				right->item[i-(BTREE_ITEM_MAX-n)] = nitem;
+				btree_item_copy(right->item+i-(BTREE_ITEM_MAX-n)+1,
+						node->item+i, BTREE_ITEM_MAX-i);
+				btree_item_copy(right->item+n+1, uright->item,
+						j);
+			}
+
+			if (update) {
+				if (i-1 <= BTREE_ITEM_MAX-n)
+					btree_item_flip(root, &new->item[i-1],
+							update);
+				else
+					btree_item_flip(root,
+						&right->item[i-1-BTREE_ITEM_MAX-n],
+							update);
+			}
+
+			continue;
+		}
+#endif
+
+		/* split node */
+		split = btree_node_alloc(root);
+		if (i < BTREE_ITEM_HALF) {
+			btree_item_copy(new->item, node->item, i);
+			new->item[i] = nitem;
+			btree_item_copy(new->item + i + 1, node->item + i,
+					BTREE_ITEM_HALF - i);
+			btree_item_copy(split->item,
+					node->item + BTREE_ITEM_HALF,
+					BTREE_ITEM_HALF);
+		} else {
+			btree_item_copy(new->item, node->item,
+					BTREE_ITEM_HALF);
+			btree_item_copy(split->item,
+					node->item + BTREE_ITEM_HALF,
+					i - BTREE_ITEM_HALF);
+			split->item[i - BTREE_ITEM_HALF] = nitem;
+			btree_item_copy(split->item + i - BTREE_ITEM_HALF + 1,
+					node->item + i,
+					BTREE_ITEM_MAX - i);
+		}
+
+		if (update) {
+			if (i-1 < BTREE_ITEM_HALF)
+				btree_item_flip(root, &new->item[i-1], update);
+			else
+				btree_item_flip(root,
+					&split->item[i-1-BTREE_ITEM_HALF],
+						update);
+		}
+
+		nitem.key = split->item[0].key;
+		nitem.data = btree_node_ptr(split);
+	} while (btree_stack_size(&stack));
+
+	if (new) {
+		if (!split) {
+			btree_ptr_flip(root, (void **)&root->root, new);
+		} else {
+			update = btree_node_alloc(root);
+			update->item[0] = (struct btree_item){
+				.key = new->item[0].key,
+				.data = btree_node_ptr(new),
+			};
+			update->item[1] = (struct btree_item){
+				.key = split->item[0].key,
+				.data = btree_node_ptr(split),
+			};
+			btree_ptr_flip(root, (void **)&root->root, update);
+			root->height++;
+		}
+	}
+
+out:
+	btree_mod_finish(root, index, "insert");
+	return ret;
+}
+
+/* ------ update ------- */
+
+/*
+ * update the index of an item
+ *
+ * be careful the range:
+ *   [min(index, new_index), max(index, new_index]
+ * _MUST_ be free, otherwise remove and re-insert.
+ *
+ * see the search comment for lockless lookup constraints
+ */
+int btree_update(struct btree_root *root,
+		unsigned long index, unsigned long new_index)
+{
+	struct btree_stack stack;
+	struct btree_node *node = rcu_dereference(root->root);
+	struct btree_item *item;
+	unsigned long key;
+	int offset, ret = 0;
+
+	if (index == new_index)
+		return 0;
+
+	if (!node)
+		return -EINVAL;
+
+	btree_stack_init(&stack);
+
+	do {
+		offset = __btree_item_search(root, node, index);
+		btree_stack_push(&stack, node, offset);
+		item = &node->item[offset];
+		if (btree_item_node(item)) {
+			/* loosen downwards */
+			if (index > new_index && btree_item_key(item) == index)
+				btree_item_key_set(item, new_index);
+			node = btree_item_deref(item);
+		} else
+			node = NULL;
+	} while (node);
+
+	if (btree_item_key(item) == index)
+		btree_item_key_set(item, new_index);
+	else
+		ret = -EINVAL;
+
+	do {
+		btree_stack_pop(&stack, &node, &offset);
+		item = &node->item[offset];
+		if (!btree_item_node(item))
+			continue;
+
+		key = btree_item_key(item);
+		/* undo on error */
+		if (ret && index > new_index && key == new_index)
+			btree_item_key_set(item, index);
+		/* tighten upwards */
+		if (!ret && index < new_index && key == index)
+			btree_item_key_set(item, new_index);
+	} while (btree_stack_size(&stack));
+
+	if (btree_validate(root)) {
+		printk(KERN_DEBUG "btree_update: index: %lu new_index: %lu\n",
+				index, new_index);
+		btree_print(root);
+		BUG();
+	}
+
+	return 0;
+}
+
+/* ------ delete ------- */
+
+/*
+ * delete an item; borrow items or merge nodes when needed.
+ */
+void *btree_remove(struct btree_root *root, unsigned long index)
+{
+	struct btree_stack stack;
+	struct btree_node *node = NULL,
+			  *old_node,
+			  *left = NULL,
+			  *new = NULL,
+			  *right = NULL,
+			  *update,
+			  *uleft,
+			  *uright;
+	struct btree_item *item;
+	void *ret = NULL;
+	unsigned long key;
+	int i, j, n;
+	int err;
+
+	if (!root->root)
+		return ERR_PTR(-EINVAL);
+
+	err = btree_mod_init(root);
+	if (err)
+		return ERR_PTR(err);
+
+	item = __btree_search(root, &stack, index);
+	if (!item || btree_item_key(item) != index) {
+		ret = ERR_PTR(-EINVAL);
+		goto out;
+	} else
+		ret = item->data;
+
+	do {
+		old_node = node;
+		btree_stack_pop(&stack, &node, &i);
+		item = &node->item[i];
+
+		update = NULL;
+		if (btree_item_node(item)) {
+			/* no change */
+			if (!new && !left && !right) {
+				BUG_ON(btree_item_deref(item) != old_node);
+
+				/* tighten the split on our way back up */
+				key = btree_item_key(&old_node->item[0]);
+				if (btree_item_key(item) != key)
+					btree_item_key_set(item, key);
+				continue;
+			}
+
+			/* single change */
+			if (!left && new && !right) {
+				btree_item_flip(root, &node->item[i], new);
+				new = NULL;
+				continue;
+			}
+
+			/* borrowed left */
+			if (left && new && !right) {
+				update = btree_node_alloc(root);
+				btree_item_copy(update->item, node->item,
+						BTREE_ITEM_MAX);
+				update->item[i-1] = (struct btree_item){
+					.key = btree_item_key(&left->item[0]),
+					.data = btree_node_ptr(left),
+				};
+				update->item[i] = (struct btree_item){
+					.key = btree_item_key(&new->item[0]),
+					.data = btree_node_ptr(new),
+				};
+				btree_item_free(root, &node->item[i-1]);
+				btree_item_free(root, &node->item[i]);
+
+				left = NULL;
+				new = update;
+				continue;
+			}
+
+			/* borrowed right */
+			if (!left && new && right) {
+				update = btree_node_alloc(root);
+				btree_item_copy(update->item, node->item,
+						BTREE_ITEM_MAX);
+				update->item[i] = (struct btree_item){
+					.key = btree_item_key(&new->item[0]),
+					.data = btree_node_ptr(new),
+				};
+				update->item[i+1] = (struct btree_item){
+					.key = btree_item_key(&right->item[0]),
+					.data = btree_node_ptr(right),
+				};
+				btree_item_free(root, &node->item[i]);
+				btree_item_free(root, &node->item[i+1]);
+
+				new = update;
+				right = NULL;
+				continue;
+			}
+
+			/* merged left */
+			if (left && !new && !right) {
+				update = left;
+			}
+			/* merged right */
+			else if (!left && !new && right) {
+				update = right;
+				i++;
+			}
+			/* impossible */
+			else {
+				printk("%p %p %p\n", left, new, right);
+				BUG();
+			}
+		}
+
+		/* delete */
+		left = NULL;
+		right = NULL;
+		new = btree_node_alloc(root);
+		if (node->item[BTREE_ITEM_HALF].data) {
+delete_one:
+			/*
+			 * CCCxcc..
+			 * CCCcc...
+			 */
+			btree_item_copy(new->item, node->item, i);
+			btree_item_copy(new->item+i, node->item+i+1,
+					BTREE_ITEM_MAX-i-1);
+
+			if (update)
+				btree_item_flip(root, &new->item[i-1], update);
+
+			btree_item_free(root, &node->item[i]);
+			continue;
+		}
+
+		/* delete underflows */
+
+		/* borrow left */
+		uleft = btree_stack_peek_left(&stack);
+		if (uleft && uleft->item[BTREE_ITEM_HALF].data) {
+			left = btree_node_alloc(root);
+
+			j = btree_item_count(uleft);
+			n = (j-BTREE_ITEM_HALF+1)/2;
+
+			/*
+			 * LLLLlll. CxCC....
+			 * LLLLl... llCCC...
+			 */
+			btree_item_copy(left->item, uleft->item, (j-n));
+			btree_item_copy(new->item, uleft->item+(j-n), n);
+			btree_item_copy(new->item+n, node->item, i);
+			btree_item_copy(new->item+n+i, node->item+i+1,
+					BTREE_ITEM_HALF-i);
+
+			if (update)
+				btree_item_flip(root, &new->item[n+i-1],
+						update);
+
+			btree_item_free(root, &node->item[i]);
+
+			continue;
+		}
+
+		/* borrow right */
+		uright = btree_stack_peek_right(&stack);
+		if (uright && uright->item[BTREE_ITEM_HALF].data) {
+			right = btree_node_alloc(root);
+
+			j = btree_item_count(uright);
+			n = (j-BTREE_ITEM_HALF+1)/2;
+
+			/*
+			 * CCxC.... RRRRrrr.
+			 * CCCRR... RRrrr...
+			 */
+			btree_item_copy(new->item, node->item, i);
+			btree_item_copy(new->item+i, node->item+i+1,
+					BTREE_ITEM_HALF-i-1);
+			btree_item_copy(new->item+BTREE_ITEM_HALF-1,
+					uright->item, n);
+			btree_item_copy(right->item, uright->item+n,
+					BTREE_ITEM_MAX-n);
+
+			if (update)
+				btree_item_flip(root, &new->item[i-1], update);
+
+			btree_item_free(root, &node->item[i]);
+
+			continue;
+		}
+
+		/* merge left */
+		if (uleft) {
+			/*
+			 * LLLL.... CCCxc...
+			 * LLLLCCCc
+			 */
+			btree_item_copy(new->item, uleft->item,
+					BTREE_ITEM_HALF);
+			btree_item_copy(new->item+BTREE_ITEM_HALF,
+					node->item, i);
+			btree_item_copy(new->item+BTREE_ITEM_HALF+i,
+				       	node->item+i+1, BTREE_ITEM_HALF-i-1);
+
+			if (update)
+				btree_item_flip(root,
+						&new->item[BTREE_ITEM_HALF+i-1],
+						update);
+
+			btree_item_free(root, &node->item[i]);
+
+			left = new;
+			new = NULL;
+			continue;
+		}
+
+		/* merge right */
+		if (uright) {
+			/*
+			 * CCxCc... RRRR....
+			 * CCCcRRRR
+			 */
+			btree_item_copy(new->item, node->item, i);
+			btree_item_copy(new->item+i, node->item+i+1,
+					BTREE_ITEM_HALF-i-1);
+			btree_item_copy(new->item+BTREE_ITEM_HALF-1,
+				       	uright->item, BTREE_ITEM_HALF);
+
+			if (update)
+				btree_item_flip(root, &new->item[i-1], update);
+
+			btree_item_free(root, &node->item[i]);
+
+			/*
+			 * use right to carry the new node to avoid having
+			 * the same pattern as single change
+			 */
+			right = new;
+			new = NULL;
+			continue;
+		}
+
+		/* only the root may underflow */
+		BUG_ON(root->root != node);
+		goto delete_one;
+
+	} while (btree_stack_size(&stack));
+
+	BUG_ON(left || right);
+
+	if (new)
+		btree_ptr_flip(root, (void **)&root->root, new);
+
+	if (!root->root->item[1].data &&
+			btree_item_node(&root->root->item[0])) {
+		btree_ptr_flip(root, (void **)&root->root,
+				btree_item_deref(&root->root->item[0]));
+		root->height--;
+	}
+
+	if (!root->root->item[0].data) {
+		btree_ptr_flip(root, (void **)&root->root, NULL);
+		root->height = 0;
+	}
+
+out:
+	btree_mod_finish(root, index, "remove");
+	return ret;
+}
+
+/* ------ */
+
+void btree_root_init(struct btree_root *root, gfp_t gfp_mask)
+{
+	*root = BTREE_INIT(gfp_mask);
+}
+
+void btree_freevec_flush(struct btree_freevec *freevec)
+{
+	int i;
+	for (i = 0; i < freevec->pos; i++)
+		kmem_cache_free(btree_cachep, freevec->slot[i]);
+	kfree(freevec);
+}
+
+void btree_root_destroy(struct btree_root *root)
+{
+	/* assume the tree is empty */
+	BUG_ON(root->height);
+	BUG_ON(root->freevec);
+	if (root->mempool)
+		mempool_destroy(root->mempool);
+}
+
+void __init btree_init(void)
+{
+	btree_cachep = KMEM_CACHE(btree_node, SLAB_HWCACHE_ALIGN);
+
+}
+
+#ifdef BTREE_DEBUG
+static void __btree_node_print(struct btree_root *root,
+		struct btree_node *node, int recurse)
+{
+	int i, j;
+	for (i = 0; i < BTREE_ITEM_MAX; i++) {
+		if (node->item[i].data) {
+			printk(KERN_DEBUG);
+			for (j=0; j<recurse; j++)
+				printk(" ");
+			if (!btree_item_node(&node->item[i])) {
+
+	printk("-> leaf: %p, item: %d, key: %lu, data: %p\n",
+		node, i, node->item[i].key, node->item[i].data);
+
+			} else {
+
+	printk("node: %p, item: %d, key: %lu, child: %p\n",
+		node, i, node->item[i].key, btree_item_deref(&node->item[i]));
+				if (recurse)
+	__btree_node_print(root, btree_item_deref(&node->item[i]), recurse+1);
+
+			}
+		}
+	}
+}
+
+void btree_node_print(struct btree_root *root, struct btree_node *node)
+{
+	printk(KERN_DEBUG "node: %p\n", node);
+	__btree_node_print(root, node, 0);
+}
+
+void btree_print(struct btree_root *root)
+{
+	printk(KERN_DEBUG "[%p] root: %p, height: %d\n",
+			root, root->root, root->height);
+	if (root->root)
+		__btree_node_print(root, root->root, 1);
+}
+
+static unsigned long __btree_key(struct btree_root *root,
+	       	struct btree_node *node, int height)
+{
+	if (height == 0)
+		return node->item[0].key;
+
+	return __btree_key(root, btree_item_deref(&node->item[0]), height - 1);
+}
+
+static int __btree_validate(struct btree_root *root, struct btree_node *node,
+		unsigned long *pindex, int height)
+{
+	unsigned long parent_key = *pindex;
+	unsigned long node_key = 0;
+	unsigned long child_key = 0;
+
+	int i;
+	unsigned long key;
+	int nr = 0;
+	int bug = 0;
+
+	for (i = 0; i < BTREE_ITEM_MAX; i++) {
+		struct btree_node *child = btree_item_deref(&node->item[i]);
+		if (!child)
+			continue;
+
+		nr++;
+
+		key = node->item[i].key;
+		if (key < parent_key || (!i && key != parent_key) ||
+				(i && key == parent_key)) {
+			printk(KERN_DEBUG
+	"wrong parent split: key: %lu parent_key: %lu index: %d\n",
+				       key, parent_key, i);
+			bug++;
+		}
+
+		if (key < node_key || (i && key == node_key)) {
+			printk(KERN_DEBUG
+	"wrong order: key: %lu node_key: %lu index: %d\n",
+				       key, node_key, i);
+			bug++;
+		}
+		node_key = key;
+
+		if (key < child_key || (i && key == child_key)) {
+			printk(KERN_DEBUG
+	"wrong child split: key: %lu child_key: %lu index: %d\n",
+					key, child_key, i);
+			bug++;
+		}
+
+		child_key = max(node_key, child_key);
+
+		if (height)
+			bug += __btree_validate(root, child,
+					&child_key, height - 1);
+
+		*pindex = max(node_key, child_key);
+	}
+
+	if (node != root->root && nr < BTREE_ITEM_HALF) {
+		printk(KERN_DEBUG "node short\n");
+		bug++;
+	}
+
+	if (bug) {
+		printk(KERN_DEBUG "bug in node: %p\n", node);
+	}
+
+	return bug;
+}
+
+int btree_validate(struct btree_root *root)
+{
+	unsigned long key;
+
+	if (root->root) {
+		key = __btree_key(root, root->root, root->height - 1);
+		return __btree_validate(root, root->root, &key,
+				root->height - 1);
+	}
+
+	return 0;
+}
+#endif

-- 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH/RFC 3/9] mm: use the B+tree for vma lookups
  2007-10-22 10:45 [PATCH/RFC 0/9] VMA lookup with RCU Vaidyanathan Srinivasan
  2007-10-22 10:45 ` [PATCH/RFC 1/9] Data structure changes Vaidyanathan Srinivasan
  2007-10-22 10:45 ` [PATCH/RFC 2/9] lib: RCU friendly B+tree Vaidyanathan Srinivasan
@ 2007-10-22 10:45 ` Vaidyanathan Srinivasan
  2007-10-22 10:45 ` [PATCH/RFC 4/9] mm: RCU " Vaidyanathan Srinivasan
                   ` (5 subsequent siblings)
  8 siblings, 0 replies; 10+ messages in thread
From: Vaidyanathan Srinivasan @ 2007-10-22 10:45 UTC (permalink / raw)
  To: linux-mm; +Cc: Alexis Bruemmer, Peter Zijlstra

[-- Attachment #1: 3_mm-btree.patch --]
[-- Type: text/plain, Size: 16893 bytes --]

---
 include/linux/init_task.h |    2 
 include/linux/mm.h        |    5 -
 include/linux/sched.h     |    4 
 kernel/fork.c             |   17 +--
 mm/mmap.c                 |  230 +++++++++++++++-------------------------------
 5 files changed, 92 insertions(+), 166 deletions(-)

--- linux-2.6.23-rc6.orig/include/linux/init_task.h
+++ linux-2.6.23-rc6/include/linux/init_task.h
@@ -48,7 +48,7 @@
 #define INIT_MM(name) \
 {			 					\
 	.mm_vmas	= LIST_HEAD_INIT(name.mm_vmas),		\
-	.mm_rb		= RB_ROOT,				\
+	.mm_btree	= BTREE_INIT(GFP_ATOMIC),		\
 	.pgd		= swapper_pg_dir, 			\
 	.mm_users	= ATOMIC_INIT(2), 			\
 	.mm_count	= ATOMIC_INIT(1), 			\
--- linux-2.6.23-rc6.orig/include/linux/mm.h
+++ linux-2.6.23-rc6/include/linux/mm.h
@@ -69,8 +69,6 @@ struct vm_area_struct {
 	pgprot_t vm_page_prot;		/* Access permissions of this VMA. */
 	unsigned long vm_flags;		/* Flags, listed below. */
 
-	struct rb_node vm_rb;
-
 	/*
 	 * For areas with an address space and backing store,
 	 * linkage into the address_space->i_mmap prio tree, or
@@ -1091,8 +1089,7 @@ extern struct anon_vma *find_mergeable_a
 extern int split_vma(struct mm_struct *,
 	struct vm_area_struct *, unsigned long addr, int new_below);
 extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *);
-extern void __vma_link_rb(struct mm_struct *, struct vm_area_struct *,
-	struct rb_node **, struct rb_node *);
+extern void __vma_link_btree(struct mm_struct *, struct vm_area_struct *);
 extern void unlink_file_vma(struct vm_area_struct *);
 extern struct vm_area_struct *copy_vma(struct vm_area_struct **,
 	unsigned long addr, unsigned long len, pgoff_t pgoff);
--- linux-2.6.23-rc6.orig/include/linux/sched.h
+++ linux-2.6.23-rc6/include/linux/sched.h
@@ -52,7 +52,7 @@ struct sched_param {
 #include <linux/types.h>
 #include <linux/timex.h>
 #include <linux/jiffies.h>
-#include <linux/rbtree.h>
+#include <linux/btree.h>
 #include <linux/thread_info.h>
 #include <linux/cpumask.h>
 #include <linux/errno.h>
@@ -368,7 +368,7 @@ extern int get_dumpable(struct mm_struct
 
 struct mm_struct {
 	struct list_head mm_vmas;
-	struct rb_root mm_rb;
+	struct btree_root mm_btree;
 	struct vm_area_struct * mmap_cache;	/* last find_vma result */
 	unsigned long (*get_unmapped_area) (struct file *filp,
 				unsigned long addr, unsigned long len,
--- linux-2.6.23-rc6.orig/kernel/fork.c
+++ linux-2.6.23-rc6/kernel/fork.c
@@ -198,7 +198,6 @@ static struct task_struct *dup_task_stru
 static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 {
 	struct vm_area_struct *mpnt, *tmp;
-	struct rb_node **rb_link, *rb_parent;
 	int retval;
 	unsigned long charge;
 	struct mempolicy *pol;
@@ -216,9 +215,6 @@ static inline int dup_mmap(struct mm_str
 	mm->cached_hole_size = ~0UL;
 	mm->map_count = 0;
 	cpus_clear(mm->cpu_vm_mask);
-	mm->mm_rb = RB_ROOT;
-	rb_link = &mm->mm_rb.rb_node;
-	rb_parent = NULL;
 
 	list_for_each_entry(mpnt, &oldmm->mm_vmas, vm_list) {
 		struct file *file;
@@ -270,9 +266,8 @@ static inline int dup_mmap(struct mm_str
 		 */
 
 		list_add_tail(&tmp->vm_list, &mm->mm_vmas);
-		__vma_link_rb(mm, tmp, rb_link, rb_parent);
-		rb_link = &tmp->vm_rb.rb_right;
-		rb_parent = &tmp->vm_rb;
+		btree_preload(&mm->mm_btree, GFP_KERNEL|__GFP_NOFAIL); // XXX create error path
+		__vma_link_btree(mm, tmp);
 
 		mm->map_count++;
 		retval = copy_page_range(mm, oldmm, mpnt);
@@ -320,13 +315,19 @@ static inline void mm_free_pgd(struct mm
  __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);
 
 #define allocate_mm()	(kmem_cache_alloc(mm_cachep, GFP_KERNEL))
-#define free_mm(mm)	(kmem_cache_free(mm_cachep, (mm)))
+
+static void free_mm(struct mm_struct *mm)
+{
+	btree_root_destroy(&mm->mm_btree);
+	kmem_cache_free(mm_cachep, mm);
+}
 
 #include <linux/init_task.h>
 
 static struct mm_struct * mm_init(struct mm_struct * mm)
 {
 	INIT_LIST_HEAD(&mm->mm_vmas);
+	mm->mm_btree = BTREE_INIT(GFP_ATOMIC);
 	atomic_set(&mm->mm_users, 1);
 	atomic_set(&mm->mm_count, 1);
 	init_rwsem(&mm->mmap_sem);
--- linux-2.6.23-rc6.orig/mm/mmap.c
+++ linux-2.6.23-rc6/mm/mmap.c
@@ -4,6 +4,7 @@
  * Written by obz.
  *
  * Address space accounting code	<alan@redhat.com>
+ * Btree adaption			<pzijlstr@redhat.com>
  */
 
 #include <linux/slab.h>
@@ -283,35 +284,6 @@ out:
 }
 
 #ifdef DEBUG_MM_RB
-static int browse_rb(struct rb_root *root)
-{
-	int i = 0, j;
-	struct rb_node *nd, *pn = NULL;
-	unsigned long prev = 0, pend = 0;
-
-	for (nd = rb_first(root); nd; nd = rb_next(nd)) {
-		struct vm_area_struct *vma;
-		vma = rb_entry(nd, struct vm_area_struct, vm_rb);
-		if (vma->vm_start < prev)
-			printk("vm_start %lx prev %lx\n", vma->vm_start, prev), i = -1;
-		if (vma->vm_start < pend)
-			printk("vm_start %lx pend %lx\n", vma->vm_start, pend);
-		if (vma->vm_start > vma->vm_end)
-			printk("vm_end %lx < vm_start %lx\n", vma->vm_end, vma->vm_start);
-		i++;
-		pn = nd;
-		prev = vma->vm_start;
-		pend = vma->vm_end;
-	}
-	j = 0;
-	for (nd = pn; nd; nd = rb_prev(nd)) {
-		j++;
-	}
-	if (i != j)
-		printk("backwards %d, forwards %d\n", j, i), i = 0;
-	return i;
-}
-
 void validate_mm(struct mm_struct *mm)
 {
 	int bug = 0;
@@ -321,9 +293,6 @@ void validate_mm(struct mm_struct *mm)
 		i++;
 	if (i != mm->map_count)
 		printk("map_count %d vm_next %d\n", mm->map_count, i), bug = 1;
-	i = browse_rb(&mm->mm_rb);
-	if (i != mm->map_count)
-		printk("map_count %d rb %d\n", mm->map_count, i), bug = 1;
 	BUG_ON(bug);
 }
 #else
@@ -332,62 +301,36 @@ void validate_mm(struct mm_struct *mm)
 
 static struct vm_area_struct *
 find_vma_prepare(struct mm_struct *mm, unsigned long addr,
-		struct vm_area_struct **pprev, struct rb_node ***rb_link,
-		struct rb_node ** rb_parent)
+ 		struct vm_area_struct **pprev)
 {
-	struct vm_area_struct * vma;
-	struct rb_node ** __rb_link, * __rb_parent, * rb_prev;
-
-	__rb_link = &mm->mm_rb.rb_node;
-	rb_prev = __rb_parent = NULL;
-	vma = NULL;
-
-	while (*__rb_link) {
-		struct vm_area_struct *vma_tmp;
-
-		__rb_parent = *__rb_link;
-		vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb);
-
-		if (vma_tmp->vm_end > addr) {
-			vma = vma_tmp;
-			if (vma_tmp->vm_start <= addr)
-				return vma;
-			__rb_link = &__rb_parent->rb_left;
-		} else {
-			rb_prev = __rb_parent;
-			__rb_link = &__rb_parent->rb_right;
-		}
-	}
-
-	*pprev = NULL;
-	if (rb_prev)
-		*pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
-	*rb_link = __rb_link;
-	*rb_parent = __rb_parent;
+ 	struct vm_area_struct *vma, *prev = NULL;
+ 	vma = btree_stab(&mm->mm_btree, addr);
+ 	if (!vma || addr >= vma->vm_end)
+ 		vma = __vma_next(&mm->mm_vmas, vma);
+ 	if (!(vma && addr < vma->vm_end))
+ 		prev = __vma_prev(&mm->mm_vmas, vma);
+ 	*pprev = prev;
 	return vma;
 }
 
 static inline void
 __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
-		struct vm_area_struct *prev, struct rb_node *rb_parent)
+ 		struct vm_area_struct *prev)
 {
-	if (prev) {
+ 	if (!prev)
+ 		prev = btree_stab(&mm->mm_btree, vma->vm_start);
+
+ 	if (prev)
 		list_add(&vma->vm_list, &prev->vm_list);
-	} else {
-		if (rb_parent) {
-			struct vm_area_struct *next = rb_entry(rb_parent,
-					struct vm_area_struct, vm_rb);
-			list_add_tail(&vma->vm_list, &next->vm_list);
-		} else
-			list_add(&vma->vm_list, &mm->mm_vmas);
-	}
+ 	else
+ 		list_add(&vma->vm_list, &mm->mm_vmas);
 }
 
-void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
-		struct rb_node **rb_link, struct rb_node *rb_parent)
+void __vma_link_btree(struct mm_struct *mm, struct vm_area_struct *vma)
 {
-	rb_link_node(&vma->vm_rb, rb_parent, rb_link);
-	rb_insert_color(&vma->vm_rb, &mm->mm_rb);
+ 	int err;
+ 	err = btree_insert(&mm->mm_btree, vma->vm_start, vma);
+ 	BUG_ON(err);
 }
 
 static inline void __vma_link_file(struct vm_area_struct *vma)
@@ -414,20 +357,20 @@ static inline void __vma_link_file(struc
 
 static void
 __vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
-	struct vm_area_struct *prev, struct rb_node **rb_link,
-	struct rb_node *rb_parent)
+	struct vm_area_struct *prev)
 {
-	__vma_link_list(mm, vma, prev, rb_parent);
-	__vma_link_rb(mm, vma, rb_link, rb_parent);
+	__vma_link_list(mm, vma, prev);
+	__vma_link_btree(mm, vma);
 	__anon_vma_link(vma);
 }
 
 static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
-			struct vm_area_struct *prev, struct rb_node **rb_link,
-			struct rb_node *rb_parent)
+			struct vm_area_struct *prev)
 {
 	struct address_space *mapping = NULL;
 
+	btree_preload(&mm->mm_btree, GFP_KERNEL|__GFP_NOFAIL); // XXX create error path
+
 	if (vma->vm_file)
 		mapping = vma->vm_file->f_mapping;
 
@@ -437,7 +380,7 @@ static void vma_link(struct mm_struct *m
 	}
 	anon_vma_lock(vma);
 
-	__vma_link(mm, vma, prev, rb_link, rb_parent);
+	__vma_link(mm, vma, prev);
 	__vma_link_file(vma);
 
 	anon_vma_unlock(vma);
@@ -456,12 +399,7 @@ static void vma_link(struct mm_struct *m
 static void
 __insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
 {
-	struct vm_area_struct * __vma, * prev;
-	struct rb_node ** rb_link, * rb_parent;
-
-	__vma = find_vma_prepare(mm, vma->vm_start,&prev, &rb_link, &rb_parent);
-	BUG_ON(__vma && __vma->vm_start < vma->vm_end);
-	__vma_link(mm, vma, prev, rb_link, rb_parent);
+	__vma_link(mm, vma, NULL);
 	mm->map_count++;
 }
 
@@ -469,8 +407,21 @@ static inline void
 __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
 		struct vm_area_struct *prev)
 {
+	struct vm_area_struct *vma_tmp;
+
 	list_del(&vma->vm_list);
-	rb_erase(&vma->vm_rb, &mm->mm_rb);
+
+	vma_tmp = btree_remove(&mm->mm_btree, vma->vm_start);
+	if (vma_tmp != vma) {
+#ifdef CONFIG_DEBUG_VM
+		printk(KERN_DEBUG
+			"btree_remove(%lu) returned %p instead of %p\n"
+			,vma->vm_start, vma_tmp, vma);
+		btree_print(&mm->mm_btree);
+#endif
+		BUG();
+	}
+
 	if (mm->mmap_cache == vma)
 		mm->mmap_cache = prev;
 }
@@ -525,6 +476,8 @@ again:			remove_next = 1 + (end > next->
 		}
 	}
 
+	btree_preload(&mm->mm_btree, GFP_KERNEL|__GFP_NOFAIL); // XXX error path
+
 	if (file) {
 		mapping = file->f_mapping;
 		if (!(vma->vm_flags & VM_NONLINEAR))
@@ -576,10 +529,14 @@ again:			remove_next = 1 + (end > next->
 			vma_prio_tree_remove(next, root);
 	}
 
+	btree_update(&mm->mm_btree, vma->vm_start, start);
 	vma->vm_start = start;
 	vma->vm_end = end;
 	vma->vm_pgoff = pgoff;
+
 	if (adjust_next) {
+		btree_update(&mm->mm_btree, next->vm_start,
+				next->vm_start + (adjust_next << PAGE_SHIFT));
 		next->vm_start += adjust_next << PAGE_SHIFT;
 		next->vm_pgoff += adjust_next;
 	}
@@ -1075,7 +1032,7 @@ unsigned long mmap_region(struct file *f
 	/* Clear old maps */
 	error = -ENOMEM;
 munmap_back:
-	vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
+	vma = find_vma_prepare(mm, addr, &prev);
 	if (vma && vma->vm_start < addr + len) {
 		if (do_munmap(mm, addr, len))
 			return -ENOMEM;
@@ -1176,7 +1133,7 @@ munmap_back:
 	if (!file || !vma_merge(mm, prev, addr, vma->vm_end,
 			vma->vm_flags, NULL, file, pgoff, vma_policy(vma))) {
 		file = vma->vm_file;
-		vma_link(mm, vma, prev, rb_link, rb_parent);
+		vma_link(mm, vma, prev);
 		if (correct_wcount)
 			atomic_inc(&inode->i_writecount);
 	} else {
@@ -1436,25 +1393,9 @@ struct vm_area_struct * find_vma(struct 
 		/* (Cache hit rate is typically around 35%.) */
 		vma = mm->mmap_cache;
 		if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
-			struct rb_node * rb_node;
-
-			rb_node = mm->mm_rb.rb_node;
-			vma = NULL;
-
-			while (rb_node) {
-				struct vm_area_struct * vma_tmp;
-
-				vma_tmp = rb_entry(rb_node,
-						struct vm_area_struct, vm_rb);
-
-				if (vma_tmp->vm_end > addr) {
-					vma = vma_tmp;
-					if (vma_tmp->vm_start <= addr)
-						break;
-					rb_node = rb_node->rb_left;
-				} else
-					rb_node = rb_node->rb_right;
-			}
+			vma = btree_stab(&mm->mm_btree, addr);
+			if (!vma || addr >= vma->vm_end)
+				vma = __vma_next(&mm->mm_vmas, vma);
 			if (vma)
 				mm->mmap_cache = vma;
 		}
@@ -1469,32 +1410,11 @@ struct vm_area_struct *
 find_vma_prev(struct mm_struct *mm, unsigned long addr,
 			struct vm_area_struct **pprev)
 {
-	struct vm_area_struct *prev = NULL, *next;
-	struct rb_node * rb_node;
-	if (!mm)
-		goto out;
-
-	/* Go through the RB tree quickly. */
-	rb_node = mm->mm_rb.rb_node;
-
-	while (rb_node) {
-		struct vm_area_struct *vma_tmp;
-		vma_tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb);
-
-		if (addr < vma_tmp->vm_end) {
-			rb_node = rb_node->rb_left;
-		} else {
-			prev = vma_tmp;
-			next = __vma_next(&mm->mm_vmas, prev);
-			if (!next || (addr < next->vm_end))
-				break;
-			rb_node = rb_node->rb_right;
-		}
-	}
+	struct vm_area_struct *vma;
 
-out:
-	*pprev = prev;
-	return __vma_next(&mm->mm_vmas, prev);
+	vma = find_vma(mm, addr);
+	*pprev = __vma_prev(&mm->mm_vmas, vma);
+	return vma;
 }
 
 /*
@@ -1633,6 +1553,17 @@ static inline int expand_downwards(struc
 
 		error = acct_stack_growth(vma, size, grow);
 		if (!error) {
+			/*
+			 * This is safe even under the read lock, anon_vma_lock
+			 * guards against concurrent expand_stack calls and the
+			 * read lock keeps other structural changes to the
+			 * btree away.
+			 *
+			 * btree_update is carfully constructed to shift the
+			 * space partitioning within the btree in a safe manner.
+			 */
+			btree_update(&vma->vm_mm->mm_btree,
+					vma->vm_start, address);
 			vma->vm_start = address;
 			vma->vm_pgoff -= grow;
 		}
@@ -1757,9 +1688,9 @@ detach_vmas_to_be_unmapped(struct mm_str
 
 	do {
 		struct vm_area_struct *next = vma_next(vma);
-		rb_erase(&vma->vm_rb, &mm->mm_rb);
+  		__vma_unlink(mm, vma, NULL);
 		mm->map_count--;
-		list_move_tail(&vma->vm_list, vmas);
+		list_add_tail(&vma->vm_list, vmas);
 		vma = next;
 	} while (vma && vma->vm_start < end);
 	if (mm->unmap_area == arch_unmap_area)
@@ -1917,10 +1848,9 @@ static inline void verify_mm_writelocked
  */
 unsigned long do_brk(unsigned long addr, unsigned long len)
 {
-	struct mm_struct * mm = current->mm;
-	struct vm_area_struct * vma, * prev;
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *vma, *prev;
 	unsigned long flags;
-	struct rb_node ** rb_link, * rb_parent;
 	pgoff_t pgoff = addr >> PAGE_SHIFT;
 	int error;
 
@@ -1963,7 +1893,7 @@ unsigned long do_brk(unsigned long addr,
 	 * Clear old maps.  this also does some error checking for us
 	 */
  munmap_back:
-	vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
+	vma = find_vma_prepare(mm, addr, &prev);
 	if (vma && vma->vm_start < addr + len) {
 		if (do_munmap(mm, addr, len))
 			return -ENOMEM;
@@ -2001,7 +1931,7 @@ unsigned long do_brk(unsigned long addr,
 	vma->vm_flags = flags;
 	vma->vm_page_prot = protection_map[flags &
 				(VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)];
-	vma_link(mm, vma, prev, rb_link, rb_parent);
+	vma_link(mm, vma, prev);
 out:
 	mm->total_vm += len >> PAGE_SHIFT;
 	if (flags & VM_LOCKED) {
@@ -2053,8 +1983,7 @@ void exit_mmap(struct mm_struct *mm)
  */
 int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
 {
-	struct vm_area_struct * __vma, * prev;
-	struct rb_node ** rb_link, * rb_parent;
+	struct vm_area_struct *__vma, *prev;
 
 	/*
 	 * The vm_pgoff of a purely anonymous vma should be irrelevant
@@ -2072,13 +2001,13 @@ int insert_vm_struct(struct mm_struct * 
 		BUG_ON(vma->anon_vma);
 		vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
 	}
-	__vma = find_vma_prepare(mm,vma->vm_start,&prev,&rb_link,&rb_parent);
+	__vma = find_vma_prepare(mm,vma->vm_start,&prev);
 	if (__vma && __vma->vm_start < vma->vm_end)
 		return -ENOMEM;
 	if ((vma->vm_flags & VM_ACCOUNT) &&
 	     security_vm_enough_memory_mm(mm, vma_pages(vma)))
 		return -ENOMEM;
-	vma_link(mm, vma, prev, rb_link, rb_parent);
+	vma_link(mm, vma, prev);
 	return 0;
 }
 
@@ -2093,7 +2022,6 @@ struct vm_area_struct *copy_vma(struct v
 	unsigned long vma_start = vma->vm_start;
 	struct mm_struct *mm = vma->vm_mm;
 	struct vm_area_struct *new_vma, *prev;
-	struct rb_node **rb_link, *rb_parent;
 	struct mempolicy *pol;
 
 	/*
@@ -2103,7 +2031,7 @@ struct vm_area_struct *copy_vma(struct v
 	if (!vma->vm_file && !vma->anon_vma)
 		pgoff = addr >> PAGE_SHIFT;
 
-	find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
+	find_vma_prepare(mm, addr, &prev);
 	new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
 			vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma));
 	if (new_vma) {
@@ -2130,7 +2058,7 @@ struct vm_area_struct *copy_vma(struct v
 				get_file(new_vma->vm_file);
 			if (new_vma->vm_ops && new_vma->vm_ops->open)
 				new_vma->vm_ops->open(new_vma);
-			vma_link(mm, new_vma, prev, rb_link, rb_parent);
+			vma_link(mm, new_vma, prev);
 		}
 	}
 	return new_vma;

-- 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH/RFC 4/9] mm: RCU vma lookups
  2007-10-22 10:45 [PATCH/RFC 0/9] VMA lookup with RCU Vaidyanathan Srinivasan
                   ` (2 preceding siblings ...)
  2007-10-22 10:45 ` [PATCH/RFC 3/9] mm: use the B+tree for vma lookups Vaidyanathan Srinivasan
@ 2007-10-22 10:45 ` Vaidyanathan Srinivasan
  2007-10-22 10:45 ` [PATCH/RFC 5/9] i386: rcu vma lookups for faults Vaidyanathan Srinivasan
                   ` (4 subsequent siblings)
  8 siblings, 0 replies; 10+ messages in thread
From: Vaidyanathan Srinivasan @ 2007-10-22 10:45 UTC (permalink / raw)
  To: linux-mm; +Cc: Alexis Bruemmer, Peter Zijlstra

[-- Attachment #1: 4_mm-rcu.patch --]
[-- Type: text/plain, Size: 12516 bytes --]

mostly lockless vma lookup using the new b+tree
pin the vma using an atomic refcount

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>
---
 include/linux/init_task.h |    2 
 include/linux/mm.h        |    9 +
 kernel/fork.c             |    2 
 mm/mmap.c                 |  216 +++++++++++++++++++++++++++++++++++++++++-----
 4 files changed, 206 insertions(+), 23 deletions(-)

--- linux-2.6.23-rc9.orig/include/linux/init_task.h
+++ linux-2.6.23-rc9/include/linux/init_task.h
@@ -48,7 +48,7 @@
 #define INIT_MM(name) \
 {			 					\
 	.mm_vmas	= LIST_HEAD_INIT(name.mm_vmas),		\
-	.mm_btree	= BTREE_INIT(GFP_ATOMIC),		\
+	.mm_btree	= BTREE_INIT_FLUSH(GFP_ATOMIC, btree_rcu_flush), \
 	.pgd		= swapper_pg_dir, 			\
 	.mm_users	= ATOMIC_INIT(2), 			\
 	.mm_count	= ATOMIC_INIT(1), 			\
--- linux-2.6.23-rc9.orig/include/linux/mm.h
+++ linux-2.6.23-rc9/include/linux/mm.h
@@ -104,12 +104,14 @@ struct vm_area_struct {
 	void * vm_private_data;		/* was vm_pte (shared mem) */
 	unsigned long vm_truncate_count;/* truncate_count or restart_addr */
 
+	struct rw_semaphore vm_sem;
 #ifndef CONFIG_MMU
 	atomic_t vm_usage;		/* refcount (VMAs shared if !MMU) */
 #endif
 #ifdef CONFIG_NUMA
 	struct mempolicy *vm_policy;	/* NUMA policy for the VMA */
 #endif
+	struct rcu_head vm_rcu_head;
 };
 
 static inline struct vm_area_struct *
@@ -1078,6 +1080,8 @@ static inline void vma_nonlinear_insert(
 }
 
 /* mmap.c */
+extern void btree_rcu_flush(struct btree_freevec *);
+extern void free_vma(struct vm_area_struct *vma);
 extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin);
 extern void vma_adjust(struct vm_area_struct *vma, unsigned long start,
 	unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert);
@@ -1177,6 +1181,11 @@ extern struct vm_area_struct * find_vma(
 extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr,
 					     struct vm_area_struct **pprev);
 
+extern struct vm_area_struct * __find_get_vma(struct mm_struct *mm, unsigned long addr, int *locked);
+extern struct vm_area_struct * find_get_vma(struct mm_struct *mm, unsigned long addr);
+extern void get_vma(struct vm_area_struct *vma);
+extern void put_vma(struct vm_area_struct *vma);
+
 /* Look up the first VMA which intersects the interval start_addr..end_addr-1,
    NULL if none.  Assume start_addr < end_addr. */
 static inline struct vm_area_struct * find_vma_intersection(struct mm_struct * mm, unsigned long start_addr, unsigned long end_addr)
--- linux-2.6.23-rc9.orig/kernel/fork.c
+++ linux-2.6.23-rc9/kernel/fork.c
@@ -327,7 +327,7 @@ static void free_mm(struct mm_struct *mm
 static struct mm_struct * mm_init(struct mm_struct * mm)
 {
 	INIT_LIST_HEAD(&mm->mm_vmas);
-	mm->mm_btree = BTREE_INIT(GFP_ATOMIC);
+	mm->mm_btree = BTREE_INIT_FLUSH(GFP_ATOMIC, btree_rcu_flush);
 	atomic_set(&mm->mm_users, 1);
 	atomic_set(&mm->mm_count, 1);
 	init_rwsem(&mm->mmap_sem);
--- linux-2.6.23-rc9.orig/mm/mmap.c
+++ linux-2.6.23-rc9/mm/mmap.c
@@ -40,6 +40,18 @@ static void unmap_region(struct mm_struc
 		struct vm_area_struct *vma, struct vm_area_struct *prev,
 		unsigned long start, unsigned long end);
 
+static void __btree_rcu_flush(struct rcu_head *head)
+{
+	struct btree_freevec *freevec =
+		container_of(head, struct btree_freevec, rcu_head);
+	btree_freevec_flush(freevec);
+}
+
+void btree_rcu_flush(struct btree_freevec *freevec)
+{
+	call_rcu(&freevec->rcu_head, __btree_rcu_flush);
+}
+
 /*
  * WARNING: the debugging will use recursive algorithms so never enable this
  * unless you know what you are doing.
@@ -218,6 +230,28 @@ void unlink_file_vma(struct vm_area_stru
 	}
 }
 
+static void __free_vma(struct rcu_head *head)
+{
+	struct vm_area_struct *vma =
+		container_of(head, struct vm_area_struct, vm_rcu_head);
+	kmem_cache_free(vm_area_cachep, vma);
+}
+
+void free_vma(struct vm_area_struct *vma)
+{
+	call_rcu(&vma->vm_rcu_head, __free_vma);
+}
+
+static void lock_vma(struct vm_area_struct *vma)
+{
+	down_write(&vma->vm_sem);
+}
+
+static void unlock_vma(struct vm_area_struct *vma)
+{
+	up_write(&vma->vm_sem);
+}
+
 /*
  * Close a vm structure and free it, returning the next.
  */
@@ -230,7 +264,7 @@ static void remove_vma(struct vm_area_st
 	if (vma->vm_file)
 		fput(vma->vm_file);
 	mpol_free(vma_policy(vma));
-	kmem_cache_free(vm_area_cachep, vma);
+	free_vma(vma);
 	return;
 }
 
@@ -288,8 +322,15 @@ void validate_mm(struct mm_struct *mm)
 	int bug = 0;
 	int i = 0;
 	struct vm_area_struct *vma;
-	list_for_each_entry(vma, &mm->mm_vmas, vm_list)
+	unsigned long addr = 0UL;
+	list_for_each_entry(vma, &mm->mm_vmas, vm_list) {
+		if (addr > vma->vm_start) {
+			printk("vma list unordered: %lu %lu\n",
+					addr, vma->vm_start);
+		}
+		addr = vma->vm_start;
 		i++;
+	}
 	if (i != mm->map_count)
 		printk("map_count %d vm_next %d\n", mm->map_count, i), bug = 1;
 	BUG_ON(bug);
@@ -328,6 +369,7 @@ __vma_link_list(struct mm_struct *mm, st
 void __vma_link_btree(struct mm_struct *mm, struct vm_area_struct *vma)
 {
  	int err;
+	init_rwsem(&vma->vm_sem);
  	err = btree_insert(&mm->mm_btree, vma->vm_start, vma);
  	BUG_ON(err);
 }
@@ -421,8 +463,8 @@ __vma_unlink(struct mm_struct *mm, struc
 		BUG();
 	}
 
-	if (mm->mmap_cache == vma)
-		mm->mmap_cache = prev;
+	if (rcu_dereference(mm->mmap_cache) == vma)
+		rcu_assign_pointer(mm->mmap_cache, prev);
 }
 
 /*
@@ -438,6 +480,7 @@ void vma_adjust(struct vm_area_struct *v
 	struct mm_struct *mm = vma->vm_mm;
 	struct vm_area_struct *next = vma_next(vma);
 	struct vm_area_struct *importer = NULL;
+	struct vm_area_struct *locked = NULL;
 	struct address_space *mapping = NULL;
 	struct prio_tree_root *root = NULL;
 	struct file *file = vma->vm_file;
@@ -448,6 +491,14 @@ void vma_adjust(struct vm_area_struct *v
 	if (next && !insert) {
 		if (end >= next->vm_end) {
 			/*
+			 * We need to lock the vma to force the lockless
+			 * lookup into the slow path. Because there is a
+			 * hole between removing the next vma and updating
+			 * the current.
+			 */
+			lock_vma(vma);
+			locked = vma;
+			/*
 			 * vma expands, overlapping all the next, and
 			 * perhaps the one after too (mprotect case 6).
 			 */
@@ -475,6 +526,25 @@ again:			remove_next = 1 + (end > next->
 		}
 	}
 
+	if (insert) {
+		/*
+		 * In order to make the adjust + insert look atomic wrt. the
+		 * lockless lookups we need to force those into the slow path.
+		 */
+		if (insert->vm_start < start) {
+			/*
+			 * If the new vma is to be placed in front of the
+			 * current one, we must lock the previous.
+			 */
+			locked = vma_prev(vma);
+			if (!locked)
+				locked = vma;
+		} else
+			locked = vma;
+
+		lock_vma(locked);
+	}
+
 	btree_preload(&mm->mm_btree, GFP_KERNEL|__GFP_NOFAIL); // XXX error path
 
 	if (file) {
@@ -521,6 +591,23 @@ again:			remove_next = 1 + (end > next->
 		}
 	}
 
+	/*
+	 * Remove the next vma before updating the address of the current,
+	 * because it might end up having the address of next.
+	 */
+	if (remove_next) {
+		/*
+		 * vma_merge has merged next into vma, and needs
+		 * us to remove next before dropping the locks.
+		 */
+		lock_vma(next);
+		__vma_unlink(mm, next, vma);
+		if (file)
+			__remove_shared_vm_struct(next, file, mapping);
+		if (next->anon_vma)
+			__anon_vma_merge(vma, next);
+	}
+
 	if (root) {
 		flush_dcache_mmap_lock(mapping);
 		vma_prio_tree_remove(vma, root);
@@ -547,17 +634,11 @@ again:			remove_next = 1 + (end > next->
 		flush_dcache_mmap_unlock(mapping);
 	}
 
-	if (remove_next) {
-		/*
-		 * vma_merge has merged next into vma, and needs
-		 * us to remove next before dropping the locks.
-		 */
-		__vma_unlink(mm, next, vma);
-		if (file)
-			__remove_shared_vm_struct(next, file, mapping);
-		if (next->anon_vma)
-			__anon_vma_merge(vma, next);
-	} else if (insert) {
+	/*
+	 * Insert after updating the address of the current vma, because it
+	 * might end up having the previous address.
+	 */
+	if (insert) {
 		/*
 		 * split_vma has split insert from vma, and needs
 		 * us to insert it before dropping the locks
@@ -576,7 +657,7 @@ again:			remove_next = 1 + (end > next->
 			fput(file);
 		mm->map_count--;
 		mpol_free(vma_policy(next));
-		kmem_cache_free(vm_area_cachep, next);
+		free_vma(next);
 		/*
 		 * In mprotect's case 6 (see comments on vma_merge),
 		 * we must remove another next too. It would clutter
@@ -588,6 +669,13 @@ again:			remove_next = 1 + (end > next->
 		}
 	}
 
+	if (locked) {
+		/*
+		 * unlock the vma, enabling lockless lookups.
+		 */
+		unlock_vma(locked);
+	}
+
 	validate_mm(mm);
 }
 
@@ -1142,7 +1230,7 @@ munmap_back:
 			fput(file);
 		}
 		mpol_free(vma_policy(vma));
-		kmem_cache_free(vm_area_cachep, vma);
+		free_vma(vma);
 	}
 out:	
 	mm->total_vm += len >> PAGE_SHIFT;
@@ -1166,7 +1254,7 @@ unmap_and_free_vma:
 	unmap_region(mm, &vmas, vma, prev, vma->vm_start, vma->vm_end);
 	charged = 0;
 free_vma:
-	kmem_cache_free(vm_area_cachep, vma);
+	free_vma(vma);
 unacct_error:
 	if (charged)
 		vm_unacct_memory(charged);
@@ -1390,13 +1478,13 @@ struct vm_area_struct * find_vma(struct 
 	if (mm) {
 		/* Check the cache first. */
 		/* (Cache hit rate is typically around 35%.) */
-		vma = mm->mmap_cache;
+		vma = rcu_dereference(mm->mmap_cache);
 		if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
 			vma = btree_stab(&mm->mm_btree, addr);
 			if (!vma || addr >= vma->vm_end)
 				vma = __vma_next(&mm->mm_vmas, vma);
 			if (vma)
-				mm->mmap_cache = vma;
+				rcu_assign_pointer(mm->mmap_cache, vma);
 		}
 	}
 	return vma;
@@ -1404,6 +1492,90 @@ struct vm_area_struct * find_vma(struct 
 
 EXPORT_SYMBOL(find_vma);
 
+/*
+ * Differs only from the above in that it uses the slightly more expensive
+ * btree_stab_next() in order to avoid using the mm->mm_vmas list without
+ * locks.
+ */
+static
+struct vm_area_struct *find_vma_rcu(struct mm_struct * mm, unsigned long addr)
+{
+	struct vm_area_struct *vma = NULL, *next;
+
+	if (mm) {
+		/* Check the cache first. */
+		/* (Cache hit rate is typically around 35%.) */
+		vma = rcu_dereference(mm->mmap_cache);
+		if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
+			vma = btree_stab_next(&mm->mm_btree, addr,
+					(void **)&next);
+			if (!vma || addr >= vma->vm_end)
+				vma = next;
+
+			if (vma)
+				rcu_assign_pointer(mm->mmap_cache, vma);
+		}
+	}
+	return vma;
+}
+
+/*
+ * Lockless lookup and pinning of vmas:
+ *
+ * In order to be able to do vma modifications and have them appear atomic
+ * we must sometimes still take the read lock. We do this when we fail to
+ * get a reference on the vma.
+ *
+ */
+struct vm_area_struct *
+__find_get_vma(struct mm_struct *mm, unsigned long addr, int *locked)
+{
+	struct vm_area_struct *vma;
+
+	if (!mm)
+		return NULL;
+
+	rcu_read_lock();
+	vma = find_vma_rcu(mm, addr);
+	if (!vma || !down_read_trylock(&vma->vm_sem))
+		goto slow;
+	rcu_read_unlock();
+	return vma;
+
+slow:
+	rcu_read_unlock();
+	down_read(&mm->mmap_sem);
+	vma = find_vma(mm, addr);
+	if (vma)
+		down_read(&vma->vm_sem);
+	*locked = 1;
+	return vma;
+}
+
+struct vm_area_struct *
+find_get_vma(struct mm_struct *mm, unsigned long addr)
+{
+	int locked = 0;
+	struct vm_area_struct *vma;
+
+	vma = __find_get_vma(mm, addr, &locked);
+	if (unlikely(locked))
+		up_read(&mm->mmap_sem);
+	return vma;
+}
+
+void get_vma(struct vm_area_struct *vma)
+{
+	if (likely(vma))
+		down_read(&vma->vm_sem);
+}
+
+void put_vma(struct vm_area_struct *vma)
+{
+	if (likely(vma))
+		up_read(&vma->vm_sem);
+}
+
 /* Same as find_vma, but also return a pointer to the previous VMA in *pprev. */
 struct vm_area_struct *
 find_vma_prev(struct mm_struct *mm, unsigned long addr,
@@ -1687,8 +1859,10 @@ detach_vmas_to_be_unmapped(struct mm_str
 
 	do {
 		struct vm_area_struct *next = vma_next(vma);
+		lock_vma(vma);
   		__vma_unlink(mm, vma, NULL);
 		mm->map_count--;
+		unlock_vma(vma);
 		list_add_tail(&vma->vm_list, vmas);
 		vma = next;
 	} while (vma && vma->vm_start < end);
@@ -1697,7 +1871,7 @@ detach_vmas_to_be_unmapped(struct mm_str
 	else
 		addr = vma ?  vma->vm_start : mm->mmap_base;
 	mm->unmap_area(mm, addr);
-	mm->mmap_cache = NULL;		/* Kill the cache. */
+	/* mm->mmap_cache = NULL;*/		/* Kill the cache. */
 }
 
 /*

-- 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH/RFC 5/9] i386: rcu vma lookups for faults
  2007-10-22 10:45 [PATCH/RFC 0/9] VMA lookup with RCU Vaidyanathan Srinivasan
                   ` (3 preceding siblings ...)
  2007-10-22 10:45 ` [PATCH/RFC 4/9] mm: RCU " Vaidyanathan Srinivasan
@ 2007-10-22 10:45 ` Vaidyanathan Srinivasan
  2007-10-22 10:45 ` [PATCH/RFC 6/9] x86_64: " Vaidyanathan Srinivasan
                   ` (3 subsequent siblings)
  8 siblings, 0 replies; 10+ messages in thread
From: Vaidyanathan Srinivasan @ 2007-10-22 10:45 UTC (permalink / raw)
  To: linux-mm; +Cc: Alexis Bruemmer, Peter Zijlstra

[-- Attachment #1: 5_fault-i386.patch --]
[-- Type: text/plain, Size: 3768 bytes --]

Use the new lockless vma lookup in the i386 fault handler.
This avoids the exclusive cacheline access for the mmap_sem.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 arch/i386/mm/fault.c |   56 +++++++++++++++++++++++++++------------------------
 1 file changed, 30 insertions(+), 26 deletions(-)

--- linux-2.6.23-rc6.orig/arch/i386/mm/fault.c
+++ linux-2.6.23-rc6/arch/i386/mm/fault.c
@@ -307,6 +307,7 @@ fastcall void __kprobes do_page_fault(st
 	unsigned long address;
 	int write, si_code;
 	int fault;
+	int locked = 0;
 
 	/* get the address */
         address = read_cr2();
@@ -357,29 +358,14 @@ fastcall void __kprobes do_page_fault(st
 	if (in_atomic() || !mm)
 		goto bad_area_nosemaphore;
 
-	/* When running in the kernel we expect faults to occur only to
-	 * addresses in user space.  All other faults represent errors in the
-	 * kernel and should generate an OOPS.  Unfortunatly, in the case of an
-	 * erroneous fault occurring in a code path which already holds mmap_sem
-	 * we will deadlock attempting to validate the fault against the
-	 * address space.  Luckily the kernel only validly references user
-	 * space from well defined areas of code, which are listed in the
-	 * exceptions table.
-	 *
-	 * As the vast majority of faults will be valid we will only perform
-	 * the source reference check when there is a possibilty of a deadlock.
-	 * Attempt to lock the address space, if we cannot we then validate the
-	 * source.  If this is invalid we can skip the address space check,
-	 * thus avoiding the deadlock.
-	 */
-	if (!down_read_trylock(&mm->mmap_sem)) {
-		if ((error_code & 4) == 0 &&
-		    !search_exception_tables(regs->eip))
-			goto bad_area_nosemaphore;
+again:
+	if (likely(!locked)) {
+		vma = __find_get_vma(mm, address, &locked);
+	} else {
 		down_read(&mm->mmap_sem);
+		vma = find_vma(mm, address);
+		get_vma(vma);
 	}
-
-	vma = find_vma(mm, address);
 	if (!vma)
 		goto bad_area;
 	if (vma->vm_start <= address)
@@ -396,6 +382,15 @@ fastcall void __kprobes do_page_fault(st
 		if (address + 65536 + 32 * sizeof(unsigned long) < regs->esp)
 			goto bad_area;
 	}
+	/*
+	 * expand_stack needs the read lock, hence retry the whole thing
+	 * read locked.
+	 */
+	if (!locked) {
+		put_vma(vma);
+		locked = 1;
+		goto again;
+	}
 	if (expand_stack(vma, address))
 		goto bad_area;
 /*
@@ -403,6 +398,11 @@ fastcall void __kprobes do_page_fault(st
  * we can handle it..
  */
 good_area:
+	if (unlikely(locked)) {
+		up_read(&mm->mmap_sem);
+		locked = 0;
+	}
+
 	si_code = SEGV_ACCERR;
 	write = 0;
 	switch (error_code & 3) {
@@ -447,7 +447,7 @@ good_area:
 		if (bit < 32)
 			tsk->thread.screen_bitmap |= 1 << bit;
 	}
-	up_read(&mm->mmap_sem);
+	put_vma(vma);
 	return;
 
 /*
@@ -455,7 +455,11 @@ good_area:
  * Fix it, but check if it's kernel or user first..
  */
 bad_area:
-	up_read(&mm->mmap_sem);
+	if (unlikely(locked)) {
+		up_read(&mm->mmap_sem);
+		locked = 0;
+	}
+	put_vma(vma);
 
 bad_area_nosemaphore:
 	/* User mode accesses just cause a SIGSEGV */
@@ -590,10 +594,10 @@ no_context:
  * us unable to handle the page fault gracefully.
  */
 out_of_memory:
-	up_read(&mm->mmap_sem);
+	put_vma(vma);
 	if (is_init(tsk)) {
 		yield();
-		down_read(&mm->mmap_sem);
+		vma = find_get_vma(mm, address);
 		goto survive;
 	}
 	printk("VM: killing process %s\n", tsk->comm);
@@ -602,7 +606,7 @@ out_of_memory:
 	goto no_context;
 
 do_sigbus:
-	up_read(&mm->mmap_sem);
+	put_vma(vma);
 
 	/* Kernel mode? Handle exceptions or die */
 	if (!(error_code & 4))

-- 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH/RFC 6/9] x86_64: rcu vma lookups for faults
  2007-10-22 10:45 [PATCH/RFC 0/9] VMA lookup with RCU Vaidyanathan Srinivasan
                   ` (4 preceding siblings ...)
  2007-10-22 10:45 ` [PATCH/RFC 5/9] i386: rcu vma lookups for faults Vaidyanathan Srinivasan
@ 2007-10-22 10:45 ` Vaidyanathan Srinivasan
  2007-10-22 10:45 ` [PATCH/RFC 7/9] Add page fault code for PPC64 path Vaidyanathan Srinivasan
                   ` (2 subsequent siblings)
  8 siblings, 0 replies; 10+ messages in thread
From: Vaidyanathan Srinivasan @ 2007-10-22 10:45 UTC (permalink / raw)
  To: linux-mm; +Cc: Alexis Bruemmer, Peter Zijlstra

[-- Attachment #1: 6_fault-x86_64.patch --]
[-- Type: text/plain, Size: 2629 bytes --]

---
 arch/x86_64/mm/fault.c |   36 +++++++++++++++++++++++++-----------
 1 file changed, 25 insertions(+), 11 deletions(-)

--- linux-2.6.23-rc6.orig/arch/x86_64/mm/fault.c
+++ linux-2.6.23-rc6/arch/x86_64/mm/fault.c
@@ -310,10 +310,10 @@ asmlinkage void __kprobes do_page_fault(
 	int write, fault;
 	unsigned long flags;
 	siginfo_t info;
+	int locked = 0;
 
 	tsk = current;
 	mm = tsk->mm;
-	prefetchw(&mm->mmap_sem);
 
 	/* get the address */
 	address = read_cr2();
@@ -390,14 +390,14 @@ asmlinkage void __kprobes do_page_fault(
 	 * source.  If this is invalid we can skip the address space check,
 	 * thus avoiding the deadlock.
 	 */
-	if (!down_read_trylock(&mm->mmap_sem)) {
-		if ((error_code & PF_USER) == 0 &&
-		    !search_exception_tables(regs->rip))
-			goto bad_area_nosemaphore;
+
+	if (likely(!locked)) {
+		vma = __find_get_vma(mm, address, &locked);
+	} else {
 		down_read(&mm->mmap_sem);
+		vma = find_vma(mm, address);
+		get_vma(vma);
 	}
-
-	vma = find_vma(mm, address);
 	if (!vma)
 		goto bad_area;
 	if (likely(vma->vm_start <= address))
@@ -411,6 +411,11 @@ asmlinkage void __kprobes do_page_fault(
 		if (address + 65536 + 32 * sizeof(unsigned long) < regs->rsp)
 			goto bad_area;
 	}
+	if (!locked) {
+		put_vma(vma);
+		locked = 1;
+		goto again;
+	}
 	if (expand_stack(vma, address))
 		goto bad_area;
 /*
@@ -418,6 +423,11 @@ asmlinkage void __kprobes do_page_fault(
  * we can handle it..
  */
 good_area:
+	if (locked) {
+		up_read(&mm->mmap_sem);
+		locked = 0;
+	}
+
 	info.si_code = SEGV_ACCERR;
 	write = 0;
 	switch (error_code & (PF_PROT|PF_WRITE)) {
@@ -452,7 +462,7 @@ good_area:
 		tsk->maj_flt++;
 	else
 		tsk->min_flt++;
-	up_read(&mm->mmap_sem);
+	put_vma(vma);
 	return;
 
 /*
@@ -460,7 +470,11 @@ good_area:
  * Fix it, but check if it's kernel or user first..
  */
 bad_area:
-	up_read(&mm->mmap_sem);
+	if (locked) {
+		up_read(&mm->mmap_sem);
+		locked = 0;
+	}
+	put_vma(vma);
 
 bad_area_nosemaphore:
 	/* User mode accesses just cause a SIGSEGV */
@@ -552,7 +566,7 @@ no_context:
  * us unable to handle the page fault gracefully.
  */
 out_of_memory:
-	up_read(&mm->mmap_sem);
+	put_vma(vma);
 	if (is_init(current)) {
 		yield();
 		goto again;
@@ -563,7 +577,7 @@ out_of_memory:
 	goto no_context;
 
 do_sigbus:
-	up_read(&mm->mmap_sem);
+	put_vma(vma);
 
 	/* Kernel mode? Handle exceptions or die */
 	if (!(error_code & PF_USER))

-- 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH/RFC 7/9] Add page fault code for PPC64 path
  2007-10-22 10:45 [PATCH/RFC 0/9] VMA lookup with RCU Vaidyanathan Srinivasan
                   ` (5 preceding siblings ...)
  2007-10-22 10:45 ` [PATCH/RFC 6/9] x86_64: " Vaidyanathan Srinivasan
@ 2007-10-22 10:45 ` Vaidyanathan Srinivasan
  2007-10-22 10:45 ` [PATCH/RFC 8/9] debug: instrument the fault path Vaidyanathan Srinivasan
  2007-10-22 10:45 ` [PATCH/RFC 9/9] mm: nr_ptes needs to be atomic Vaidyanathan Srinivasan
  8 siblings, 0 replies; 10+ messages in thread
From: Vaidyanathan Srinivasan @ 2007-10-22 10:45 UTC (permalink / raw)
  To: linux-mm; +Cc: Alexis Bruemmer, Peter Zijlstra

[-- Attachment #1: 7_fault-powerpc.patch --]
[-- Type: text/plain, Size: 2828 bytes --]

---
 arch/powerpc/mm/fault.c |   44 +++++++++++++++++++++++++++++++++-----------
 1 file changed, 33 insertions(+), 11 deletions(-)

--- linux-2.6.23-rc8.orig/arch/powerpc/mm/fault.c
+++ linux-2.6.23-rc8/arch/powerpc/mm/fault.c
@@ -148,6 +148,7 @@ int __kprobes do_page_fault(struct pt_re
 	int is_write = 0, ret;
 	int trap = TRAP(regs);
  	int is_exec = trap == 0x400;
+	int locked = 0;
 
 #if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE))
 	/*
@@ -211,14 +212,22 @@ int __kprobes do_page_fault(struct pt_re
 	 * source.  If this is invalid we can skip the address space check,
 	 * thus avoiding the deadlock.
 	 */
-	if (!down_read_trylock(&mm->mmap_sem)) {
-		if (!user_mode(regs) && !search_exception_tables(regs->nip))
-			goto bad_area_nosemaphore;
-
+//	if (!down_read_trylock(&mm->mmap_sem)) {
+//		if (!user_mode(regs) && !search_exception_tables(regs->nip))
+//			goto bad_area_nosemaphore;
+//
+//		down_read(&mm->mmap_sem);
+//	}
+
+again:
+	if (likely(!locked)) {
+		vma = __find_get_vma(mm, address, &locked);
+	} else {
 		down_read(&mm->mmap_sem);
-	}
+		vma = find_vma(mm, address);
+		get_vma(vma);
+	}
 
-	vma = find_vma(mm, address);
 	if (!vma)
 		goto bad_area;
 	if (vma->vm_start <= address)
@@ -257,10 +266,19 @@ int __kprobes do_page_fault(struct pt_re
 		    && (!user_mode(regs) || !store_updates_sp(regs)))
 			goto bad_area;
 	}
+	if (!locked) {
+		put_vma(vma);
+		locked = 1;
+		goto again;
+	}
 	if (expand_stack(vma, address))
 		goto bad_area;
 
 good_area:
+	if (locked) {
+		up_read(&mm->mmap_sem);
+		locked = 0;
+	}
 	code = SEGV_ACCERR;
 #if defined(CONFIG_6xx)
 	if (error_code & 0x95700000)
@@ -311,7 +329,7 @@ good_area:
 				pte_update(ptep, 0, _PAGE_HWEXEC);
 				_tlbie(address);
 				pte_unmap_unlock(ptep, ptl);
-				up_read(&mm->mmap_sem);
+				put_vma(vma);
 				return 0;
 			}
 			pte_unmap_unlock(ptep, ptl);
@@ -348,11 +366,15 @@ good_area:
 		current->maj_flt++;
 	else
 		current->min_flt++;
-	up_read(&mm->mmap_sem);
+	put_vma(vma);
 	return 0;
 
 bad_area:
-	up_read(&mm->mmap_sem);
+	if (locked) {
+		up_read(&mm->mmap_sem);
+		locked = 0;
+	}
+	put_vma(vma);
 
 bad_area_nosemaphore:
 	/* User mode accesses cause a SIGSEGV */
@@ -374,7 +396,7 @@ bad_area_nosemaphore:
  * us unable to handle the page fault gracefully.
  */
 out_of_memory:
-	up_read(&mm->mmap_sem);
+	put_vma(vma);
 	if (is_init(current)) {
 		yield();
 		down_read(&mm->mmap_sem);
@@ -386,7 +408,7 @@ out_of_memory:
 	return SIGKILL;
 
 do_sigbus:
-	up_read(&mm->mmap_sem);
+	put_vma(vma);
 	if (user_mode(regs)) {
 		info.si_signo = SIGBUS;
 		info.si_errno = 0;

-- 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH/RFC 8/9] debug: instrument the fault path
  2007-10-22 10:45 [PATCH/RFC 0/9] VMA lookup with RCU Vaidyanathan Srinivasan
                   ` (6 preceding siblings ...)
  2007-10-22 10:45 ` [PATCH/RFC 7/9] Add page fault code for PPC64 path Vaidyanathan Srinivasan
@ 2007-10-22 10:45 ` Vaidyanathan Srinivasan
  2007-10-22 10:45 ` [PATCH/RFC 9/9] mm: nr_ptes needs to be atomic Vaidyanathan Srinivasan
  8 siblings, 0 replies; 10+ messages in thread
From: Vaidyanathan Srinivasan @ 2007-10-22 10:45 UTC (permalink / raw)
  To: linux-mm; +Cc: Alexis Bruemmer, Peter Zijlstra

[-- Attachment #1: 8_stats.patch --]
[-- Type: text/plain, Size: 1856 bytes --]

---
 arch/x86_64/mm/fault.c |    8 ++++++++
 include/linux/vmstat.h |    1 +
 mm/vmstat.c            |    5 +++++
 3 files changed, 14 insertions(+)

--- linux-2.6.23-rc6.orig/arch/x86_64/mm/fault.c
+++ linux-2.6.23-rc6/arch/x86_64/mm/fault.c
@@ -25,6 +25,7 @@
 #include <linux/kprobes.h>
 #include <linux/uaccess.h>
 #include <linux/kdebug.h>
+#include <linux/vmstat.h>
 
 #include <asm/system.h>
 #include <asm/pgalloc.h>
@@ -393,10 +394,16 @@ asmlinkage void __kprobes do_page_fault(
 
 	if (likely(!locked)) {
 		vma = __find_get_vma(mm, address, &locked);
+		if (unlikely(locked))
+			count_vm_event(FAULT_RCU_SLOW);
+		else
+			count_vm_event(FAULT_RCU);
+
 	} else {
 		down_read(&mm->mmap_sem);
 		vma = find_vma(mm, address);
 		get_vma(vma);
+		count_vm_event(FAULT_LOCKED);
 	}
 	if (!vma)
 		goto bad_area;
@@ -416,6 +423,7 @@ asmlinkage void __kprobes do_page_fault(
 		locked = 1;
 		goto again;
 	}
+	count_vm_event(FAULT_STACK);
 	if (expand_stack(vma, address))
 		goto bad_area;
 /*
--- linux-2.6.23-rc6.orig/include/linux/vmstat.h
+++ linux-2.6.23-rc6/include/linux/vmstat.h
@@ -37,6 +37,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PS
 		FOR_ALL_ZONES(PGSCAN_DIRECT),
 		PGINODESTEAL, SLABS_SCANNED, KSWAPD_STEAL, KSWAPD_INODESTEAL,
 		PAGEOUTRUN, ALLOCSTALL, PGROTATED,
+		FAULT_RCU, FAULT_RCU_SLOW, FAULT_LOCKED, FAULT_STACK,
 		NR_VM_EVENT_ITEMS
 };
 
--- linux-2.6.23-rc6.orig/mm/vmstat.c
+++ linux-2.6.23-rc6/mm/vmstat.c
@@ -529,6 +529,11 @@ static const char * const vmstat_text[] 
 	"allocstall",
 
 	"pgrotated",
+
+	"fault_rcu",
+	"fault_rcu_slow",
+	"fault_locked",
+	"fault_stack",
 #endif
 };
 

-- 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH/RFC 9/9] mm: nr_ptes needs to be atomic
  2007-10-22 10:45 [PATCH/RFC 0/9] VMA lookup with RCU Vaidyanathan Srinivasan
                   ` (7 preceding siblings ...)
  2007-10-22 10:45 ` [PATCH/RFC 8/9] debug: instrument the fault path Vaidyanathan Srinivasan
@ 2007-10-22 10:45 ` Vaidyanathan Srinivasan
  8 siblings, 0 replies; 10+ messages in thread
From: Vaidyanathan Srinivasan @ 2007-10-22 10:45 UTC (permalink / raw)
  To: linux-mm; +Cc: Alexis Bruemmer, Peter Zijlstra

[-- Attachment #1: 9_nr_ptes.patch --]
[-- Type: text/plain, Size: 3692 bytes --]

---
 arch/powerpc/mm/fault.c   |    6 ++++++
 arch/sh/mm/cache-sh4.c    |    2 +-
 arch/um/kernel/skas/mmu.c |    2 +-
 fs/proc/task_mmu.c        |    2 +-
 include/linux/sched.h     |    4 +++-
 kernel/fork.c             |    2 +-
 mm/memory.c               |    4 ++--
 7 files changed, 15 insertions(+), 7 deletions(-)

--- linux-2.6.23-rc8.orig/arch/powerpc/mm/fault.c
+++ linux-2.6.23-rc8/arch/powerpc/mm/fault.c
@@ -235,6 +235,12 @@ again:
 	if (!(vma->vm_flags & VM_GROWSDOWN))
 		goto bad_area;
 
+	if (!locked) {
+		put_vma(vma);
+		locked = 1;
+		goto again;
+	}
+
 	/*
 	 * N.B. The POWER/Open ABI allows programs to access up to
 	 * 288 bytes below the stack pointer.
--- linux-2.6.23-rc8.orig/arch/sh/mm/cache-sh4.c
+++ linux-2.6.23-rc8/arch/sh/mm/cache-sh4.c
@@ -373,7 +373,7 @@ void flush_cache_mm(struct mm_struct *mm
 	 * Don't bother groveling around the dcache for the VMA ranges
 	 * if there are too many PTEs to make it worthwhile.
 	 */
-	if (mm->nr_ptes >= MAX_DCACHE_PAGES)
+	if (atomic_long_read(&mm->nr_ptes) >= MAX_DCACHE_PAGES)
 		flush_dcache_all();
 	else {
 		struct vm_area_struct *vma;
--- linux-2.6.23-rc8.orig/arch/um/kernel/skas/mmu.c
+++ linux-2.6.23-rc8/arch/um/kernel/skas/mmu.c
@@ -98,7 +98,7 @@ int init_new_context_skas(struct task_st
 		if(ret)
 			goto out_free;
 
-		mm->nr_ptes--;
+		atomic_long_dec(&mm->nr_ptes);
 	}
 
 	to_mm->id.stack = stack;
--- linux-2.6.23-rc8.orig/fs/proc/task_mmu.c
+++ linux-2.6.23-rc8/fs/proc/task_mmu.c
@@ -52,7 +52,7 @@ char *task_mem(struct mm_struct *mm, cha
 		total_rss << (PAGE_SHIFT-10),
 		data << (PAGE_SHIFT-10),
 		mm->stack_vm << (PAGE_SHIFT-10), text, lib,
-		(PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10);
+		(PTRS_PER_PTE*sizeof(pte_t)*atomic_long_read(&mm->nr_ptes)) >> 10);
 	return buffer;
 }
 
--- linux-2.6.23-rc8.orig/include/linux/sched.h
+++ linux-2.6.23-rc8/include/linux/sched.h
@@ -400,11 +400,13 @@ struct mm_struct {
 	unsigned long hiwater_vm;	/* High-water virtual memory usage */
 
 	unsigned long total_vm, locked_vm, shared_vm, exec_vm;
-	unsigned long stack_vm, reserved_vm, def_flags, nr_ptes;
+	unsigned long stack_vm, reserved_vm, def_flags;
 	unsigned long start_code, end_code, start_data, end_data;
 	unsigned long start_brk, brk, start_stack;
 	unsigned long arg_start, arg_end, env_start, env_end;
 
+	atomic_long_t nr_ptes;
+
 	unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */
 
 	cpumask_t cpu_vm_mask;
--- linux-2.6.23-rc8.orig/kernel/fork.c
+++ linux-2.6.23-rc8/kernel/fork.c
@@ -335,7 +335,7 @@ static struct mm_struct * mm_init(struct
 	mm->flags = (current->mm) ? current->mm->flags
 				  : MMF_DUMP_FILTER_DEFAULT;
 	mm->core_waiters = 0;
-	mm->nr_ptes = 0;
+	atomic_long_set(&mm->nr_ptes, 0);
 	set_mm_counter(mm, file_rss, 0);
 	set_mm_counter(mm, anon_rss, 0);
 	spin_lock_init(&mm->page_table_lock);
--- linux-2.6.23-rc8.orig/mm/memory.c
+++ linux-2.6.23-rc8/mm/memory.c
@@ -127,7 +127,7 @@ static void free_pte_range(struct mmu_ga
 	pte_lock_deinit(page);
 	pte_free_tlb(tlb, page);
 	dec_zone_page_state(page, NR_PAGETABLE);
-	tlb->mm->nr_ptes--;
+	atomic_long_dec(&tlb->mm->nr_ptes);
 }
 
 static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
@@ -310,7 +310,7 @@ int __pte_alloc(struct mm_struct *mm, pm
 		pte_lock_deinit(new);
 		pte_free(new);
 	} else {
-		mm->nr_ptes++;
+		atomic_long_inc(&mm->nr_ptes);
 		inc_zone_page_state(new, NR_PAGETABLE);
 		pmd_populate(mm, pmd, new);
 	}

-- 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2007-10-22 10:46 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-10-22 10:45 [PATCH/RFC 0/9] VMA lookup with RCU Vaidyanathan Srinivasan
2007-10-22 10:45 ` [PATCH/RFC 1/9] Data structure changes Vaidyanathan Srinivasan
2007-10-22 10:45 ` [PATCH/RFC 2/9] lib: RCU friendly B+tree Vaidyanathan Srinivasan
2007-10-22 10:45 ` [PATCH/RFC 3/9] mm: use the B+tree for vma lookups Vaidyanathan Srinivasan
2007-10-22 10:45 ` [PATCH/RFC 4/9] mm: RCU " Vaidyanathan Srinivasan
2007-10-22 10:45 ` [PATCH/RFC 5/9] i386: rcu vma lookups for faults Vaidyanathan Srinivasan
2007-10-22 10:45 ` [PATCH/RFC 6/9] x86_64: " Vaidyanathan Srinivasan
2007-10-22 10:45 ` [PATCH/RFC 7/9] Add page fault code for PPC64 path Vaidyanathan Srinivasan
2007-10-22 10:45 ` [PATCH/RFC 8/9] debug: instrument the fault path Vaidyanathan Srinivasan
2007-10-22 10:45 ` [PATCH/RFC 9/9] mm: nr_ptes needs to be atomic Vaidyanathan Srinivasan

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).