* [PATCH/RFC 0/9] VMA lookup with RCU
@ 2007-10-22 10:45 Vaidyanathan Srinivasan
2007-10-22 10:45 ` [PATCH/RFC 1/9] Data structure changes Vaidyanathan Srinivasan
` (8 more replies)
0 siblings, 9 replies; 10+ messages in thread
From: Vaidyanathan Srinivasan @ 2007-10-22 10:45 UTC (permalink / raw)
To: linux-mm; +Cc: Alexis Bruemmer, Peter Zijlstra
This is forward port of Peter Zijlstra's RCU based
VMA lookup patch set to kernel version 2.6.23.1
Reference:
http://programming.kicks-ass.net/kernel-patches/vma_lookup/
The patch set still needs some cleanup. I am sending this out
for anybody to review and try more experiments.
I have been posting results based on this patch set at linux-mm
http://marc.info/?l=linux-mm&m=119186241210311&w=2
--Vaidy
--
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 10+ messages in thread
* [PATCH/RFC 1/9] Data structure changes
2007-10-22 10:45 [PATCH/RFC 0/9] VMA lookup with RCU Vaidyanathan Srinivasan
@ 2007-10-22 10:45 ` Vaidyanathan Srinivasan
2007-10-22 10:45 ` [PATCH/RFC 2/9] lib: RCU friendly B+tree Vaidyanathan Srinivasan
` (7 subsequent siblings)
8 siblings, 0 replies; 10+ messages in thread
From: Vaidyanathan Srinivasan @ 2007-10-22 10:45 UTC (permalink / raw)
To: linux-mm; +Cc: Alexis Bruemmer, Peter Zijlstra
[-- Attachment #1: 1_mm_vmas.patch --]
[-- Type: text/plain, Size: 39930 bytes --]
* Replace vm_next with list_head in mm_struct
* Change all instance of mm->vm_next and mm->mmap
* TBD: Not all architectures covered
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>
---
arch/alpha/kernel/osf_sys.c | 2
arch/arm/mm/mmap.c | 2
arch/frv/mm/elf-fdpic.c | 4 -
arch/i386/mm/hugetlbpage.c | 2
arch/ia64/kernel/sys_ia64.c | 2
arch/ia64/mm/hugetlbpage.c | 2
arch/mips/kernel/irixelf.c | 19 +++---
arch/mips/kernel/syscall.c | 2
arch/parisc/kernel/sys_parisc.c | 4 -
arch/powerpc/mm/tlb_32.c | 2
arch/ppc/mm/tlb.c | 2
arch/x86_64/ia32/ia32_aout.c | 2
arch/x86_64/kernel/sys_x86_64.c | 2
drivers/char/mem.c | 2
drivers/oprofile/buffer_sync.c | 4 -
fs/binfmt_aout.c | 2
fs/binfmt_elf.c | 4 -
fs/binfmt_elf_fdpic.c | 5 +
fs/exec.c | 4 -
fs/hugetlbfs/inode.c | 2
fs/proc/task_mmu.c | 18 +++---
include/linux/init_task.h | 1
include/linux/mm.h | 44 ++++++++++++++-
include/linux/sched.h | 2
ipc/shm.c | 4 -
kernel/acct.c | 5 -
kernel/auditsc.c | 4 -
kernel/fork.c | 11 +--
mm/madvise.c | 2
mm/memory.c | 21 +++----
mm/mempolicy.c | 10 +--
mm/migrate.c | 2
mm/mlock.c | 5 +
mm/mmap.c | 114 +++++++++++++++++++---------------------
mm/mprotect.c | 2
mm/mremap.c | 7 +-
mm/msync.c | 2
mm/swapfile.c | 2
38 files changed, 180 insertions(+), 146 deletions(-)
--- linux-2.6.23-rc9.orig/arch/alpha/kernel/osf_sys.c
+++ linux-2.6.23-rc9/arch/alpha/kernel/osf_sys.c
@@ -1255,7 +1255,7 @@ arch_get_unmapped_area_1(unsigned long a
if (!vma || addr + len <= vma->vm_start)
return addr;
addr = vma->vm_end;
- vma = vma->vm_next;
+ vma = vma_next(vma);
}
}
--- linux-2.6.23-rc9.orig/arch/arm/mm/mmap.c
+++ linux-2.6.23-rc9/arch/arm/mm/mmap.c
@@ -84,7 +84,7 @@ full_search:
else
addr = PAGE_ALIGN(addr);
- for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
+ for (vma = find_vma(mm, addr); ; vma = vma_next(vma)) {
/* At this point: (!vma || addr < vma->vm_end). */
if (TASK_SIZE - len < addr) {
/*
--- linux-2.6.23-rc9.orig/arch/frv/mm/elf-fdpic.c
+++ linux-2.6.23-rc9/arch/frv/mm/elf-fdpic.c
@@ -86,7 +86,7 @@ unsigned long arch_get_unmapped_area(str
if (addr <= limit) {
vma = find_vma(current->mm, PAGE_SIZE);
- for (; vma; vma = vma->vm_next) {
+ for (; vma; vma = vma_next(vma)) {
if (addr > limit)
break;
if (addr + len <= vma->vm_start)
@@ -101,7 +101,7 @@ unsigned long arch_get_unmapped_area(str
limit = TASK_SIZE - len;
if (addr <= limit) {
vma = find_vma(current->mm, addr);
- for (; vma; vma = vma->vm_next) {
+ for (; vma; vma = vma_next(vma)) {
if (addr > limit)
break;
if (addr + len <= vma->vm_start)
--- linux-2.6.23-rc9.orig/arch/i386/mm/hugetlbpage.c
+++ linux-2.6.23-rc9/arch/i386/mm/hugetlbpage.c
@@ -241,7 +241,7 @@ static unsigned long hugetlb_get_unmappe
full_search:
addr = ALIGN(start_addr, HPAGE_SIZE);
- for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
+ for (vma = find_vma(mm, addr); ; vma = vma_next(vma)) {
/* At this point: (!vma || addr < vma->vm_end). */
if (TASK_SIZE - len < addr) {
/*
--- linux-2.6.23-rc9.orig/arch/ia64/kernel/sys_ia64.c
+++ linux-2.6.23-rc9/arch/ia64/kernel/sys_ia64.c
@@ -58,7 +58,7 @@ arch_get_unmapped_area (struct file *fil
full_search:
start_addr = addr = (addr + align_mask) & ~align_mask;
- for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
+ for (vma = find_vma(mm, addr); ; vma = vma_next(vma)) {
/* At this point: (!vma || addr < vma->vm_end). */
if (TASK_SIZE - len < addr || RGN_MAP_LIMIT - len < REGION_OFFSET(addr)) {
if (start_addr != TASK_UNMAPPED_BASE) {
--- linux-2.6.23-rc9.orig/arch/ia64/mm/hugetlbpage.c
+++ linux-2.6.23-rc9/arch/ia64/mm/hugetlbpage.c
@@ -159,7 +159,7 @@ unsigned long hugetlb_get_unmapped_area(
addr = HPAGE_REGION_BASE;
else
addr = ALIGN(addr, HPAGE_SIZE);
- for (vmm = find_vma(current->mm, addr); ; vmm = vmm->vm_next) {
+ for (vmm = find_vma(current->mm, addr); ; vmm = vmm_next(vma)) {
/* At this point: (!vmm || addr < vmm->vm_end). */
if (REGION_OFFSET(addr) + len > RGN_MAP_LIMIT)
return -ENOMEM;
--- linux-2.6.23-rc9.orig/arch/mips/kernel/irixelf.c
+++ linux-2.6.23-rc9/arch/mips/kernel/irixelf.c
@@ -718,7 +718,7 @@ static int load_irix_binary(struct linux
/* OK, This is the point of no return */
current->mm->end_data = 0;
current->mm->end_code = 0;
- current->mm->mmap = NULL;
+ INIT_LIST_HEAD(¤t->mm->mm_vmas);
current->flags &= ~PF_FORKNOEXEC;
elf_entry = (unsigned int) elf_ex.e_entry;
@@ -1108,7 +1108,7 @@ static int irix_core_dump(long signr, st
/* Count what's needed to dump, up to the limit of coredump size. */
segs = 0;
size = 0;
- for (vma = current->mm->mmap; vma != NULL; vma = vma->vm_next) {
+ for (vma = current->mm->mmap; vma != NULL; vma = vma_next(vma)) {
if (maydump(vma))
{
int sz = vma->vm_end-vma->vm_start;
@@ -1267,12 +1267,13 @@ static int irix_core_dump(long signr, st
dataoff = offset = roundup(offset, PAGE_SIZE);
/* Write program headers for segments dump. */
- for (vma = current->mm->mmap, i = 0;
- i < segs && vma != NULL; vma = vma->vm_next) {
+ i = 0
+ list_for_each_entry(vma, ¤t->mm->mm_vmas, vm_list) {
struct elf_phdr phdr;
size_t sz;
- i++;
+ if (i++ == seg)
+ break;
sz = vma->vm_end - vma->vm_start;
@@ -1301,15 +1302,15 @@ static int irix_core_dump(long signr, st
DUMP_SEEK(dataoff);
- for (i = 0, vma = current->mm->mmap;
- i < segs && vma != NULL;
- vma = vma->vm_next) {
+ i = 0
+ list_for_each_entry(vma, ¤t->mm->mm_vmas, vm_list) {
unsigned long addr = vma->vm_start;
unsigned long len = vma->vm_end - vma->vm_start;
if (!maydump(vma))
continue;
- i++;
+ if (i++ == seg)
+ break;
pr_debug("elf_core_dump: writing %08lx %lx\n", addr, len);
DUMP_WRITE((void __user *)addr, len);
}
--- linux-2.6.23-rc9.orig/arch/mips/kernel/syscall.c
+++ linux-2.6.23-rc9/arch/mips/kernel/syscall.c
@@ -104,7 +104,7 @@ unsigned long arch_get_unmapped_area(str
else
addr = PAGE_ALIGN(addr);
- for (vmm = find_vma(current->mm, addr); ; vmm = vmm->vm_next) {
+ for (vmm = find_vma(current->mm, addr); ; vmm = vmm_next(vmm)) {
/* At this point: (!vmm || addr < vmm->vm_end). */
if (task_size - len < addr)
return -ENOMEM;
--- linux-2.6.23-rc9.orig/arch/parisc/kernel/sys_parisc.c
+++ linux-2.6.23-rc9/arch/parisc/kernel/sys_parisc.c
@@ -52,7 +52,7 @@ static unsigned long get_unshared_area(u
addr = PAGE_ALIGN(addr);
- for (vma = find_vma(current->mm, addr); ; vma = vma->vm_next) {
+ for (vma = find_vma(current->mm, addr); ; vma = vma_next(vma)) {
/* At this point: (!vma || addr < vma->vm_end). */
if (TASK_SIZE - len < addr)
return -ENOMEM;
@@ -88,7 +88,7 @@ static unsigned long get_shared_area(str
addr = DCACHE_ALIGN(addr - offset) + offset;
- for (vma = find_vma(current->mm, addr); ; vma = vma->vm_next) {
+ for (vma = find_vma(current->mm, addr); ; vma = vma_next(vma)) {
/* At this point: (!vma || addr < vma->vm_end). */
if (TASK_SIZE - len < addr)
return -ENOMEM;
--- linux-2.6.23-rc9.orig/arch/powerpc/mm/tlb_32.c
+++ linux-2.6.23-rc9/arch/powerpc/mm/tlb_32.c
@@ -155,7 +155,7 @@ void flush_tlb_mm(struct mm_struct *mm)
* unmap_region or exit_mmap, but not from vmtruncate on SMP -
* but it seems dup_mmap is the only SMP case which gets here.
*/
- for (mp = mm->mmap; mp != NULL; mp = mp->vm_next)
+ list_for_each_entry(mp, &mm->mm_vmas, vm_list)
flush_range(mp->vm_mm, mp->vm_start, mp->vm_end);
FINISH_FLUSH;
}
--- linux-2.6.23-rc9.orig/arch/ppc/mm/tlb.c
+++ linux-2.6.23-rc9/arch/ppc/mm/tlb.c
@@ -149,7 +149,7 @@ void flush_tlb_mm(struct mm_struct *mm)
return;
}
- for (mp = mm->mmap; mp != NULL; mp = mp->vm_next)
+ list_for_each_entry(mp, &mm->mm_vmas, vm_list)
flush_range(mp->vm_mm, mp->vm_start, mp->vm_end);
FINISH_FLUSH;
}
--- linux-2.6.23-rc9.orig/arch/x86_64/ia32/ia32_aout.c
+++ linux-2.6.23-rc9/arch/x86_64/ia32/ia32_aout.c
@@ -311,7 +311,7 @@ static int load_aout_binary(struct linux
current->mm->free_area_cache = TASK_UNMAPPED_BASE;
current->mm->cached_hole_size = 0;
- current->mm->mmap = NULL;
+ INIT_LIST_HEAD(¤t->mm->mm_vmas);
compute_creds(bprm);
current->flags &= ~PF_FORKNOEXEC;
--- linux-2.6.23-rc9.orig/arch/x86_64/kernel/sys_x86_64.c
+++ linux-2.6.23-rc9/arch/x86_64/kernel/sys_x86_64.c
@@ -119,7 +119,7 @@ arch_get_unmapped_area(struct file *filp
start_addr = addr;
full_search:
- for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
+ for (vma = find_vma(mm, addr); ; vma = vma_next(vma)) {
/* At this point: (!vma || addr < vma->vm_end). */
if (end - len < addr) {
/*
--- linux-2.6.23-rc9.orig/drivers/char/mem.c
+++ linux-2.6.23-rc9/drivers/char/mem.c
@@ -640,7 +640,7 @@ static inline size_t read_zero_pagealign
down_read(&mm->mmap_sem);
/* For private mappings, just map in zero pages. */
- for (vma = find_vma(mm, addr); vma; vma = vma->vm_next) {
+ for (vma = find_vma(mm, addr); vma; vma = vma_next(vma)) {
unsigned long count;
if (vma->vm_start > addr || (vma->vm_flags & VM_WRITE) == 0)
--- linux-2.6.23-rc9.orig/drivers/oprofile/buffer_sync.c
+++ linux-2.6.23-rc9/drivers/oprofile/buffer_sync.c
@@ -217,7 +217,7 @@ static unsigned long get_exec_dcookie(st
if (!mm)
goto out;
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ list_for_each_entry(vma, &mm->mm_vmas, vm_list) {
if (!vma->vm_file)
continue;
if (!(vma->vm_flags & VM_EXECUTABLE))
@@ -242,7 +242,7 @@ static unsigned long lookup_dcookie(stru
unsigned long cookie = NO_COOKIE;
struct vm_area_struct * vma;
- for (vma = find_vma(mm, addr); vma; vma = vma->vm_next) {
+ for (vma = find_vma(mm, addr); vma; vma = vma_next(vma)) {
if (addr < vma->vm_start || addr >= vma->vm_end)
continue;
--- linux-2.6.23-rc9.orig/fs/binfmt_aout.c
+++ linux-2.6.23-rc9/fs/binfmt_aout.c
@@ -323,7 +323,7 @@ static int load_aout_binary(struct linux
current->mm->free_area_cache = current->mm->mmap_base;
current->mm->cached_hole_size = 0;
- current->mm->mmap = NULL;
+ INIT_LIST_HEAD(¤t->mm->mm_vmas);
compute_creds(bprm);
current->flags &= ~PF_FORKNOEXEC;
#ifdef __sparc__
--- linux-2.6.23-rc9.orig/fs/binfmt_elf.c
+++ linux-2.6.23-rc9/fs/binfmt_elf.c
@@ -1458,7 +1458,7 @@ static int elf_dump_thread_status(long s
static struct vm_area_struct *first_vma(struct task_struct *tsk,
struct vm_area_struct *gate_vma)
{
- struct vm_area_struct *ret = tsk->mm->mmap;
+ struct vm_area_struct *ret = __vma_next(&tsk->mm->mm_vmas, NULL);
if (ret)
return ret;
@@ -1473,7 +1473,7 @@ static struct vm_area_struct *next_vma(s
{
struct vm_area_struct *ret;
- ret = this_vma->vm_next;
+ ret = vma_next(this_vma);
if (ret)
return ret;
if (this_vma == gate_vma)
--- linux-2.6.23-rc9.orig/fs/binfmt_elf_fdpic.c
+++ linux-2.6.23-rc9/fs/binfmt_elf_fdpic.c
@@ -1471,7 +1471,8 @@ static int elf_fdpic_dump_segments(struc
{
struct vm_area_struct *vma;
- for (vma = current->mm->mmap; vma; vma = vma->vm_next) {
+ list_for_each_entry(vma, ¤t->mm->mm_vmas, vm_list) {
+
unsigned long addr;
if (!maydump(vma, mm_flags))
@@ -1728,7 +1729,7 @@ static int elf_fdpic_core_dump(long sign
/* write program headers for segments dump */
for (
#ifdef CONFIG_MMU
- vma = current->mm->mmap; vma; vma = vma->vm_next
+ vma = __vma_next(¤t->mm->mm_vmas, NULL); vma; vma = vma_next(vma)
#else
vml = current->mm->context.vmlist; vml; vml = vml->next
#endif
--- linux-2.6.23-rc9.orig/fs/exec.c
+++ linux-2.6.23-rc9/fs/exec.c
@@ -555,7 +555,7 @@ static int shift_arg_pages(struct vm_are
* when the old and new regions overlap clear from new_end.
*/
free_pgd_range(&tlb, new_end, old_end, new_end,
- vma->vm_next ? vma->vm_next->vm_start : 0);
+ vma_next(vma) ? vma_next(vma)->vm_start : 0);
} else {
/*
* otherwise, clean from old_start; this is done to not touch
@@ -564,7 +564,7 @@ static int shift_arg_pages(struct vm_are
* for the others its just a little faster.
*/
free_pgd_range(&tlb, old_start, old_end, new_end,
- vma->vm_next ? vma->vm_next->vm_start : 0);
+ vma_next(vma) ? vma_next(vma)->vm_start : 0);
}
tlb_finish_mmu(tlb, new_end, old_end);
--- linux-2.6.23-rc9.orig/fs/hugetlbfs/inode.c
+++ linux-2.6.23-rc9/fs/hugetlbfs/inode.c
@@ -158,7 +158,7 @@ hugetlb_get_unmapped_area(struct file *f
full_search:
addr = ALIGN(start_addr, HPAGE_SIZE);
- for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
+ for (vma = find_vma(mm, addr); ; vma = vma_next(vma)) {
/* At this point: (!vma || addr < vma->vm_end). */
if (TASK_SIZE - len < addr) {
/*
--- linux-2.6.23-rc9.orig/fs/proc/task_mmu.c
+++ linux-2.6.23-rc9/fs/proc/task_mmu.c
@@ -87,11 +87,11 @@ int proc_exe_link(struct inode *inode, s
goto out;
down_read(&mm->mmap_sem);
- vma = mm->mmap;
+ vma = __vma_next(&mm->mm_vmas, NULL);
while (vma) {
if ((vma->vm_flags & VM_EXECUTABLE) && vma->vm_file)
break;
- vma = vma->vm_next;
+ vma = vma_next(vma);
}
if (vma) {
@@ -364,7 +364,7 @@ void clear_refs_smap(struct mm_struct *m
struct vm_area_struct *vma;
down_read(&mm->mmap_sem);
- for (vma = mm->mmap; vma; vma = vma->vm_next)
+ list_for_each_entry(vma, &mm->mm_vmas, vm_list)
if (vma->vm_mm && !is_vm_hugetlb_page(vma))
walk_page_range(vma, clear_refs_pte_range, NULL);
flush_tlb_mm(mm);
@@ -406,7 +406,7 @@ static void *m_start(struct seq_file *m,
/* Start with last addr hint */
if (last_addr && (vma = find_vma(mm, last_addr))) {
- vma = vma->vm_next;
+ vma = vma_next(vma);
goto out;
}
@@ -416,9 +416,9 @@ static void *m_start(struct seq_file *m,
*/
vma = NULL;
if ((unsigned long)l < mm->map_count) {
- vma = mm->mmap;
+ vma = __vma_next(&mm->mm_vmas, NULL);
while (l-- && vma)
- vma = vma->vm_next;
+ vma = vma_next(vma);
goto out;
}
@@ -448,12 +448,12 @@ static void vma_stop(struct proc_maps_pr
static void *m_next(struct seq_file *m, void *v, loff_t *pos)
{
struct proc_maps_private *priv = m->private;
- struct vm_area_struct *vma = v;
+ struct vm_area_struct *vma = v, *next;
struct vm_area_struct *tail_vma = priv->tail_vma;
(*pos)++;
- if (vma && (vma != tail_vma) && vma->vm_next)
- return vma->vm_next;
+ if (vma && (vma != tail_vma) && (next = vma_next(vma)))
+ return next;
vma_stop(priv, vma);
return (vma != tail_vma)? tail_vma: NULL;
}
--- linux-2.6.23-rc9.orig/include/linux/init_task.h
+++ linux-2.6.23-rc9/include/linux/init_task.h
@@ -47,6 +47,7 @@
#define INIT_MM(name) \
{ \
+ .mm_vmas = LIST_HEAD_INIT(name.mm_vmas), \
.mm_rb = RB_ROOT, \
.pgd = swapper_pg_dir, \
.mm_users = ATOMIC_INIT(2), \
--- linux-2.6.23-rc9.orig/include/linux/mm.h
+++ linux-2.6.23-rc9/include/linux/mm.h
@@ -35,6 +35,7 @@ extern int sysctl_legacy_va_layout;
#define sysctl_legacy_va_layout 0
#endif
+#include <linux/sched.h>
#include <asm/page.h>
#include <asm/pgtable.h>
#include <asm/processor.h>
@@ -63,7 +64,7 @@ struct vm_area_struct {
within vm_mm. */
/* linked list of VM areas per task, sorted by address */
- struct vm_area_struct *vm_next;
+ struct list_head vm_list;
pgprot_t vm_page_prot; /* Access permissions of this VMA. */
unsigned long vm_flags; /* Flags, listed below. */
@@ -113,6 +114,42 @@ struct vm_area_struct {
#endif
};
+static inline struct vm_area_struct *
+__vma_next(struct list_head *head, struct vm_area_struct *vma)
+{
+ if (unlikely(!vma))
+ vma = container_of(head, struct vm_area_struct, vm_list);
+
+ if (vma->vm_list.next == head)
+ return NULL;
+
+ return list_entry(vma->vm_list.next, struct vm_area_struct, vm_list);
+}
+
+static inline struct vm_area_struct *
+vma_next(struct vm_area_struct *vma)
+{
+ return __vma_next(&vma->vm_mm->mm_vmas, vma);
+}
+
+static inline struct vm_area_struct *
+__vma_prev(struct list_head *head, struct vm_area_struct *vma)
+{
+ if (unlikely(!vma))
+ vma = container_of(head, struct vm_area_struct, vm_list);
+
+ if (vma->vm_list.prev == head)
+ return NULL;
+
+ return list_entry(vma->vm_list.prev, struct vm_area_struct, vm_list);
+}
+
+static inline struct vm_area_struct *
+vma_prev(struct vm_area_struct *vma)
+{
+ return __vma_prev(&vma->vm_mm->mm_vmas, vma);
+}
+
extern struct kmem_cache *vm_area_cachep;
/*
@@ -769,13 +806,14 @@ struct zap_details {
struct page *vm_normal_page(struct vm_area_struct *, unsigned long, pte_t);
unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
unsigned long size, struct zap_details *);
-unsigned long unmap_vmas(struct mmu_gather **tlb,
+unsigned long unmap_vmas(struct mmu_gather **tlb, struct list_head *vmas,
struct vm_area_struct *start_vma, unsigned long start_addr,
unsigned long end_addr, unsigned long *nr_accounted,
struct zap_details *);
void free_pgd_range(struct mmu_gather **tlb, unsigned long addr,
unsigned long end, unsigned long floor, unsigned long ceiling);
-void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *start_vma,
+void free_pgtables(struct mmu_gather **tlb, struct list_head *vmas,
+ struct vm_area_struct *start_vma,
unsigned long floor, unsigned long ceiling);
int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
struct vm_area_struct *vma);
--- linux-2.6.23-rc9.orig/include/linux/sched.h
+++ linux-2.6.23-rc9/include/linux/sched.h
@@ -367,7 +367,7 @@ extern int get_dumpable(struct mm_struct
((1 << MMF_DUMP_ANON_PRIVATE) | (1 << MMF_DUMP_ANON_SHARED))
struct mm_struct {
- struct vm_area_struct * mmap; /* list of VMAs */
+ struct list_head mm_vmas;
struct rb_root mm_rb;
struct vm_area_struct * mmap_cache; /* last find_vma result */
unsigned long (*get_unmapped_area) (struct file *filp,
--- linux-2.6.23-rc9.orig/ipc/shm.c
+++ linux-2.6.23-rc9/ipc/shm.c
@@ -1034,7 +1034,7 @@ asmlinkage long sys_shmdt(char __user *s
vma = find_vma(mm, addr);
while (vma) {
- next = vma->vm_next;
+ next = vma_next(vma);
/*
* Check if the starting address would match, i.e. it's
@@ -1067,7 +1067,7 @@ asmlinkage long sys_shmdt(char __user *s
*/
size = PAGE_ALIGN(size);
while (vma && (loff_t)(vma->vm_end - addr) <= size) {
- next = vma->vm_next;
+ next = vma_next(vma);
/* finding a matching vma now does not alter retval */
if ((vma->vm_ops == &shm_vm_ops) &&
--- linux-2.6.23-rc9.orig/kernel/acct.c
+++ linux-2.6.23-rc9/kernel/acct.c
@@ -540,11 +540,8 @@ void acct_collect(long exitcode, int gro
if (group_dead && current->mm) {
struct vm_area_struct *vma;
down_read(¤t->mm->mmap_sem);
- vma = current->mm->mmap;
- while (vma) {
+ list_for_each_entry(vma, ¤t->mm->mm_vmas, vm_list)
vsize += vma->vm_end - vma->vm_start;
- vma = vma->vm_next;
- }
up_read(¤t->mm->mmap_sem);
}
--- linux-2.6.23-rc9.orig/kernel/auditsc.c
+++ linux-2.6.23-rc9/kernel/auditsc.c
@@ -780,8 +780,7 @@ static void audit_log_task_info(struct a
if (mm) {
down_read(&mm->mmap_sem);
- vma = mm->mmap;
- while (vma) {
+ list_for_each_entry(vma, &mm->mm_vmas, vm_list) {
if ((vma->vm_flags & VM_EXECUTABLE) &&
vma->vm_file) {
audit_log_d_path(ab, "exe=",
@@ -789,7 +788,6 @@ static void audit_log_task_info(struct a
vma->vm_file->f_path.mnt);
break;
}
- vma = vma->vm_next;
}
up_read(&mm->mmap_sem);
}
--- linux-2.6.23-rc9.orig/kernel/fork.c
+++ linux-2.6.23-rc9/kernel/fork.c
@@ -197,7 +197,7 @@ static struct task_struct *dup_task_stru
#ifdef CONFIG_MMU
static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
{
- struct vm_area_struct *mpnt, *tmp, **pprev;
+ struct vm_area_struct *mpnt, *tmp;
struct rb_node **rb_link, *rb_parent;
int retval;
unsigned long charge;
@@ -211,7 +211,6 @@ static inline int dup_mmap(struct mm_str
down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING);
mm->locked_vm = 0;
- mm->mmap = NULL;
mm->mmap_cache = NULL;
mm->free_area_cache = oldmm->mmap_base;
mm->cached_hole_size = ~0UL;
@@ -220,9 +219,8 @@ static inline int dup_mmap(struct mm_str
mm->mm_rb = RB_ROOT;
rb_link = &mm->mm_rb.rb_node;
rb_parent = NULL;
- pprev = &mm->mmap;
- for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
+ list_for_each_entry(mpnt, &oldmm->mm_vmas, vm_list) {
struct file *file;
if (mpnt->vm_flags & VM_DONTCOPY) {
@@ -250,7 +248,6 @@ static inline int dup_mmap(struct mm_str
vma_set_policy(tmp, pol);
tmp->vm_flags &= ~VM_LOCKED;
tmp->vm_mm = mm;
- tmp->vm_next = NULL;
anon_vma_link(tmp);
file = tmp->vm_file;
if (file) {
@@ -271,9 +268,8 @@ static inline int dup_mmap(struct mm_str
/*
* Link in the new vma and copy the page table entries.
*/
- *pprev = tmp;
- pprev = &tmp->vm_next;
+ list_add_tail(&tmp->vm_list, &mm->mm_vmas);
__vma_link_rb(mm, tmp, rb_link, rb_parent);
rb_link = &tmp->vm_rb.rb_right;
rb_parent = &tmp->vm_rb;
@@ -330,6 +326,7 @@ static inline void mm_free_pgd(struct mm
static struct mm_struct * mm_init(struct mm_struct * mm)
{
+ INIT_LIST_HEAD(&mm->mm_vmas);
atomic_set(&mm->mm_users, 1);
atomic_set(&mm->mm_count, 1);
init_rwsem(&mm->mmap_sem);
--- linux-2.6.23-rc9.orig/mm/madvise.c
+++ linux-2.6.23-rc9/mm/madvise.c
@@ -351,7 +351,7 @@ asmlinkage long sys_madvise(unsigned lon
if (start >= end)
goto out;
if (prev)
- vma = prev->vm_next;
+ vma = vma_next(prev);
else /* madvise_remove dropped mmap_sem */
vma = find_vma(current->mm, start);
}
--- linux-2.6.23-rc9.orig/mm/memory.c
+++ linux-2.6.23-rc9/mm/memory.c
@@ -264,11 +264,11 @@ void free_pgd_range(struct mmu_gather **
flush_tlb_pgtables((*tlb)->mm, start, end);
}
-void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
- unsigned long floor, unsigned long ceiling)
+void free_pgtables(struct mmu_gather **tlb, struct list_head *vmas,
+ struct vm_area_struct *vma, unsigned long floor, unsigned long ceiling)
{
while (vma) {
- struct vm_area_struct *next = vma->vm_next;
+ struct vm_area_struct *next = __vma_next(vmas, vma);
unsigned long addr = vma->vm_start;
/*
@@ -287,7 +287,7 @@ void free_pgtables(struct mmu_gather **t
while (next && next->vm_start <= vma->vm_end + PMD_SIZE
&& !is_vm_hugetlb_page(next)) {
vma = next;
- next = vma->vm_next;
+ next = __vma_next(vmas, vma);
anon_vma_unlink(vma);
unlink_file_vma(vma);
}
@@ -806,7 +806,7 @@ static unsigned long unmap_page_range(st
* ensure that any thus-far unmapped pages are flushed before unmap_vmas()
* drops the lock and schedules.
*/
-unsigned long unmap_vmas(struct mmu_gather **tlbp,
+unsigned long unmap_vmas(struct mmu_gather **tlbp, struct list_head *vmas,
struct vm_area_struct *vma, unsigned long start_addr,
unsigned long end_addr, unsigned long *nr_accounted,
struct zap_details *details)
@@ -818,7 +818,7 @@ unsigned long unmap_vmas(struct mmu_gath
spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
int fullmm = (*tlbp)->fullmm;
- for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {
+ for ( ; vma && vma->vm_start < end_addr; vma = __vma_next(vmas, vma)) {
unsigned long end;
start = max(vma->vm_start, start_addr);
@@ -889,7 +889,8 @@ unsigned long zap_page_range(struct vm_a
lru_add_drain();
tlb = tlb_gather_mmu(mm, 0);
update_hiwater_rss(mm);
- end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
+ end = unmap_vmas(&tlb, &vma->vm_mm->mm_vmas, vma,
+ address, end, &nr_accounted, details);
if (tlb)
tlb_finish_mmu(tlb, address, end);
return end;
@@ -2093,7 +2094,7 @@ int vmtruncate_range(struct inode *inode
void swapin_readahead(swp_entry_t entry, unsigned long addr,struct vm_area_struct *vma)
{
#ifdef CONFIG_NUMA
- struct vm_area_struct *next_vma = vma ? vma->vm_next : NULL;
+ struct vm_area_struct *next_vma = vma ? vma_next(vma) : NULL;
#endif
int i, num;
struct page *new_page;
@@ -2120,14 +2121,14 @@ void swapin_readahead(swp_entry_t entry,
if (vma) {
if (addr >= vma->vm_end) {
vma = next_vma;
- next_vma = vma ? vma->vm_next : NULL;
+ next_vma = vma ? vma_next(vma) : NULL;
}
if (vma && addr < vma->vm_start)
vma = NULL;
} else {
if (next_vma && addr >= next_vma->vm_start) {
vma = next_vma;
- next_vma = vma->vm_next;
+ next_vma = vma_next(vma);
}
}
#endif
--- linux-2.6.23-rc9.orig/mm/mempolicy.c
+++ linux-2.6.23-rc9/mm/mempolicy.c
@@ -344,9 +344,9 @@ check_range(struct mm_struct *mm, unsign
if (!first)
return ERR_PTR(-EFAULT);
prev = NULL;
- for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
+ for (vma = first; vma && vma->vm_start < end; vma = vma_next(vma)) {
if (!(flags & MPOL_MF_DISCONTIG_OK)) {
- if (!vma->vm_next && vma->vm_end < end)
+ if (!vma_next(vma) && vma->vm_end < end)
return ERR_PTR(-EFAULT);
if (prev && prev->vm_end < vma->vm_start)
return ERR_PTR(-EFAULT);
@@ -403,7 +403,7 @@ static int mbind_range(struct vm_area_st
err = 0;
for (; vma && vma->vm_start < end; vma = next) {
- next = vma->vm_next;
+ next = vma_next(vma);
if (vma->vm_start < start)
err = split_vma(vma->vm_mm, vma, start, 1);
if (!err && vma->vm_end > end)
@@ -610,7 +610,7 @@ int migrate_to_node(struct mm_struct *mm
nodes_clear(nmask);
node_set(source, nmask);
- check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
+ check_range(mm, __vma_next(&mm->mm_vmas, NULL)->vm_start, TASK_SIZE, &nmask,
flags | MPOL_MF_DISCONTIG_OK, &pagelist);
if (!list_empty(&pagelist))
@@ -1785,7 +1785,7 @@ void mpol_rebind_mm(struct mm_struct *mm
struct vm_area_struct *vma;
down_write(&mm->mmap_sem);
- for (vma = mm->mmap; vma; vma = vma->vm_next)
+ list_for_each_entry(vma, &mm->mm_vmas, vm_list)
mpol_rebind_policy(vma->vm_policy, new);
up_write(&mm->mmap_sem);
}
--- linux-2.6.23-rc9.orig/mm/migrate.c
+++ linux-2.6.23-rc9/mm/migrate.c
@@ -1030,7 +1030,7 @@ int migrate_vmas(struct mm_struct *mm, c
struct vm_area_struct *vma;
int err = 0;
- for(vma = mm->mmap; vma->vm_next && !err; vma = vma->vm_next) {
+ list_for_each_entry(vma, &mm->mm_vmas, vm_list) {
if (vma->vm_ops && vma->vm_ops->migrate) {
err = vma->vm_ops->migrate(vma, to, from, flags);
if (err)
--- linux-2.6.23-rc9.orig/mm/mlock.c
+++ linux-2.6.23-rc9/mm/mlock.c
@@ -123,7 +123,7 @@ static int do_mlock(unsigned long start,
if (nstart >= end)
break;
- vma = prev->vm_next;
+ vma = vma_next(prev);
if (!vma || vma->vm_start != nstart) {
error = -ENOMEM;
break;
@@ -181,7 +181,7 @@ static int do_mlockall(int flags)
if (flags == MCL_FUTURE)
goto out;
- for (vma = current->mm->mmap; vma ; vma = prev->vm_next) {
+ list_for_each_entry(vma, ¤t->mm->mm_vmas, vm_list) {
unsigned int newflags;
newflags = vma->vm_flags | VM_LOCKED;
@@ -190,6 +190,7 @@ static int do_mlockall(int flags)
/* Ignore errors */
mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags);
+ vma = prev;
}
out:
return 0;
--- linux-2.6.23-rc9.orig/mm/mmap.c
+++ linux-2.6.23-rc9/mm/mmap.c
@@ -35,7 +35,7 @@
#define arch_mmap_check(addr, len, flags) (0)
#endif
-static void unmap_region(struct mm_struct *mm,
+static void unmap_region(struct mm_struct *mm, struct list_head *vmas,
struct vm_area_struct *vma, struct vm_area_struct *prev,
unsigned long start, unsigned long end);
@@ -220,18 +220,17 @@ void unlink_file_vma(struct vm_area_stru
/*
* Close a vm structure and free it, returning the next.
*/
-static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
+static void remove_vma(struct vm_area_struct *vma)
{
- struct vm_area_struct *next = vma->vm_next;
-
might_sleep();
+ list_del(&vma->vm_list);
if (vma->vm_ops && vma->vm_ops->close)
vma->vm_ops->close(vma);
if (vma->vm_file)
fput(vma->vm_file);
mpol_free(vma_policy(vma));
kmem_cache_free(vm_area_cachep, vma);
- return next;
+ return;
}
asmlinkage unsigned long sys_brk(unsigned long brk)
@@ -316,11 +315,9 @@ void validate_mm(struct mm_struct *mm)
{
int bug = 0;
int i = 0;
- struct vm_area_struct *tmp = mm->mmap;
- while (tmp) {
- tmp = tmp->vm_next;
+ struct vm_area_struct *vma;
+ list_for_each_entry(vma, &mm->mm_vmas, vm_list)
i++;
- }
if (i != mm->map_count)
printk("map_count %d vm_next %d\n", mm->map_count, i), bug = 1;
i = browse_rb(&mm->mm_rb);
@@ -374,15 +371,14 @@ __vma_link_list(struct mm_struct *mm, st
struct vm_area_struct *prev, struct rb_node *rb_parent)
{
if (prev) {
- vma->vm_next = prev->vm_next;
- prev->vm_next = vma;
+ list_add(&vma->vm_list, &prev->vm_list);
} else {
- mm->mmap = vma;
- if (rb_parent)
- vma->vm_next = rb_entry(rb_parent,
+ if (rb_parent) {
+ struct vm_area_struct *next = rb_entry(rb_parent,
struct vm_area_struct, vm_rb);
- else
- vma->vm_next = NULL;
+ list_add_tail(&vma->vm_list, &next->vm_list);
+ } else
+ list_add(&vma->vm_list, &mm->mm_vmas);
}
}
@@ -472,7 +468,7 @@ static inline void
__vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
struct vm_area_struct *prev)
{
- prev->vm_next = vma->vm_next;
+ list_del(&vma->vm_list);
rb_erase(&vma->vm_rb, &mm->mm_rb);
if (mm->mmap_cache == vma)
mm->mmap_cache = prev;
@@ -489,7 +485,7 @@ void vma_adjust(struct vm_area_struct *v
unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert)
{
struct mm_struct *mm = vma->vm_mm;
- struct vm_area_struct *next = vma->vm_next;
+ struct vm_area_struct *next = vma_next(vma);
struct vm_area_struct *importer = NULL;
struct address_space *mapping = NULL;
struct prio_tree_root *root = NULL;
@@ -630,7 +626,7 @@ again: remove_next = 1 + (end > next->
* up the code too much to do both in one go.
*/
if (remove_next == 2) {
- next = vma->vm_next;
+ next = vma_next(vma);
goto again;
}
}
@@ -751,13 +747,10 @@ struct vm_area_struct *vma_merge(struct
if (vm_flags & VM_SPECIAL)
return NULL;
- if (prev)
- next = prev->vm_next;
- else
- next = mm->mmap;
+ next = __vma_next(&mm->mm_vmas, prev);
area = next;
if (next && next->vm_end == end) /* cases 6, 7, 8 */
- next = next->vm_next;
+ next = vma_next(next);
/*
* Can it merge with the predecessor?
@@ -816,7 +809,7 @@ struct anon_vma *find_mergeable_anon_vma
struct vm_area_struct *near;
unsigned long vm_flags;
- near = vma->vm_next;
+ near = vma_next(vma);
if (!near)
goto try_prev;
@@ -899,6 +892,7 @@ unsigned long do_mmap_pgoff(struct file
int error;
int accountable = 1;
unsigned long reqprot = prot;
+ LIST_HEAD(vmas);
/*
* Does the application expect PROT_READ to imply PROT_EXEC?
@@ -1075,6 +1069,7 @@ unsigned long mmap_region(struct file *f
struct rb_node **rb_link, *rb_parent;
unsigned long charged = 0;
struct inode *inode = file ? file->f_path.dentry->d_inode : NULL;
+ LIST_HEAD(vmas);
/* Clear old maps */
error = -ENOMEM;
@@ -1210,7 +1205,8 @@ unmap_and_free_vma:
fput(file);
/* Undo any partial mapping done by a device driver. */
- unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end);
+ list_add(&vma->vm_list, &vmas);
+ unmap_region(mm, &vmas, vma, prev, vma->vm_start, vma->vm_end);
charged = 0;
free_vma:
kmem_cache_free(vm_area_cachep, vma);
@@ -1261,7 +1257,7 @@ arch_get_unmapped_area(struct file *filp
}
full_search:
- for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
+ for (vma = find_vma(mm, addr); ; vma = vma_next(vma)) {
/* At this point: (!vma || addr < vma->vm_end). */
if (TASK_SIZE - len < addr) {
/*
@@ -1472,14 +1468,11 @@ struct vm_area_struct *
find_vma_prev(struct mm_struct *mm, unsigned long addr,
struct vm_area_struct **pprev)
{
- struct vm_area_struct *vma = NULL, *prev = NULL;
+ struct vm_area_struct *prev = NULL, *next;
struct rb_node * rb_node;
if (!mm)
goto out;
- /* Guard against addr being lower than the first VMA */
- vma = mm->mmap;
-
/* Go through the RB tree quickly. */
rb_node = mm->mm_rb.rb_node;
@@ -1491,7 +1484,8 @@ find_vma_prev(struct mm_struct *mm, unsi
rb_node = rb_node->rb_left;
} else {
prev = vma_tmp;
- if (!prev->vm_next || (addr < prev->vm_next->vm_end))
+ next = __vma_next(&mm->mm_vmas, prev);
+ if (!next || (addr < next->vm_end))
break;
rb_node = rb_node->rb_right;
}
@@ -1499,7 +1493,7 @@ find_vma_prev(struct mm_struct *mm, unsi
out:
*pprev = prev;
- return prev ? prev->vm_next : vma;
+ return __vma_next(&mm->mm_vmas, prev);
}
/*
@@ -1707,18 +1701,21 @@ find_extend_vma(struct mm_struct * mm, u
*
* Called with the mm semaphore held.
*/
-static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
+static void remove_vma_list(struct mm_struct *mm, struct list_head *vmas,
+ struct vm_area_struct *vma)
{
/* Update high watermark before we lower total_vm */
update_hiwater_vm(mm);
do {
+ struct vm_area_struct *next = __vma_next(vmas, vma);
long nrpages = vma_pages(vma);
mm->total_vm -= nrpages;
if (vma->vm_flags & VM_LOCKED)
mm->locked_vm -= nrpages;
vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages);
- vma = remove_vma(vma);
+ remove_vma(vma);
+ vma = next;
} while (vma);
validate_mm(mm);
}
@@ -1728,21 +1725,22 @@ static void remove_vma_list(struct mm_st
*
* Called with the mm semaphore held.
*/
-static void unmap_region(struct mm_struct *mm,
+static void unmap_region(struct mm_struct *mm, struct list_head *vmas,
struct vm_area_struct *vma, struct vm_area_struct *prev,
unsigned long start, unsigned long end)
{
- struct vm_area_struct *next = prev? prev->vm_next: mm->mmap;
+ struct vm_area_struct *next = __vma_next(&mm->mm_vmas, prev);
struct mmu_gather *tlb;
unsigned long nr_accounted = 0;
lru_add_drain();
tlb = tlb_gather_mmu(mm, 0);
update_hiwater_rss(mm);
- unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL);
+ unmap_vmas(&tlb, vmas, vma, start, end, &nr_accounted, NULL);
vm_unacct_memory(nr_accounted);
- free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
- next? next->vm_start: 0);
+ free_pgtables(&tlb, vmas, vma,
+ prev ? prev->vm_end : FIRST_USER_ADDRESS,
+ next ? next->vm_start : 0);
tlb_finish_mmu(tlb, start, end);
}
@@ -1752,21 +1750,17 @@ static void unmap_region(struct mm_struc
*/
static void
detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
- struct vm_area_struct *prev, unsigned long end)
+ struct vm_area_struct *prev, unsigned long end, struct list_head *vmas)
{
- struct vm_area_struct **insertion_point;
- struct vm_area_struct *tail_vma = NULL;
unsigned long addr;
- insertion_point = (prev ? &prev->vm_next : &mm->mmap);
do {
+ struct vm_area_struct *next = vma_next(vma);
rb_erase(&vma->vm_rb, &mm->mm_rb);
mm->map_count--;
- tail_vma = vma;
- vma = vma->vm_next;
+ list_move_tail(&vma->vm_list, vmas);
+ vma = next;
} while (vma && vma->vm_start < end);
- *insertion_point = vma;
- tail_vma->vm_next = NULL;
if (mm->unmap_area == arch_unmap_area)
addr = prev ? prev->vm_end : mm->mmap_base;
else
@@ -1836,6 +1830,7 @@ int do_munmap(struct mm_struct *mm, unsi
{
unsigned long end;
struct vm_area_struct *vma, *prev, *last;
+ LIST_HEAD(vmas);
if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start)
return -EINVAL;
@@ -1875,16 +1870,16 @@ int do_munmap(struct mm_struct *mm, unsi
if (error)
return error;
}
- vma = prev? prev->vm_next: mm->mmap;
+ vma = __vma_next(&mm->mm_vmas, prev);
/*
* Remove the vma's, and unmap the actual pages
*/
- detach_vmas_to_be_unmapped(mm, vma, prev, end);
- unmap_region(mm, vma, prev, start, end);
+ detach_vmas_to_be_unmapped(mm, vma, prev, end, &vmas);
+ unmap_region(mm, &vmas, vma, prev, start, end);
/* Fix up all other VM information */
- remove_vma_list(mm, vma);
+ remove_vma_list(mm, &vmas, vma);
return 0;
}
@@ -2021,7 +2016,9 @@ EXPORT_SYMBOL(do_brk);
void exit_mmap(struct mm_struct *mm)
{
struct mmu_gather *tlb;
- struct vm_area_struct *vma = mm->mmap;
+ LIST_HEAD(vmas);
+ struct vm_area_struct *vma = __vma_next(&mm->mm_vmas, NULL);
+ struct vm_area_struct *next;
unsigned long nr_accounted = 0;
unsigned long end;
@@ -2030,22 +2027,23 @@ void exit_mmap(struct mm_struct *mm)
lru_add_drain();
flush_cache_mm(mm);
+ detach_vmas_to_be_unmapped(mm, vma, NULL, -1, &vmas);
tlb = tlb_gather_mmu(mm, 1);
/* Don't update_hiwater_rss(mm) here, do_exit already did */
/* Use -1 here to ensure all VMAs in the mm are unmapped */
- end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
+ end = unmap_vmas(&tlb, &vmas, vma, 0, -1, &nr_accounted, NULL);
vm_unacct_memory(nr_accounted);
- free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);
+ free_pgtables(&tlb, &vmas, vma, FIRST_USER_ADDRESS, 0);
tlb_finish_mmu(tlb, 0, end);
/*
* Walk the list again, actually closing and freeing it,
* with preemption enabled, without holding any MM locks.
*/
- while (vma)
- vma = remove_vma(vma);
+ list_for_each_entry_safe(vma, next, &vmas, vm_list)
+ remove_vma(vma);
- BUG_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
+ /* BUG_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT); */
}
/* Insert vm structure into process list sorted by address
--- linux-2.6.23-rc9.orig/mm/mprotect.c
+++ linux-2.6.23-rc9/mm/mprotect.c
@@ -302,7 +302,7 @@ sys_mprotect(unsigned long start, size_t
if (nstart >= end)
goto out;
- vma = prev->vm_next;
+ vma = vma_next(prev);
if (!vma || vma->vm_start != nstart) {
error = -ENOMEM;
goto out;
--- linux-2.6.23-rc9.orig/mm/mremap.c
+++ linux-2.6.23-rc9/mm/mremap.c
@@ -226,7 +226,7 @@ static unsigned long move_vma(struct vm_
if (excess) {
vma->vm_flags |= VM_ACCOUNT;
if (split)
- vma->vm_next->vm_flags |= VM_ACCOUNT;
+ vma_next(vma)->vm_flags |= VM_ACCOUNT;
}
if (vm_flags & VM_LOCKED) {
@@ -360,8 +360,9 @@ unsigned long do_mremap(unsigned long ad
!((flags & MREMAP_FIXED) && (addr != new_addr)) &&
(old_len != new_len || !(flags & MREMAP_MAYMOVE))) {
unsigned long max_addr = TASK_SIZE;
- if (vma->vm_next)
- max_addr = vma->vm_next->vm_start;
+ struct vm_area_struct *next = vma_next(vma);
+ if (next)
+ max_addr = next->vm_start;
/* can we just expand the current mapping? */
if (max_addr - addr >= new_len) {
int pages = (new_len - old_len) >> PAGE_SHIFT;
--- linux-2.6.23-rc9.orig/mm/msync.c
+++ linux-2.6.23-rc9/mm/msync.c
@@ -93,7 +93,7 @@ asmlinkage long sys_msync(unsigned long
error = 0;
goto out_unlock;
}
- vma = vma->vm_next;
+ vma = vma_next(vma);
}
}
out_unlock:
--- linux-2.6.23-rc9.orig/mm/swapfile.c
+++ linux-2.6.23-rc9/mm/swapfile.c
@@ -626,7 +626,7 @@ static int unuse_mm(struct mm_struct *mm
down_read(&mm->mmap_sem);
lock_page(page);
}
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ list_for_each_entry(vma, &mm->mm_vmas, vm_list) {
if (vma->anon_vma && unuse_vma(vma, entry, page))
break;
}
--
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 10+ messages in thread
* [PATCH/RFC 2/9] lib: RCU friendly B+tree
2007-10-22 10:45 [PATCH/RFC 0/9] VMA lookup with RCU Vaidyanathan Srinivasan
2007-10-22 10:45 ` [PATCH/RFC 1/9] Data structure changes Vaidyanathan Srinivasan
@ 2007-10-22 10:45 ` Vaidyanathan Srinivasan
2007-10-22 10:45 ` [PATCH/RFC 3/9] mm: use the B+tree for vma lookups Vaidyanathan Srinivasan
` (6 subsequent siblings)
8 siblings, 0 replies; 10+ messages in thread
From: Vaidyanathan Srinivasan @ 2007-10-22 10:45 UTC (permalink / raw)
To: linux-mm; +Cc: Alexis Bruemmer, Peter Zijlstra
[-- Attachment #1: 2_btree.patch --]
[-- Type: text/plain, Size: 33151 bytes --]
Introduce an RCU friendly B+tree in lib
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
include/linux/btree.h | 151 ++++++
init/main.c | 2
lib/Makefile | 3
lib/btree.c | 1189 ++++++++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 1344 insertions(+), 1 deletion(-)
--- /dev/null
+++ linux-2.6.23-rc6/include/linux/btree.h
@@ -0,0 +1,151 @@
+#ifndef _LINUX_BTREE_H
+#define _LINUX_BTREE_H
+
+#define BTREE_DEBUG 1
+
+#include <linux/mempool.h>
+#include <linux/log2.h>
+#include <linux/rcupdate.h>
+#include <asm/cache.h>
+#include <asm/bitops.h>
+
+struct btree_item {
+ unsigned long key;
+ void *data;
+};
+
+#define BTREE_ITEM_CACHELINE (L1_CACHE_BYTES / sizeof(struct btree_item))
+#define BTREE_ITEM_MAX (BTREE_ITEM_CACHELINE < 16 ? 16 : BTREE_ITEM_CACHELINE)
+#define BTREE_ITEM_HALF (BTREE_ITEM_MAX/2)
+
+struct btree_node {
+ struct btree_item item[BTREE_ITEM_MAX];
+};
+
+/*
+ * Max depth of the tree.
+ * Assumes BTREE_ITEM_MAX is power of two.
+ */
+#define BTREE_MAX_DEPTH \
+ (1 + DIV_ROUND_UP(BITS_PER_LONG, ilog2(BTREE_ITEM_MAX)))
+
+/*
+ * Max number of nodes to be RCUed per modification.
+ */
+#define BTREE_NODE_REPLACE (2 * BTREE_MAX_DEPTH)
+
+struct btree_freevec {
+ struct rcu_head rcu_head;
+ int pos;
+ void *slot[BTREE_NODE_REPLACE];
+};
+
+struct btree_root {
+ struct btree_node *root;
+ int height;
+ mempool_t *mempool;
+ struct btree_freevec *freevec;
+ gfp_t gfp_mask;
+ void (*flush)(struct btree_freevec *);
+};
+
+#ifdef BTREE_DEBUG
+void btree_print(struct btree_root *);
+#else
+#define btree_print(root) do { } while (0)
+#endif
+
+static inline void *btree_item_deref(struct btree_item *itemp)
+{
+ void *ptr = rcu_dereference(itemp->data);
+ __clear_bit(0, (void *)&ptr);
+ return ptr;
+}
+
+static inline unsigned long btree_item_key(struct btree_item *itemp)
+{
+ unsigned long key = itemp->key;
+ /*
+ * match the smp_wmb() in btree_item_key_set()
+ */
+ smp_rmb();
+ return key;
+}
+
+extern struct btree_item *btree_search_next(struct btree_root *root,
+ unsigned long index, struct btree_item **nextp);
+
+static inline void *btree_lookup(struct btree_root *root, unsigned long index)
+{
+ struct btree_item *item;
+
+ item = btree_search_next(root, index, NULL);
+ if (!item || btree_item_key(item) != index)
+ return NULL;
+
+ return item->data;
+}
+
+static inline void *btree_lookup_next(struct btree_root *root,
+ unsigned long index, void **nextp)
+{
+ struct btree_item *next = NULL, *item;
+
+ item = btree_search_next(root, index, &next);
+ if (next)
+ *nextp = next->data;
+ if (!item || btree_item_key(item) != index)
+ return NULL;
+
+ return item->data;
+}
+
+static inline void *btree_stab(struct btree_root *root, unsigned long index)
+{
+ struct btree_item *item;
+
+ item = btree_search_next(root, index, NULL);
+ if (!item)
+ return NULL;
+
+ return item->data;
+}
+
+static inline void *btree_stab_next(struct btree_root *root,
+ unsigned long index, void **nextp)
+{
+ struct btree_item *next = NULL, *item;
+
+ item = btree_search_next(root, index, &next);
+ if (next)
+ *nextp = next->data;
+ if (!item)
+ return NULL;
+
+ return item->data;
+}
+
+extern int btree_preload(struct btree_root *, gfp_t);
+extern int btree_insert(struct btree_root *, unsigned long, void *);
+extern int btree_update(struct btree_root *, unsigned long, unsigned long);
+extern void *btree_remove(struct btree_root *, unsigned long);
+
+extern void btree_root_init(struct btree_root *, gfp_t);
+extern void btree_root_destroy(struct btree_root *);
+
+extern void btree_freevec_flush(struct btree_freevec *);
+
+#define BTREE_INIT_FLUSH(gfp, f) (struct btree_root){ \
+ .root = NULL, \
+ .height = 0, \
+ .mempool = NULL, \
+ .freevec = NULL, \
+ .gfp_mask = (gfp), \
+ .flush = (f), \
+}
+
+#define BTREE_INIT(gfp) BTREE_INIT_FLUSH(gfp, btree_freevec_flush)
+
+extern void __init btree_init(void);
+
+#endif /* _LINUX_BTREE_H */
--- linux-2.6.23-rc6.orig/init/main.c
+++ linux-2.6.23-rc6/init/main.c
@@ -55,6 +55,7 @@
#include <linux/pid_namespace.h>
#include <linux/device.h>
#include <linux/kthread.h>
+#include <linux/btree.h>
#include <asm/io.h>
#include <asm/bugs.h>
@@ -613,6 +614,7 @@ asmlinkage void __init start_kernel(void
cpuset_init_early();
mem_init();
kmem_cache_init();
+ btree_init();
setup_per_cpu_pageset();
numa_policy_init();
if (late_time_init)
--- linux-2.6.23-rc6.orig/lib/Makefile
+++ linux-2.6.23-rc6/lib/Makefile
@@ -5,7 +5,8 @@
lib-y := ctype.o string.o vsprintf.o kasprintf.o cmdline.o \
rbtree.o radix-tree.o dump_stack.o \
idr.o int_sqrt.o bitmap.o extable.o prio_tree.o \
- sha1.o irq_regs.o reciprocal_div.o argv_split.o
+ sha1.o irq_regs.o reciprocal_div.o argv_split.o \
+ btree.o
lib-$(CONFIG_MMU) += ioremap.o
lib-$(CONFIG_SMP) += cpumask.o
--- /dev/null
+++ linux-2.6.23-rc6/lib/btree.c
@@ -0,0 +1,1189 @@
+/*
+ * Copyright 2007, Red Hat Inc. Peter Zijlstra <pzijlstr@redhat.com>
+ * GPLv2
+ *
+ * Something that started out as an RCU friendly B+tree.
+ *
+ * Contrairy to most B-trees the nodes have an even number of slots. This can
+ * be done because we carry all the children in the leafs and thus we don't
+ * need to push one up on a split. We can just duplicate the first.
+ *
+ * The inner nodes are basically an N-way space partition.
+ *
+ * Another difference to the typical B+tree is that this implementation does
+ * not thread the leafs, this is left up to the user if so desired.
+ *
+ *
+ * [------------------------------>[------------------------------>
+ * | |
+ * [------------------->[--------->[-------------->[-------------->
+ * | | | |
+ * [..) [..) [..) [..) [..) [..) [..) [..) [..) [..) [..) [..)
+ *
+ *
+ */
+#include <linux/btree.h>
+#include <linux/slab.h>
+#include <linux/log2.h>
+#include <linux/err.h>
+
+#ifdef BTREE_DEBUG
+void btree_print(struct btree_root *root);
+int btree_validate(struct btree_root *root);
+#else
+#define btree_print(root) do { } while (0)
+#define btree_validate(root) (0)
+#endif
+
+static struct kmem_cache *btree_cachep;
+
+static inline void btree_free(struct btree_root *root, void *ptr)
+{
+ root->freevec->slot[root->freevec->pos++] = ptr;
+}
+
+static inline void btree_ptr_flip(struct btree_root *root,
+ void **nodep, void *new)
+{
+ void *old = rcu_dereference(*nodep);
+ rcu_assign_pointer(*nodep, new);
+ __clear_bit(0, (void *)&old);
+ btree_free(root, old);
+}
+
+/*
+ * node pointers have bit 0 set.
+ */
+static inline int btree_item_node(struct btree_item *itemp)
+{
+ return test_bit(0, (void *)&itemp->data);
+}
+
+static inline struct btree_node *btree_node_ptr(struct btree_node *nodep)
+{
+ __set_bit(0, (void *)&nodep);
+ return nodep;
+}
+
+static inline void btree_item_key_set(struct btree_item *itemp,
+ unsigned long index)
+{
+ /*
+ * we can only tighten when the new node is linked in
+ */
+ smp_wmb();
+ itemp->key = index;
+}
+
+static void btree_item_flip(struct btree_root *root,
+ struct btree_item *itemp, struct btree_node *new)
+{
+ unsigned long index;
+
+ btree_ptr_flip(root, &itemp->data, btree_node_ptr(new));
+
+ /* possibly tighten */
+ index = btree_item_key(&new->item[0]);
+ if (btree_item_key(itemp) != index)
+ btree_item_key_set(itemp, index);
+}
+
+static inline void btree_item_copy(void *dst, void *src, int nr)
+{
+ memcpy(dst, src, nr*sizeof(struct btree_item));
+}
+
+static inline int btree_item_count(struct btree_node *node)
+{
+ int j;
+
+ for (j = 0; j < BTREE_ITEM_MAX &&
+ node->item[j].data ; j++)
+ ;
+
+ return j;
+}
+
+static struct btree_node *btree_node_alloc(struct btree_root *root)
+{
+ struct btree_node *node;
+
+ node = mempool_alloc(root->mempool, root->gfp_mask);
+ memset(node, 0, sizeof(struct btree_node));
+
+ return node;
+}
+
+static inline void btree_item_free(struct btree_root *root,
+ struct btree_item *itemp)
+{
+ if (btree_item_node(itemp))
+ btree_free(root, btree_item_deref(itemp));
+}
+
+/*
+ * node storage; expose interface to claim all the needed memory in advance
+ * this avoids getting stuck in a situation where going bad is as impossible
+ * as going forward.
+ */
+int btree_preload(struct btree_root *root, gfp_t gfp_mask)
+{
+ int ret = 0;
+
+ if (!root->mempool) {
+ root->mempool = mempool_create_slab_pool(0, btree_cachep);
+ if (!root->mempool)
+ return -ENOMEM;
+ }
+
+ if (!root->freevec) {
+ root->freevec = kzalloc(sizeof(struct btree_freevec), gfp_mask);
+ if (!root->freevec)
+ return -ENOMEM;
+ }
+
+ ret = mempool_resize(root->mempool, 1+2*root->height, gfp_mask);
+
+ return ret;
+}
+
+static int btree_mod_init(struct btree_root *root)
+{
+ return btree_preload(root, root->gfp_mask);
+}
+
+static int btree_mod_finish(struct btree_root *root, unsigned long index,
+ char *op)
+{
+ int ret = 0;
+ if (root->mempool)
+ ret = mempool_resize(root->mempool, 1, root->gfp_mask);
+
+ if (btree_validate(root)) {
+ printk(KERN_DEBUG "modified(%s): %lu\n", op, index);
+ btree_print(root);
+ BUG();
+ }
+
+ root->flush(root->freevec);
+ root->freevec = NULL;
+
+ return ret;
+}
+
+/*
+ * stack, to store traversal path.
+ */
+struct btree_path {
+ struct btree_node *node;
+ int offset;
+};
+
+struct btree_stack {
+ int p;
+ struct btree_path path[BTREE_MAX_DEPTH];
+};
+
+static inline void btree_stack_init(struct btree_stack *stack)
+{
+ memset(stack, 0, sizeof(*stack));
+}
+
+static inline void btree_stack_push(struct btree_stack *stack,
+ struct btree_node *node, int offset)
+{
+ stack->path[stack->p].node = node;
+ stack->path[stack->p].offset = offset;
+ stack->p++;
+
+ BUG_ON(stack->p > ARRAY_SIZE(stack->path));
+}
+
+static inline void btree_stack_pop(struct btree_stack *stack,
+ struct btree_node **nodep, int *offsetp)
+{
+ stack->p--;
+ *nodep = stack->path[stack->p].node;
+ *offsetp = stack->path[stack->p].offset;
+
+ BUG_ON(stack->p < 0);
+}
+
+static inline int btree_stack_size(struct btree_stack *stack)
+{
+ return stack->p;
+}
+
+static struct btree_node *btree_stack_peek_left(struct btree_stack *stack)
+{
+ struct btree_node *node;
+ struct btree_item *item;
+ int offset;
+
+ if (!btree_stack_size(stack))
+ return NULL;
+
+ node = stack->path[stack->p - 1].node;
+ offset = stack->path[stack->p - 1].offset;
+
+ if (offset == 0)
+ return NULL;
+
+ offset--;
+ item = &node->item[offset];
+ return btree_item_deref(item);
+}
+
+static struct btree_node *btree_stack_peek_right(struct btree_stack *stack)
+{
+ struct btree_node *node;
+ struct btree_item *item;
+ int offset;
+
+ if (!btree_stack_size(stack))
+ return NULL;
+
+ node = stack->path[stack->p - 1].node;
+ offset = stack->path[stack->p - 1].offset;
+
+ offset++;
+ if (offset == BTREE_ITEM_MAX)
+ return NULL;
+
+ item = &node->item[offset];
+ return btree_item_deref(item);
+}
+
+/* ------ search ------- */
+
+/*
+ * Since we can re-adjust the space partitioning in delete and update a
+ * lockless lookup might hit a wrong branch once in a while, hence we might
+ * need to backtrack.
+ *
+ * Because we do a search for a less than or equal index we can only backtrack
+ * to the left. So the split can be off to the left but never to the right.
+ * Hence we need to move splits left on descend and right on ascend of the
+ * tree. See update and remove.
+ *
+ * NOTE the backtrack should be limited to one entry so worst time is:
+ * O(2*log(n)-1)
+ */
+
+/*
+ * find which branch to take
+ */
+static inline
+int __btree_item_search(struct btree_root *root,
+ struct btree_node *node, unsigned long index)
+{
+ int i;
+
+ for (i = 1; i < BTREE_ITEM_MAX; i++)
+ if (node->item[i].data == NULL ||
+ index < btree_item_key(&node->item[i]))
+ break;
+ i--;
+ return i;
+}
+
+/*
+ * find item and the path to it.
+ */
+static struct btree_item *__btree_search(struct btree_root *root,
+ struct btree_stack *stack, unsigned long index)
+{
+ struct btree_node *node = rcu_dereference(root->root);
+ struct btree_item *item;
+ int offset;
+
+ btree_stack_init(stack);
+
+ if (!node)
+ return NULL;
+
+ do {
+ /*
+ * if the split was too low, back-track and take the previous
+ * branch
+ */
+ if (unlikely(btree_item_key(&node->item[0]) > index)) {
+ if (!btree_stack_size(stack))
+ return NULL;
+
+ do {
+ btree_stack_pop(stack, &node, &offset);
+ } while (btree_stack_size(stack) && offset == 0);
+
+ if (!btree_stack_size(stack) && offset == 0)
+ return NULL;
+
+ offset--;
+ } else
+ offset = __btree_item_search(root, node, index);
+
+ btree_stack_push(stack, node, offset);
+
+ item = &node->item[offset];
+ if (!btree_item_node(item))
+ break;
+
+ node = btree_item_deref(item);
+ } while (node);
+
+ return item;
+}
+
+/*
+ * find the left-most item in a sub-tree, and the path to it.
+ */
+static struct btree_item *__btree_find_first(struct btree_root *root,
+ struct btree_stack *stack, struct btree_node *node, int offset)
+{
+ struct btree_item *item;
+
+ do {
+ btree_stack_push(stack, node, offset);
+
+ item = &node->item[offset];
+ if (!btree_item_node(item))
+ break;
+
+ node = btree_item_deref(item);
+ offset = 0;
+ } while (node);
+
+ return item;
+}
+
+/*
+ * find the item with a lesser or equal index
+ * and optionally the next item.
+ */
+struct btree_item *btree_search_next(struct btree_root *root,
+ unsigned long index, struct btree_item **nextp)
+{
+ struct btree_stack stack;
+ struct btree_node *node;
+ struct btree_item *item, *next;
+ int offset;
+
+ item = __btree_search(root, &stack, index);
+
+ if (item && nextp) {
+ do {
+ btree_stack_pop(&stack, &node, &offset);
+ } while (btree_stack_size(&stack) &&
+ offset == BTREE_ITEM_MAX-1);
+
+ if (offset != BTREE_ITEM_MAX-1) {
+ offset++;
+ next = __btree_find_first(root, &stack, node, offset);
+ } else
+ next = NULL;
+
+ *nextp = next;
+ } else if (nextp) {
+ node = rcu_dereference(root->root);
+ offset = 0;
+
+ if (node) {
+ next = __btree_find_first(root, &stack, node, offset);
+ *nextp = next;
+ }
+ }
+
+ return item;
+}
+
+/* ------ insert ------- */
+
+/*
+ * insert item; split nodes when needed
+ */
+int btree_insert(struct btree_root *root,
+ unsigned long index, void *data)
+{
+ struct btree_stack stack;
+ struct btree_node *node = NULL,
+ *old_node,
+ *update,
+ *new = NULL,
+ *split = NULL,
+ *uleft,
+ *uright,
+ *left = NULL,
+ *right = NULL;
+ struct btree_item *item;
+ struct btree_item nitem = {
+ .key = index,
+ .data = data,
+ };
+ unsigned long key;
+ int i, j, n, ret = 0;
+
+ ret = btree_mod_init(root);
+ if (ret)
+ return ret;
+
+ item = __btree_search(root, &stack, index);
+ if (!item) {
+ if (!root->root) {
+ root->root = btree_node_alloc(root);
+ root->height = 1;
+ btree_stack_push(&stack, root->root, 0);
+ } else
+ __btree_find_first(root, &stack, root->root, 0);
+ } else if (btree_item_key(item) == index) {
+ ret = -EEXIST;
+ goto out;
+ }
+
+ do {
+ old_node = node;
+ btree_stack_pop(&stack, &node, &i);
+ item = &node->item[i];
+
+ update = NULL;
+ if (btree_item_node(item)) {
+ /* no change */
+ if (!new && !split && !left && !right) {
+ BUG_ON(btree_item_deref(item) != old_node);
+
+ /* possibly tighten the split on our way back up */
+ key = btree_item_key(&old_node->item[0]);
+ if (btree_item_key(item) != key)
+ btree_item_key_set(item, key);
+ continue;
+ }
+
+ /* single change */
+ if (new && !split && !left && !right) {
+ btree_item_flip(root, item, new);
+ new = NULL;
+ continue;
+ }
+
+ /* merge left */
+ if (new && !split && left && !right) {
+ update = btree_node_alloc(root);
+ btree_item_copy(update->item, node->item,
+ BTREE_ITEM_MAX);
+ update->item[i-1] = (struct btree_item){
+ .key = btree_item_key(&left->item[0]),
+ .data = btree_node_ptr(left),
+ };
+ update->item[i] = (struct btree_item){
+ .key = btree_item_key(&new->item[0]),
+ .data = btree_node_ptr(new),
+ };
+ btree_item_free(root, &node->item[i-1]);
+ btree_item_free(root, &node->item[i]);
+
+ left = NULL;
+ new = update;
+ continue;
+ }
+
+ /* merge right */
+ if (new && !split && !left && right) {
+ update = btree_node_alloc(root);
+ btree_item_copy(update->item, node->item,
+ BTREE_ITEM_MAX);
+ update->item[i] = (struct btree_item){
+ .key = btree_item_key(&new->item[0]),
+ .data = btree_node_ptr(new),
+ };
+ update->item[i+1] = (struct btree_item){
+ .key = btree_item_key(&right->item[0]),
+ .data = btree_node_ptr(right),
+ };
+ btree_item_free(root, &node->item[i]);
+ btree_item_free(root, &node->item[i+1]);
+
+ new = update;
+ right = NULL;
+ continue;
+ }
+
+ /* split */
+ BUG_ON(left || right);
+ update = new;
+ }
+
+ if (btree_item_deref(item) &&
+ btree_item_key(item) < nitem.key)
+ i++;
+
+ split = NULL;
+ left = NULL;
+ right = NULL;
+ new = btree_node_alloc(root);
+ /* insert has room */
+ if (node->item[BTREE_ITEM_MAX-1].data == NULL) {
+ btree_item_copy(new->item, node->item, i);
+ new->item[i] = nitem;
+ btree_item_copy(new->item + i + 1, node->item + i,
+ BTREE_ITEM_MAX - i - 1);
+
+ if (update)
+ btree_item_flip(root, &new->item[i-1], update);
+
+ continue;
+ }
+
+#if 0 /* there be dragons here ! */
+
+ /* insert overflows */
+
+ /* see if left sibling will take some overflow */
+ uleft = btree_stack_peek_left(&stack);
+ if (uleft && uleft->item[BTREE_ITEM_MAX-1].data == NULL) {
+
+ j = btree_item_count(uleft);
+ n = (BTREE_ITEM_MAX - j + 1)/2;
+
+ left = btree_node_alloc(root);
+ /*
+ * LLLLl... CCCCcccc
+ * LLLLlC.. CCCcc*cc
+ */
+ btree_item_copy(left->item, uleft->item, j);
+ if (i < n) {
+ btree_item_copy(left->item+j, node->item, i);
+ left->item[j+i] = nitem;
+ btree_item_copy(left->item+j+i+1, node->item+i,
+ n-i);
+ btree_item_copy(new->item, node->item+n,
+ BTREE_ITEM_MAX-n);
+ } else {
+ btree_item_copy(left->item+j, node->item, n);
+ btree_item_copy(new->item, node->item+n, i-n);
+ new->item[i-n] = nitem;
+ btree_item_copy(new->item+i-n+1, node->item+i,
+ BTREE_ITEM_MAX-i);
+ }
+
+ if (update) {
+ if (i-1 < n)
+ btree_item_flip(root,
+ &left->item[j+i-1],
+ update);
+ else
+ btree_item_flip(root, &new->item[i-n-1],
+ update);
+ }
+
+ continue;
+ }
+
+ /* see if right sibling will take some overflow */
+ uright = btree_stack_peek_right(&stack);
+ if (uright && uright->item[BTREE_ITEM_MAX-1].data == NULL) {
+
+ j = btree_item_count(uright);
+ n = (BTREE_ITEM_MAX - j + 1)/2;
+
+ right = btree_node_alloc(root);
+ /*
+ * CCCCcccc RRRR....
+ * CCCC*cc. ccRRRR..
+ */
+ if (i <= BTREE_ITEM_MAX-n) {
+ btree_item_copy(new->item, node->item, i);
+ new->item[i] = nitem;
+ btree_item_copy(new->item+i+1, node->item+i,
+ BTREE_ITEM_MAX-n-i);
+ btree_item_copy(right->item,
+ node->item+BTREE_ITEM_MAX-n, n);
+ btree_item_copy(right->item+n, uright->item, j);
+ } else {
+ btree_item_copy(new->item, node->item,
+ BTREE_ITEM_MAX-n);
+
+ btree_item_copy(right->item,
+ node->item+(BTREE_ITEM_MAX-n),
+ i-(BTREE_ITEM_MAX-n));
+ right->item[i-(BTREE_ITEM_MAX-n)] = nitem;
+ btree_item_copy(right->item+i-(BTREE_ITEM_MAX-n)+1,
+ node->item+i, BTREE_ITEM_MAX-i);
+ btree_item_copy(right->item+n+1, uright->item,
+ j);
+ }
+
+ if (update) {
+ if (i-1 <= BTREE_ITEM_MAX-n)
+ btree_item_flip(root, &new->item[i-1],
+ update);
+ else
+ btree_item_flip(root,
+ &right->item[i-1-BTREE_ITEM_MAX-n],
+ update);
+ }
+
+ continue;
+ }
+#endif
+
+ /* split node */
+ split = btree_node_alloc(root);
+ if (i < BTREE_ITEM_HALF) {
+ btree_item_copy(new->item, node->item, i);
+ new->item[i] = nitem;
+ btree_item_copy(new->item + i + 1, node->item + i,
+ BTREE_ITEM_HALF - i);
+ btree_item_copy(split->item,
+ node->item + BTREE_ITEM_HALF,
+ BTREE_ITEM_HALF);
+ } else {
+ btree_item_copy(new->item, node->item,
+ BTREE_ITEM_HALF);
+ btree_item_copy(split->item,
+ node->item + BTREE_ITEM_HALF,
+ i - BTREE_ITEM_HALF);
+ split->item[i - BTREE_ITEM_HALF] = nitem;
+ btree_item_copy(split->item + i - BTREE_ITEM_HALF + 1,
+ node->item + i,
+ BTREE_ITEM_MAX - i);
+ }
+
+ if (update) {
+ if (i-1 < BTREE_ITEM_HALF)
+ btree_item_flip(root, &new->item[i-1], update);
+ else
+ btree_item_flip(root,
+ &split->item[i-1-BTREE_ITEM_HALF],
+ update);
+ }
+
+ nitem.key = split->item[0].key;
+ nitem.data = btree_node_ptr(split);
+ } while (btree_stack_size(&stack));
+
+ if (new) {
+ if (!split) {
+ btree_ptr_flip(root, (void **)&root->root, new);
+ } else {
+ update = btree_node_alloc(root);
+ update->item[0] = (struct btree_item){
+ .key = new->item[0].key,
+ .data = btree_node_ptr(new),
+ };
+ update->item[1] = (struct btree_item){
+ .key = split->item[0].key,
+ .data = btree_node_ptr(split),
+ };
+ btree_ptr_flip(root, (void **)&root->root, update);
+ root->height++;
+ }
+ }
+
+out:
+ btree_mod_finish(root, index, "insert");
+ return ret;
+}
+
+/* ------ update ------- */
+
+/*
+ * update the index of an item
+ *
+ * be careful the range:
+ * [min(index, new_index), max(index, new_index]
+ * _MUST_ be free, otherwise remove and re-insert.
+ *
+ * see the search comment for lockless lookup constraints
+ */
+int btree_update(struct btree_root *root,
+ unsigned long index, unsigned long new_index)
+{
+ struct btree_stack stack;
+ struct btree_node *node = rcu_dereference(root->root);
+ struct btree_item *item;
+ unsigned long key;
+ int offset, ret = 0;
+
+ if (index == new_index)
+ return 0;
+
+ if (!node)
+ return -EINVAL;
+
+ btree_stack_init(&stack);
+
+ do {
+ offset = __btree_item_search(root, node, index);
+ btree_stack_push(&stack, node, offset);
+ item = &node->item[offset];
+ if (btree_item_node(item)) {
+ /* loosen downwards */
+ if (index > new_index && btree_item_key(item) == index)
+ btree_item_key_set(item, new_index);
+ node = btree_item_deref(item);
+ } else
+ node = NULL;
+ } while (node);
+
+ if (btree_item_key(item) == index)
+ btree_item_key_set(item, new_index);
+ else
+ ret = -EINVAL;
+
+ do {
+ btree_stack_pop(&stack, &node, &offset);
+ item = &node->item[offset];
+ if (!btree_item_node(item))
+ continue;
+
+ key = btree_item_key(item);
+ /* undo on error */
+ if (ret && index > new_index && key == new_index)
+ btree_item_key_set(item, index);
+ /* tighten upwards */
+ if (!ret && index < new_index && key == index)
+ btree_item_key_set(item, new_index);
+ } while (btree_stack_size(&stack));
+
+ if (btree_validate(root)) {
+ printk(KERN_DEBUG "btree_update: index: %lu new_index: %lu\n",
+ index, new_index);
+ btree_print(root);
+ BUG();
+ }
+
+ return 0;
+}
+
+/* ------ delete ------- */
+
+/*
+ * delete an item; borrow items or merge nodes when needed.
+ */
+void *btree_remove(struct btree_root *root, unsigned long index)
+{
+ struct btree_stack stack;
+ struct btree_node *node = NULL,
+ *old_node,
+ *left = NULL,
+ *new = NULL,
+ *right = NULL,
+ *update,
+ *uleft,
+ *uright;
+ struct btree_item *item;
+ void *ret = NULL;
+ unsigned long key;
+ int i, j, n;
+ int err;
+
+ if (!root->root)
+ return ERR_PTR(-EINVAL);
+
+ err = btree_mod_init(root);
+ if (err)
+ return ERR_PTR(err);
+
+ item = __btree_search(root, &stack, index);
+ if (!item || btree_item_key(item) != index) {
+ ret = ERR_PTR(-EINVAL);
+ goto out;
+ } else
+ ret = item->data;
+
+ do {
+ old_node = node;
+ btree_stack_pop(&stack, &node, &i);
+ item = &node->item[i];
+
+ update = NULL;
+ if (btree_item_node(item)) {
+ /* no change */
+ if (!new && !left && !right) {
+ BUG_ON(btree_item_deref(item) != old_node);
+
+ /* tighten the split on our way back up */
+ key = btree_item_key(&old_node->item[0]);
+ if (btree_item_key(item) != key)
+ btree_item_key_set(item, key);
+ continue;
+ }
+
+ /* single change */
+ if (!left && new && !right) {
+ btree_item_flip(root, &node->item[i], new);
+ new = NULL;
+ continue;
+ }
+
+ /* borrowed left */
+ if (left && new && !right) {
+ update = btree_node_alloc(root);
+ btree_item_copy(update->item, node->item,
+ BTREE_ITEM_MAX);
+ update->item[i-1] = (struct btree_item){
+ .key = btree_item_key(&left->item[0]),
+ .data = btree_node_ptr(left),
+ };
+ update->item[i] = (struct btree_item){
+ .key = btree_item_key(&new->item[0]),
+ .data = btree_node_ptr(new),
+ };
+ btree_item_free(root, &node->item[i-1]);
+ btree_item_free(root, &node->item[i]);
+
+ left = NULL;
+ new = update;
+ continue;
+ }
+
+ /* borrowed right */
+ if (!left && new && right) {
+ update = btree_node_alloc(root);
+ btree_item_copy(update->item, node->item,
+ BTREE_ITEM_MAX);
+ update->item[i] = (struct btree_item){
+ .key = btree_item_key(&new->item[0]),
+ .data = btree_node_ptr(new),
+ };
+ update->item[i+1] = (struct btree_item){
+ .key = btree_item_key(&right->item[0]),
+ .data = btree_node_ptr(right),
+ };
+ btree_item_free(root, &node->item[i]);
+ btree_item_free(root, &node->item[i+1]);
+
+ new = update;
+ right = NULL;
+ continue;
+ }
+
+ /* merged left */
+ if (left && !new && !right) {
+ update = left;
+ }
+ /* merged right */
+ else if (!left && !new && right) {
+ update = right;
+ i++;
+ }
+ /* impossible */
+ else {
+ printk("%p %p %p\n", left, new, right);
+ BUG();
+ }
+ }
+
+ /* delete */
+ left = NULL;
+ right = NULL;
+ new = btree_node_alloc(root);
+ if (node->item[BTREE_ITEM_HALF].data) {
+delete_one:
+ /*
+ * CCCxcc..
+ * CCCcc...
+ */
+ btree_item_copy(new->item, node->item, i);
+ btree_item_copy(new->item+i, node->item+i+1,
+ BTREE_ITEM_MAX-i-1);
+
+ if (update)
+ btree_item_flip(root, &new->item[i-1], update);
+
+ btree_item_free(root, &node->item[i]);
+ continue;
+ }
+
+ /* delete underflows */
+
+ /* borrow left */
+ uleft = btree_stack_peek_left(&stack);
+ if (uleft && uleft->item[BTREE_ITEM_HALF].data) {
+ left = btree_node_alloc(root);
+
+ j = btree_item_count(uleft);
+ n = (j-BTREE_ITEM_HALF+1)/2;
+
+ /*
+ * LLLLlll. CxCC....
+ * LLLLl... llCCC...
+ */
+ btree_item_copy(left->item, uleft->item, (j-n));
+ btree_item_copy(new->item, uleft->item+(j-n), n);
+ btree_item_copy(new->item+n, node->item, i);
+ btree_item_copy(new->item+n+i, node->item+i+1,
+ BTREE_ITEM_HALF-i);
+
+ if (update)
+ btree_item_flip(root, &new->item[n+i-1],
+ update);
+
+ btree_item_free(root, &node->item[i]);
+
+ continue;
+ }
+
+ /* borrow right */
+ uright = btree_stack_peek_right(&stack);
+ if (uright && uright->item[BTREE_ITEM_HALF].data) {
+ right = btree_node_alloc(root);
+
+ j = btree_item_count(uright);
+ n = (j-BTREE_ITEM_HALF+1)/2;
+
+ /*
+ * CCxC.... RRRRrrr.
+ * CCCRR... RRrrr...
+ */
+ btree_item_copy(new->item, node->item, i);
+ btree_item_copy(new->item+i, node->item+i+1,
+ BTREE_ITEM_HALF-i-1);
+ btree_item_copy(new->item+BTREE_ITEM_HALF-1,
+ uright->item, n);
+ btree_item_copy(right->item, uright->item+n,
+ BTREE_ITEM_MAX-n);
+
+ if (update)
+ btree_item_flip(root, &new->item[i-1], update);
+
+ btree_item_free(root, &node->item[i]);
+
+ continue;
+ }
+
+ /* merge left */
+ if (uleft) {
+ /*
+ * LLLL.... CCCxc...
+ * LLLLCCCc
+ */
+ btree_item_copy(new->item, uleft->item,
+ BTREE_ITEM_HALF);
+ btree_item_copy(new->item+BTREE_ITEM_HALF,
+ node->item, i);
+ btree_item_copy(new->item+BTREE_ITEM_HALF+i,
+ node->item+i+1, BTREE_ITEM_HALF-i-1);
+
+ if (update)
+ btree_item_flip(root,
+ &new->item[BTREE_ITEM_HALF+i-1],
+ update);
+
+ btree_item_free(root, &node->item[i]);
+
+ left = new;
+ new = NULL;
+ continue;
+ }
+
+ /* merge right */
+ if (uright) {
+ /*
+ * CCxCc... RRRR....
+ * CCCcRRRR
+ */
+ btree_item_copy(new->item, node->item, i);
+ btree_item_copy(new->item+i, node->item+i+1,
+ BTREE_ITEM_HALF-i-1);
+ btree_item_copy(new->item+BTREE_ITEM_HALF-1,
+ uright->item, BTREE_ITEM_HALF);
+
+ if (update)
+ btree_item_flip(root, &new->item[i-1], update);
+
+ btree_item_free(root, &node->item[i]);
+
+ /*
+ * use right to carry the new node to avoid having
+ * the same pattern as single change
+ */
+ right = new;
+ new = NULL;
+ continue;
+ }
+
+ /* only the root may underflow */
+ BUG_ON(root->root != node);
+ goto delete_one;
+
+ } while (btree_stack_size(&stack));
+
+ BUG_ON(left || right);
+
+ if (new)
+ btree_ptr_flip(root, (void **)&root->root, new);
+
+ if (!root->root->item[1].data &&
+ btree_item_node(&root->root->item[0])) {
+ btree_ptr_flip(root, (void **)&root->root,
+ btree_item_deref(&root->root->item[0]));
+ root->height--;
+ }
+
+ if (!root->root->item[0].data) {
+ btree_ptr_flip(root, (void **)&root->root, NULL);
+ root->height = 0;
+ }
+
+out:
+ btree_mod_finish(root, index, "remove");
+ return ret;
+}
+
+/* ------ */
+
+void btree_root_init(struct btree_root *root, gfp_t gfp_mask)
+{
+ *root = BTREE_INIT(gfp_mask);
+}
+
+void btree_freevec_flush(struct btree_freevec *freevec)
+{
+ int i;
+ for (i = 0; i < freevec->pos; i++)
+ kmem_cache_free(btree_cachep, freevec->slot[i]);
+ kfree(freevec);
+}
+
+void btree_root_destroy(struct btree_root *root)
+{
+ /* assume the tree is empty */
+ BUG_ON(root->height);
+ BUG_ON(root->freevec);
+ if (root->mempool)
+ mempool_destroy(root->mempool);
+}
+
+void __init btree_init(void)
+{
+ btree_cachep = KMEM_CACHE(btree_node, SLAB_HWCACHE_ALIGN);
+
+}
+
+#ifdef BTREE_DEBUG
+static void __btree_node_print(struct btree_root *root,
+ struct btree_node *node, int recurse)
+{
+ int i, j;
+ for (i = 0; i < BTREE_ITEM_MAX; i++) {
+ if (node->item[i].data) {
+ printk(KERN_DEBUG);
+ for (j=0; j<recurse; j++)
+ printk(" ");
+ if (!btree_item_node(&node->item[i])) {
+
+ printk("-> leaf: %p, item: %d, key: %lu, data: %p\n",
+ node, i, node->item[i].key, node->item[i].data);
+
+ } else {
+
+ printk("node: %p, item: %d, key: %lu, child: %p\n",
+ node, i, node->item[i].key, btree_item_deref(&node->item[i]));
+ if (recurse)
+ __btree_node_print(root, btree_item_deref(&node->item[i]), recurse+1);
+
+ }
+ }
+ }
+}
+
+void btree_node_print(struct btree_root *root, struct btree_node *node)
+{
+ printk(KERN_DEBUG "node: %p\n", node);
+ __btree_node_print(root, node, 0);
+}
+
+void btree_print(struct btree_root *root)
+{
+ printk(KERN_DEBUG "[%p] root: %p, height: %d\n",
+ root, root->root, root->height);
+ if (root->root)
+ __btree_node_print(root, root->root, 1);
+}
+
+static unsigned long __btree_key(struct btree_root *root,
+ struct btree_node *node, int height)
+{
+ if (height == 0)
+ return node->item[0].key;
+
+ return __btree_key(root, btree_item_deref(&node->item[0]), height - 1);
+}
+
+static int __btree_validate(struct btree_root *root, struct btree_node *node,
+ unsigned long *pindex, int height)
+{
+ unsigned long parent_key = *pindex;
+ unsigned long node_key = 0;
+ unsigned long child_key = 0;
+
+ int i;
+ unsigned long key;
+ int nr = 0;
+ int bug = 0;
+
+ for (i = 0; i < BTREE_ITEM_MAX; i++) {
+ struct btree_node *child = btree_item_deref(&node->item[i]);
+ if (!child)
+ continue;
+
+ nr++;
+
+ key = node->item[i].key;
+ if (key < parent_key || (!i && key != parent_key) ||
+ (i && key == parent_key)) {
+ printk(KERN_DEBUG
+ "wrong parent split: key: %lu parent_key: %lu index: %d\n",
+ key, parent_key, i);
+ bug++;
+ }
+
+ if (key < node_key || (i && key == node_key)) {
+ printk(KERN_DEBUG
+ "wrong order: key: %lu node_key: %lu index: %d\n",
+ key, node_key, i);
+ bug++;
+ }
+ node_key = key;
+
+ if (key < child_key || (i && key == child_key)) {
+ printk(KERN_DEBUG
+ "wrong child split: key: %lu child_key: %lu index: %d\n",
+ key, child_key, i);
+ bug++;
+ }
+
+ child_key = max(node_key, child_key);
+
+ if (height)
+ bug += __btree_validate(root, child,
+ &child_key, height - 1);
+
+ *pindex = max(node_key, child_key);
+ }
+
+ if (node != root->root && nr < BTREE_ITEM_HALF) {
+ printk(KERN_DEBUG "node short\n");
+ bug++;
+ }
+
+ if (bug) {
+ printk(KERN_DEBUG "bug in node: %p\n", node);
+ }
+
+ return bug;
+}
+
+int btree_validate(struct btree_root *root)
+{
+ unsigned long key;
+
+ if (root->root) {
+ key = __btree_key(root, root->root, root->height - 1);
+ return __btree_validate(root, root->root, &key,
+ root->height - 1);
+ }
+
+ return 0;
+}
+#endif
--
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 10+ messages in thread
* [PATCH/RFC 3/9] mm: use the B+tree for vma lookups
2007-10-22 10:45 [PATCH/RFC 0/9] VMA lookup with RCU Vaidyanathan Srinivasan
2007-10-22 10:45 ` [PATCH/RFC 1/9] Data structure changes Vaidyanathan Srinivasan
2007-10-22 10:45 ` [PATCH/RFC 2/9] lib: RCU friendly B+tree Vaidyanathan Srinivasan
@ 2007-10-22 10:45 ` Vaidyanathan Srinivasan
2007-10-22 10:45 ` [PATCH/RFC 4/9] mm: RCU " Vaidyanathan Srinivasan
` (5 subsequent siblings)
8 siblings, 0 replies; 10+ messages in thread
From: Vaidyanathan Srinivasan @ 2007-10-22 10:45 UTC (permalink / raw)
To: linux-mm; +Cc: Alexis Bruemmer, Peter Zijlstra
[-- Attachment #1: 3_mm-btree.patch --]
[-- Type: text/plain, Size: 16893 bytes --]
---
include/linux/init_task.h | 2
include/linux/mm.h | 5 -
include/linux/sched.h | 4
kernel/fork.c | 17 +--
mm/mmap.c | 230 +++++++++++++++-------------------------------
5 files changed, 92 insertions(+), 166 deletions(-)
--- linux-2.6.23-rc6.orig/include/linux/init_task.h
+++ linux-2.6.23-rc6/include/linux/init_task.h
@@ -48,7 +48,7 @@
#define INIT_MM(name) \
{ \
.mm_vmas = LIST_HEAD_INIT(name.mm_vmas), \
- .mm_rb = RB_ROOT, \
+ .mm_btree = BTREE_INIT(GFP_ATOMIC), \
.pgd = swapper_pg_dir, \
.mm_users = ATOMIC_INIT(2), \
.mm_count = ATOMIC_INIT(1), \
--- linux-2.6.23-rc6.orig/include/linux/mm.h
+++ linux-2.6.23-rc6/include/linux/mm.h
@@ -69,8 +69,6 @@ struct vm_area_struct {
pgprot_t vm_page_prot; /* Access permissions of this VMA. */
unsigned long vm_flags; /* Flags, listed below. */
- struct rb_node vm_rb;
-
/*
* For areas with an address space and backing store,
* linkage into the address_space->i_mmap prio tree, or
@@ -1091,8 +1089,7 @@ extern struct anon_vma *find_mergeable_a
extern int split_vma(struct mm_struct *,
struct vm_area_struct *, unsigned long addr, int new_below);
extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *);
-extern void __vma_link_rb(struct mm_struct *, struct vm_area_struct *,
- struct rb_node **, struct rb_node *);
+extern void __vma_link_btree(struct mm_struct *, struct vm_area_struct *);
extern void unlink_file_vma(struct vm_area_struct *);
extern struct vm_area_struct *copy_vma(struct vm_area_struct **,
unsigned long addr, unsigned long len, pgoff_t pgoff);
--- linux-2.6.23-rc6.orig/include/linux/sched.h
+++ linux-2.6.23-rc6/include/linux/sched.h
@@ -52,7 +52,7 @@ struct sched_param {
#include <linux/types.h>
#include <linux/timex.h>
#include <linux/jiffies.h>
-#include <linux/rbtree.h>
+#include <linux/btree.h>
#include <linux/thread_info.h>
#include <linux/cpumask.h>
#include <linux/errno.h>
@@ -368,7 +368,7 @@ extern int get_dumpable(struct mm_struct
struct mm_struct {
struct list_head mm_vmas;
- struct rb_root mm_rb;
+ struct btree_root mm_btree;
struct vm_area_struct * mmap_cache; /* last find_vma result */
unsigned long (*get_unmapped_area) (struct file *filp,
unsigned long addr, unsigned long len,
--- linux-2.6.23-rc6.orig/kernel/fork.c
+++ linux-2.6.23-rc6/kernel/fork.c
@@ -198,7 +198,6 @@ static struct task_struct *dup_task_stru
static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
{
struct vm_area_struct *mpnt, *tmp;
- struct rb_node **rb_link, *rb_parent;
int retval;
unsigned long charge;
struct mempolicy *pol;
@@ -216,9 +215,6 @@ static inline int dup_mmap(struct mm_str
mm->cached_hole_size = ~0UL;
mm->map_count = 0;
cpus_clear(mm->cpu_vm_mask);
- mm->mm_rb = RB_ROOT;
- rb_link = &mm->mm_rb.rb_node;
- rb_parent = NULL;
list_for_each_entry(mpnt, &oldmm->mm_vmas, vm_list) {
struct file *file;
@@ -270,9 +266,8 @@ static inline int dup_mmap(struct mm_str
*/
list_add_tail(&tmp->vm_list, &mm->mm_vmas);
- __vma_link_rb(mm, tmp, rb_link, rb_parent);
- rb_link = &tmp->vm_rb.rb_right;
- rb_parent = &tmp->vm_rb;
+ btree_preload(&mm->mm_btree, GFP_KERNEL|__GFP_NOFAIL); // XXX create error path
+ __vma_link_btree(mm, tmp);
mm->map_count++;
retval = copy_page_range(mm, oldmm, mpnt);
@@ -320,13 +315,19 @@ static inline void mm_free_pgd(struct mm
__cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);
#define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL))
-#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm)))
+
+static void free_mm(struct mm_struct *mm)
+{
+ btree_root_destroy(&mm->mm_btree);
+ kmem_cache_free(mm_cachep, mm);
+}
#include <linux/init_task.h>
static struct mm_struct * mm_init(struct mm_struct * mm)
{
INIT_LIST_HEAD(&mm->mm_vmas);
+ mm->mm_btree = BTREE_INIT(GFP_ATOMIC);
atomic_set(&mm->mm_users, 1);
atomic_set(&mm->mm_count, 1);
init_rwsem(&mm->mmap_sem);
--- linux-2.6.23-rc6.orig/mm/mmap.c
+++ linux-2.6.23-rc6/mm/mmap.c
@@ -4,6 +4,7 @@
* Written by obz.
*
* Address space accounting code <alan@redhat.com>
+ * Btree adaption <pzijlstr@redhat.com>
*/
#include <linux/slab.h>
@@ -283,35 +284,6 @@ out:
}
#ifdef DEBUG_MM_RB
-static int browse_rb(struct rb_root *root)
-{
- int i = 0, j;
- struct rb_node *nd, *pn = NULL;
- unsigned long prev = 0, pend = 0;
-
- for (nd = rb_first(root); nd; nd = rb_next(nd)) {
- struct vm_area_struct *vma;
- vma = rb_entry(nd, struct vm_area_struct, vm_rb);
- if (vma->vm_start < prev)
- printk("vm_start %lx prev %lx\n", vma->vm_start, prev), i = -1;
- if (vma->vm_start < pend)
- printk("vm_start %lx pend %lx\n", vma->vm_start, pend);
- if (vma->vm_start > vma->vm_end)
- printk("vm_end %lx < vm_start %lx\n", vma->vm_end, vma->vm_start);
- i++;
- pn = nd;
- prev = vma->vm_start;
- pend = vma->vm_end;
- }
- j = 0;
- for (nd = pn; nd; nd = rb_prev(nd)) {
- j++;
- }
- if (i != j)
- printk("backwards %d, forwards %d\n", j, i), i = 0;
- return i;
-}
-
void validate_mm(struct mm_struct *mm)
{
int bug = 0;
@@ -321,9 +293,6 @@ void validate_mm(struct mm_struct *mm)
i++;
if (i != mm->map_count)
printk("map_count %d vm_next %d\n", mm->map_count, i), bug = 1;
- i = browse_rb(&mm->mm_rb);
- if (i != mm->map_count)
- printk("map_count %d rb %d\n", mm->map_count, i), bug = 1;
BUG_ON(bug);
}
#else
@@ -332,62 +301,36 @@ void validate_mm(struct mm_struct *mm)
static struct vm_area_struct *
find_vma_prepare(struct mm_struct *mm, unsigned long addr,
- struct vm_area_struct **pprev, struct rb_node ***rb_link,
- struct rb_node ** rb_parent)
+ struct vm_area_struct **pprev)
{
- struct vm_area_struct * vma;
- struct rb_node ** __rb_link, * __rb_parent, * rb_prev;
-
- __rb_link = &mm->mm_rb.rb_node;
- rb_prev = __rb_parent = NULL;
- vma = NULL;
-
- while (*__rb_link) {
- struct vm_area_struct *vma_tmp;
-
- __rb_parent = *__rb_link;
- vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb);
-
- if (vma_tmp->vm_end > addr) {
- vma = vma_tmp;
- if (vma_tmp->vm_start <= addr)
- return vma;
- __rb_link = &__rb_parent->rb_left;
- } else {
- rb_prev = __rb_parent;
- __rb_link = &__rb_parent->rb_right;
- }
- }
-
- *pprev = NULL;
- if (rb_prev)
- *pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
- *rb_link = __rb_link;
- *rb_parent = __rb_parent;
+ struct vm_area_struct *vma, *prev = NULL;
+ vma = btree_stab(&mm->mm_btree, addr);
+ if (!vma || addr >= vma->vm_end)
+ vma = __vma_next(&mm->mm_vmas, vma);
+ if (!(vma && addr < vma->vm_end))
+ prev = __vma_prev(&mm->mm_vmas, vma);
+ *pprev = prev;
return vma;
}
static inline void
__vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
- struct vm_area_struct *prev, struct rb_node *rb_parent)
+ struct vm_area_struct *prev)
{
- if (prev) {
+ if (!prev)
+ prev = btree_stab(&mm->mm_btree, vma->vm_start);
+
+ if (prev)
list_add(&vma->vm_list, &prev->vm_list);
- } else {
- if (rb_parent) {
- struct vm_area_struct *next = rb_entry(rb_parent,
- struct vm_area_struct, vm_rb);
- list_add_tail(&vma->vm_list, &next->vm_list);
- } else
- list_add(&vma->vm_list, &mm->mm_vmas);
- }
+ else
+ list_add(&vma->vm_list, &mm->mm_vmas);
}
-void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
- struct rb_node **rb_link, struct rb_node *rb_parent)
+void __vma_link_btree(struct mm_struct *mm, struct vm_area_struct *vma)
{
- rb_link_node(&vma->vm_rb, rb_parent, rb_link);
- rb_insert_color(&vma->vm_rb, &mm->mm_rb);
+ int err;
+ err = btree_insert(&mm->mm_btree, vma->vm_start, vma);
+ BUG_ON(err);
}
static inline void __vma_link_file(struct vm_area_struct *vma)
@@ -414,20 +357,20 @@ static inline void __vma_link_file(struc
static void
__vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
- struct vm_area_struct *prev, struct rb_node **rb_link,
- struct rb_node *rb_parent)
+ struct vm_area_struct *prev)
{
- __vma_link_list(mm, vma, prev, rb_parent);
- __vma_link_rb(mm, vma, rb_link, rb_parent);
+ __vma_link_list(mm, vma, prev);
+ __vma_link_btree(mm, vma);
__anon_vma_link(vma);
}
static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
- struct vm_area_struct *prev, struct rb_node **rb_link,
- struct rb_node *rb_parent)
+ struct vm_area_struct *prev)
{
struct address_space *mapping = NULL;
+ btree_preload(&mm->mm_btree, GFP_KERNEL|__GFP_NOFAIL); // XXX create error path
+
if (vma->vm_file)
mapping = vma->vm_file->f_mapping;
@@ -437,7 +380,7 @@ static void vma_link(struct mm_struct *m
}
anon_vma_lock(vma);
- __vma_link(mm, vma, prev, rb_link, rb_parent);
+ __vma_link(mm, vma, prev);
__vma_link_file(vma);
anon_vma_unlock(vma);
@@ -456,12 +399,7 @@ static void vma_link(struct mm_struct *m
static void
__insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
{
- struct vm_area_struct * __vma, * prev;
- struct rb_node ** rb_link, * rb_parent;
-
- __vma = find_vma_prepare(mm, vma->vm_start,&prev, &rb_link, &rb_parent);
- BUG_ON(__vma && __vma->vm_start < vma->vm_end);
- __vma_link(mm, vma, prev, rb_link, rb_parent);
+ __vma_link(mm, vma, NULL);
mm->map_count++;
}
@@ -469,8 +407,21 @@ static inline void
__vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
struct vm_area_struct *prev)
{
+ struct vm_area_struct *vma_tmp;
+
list_del(&vma->vm_list);
- rb_erase(&vma->vm_rb, &mm->mm_rb);
+
+ vma_tmp = btree_remove(&mm->mm_btree, vma->vm_start);
+ if (vma_tmp != vma) {
+#ifdef CONFIG_DEBUG_VM
+ printk(KERN_DEBUG
+ "btree_remove(%lu) returned %p instead of %p\n"
+ ,vma->vm_start, vma_tmp, vma);
+ btree_print(&mm->mm_btree);
+#endif
+ BUG();
+ }
+
if (mm->mmap_cache == vma)
mm->mmap_cache = prev;
}
@@ -525,6 +476,8 @@ again: remove_next = 1 + (end > next->
}
}
+ btree_preload(&mm->mm_btree, GFP_KERNEL|__GFP_NOFAIL); // XXX error path
+
if (file) {
mapping = file->f_mapping;
if (!(vma->vm_flags & VM_NONLINEAR))
@@ -576,10 +529,14 @@ again: remove_next = 1 + (end > next->
vma_prio_tree_remove(next, root);
}
+ btree_update(&mm->mm_btree, vma->vm_start, start);
vma->vm_start = start;
vma->vm_end = end;
vma->vm_pgoff = pgoff;
+
if (adjust_next) {
+ btree_update(&mm->mm_btree, next->vm_start,
+ next->vm_start + (adjust_next << PAGE_SHIFT));
next->vm_start += adjust_next << PAGE_SHIFT;
next->vm_pgoff += adjust_next;
}
@@ -1075,7 +1032,7 @@ unsigned long mmap_region(struct file *f
/* Clear old maps */
error = -ENOMEM;
munmap_back:
- vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
+ vma = find_vma_prepare(mm, addr, &prev);
if (vma && vma->vm_start < addr + len) {
if (do_munmap(mm, addr, len))
return -ENOMEM;
@@ -1176,7 +1133,7 @@ munmap_back:
if (!file || !vma_merge(mm, prev, addr, vma->vm_end,
vma->vm_flags, NULL, file, pgoff, vma_policy(vma))) {
file = vma->vm_file;
- vma_link(mm, vma, prev, rb_link, rb_parent);
+ vma_link(mm, vma, prev);
if (correct_wcount)
atomic_inc(&inode->i_writecount);
} else {
@@ -1436,25 +1393,9 @@ struct vm_area_struct * find_vma(struct
/* (Cache hit rate is typically around 35%.) */
vma = mm->mmap_cache;
if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
- struct rb_node * rb_node;
-
- rb_node = mm->mm_rb.rb_node;
- vma = NULL;
-
- while (rb_node) {
- struct vm_area_struct * vma_tmp;
-
- vma_tmp = rb_entry(rb_node,
- struct vm_area_struct, vm_rb);
-
- if (vma_tmp->vm_end > addr) {
- vma = vma_tmp;
- if (vma_tmp->vm_start <= addr)
- break;
- rb_node = rb_node->rb_left;
- } else
- rb_node = rb_node->rb_right;
- }
+ vma = btree_stab(&mm->mm_btree, addr);
+ if (!vma || addr >= vma->vm_end)
+ vma = __vma_next(&mm->mm_vmas, vma);
if (vma)
mm->mmap_cache = vma;
}
@@ -1469,32 +1410,11 @@ struct vm_area_struct *
find_vma_prev(struct mm_struct *mm, unsigned long addr,
struct vm_area_struct **pprev)
{
- struct vm_area_struct *prev = NULL, *next;
- struct rb_node * rb_node;
- if (!mm)
- goto out;
-
- /* Go through the RB tree quickly. */
- rb_node = mm->mm_rb.rb_node;
-
- while (rb_node) {
- struct vm_area_struct *vma_tmp;
- vma_tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb);
-
- if (addr < vma_tmp->vm_end) {
- rb_node = rb_node->rb_left;
- } else {
- prev = vma_tmp;
- next = __vma_next(&mm->mm_vmas, prev);
- if (!next || (addr < next->vm_end))
- break;
- rb_node = rb_node->rb_right;
- }
- }
+ struct vm_area_struct *vma;
-out:
- *pprev = prev;
- return __vma_next(&mm->mm_vmas, prev);
+ vma = find_vma(mm, addr);
+ *pprev = __vma_prev(&mm->mm_vmas, vma);
+ return vma;
}
/*
@@ -1633,6 +1553,17 @@ static inline int expand_downwards(struc
error = acct_stack_growth(vma, size, grow);
if (!error) {
+ /*
+ * This is safe even under the read lock, anon_vma_lock
+ * guards against concurrent expand_stack calls and the
+ * read lock keeps other structural changes to the
+ * btree away.
+ *
+ * btree_update is carfully constructed to shift the
+ * space partitioning within the btree in a safe manner.
+ */
+ btree_update(&vma->vm_mm->mm_btree,
+ vma->vm_start, address);
vma->vm_start = address;
vma->vm_pgoff -= grow;
}
@@ -1757,9 +1688,9 @@ detach_vmas_to_be_unmapped(struct mm_str
do {
struct vm_area_struct *next = vma_next(vma);
- rb_erase(&vma->vm_rb, &mm->mm_rb);
+ __vma_unlink(mm, vma, NULL);
mm->map_count--;
- list_move_tail(&vma->vm_list, vmas);
+ list_add_tail(&vma->vm_list, vmas);
vma = next;
} while (vma && vma->vm_start < end);
if (mm->unmap_area == arch_unmap_area)
@@ -1917,10 +1848,9 @@ static inline void verify_mm_writelocked
*/
unsigned long do_brk(unsigned long addr, unsigned long len)
{
- struct mm_struct * mm = current->mm;
- struct vm_area_struct * vma, * prev;
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma, *prev;
unsigned long flags;
- struct rb_node ** rb_link, * rb_parent;
pgoff_t pgoff = addr >> PAGE_SHIFT;
int error;
@@ -1963,7 +1893,7 @@ unsigned long do_brk(unsigned long addr,
* Clear old maps. this also does some error checking for us
*/
munmap_back:
- vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
+ vma = find_vma_prepare(mm, addr, &prev);
if (vma && vma->vm_start < addr + len) {
if (do_munmap(mm, addr, len))
return -ENOMEM;
@@ -2001,7 +1931,7 @@ unsigned long do_brk(unsigned long addr,
vma->vm_flags = flags;
vma->vm_page_prot = protection_map[flags &
(VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)];
- vma_link(mm, vma, prev, rb_link, rb_parent);
+ vma_link(mm, vma, prev);
out:
mm->total_vm += len >> PAGE_SHIFT;
if (flags & VM_LOCKED) {
@@ -2053,8 +1983,7 @@ void exit_mmap(struct mm_struct *mm)
*/
int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
{
- struct vm_area_struct * __vma, * prev;
- struct rb_node ** rb_link, * rb_parent;
+ struct vm_area_struct *__vma, *prev;
/*
* The vm_pgoff of a purely anonymous vma should be irrelevant
@@ -2072,13 +2001,13 @@ int insert_vm_struct(struct mm_struct *
BUG_ON(vma->anon_vma);
vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
}
- __vma = find_vma_prepare(mm,vma->vm_start,&prev,&rb_link,&rb_parent);
+ __vma = find_vma_prepare(mm,vma->vm_start,&prev);
if (__vma && __vma->vm_start < vma->vm_end)
return -ENOMEM;
if ((vma->vm_flags & VM_ACCOUNT) &&
security_vm_enough_memory_mm(mm, vma_pages(vma)))
return -ENOMEM;
- vma_link(mm, vma, prev, rb_link, rb_parent);
+ vma_link(mm, vma, prev);
return 0;
}
@@ -2093,7 +2022,6 @@ struct vm_area_struct *copy_vma(struct v
unsigned long vma_start = vma->vm_start;
struct mm_struct *mm = vma->vm_mm;
struct vm_area_struct *new_vma, *prev;
- struct rb_node **rb_link, *rb_parent;
struct mempolicy *pol;
/*
@@ -2103,7 +2031,7 @@ struct vm_area_struct *copy_vma(struct v
if (!vma->vm_file && !vma->anon_vma)
pgoff = addr >> PAGE_SHIFT;
- find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
+ find_vma_prepare(mm, addr, &prev);
new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma));
if (new_vma) {
@@ -2130,7 +2058,7 @@ struct vm_area_struct *copy_vma(struct v
get_file(new_vma->vm_file);
if (new_vma->vm_ops && new_vma->vm_ops->open)
new_vma->vm_ops->open(new_vma);
- vma_link(mm, new_vma, prev, rb_link, rb_parent);
+ vma_link(mm, new_vma, prev);
}
}
return new_vma;
--
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 10+ messages in thread
* [PATCH/RFC 4/9] mm: RCU vma lookups
2007-10-22 10:45 [PATCH/RFC 0/9] VMA lookup with RCU Vaidyanathan Srinivasan
` (2 preceding siblings ...)
2007-10-22 10:45 ` [PATCH/RFC 3/9] mm: use the B+tree for vma lookups Vaidyanathan Srinivasan
@ 2007-10-22 10:45 ` Vaidyanathan Srinivasan
2007-10-22 10:45 ` [PATCH/RFC 5/9] i386: rcu vma lookups for faults Vaidyanathan Srinivasan
` (4 subsequent siblings)
8 siblings, 0 replies; 10+ messages in thread
From: Vaidyanathan Srinivasan @ 2007-10-22 10:45 UTC (permalink / raw)
To: linux-mm; +Cc: Alexis Bruemmer, Peter Zijlstra
[-- Attachment #1: 4_mm-rcu.patch --]
[-- Type: text/plain, Size: 12516 bytes --]
mostly lockless vma lookup using the new b+tree
pin the vma using an atomic refcount
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>
---
include/linux/init_task.h | 2
include/linux/mm.h | 9 +
kernel/fork.c | 2
mm/mmap.c | 216 +++++++++++++++++++++++++++++++++++++++++-----
4 files changed, 206 insertions(+), 23 deletions(-)
--- linux-2.6.23-rc9.orig/include/linux/init_task.h
+++ linux-2.6.23-rc9/include/linux/init_task.h
@@ -48,7 +48,7 @@
#define INIT_MM(name) \
{ \
.mm_vmas = LIST_HEAD_INIT(name.mm_vmas), \
- .mm_btree = BTREE_INIT(GFP_ATOMIC), \
+ .mm_btree = BTREE_INIT_FLUSH(GFP_ATOMIC, btree_rcu_flush), \
.pgd = swapper_pg_dir, \
.mm_users = ATOMIC_INIT(2), \
.mm_count = ATOMIC_INIT(1), \
--- linux-2.6.23-rc9.orig/include/linux/mm.h
+++ linux-2.6.23-rc9/include/linux/mm.h
@@ -104,12 +104,14 @@ struct vm_area_struct {
void * vm_private_data; /* was vm_pte (shared mem) */
unsigned long vm_truncate_count;/* truncate_count or restart_addr */
+ struct rw_semaphore vm_sem;
#ifndef CONFIG_MMU
atomic_t vm_usage; /* refcount (VMAs shared if !MMU) */
#endif
#ifdef CONFIG_NUMA
struct mempolicy *vm_policy; /* NUMA policy for the VMA */
#endif
+ struct rcu_head vm_rcu_head;
};
static inline struct vm_area_struct *
@@ -1078,6 +1080,8 @@ static inline void vma_nonlinear_insert(
}
/* mmap.c */
+extern void btree_rcu_flush(struct btree_freevec *);
+extern void free_vma(struct vm_area_struct *vma);
extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin);
extern void vma_adjust(struct vm_area_struct *vma, unsigned long start,
unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert);
@@ -1177,6 +1181,11 @@ extern struct vm_area_struct * find_vma(
extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr,
struct vm_area_struct **pprev);
+extern struct vm_area_struct * __find_get_vma(struct mm_struct *mm, unsigned long addr, int *locked);
+extern struct vm_area_struct * find_get_vma(struct mm_struct *mm, unsigned long addr);
+extern void get_vma(struct vm_area_struct *vma);
+extern void put_vma(struct vm_area_struct *vma);
+
/* Look up the first VMA which intersects the interval start_addr..end_addr-1,
NULL if none. Assume start_addr < end_addr. */
static inline struct vm_area_struct * find_vma_intersection(struct mm_struct * mm, unsigned long start_addr, unsigned long end_addr)
--- linux-2.6.23-rc9.orig/kernel/fork.c
+++ linux-2.6.23-rc9/kernel/fork.c
@@ -327,7 +327,7 @@ static void free_mm(struct mm_struct *mm
static struct mm_struct * mm_init(struct mm_struct * mm)
{
INIT_LIST_HEAD(&mm->mm_vmas);
- mm->mm_btree = BTREE_INIT(GFP_ATOMIC);
+ mm->mm_btree = BTREE_INIT_FLUSH(GFP_ATOMIC, btree_rcu_flush);
atomic_set(&mm->mm_users, 1);
atomic_set(&mm->mm_count, 1);
init_rwsem(&mm->mmap_sem);
--- linux-2.6.23-rc9.orig/mm/mmap.c
+++ linux-2.6.23-rc9/mm/mmap.c
@@ -40,6 +40,18 @@ static void unmap_region(struct mm_struc
struct vm_area_struct *vma, struct vm_area_struct *prev,
unsigned long start, unsigned long end);
+static void __btree_rcu_flush(struct rcu_head *head)
+{
+ struct btree_freevec *freevec =
+ container_of(head, struct btree_freevec, rcu_head);
+ btree_freevec_flush(freevec);
+}
+
+void btree_rcu_flush(struct btree_freevec *freevec)
+{
+ call_rcu(&freevec->rcu_head, __btree_rcu_flush);
+}
+
/*
* WARNING: the debugging will use recursive algorithms so never enable this
* unless you know what you are doing.
@@ -218,6 +230,28 @@ void unlink_file_vma(struct vm_area_stru
}
}
+static void __free_vma(struct rcu_head *head)
+{
+ struct vm_area_struct *vma =
+ container_of(head, struct vm_area_struct, vm_rcu_head);
+ kmem_cache_free(vm_area_cachep, vma);
+}
+
+void free_vma(struct vm_area_struct *vma)
+{
+ call_rcu(&vma->vm_rcu_head, __free_vma);
+}
+
+static void lock_vma(struct vm_area_struct *vma)
+{
+ down_write(&vma->vm_sem);
+}
+
+static void unlock_vma(struct vm_area_struct *vma)
+{
+ up_write(&vma->vm_sem);
+}
+
/*
* Close a vm structure and free it, returning the next.
*/
@@ -230,7 +264,7 @@ static void remove_vma(struct vm_area_st
if (vma->vm_file)
fput(vma->vm_file);
mpol_free(vma_policy(vma));
- kmem_cache_free(vm_area_cachep, vma);
+ free_vma(vma);
return;
}
@@ -288,8 +322,15 @@ void validate_mm(struct mm_struct *mm)
int bug = 0;
int i = 0;
struct vm_area_struct *vma;
- list_for_each_entry(vma, &mm->mm_vmas, vm_list)
+ unsigned long addr = 0UL;
+ list_for_each_entry(vma, &mm->mm_vmas, vm_list) {
+ if (addr > vma->vm_start) {
+ printk("vma list unordered: %lu %lu\n",
+ addr, vma->vm_start);
+ }
+ addr = vma->vm_start;
i++;
+ }
if (i != mm->map_count)
printk("map_count %d vm_next %d\n", mm->map_count, i), bug = 1;
BUG_ON(bug);
@@ -328,6 +369,7 @@ __vma_link_list(struct mm_struct *mm, st
void __vma_link_btree(struct mm_struct *mm, struct vm_area_struct *vma)
{
int err;
+ init_rwsem(&vma->vm_sem);
err = btree_insert(&mm->mm_btree, vma->vm_start, vma);
BUG_ON(err);
}
@@ -421,8 +463,8 @@ __vma_unlink(struct mm_struct *mm, struc
BUG();
}
- if (mm->mmap_cache == vma)
- mm->mmap_cache = prev;
+ if (rcu_dereference(mm->mmap_cache) == vma)
+ rcu_assign_pointer(mm->mmap_cache, prev);
}
/*
@@ -438,6 +480,7 @@ void vma_adjust(struct vm_area_struct *v
struct mm_struct *mm = vma->vm_mm;
struct vm_area_struct *next = vma_next(vma);
struct vm_area_struct *importer = NULL;
+ struct vm_area_struct *locked = NULL;
struct address_space *mapping = NULL;
struct prio_tree_root *root = NULL;
struct file *file = vma->vm_file;
@@ -448,6 +491,14 @@ void vma_adjust(struct vm_area_struct *v
if (next && !insert) {
if (end >= next->vm_end) {
/*
+ * We need to lock the vma to force the lockless
+ * lookup into the slow path. Because there is a
+ * hole between removing the next vma and updating
+ * the current.
+ */
+ lock_vma(vma);
+ locked = vma;
+ /*
* vma expands, overlapping all the next, and
* perhaps the one after too (mprotect case 6).
*/
@@ -475,6 +526,25 @@ again: remove_next = 1 + (end > next->
}
}
+ if (insert) {
+ /*
+ * In order to make the adjust + insert look atomic wrt. the
+ * lockless lookups we need to force those into the slow path.
+ */
+ if (insert->vm_start < start) {
+ /*
+ * If the new vma is to be placed in front of the
+ * current one, we must lock the previous.
+ */
+ locked = vma_prev(vma);
+ if (!locked)
+ locked = vma;
+ } else
+ locked = vma;
+
+ lock_vma(locked);
+ }
+
btree_preload(&mm->mm_btree, GFP_KERNEL|__GFP_NOFAIL); // XXX error path
if (file) {
@@ -521,6 +591,23 @@ again: remove_next = 1 + (end > next->
}
}
+ /*
+ * Remove the next vma before updating the address of the current,
+ * because it might end up having the address of next.
+ */
+ if (remove_next) {
+ /*
+ * vma_merge has merged next into vma, and needs
+ * us to remove next before dropping the locks.
+ */
+ lock_vma(next);
+ __vma_unlink(mm, next, vma);
+ if (file)
+ __remove_shared_vm_struct(next, file, mapping);
+ if (next->anon_vma)
+ __anon_vma_merge(vma, next);
+ }
+
if (root) {
flush_dcache_mmap_lock(mapping);
vma_prio_tree_remove(vma, root);
@@ -547,17 +634,11 @@ again: remove_next = 1 + (end > next->
flush_dcache_mmap_unlock(mapping);
}
- if (remove_next) {
- /*
- * vma_merge has merged next into vma, and needs
- * us to remove next before dropping the locks.
- */
- __vma_unlink(mm, next, vma);
- if (file)
- __remove_shared_vm_struct(next, file, mapping);
- if (next->anon_vma)
- __anon_vma_merge(vma, next);
- } else if (insert) {
+ /*
+ * Insert after updating the address of the current vma, because it
+ * might end up having the previous address.
+ */
+ if (insert) {
/*
* split_vma has split insert from vma, and needs
* us to insert it before dropping the locks
@@ -576,7 +657,7 @@ again: remove_next = 1 + (end > next->
fput(file);
mm->map_count--;
mpol_free(vma_policy(next));
- kmem_cache_free(vm_area_cachep, next);
+ free_vma(next);
/*
* In mprotect's case 6 (see comments on vma_merge),
* we must remove another next too. It would clutter
@@ -588,6 +669,13 @@ again: remove_next = 1 + (end > next->
}
}
+ if (locked) {
+ /*
+ * unlock the vma, enabling lockless lookups.
+ */
+ unlock_vma(locked);
+ }
+
validate_mm(mm);
}
@@ -1142,7 +1230,7 @@ munmap_back:
fput(file);
}
mpol_free(vma_policy(vma));
- kmem_cache_free(vm_area_cachep, vma);
+ free_vma(vma);
}
out:
mm->total_vm += len >> PAGE_SHIFT;
@@ -1166,7 +1254,7 @@ unmap_and_free_vma:
unmap_region(mm, &vmas, vma, prev, vma->vm_start, vma->vm_end);
charged = 0;
free_vma:
- kmem_cache_free(vm_area_cachep, vma);
+ free_vma(vma);
unacct_error:
if (charged)
vm_unacct_memory(charged);
@@ -1390,13 +1478,13 @@ struct vm_area_struct * find_vma(struct
if (mm) {
/* Check the cache first. */
/* (Cache hit rate is typically around 35%.) */
- vma = mm->mmap_cache;
+ vma = rcu_dereference(mm->mmap_cache);
if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
vma = btree_stab(&mm->mm_btree, addr);
if (!vma || addr >= vma->vm_end)
vma = __vma_next(&mm->mm_vmas, vma);
if (vma)
- mm->mmap_cache = vma;
+ rcu_assign_pointer(mm->mmap_cache, vma);
}
}
return vma;
@@ -1404,6 +1492,90 @@ struct vm_area_struct * find_vma(struct
EXPORT_SYMBOL(find_vma);
+/*
+ * Differs only from the above in that it uses the slightly more expensive
+ * btree_stab_next() in order to avoid using the mm->mm_vmas list without
+ * locks.
+ */
+static
+struct vm_area_struct *find_vma_rcu(struct mm_struct * mm, unsigned long addr)
+{
+ struct vm_area_struct *vma = NULL, *next;
+
+ if (mm) {
+ /* Check the cache first. */
+ /* (Cache hit rate is typically around 35%.) */
+ vma = rcu_dereference(mm->mmap_cache);
+ if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
+ vma = btree_stab_next(&mm->mm_btree, addr,
+ (void **)&next);
+ if (!vma || addr >= vma->vm_end)
+ vma = next;
+
+ if (vma)
+ rcu_assign_pointer(mm->mmap_cache, vma);
+ }
+ }
+ return vma;
+}
+
+/*
+ * Lockless lookup and pinning of vmas:
+ *
+ * In order to be able to do vma modifications and have them appear atomic
+ * we must sometimes still take the read lock. We do this when we fail to
+ * get a reference on the vma.
+ *
+ */
+struct vm_area_struct *
+__find_get_vma(struct mm_struct *mm, unsigned long addr, int *locked)
+{
+ struct vm_area_struct *vma;
+
+ if (!mm)
+ return NULL;
+
+ rcu_read_lock();
+ vma = find_vma_rcu(mm, addr);
+ if (!vma || !down_read_trylock(&vma->vm_sem))
+ goto slow;
+ rcu_read_unlock();
+ return vma;
+
+slow:
+ rcu_read_unlock();
+ down_read(&mm->mmap_sem);
+ vma = find_vma(mm, addr);
+ if (vma)
+ down_read(&vma->vm_sem);
+ *locked = 1;
+ return vma;
+}
+
+struct vm_area_struct *
+find_get_vma(struct mm_struct *mm, unsigned long addr)
+{
+ int locked = 0;
+ struct vm_area_struct *vma;
+
+ vma = __find_get_vma(mm, addr, &locked);
+ if (unlikely(locked))
+ up_read(&mm->mmap_sem);
+ return vma;
+}
+
+void get_vma(struct vm_area_struct *vma)
+{
+ if (likely(vma))
+ down_read(&vma->vm_sem);
+}
+
+void put_vma(struct vm_area_struct *vma)
+{
+ if (likely(vma))
+ up_read(&vma->vm_sem);
+}
+
/* Same as find_vma, but also return a pointer to the previous VMA in *pprev. */
struct vm_area_struct *
find_vma_prev(struct mm_struct *mm, unsigned long addr,
@@ -1687,8 +1859,10 @@ detach_vmas_to_be_unmapped(struct mm_str
do {
struct vm_area_struct *next = vma_next(vma);
+ lock_vma(vma);
__vma_unlink(mm, vma, NULL);
mm->map_count--;
+ unlock_vma(vma);
list_add_tail(&vma->vm_list, vmas);
vma = next;
} while (vma && vma->vm_start < end);
@@ -1697,7 +1871,7 @@ detach_vmas_to_be_unmapped(struct mm_str
else
addr = vma ? vma->vm_start : mm->mmap_base;
mm->unmap_area(mm, addr);
- mm->mmap_cache = NULL; /* Kill the cache. */
+ /* mm->mmap_cache = NULL;*/ /* Kill the cache. */
}
/*
--
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 10+ messages in thread
* [PATCH/RFC 5/9] i386: rcu vma lookups for faults
2007-10-22 10:45 [PATCH/RFC 0/9] VMA lookup with RCU Vaidyanathan Srinivasan
` (3 preceding siblings ...)
2007-10-22 10:45 ` [PATCH/RFC 4/9] mm: RCU " Vaidyanathan Srinivasan
@ 2007-10-22 10:45 ` Vaidyanathan Srinivasan
2007-10-22 10:45 ` [PATCH/RFC 6/9] x86_64: " Vaidyanathan Srinivasan
` (3 subsequent siblings)
8 siblings, 0 replies; 10+ messages in thread
From: Vaidyanathan Srinivasan @ 2007-10-22 10:45 UTC (permalink / raw)
To: linux-mm; +Cc: Alexis Bruemmer, Peter Zijlstra
[-- Attachment #1: 5_fault-i386.patch --]
[-- Type: text/plain, Size: 3768 bytes --]
Use the new lockless vma lookup in the i386 fault handler.
This avoids the exclusive cacheline access for the mmap_sem.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
arch/i386/mm/fault.c | 56 +++++++++++++++++++++++++++------------------------
1 file changed, 30 insertions(+), 26 deletions(-)
--- linux-2.6.23-rc6.orig/arch/i386/mm/fault.c
+++ linux-2.6.23-rc6/arch/i386/mm/fault.c
@@ -307,6 +307,7 @@ fastcall void __kprobes do_page_fault(st
unsigned long address;
int write, si_code;
int fault;
+ int locked = 0;
/* get the address */
address = read_cr2();
@@ -357,29 +358,14 @@ fastcall void __kprobes do_page_fault(st
if (in_atomic() || !mm)
goto bad_area_nosemaphore;
- /* When running in the kernel we expect faults to occur only to
- * addresses in user space. All other faults represent errors in the
- * kernel and should generate an OOPS. Unfortunatly, in the case of an
- * erroneous fault occurring in a code path which already holds mmap_sem
- * we will deadlock attempting to validate the fault against the
- * address space. Luckily the kernel only validly references user
- * space from well defined areas of code, which are listed in the
- * exceptions table.
- *
- * As the vast majority of faults will be valid we will only perform
- * the source reference check when there is a possibilty of a deadlock.
- * Attempt to lock the address space, if we cannot we then validate the
- * source. If this is invalid we can skip the address space check,
- * thus avoiding the deadlock.
- */
- if (!down_read_trylock(&mm->mmap_sem)) {
- if ((error_code & 4) == 0 &&
- !search_exception_tables(regs->eip))
- goto bad_area_nosemaphore;
+again:
+ if (likely(!locked)) {
+ vma = __find_get_vma(mm, address, &locked);
+ } else {
down_read(&mm->mmap_sem);
+ vma = find_vma(mm, address);
+ get_vma(vma);
}
-
- vma = find_vma(mm, address);
if (!vma)
goto bad_area;
if (vma->vm_start <= address)
@@ -396,6 +382,15 @@ fastcall void __kprobes do_page_fault(st
if (address + 65536 + 32 * sizeof(unsigned long) < regs->esp)
goto bad_area;
}
+ /*
+ * expand_stack needs the read lock, hence retry the whole thing
+ * read locked.
+ */
+ if (!locked) {
+ put_vma(vma);
+ locked = 1;
+ goto again;
+ }
if (expand_stack(vma, address))
goto bad_area;
/*
@@ -403,6 +398,11 @@ fastcall void __kprobes do_page_fault(st
* we can handle it..
*/
good_area:
+ if (unlikely(locked)) {
+ up_read(&mm->mmap_sem);
+ locked = 0;
+ }
+
si_code = SEGV_ACCERR;
write = 0;
switch (error_code & 3) {
@@ -447,7 +447,7 @@ good_area:
if (bit < 32)
tsk->thread.screen_bitmap |= 1 << bit;
}
- up_read(&mm->mmap_sem);
+ put_vma(vma);
return;
/*
@@ -455,7 +455,11 @@ good_area:
* Fix it, but check if it's kernel or user first..
*/
bad_area:
- up_read(&mm->mmap_sem);
+ if (unlikely(locked)) {
+ up_read(&mm->mmap_sem);
+ locked = 0;
+ }
+ put_vma(vma);
bad_area_nosemaphore:
/* User mode accesses just cause a SIGSEGV */
@@ -590,10 +594,10 @@ no_context:
* us unable to handle the page fault gracefully.
*/
out_of_memory:
- up_read(&mm->mmap_sem);
+ put_vma(vma);
if (is_init(tsk)) {
yield();
- down_read(&mm->mmap_sem);
+ vma = find_get_vma(mm, address);
goto survive;
}
printk("VM: killing process %s\n", tsk->comm);
@@ -602,7 +606,7 @@ out_of_memory:
goto no_context;
do_sigbus:
- up_read(&mm->mmap_sem);
+ put_vma(vma);
/* Kernel mode? Handle exceptions or die */
if (!(error_code & 4))
--
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 10+ messages in thread
* [PATCH/RFC 6/9] x86_64: rcu vma lookups for faults
2007-10-22 10:45 [PATCH/RFC 0/9] VMA lookup with RCU Vaidyanathan Srinivasan
` (4 preceding siblings ...)
2007-10-22 10:45 ` [PATCH/RFC 5/9] i386: rcu vma lookups for faults Vaidyanathan Srinivasan
@ 2007-10-22 10:45 ` Vaidyanathan Srinivasan
2007-10-22 10:45 ` [PATCH/RFC 7/9] Add page fault code for PPC64 path Vaidyanathan Srinivasan
` (2 subsequent siblings)
8 siblings, 0 replies; 10+ messages in thread
From: Vaidyanathan Srinivasan @ 2007-10-22 10:45 UTC (permalink / raw)
To: linux-mm; +Cc: Alexis Bruemmer, Peter Zijlstra
[-- Attachment #1: 6_fault-x86_64.patch --]
[-- Type: text/plain, Size: 2629 bytes --]
---
arch/x86_64/mm/fault.c | 36 +++++++++++++++++++++++++-----------
1 file changed, 25 insertions(+), 11 deletions(-)
--- linux-2.6.23-rc6.orig/arch/x86_64/mm/fault.c
+++ linux-2.6.23-rc6/arch/x86_64/mm/fault.c
@@ -310,10 +310,10 @@ asmlinkage void __kprobes do_page_fault(
int write, fault;
unsigned long flags;
siginfo_t info;
+ int locked = 0;
tsk = current;
mm = tsk->mm;
- prefetchw(&mm->mmap_sem);
/* get the address */
address = read_cr2();
@@ -390,14 +390,14 @@ asmlinkage void __kprobes do_page_fault(
* source. If this is invalid we can skip the address space check,
* thus avoiding the deadlock.
*/
- if (!down_read_trylock(&mm->mmap_sem)) {
- if ((error_code & PF_USER) == 0 &&
- !search_exception_tables(regs->rip))
- goto bad_area_nosemaphore;
+
+ if (likely(!locked)) {
+ vma = __find_get_vma(mm, address, &locked);
+ } else {
down_read(&mm->mmap_sem);
+ vma = find_vma(mm, address);
+ get_vma(vma);
}
-
- vma = find_vma(mm, address);
if (!vma)
goto bad_area;
if (likely(vma->vm_start <= address))
@@ -411,6 +411,11 @@ asmlinkage void __kprobes do_page_fault(
if (address + 65536 + 32 * sizeof(unsigned long) < regs->rsp)
goto bad_area;
}
+ if (!locked) {
+ put_vma(vma);
+ locked = 1;
+ goto again;
+ }
if (expand_stack(vma, address))
goto bad_area;
/*
@@ -418,6 +423,11 @@ asmlinkage void __kprobes do_page_fault(
* we can handle it..
*/
good_area:
+ if (locked) {
+ up_read(&mm->mmap_sem);
+ locked = 0;
+ }
+
info.si_code = SEGV_ACCERR;
write = 0;
switch (error_code & (PF_PROT|PF_WRITE)) {
@@ -452,7 +462,7 @@ good_area:
tsk->maj_flt++;
else
tsk->min_flt++;
- up_read(&mm->mmap_sem);
+ put_vma(vma);
return;
/*
@@ -460,7 +470,11 @@ good_area:
* Fix it, but check if it's kernel or user first..
*/
bad_area:
- up_read(&mm->mmap_sem);
+ if (locked) {
+ up_read(&mm->mmap_sem);
+ locked = 0;
+ }
+ put_vma(vma);
bad_area_nosemaphore:
/* User mode accesses just cause a SIGSEGV */
@@ -552,7 +566,7 @@ no_context:
* us unable to handle the page fault gracefully.
*/
out_of_memory:
- up_read(&mm->mmap_sem);
+ put_vma(vma);
if (is_init(current)) {
yield();
goto again;
@@ -563,7 +577,7 @@ out_of_memory:
goto no_context;
do_sigbus:
- up_read(&mm->mmap_sem);
+ put_vma(vma);
/* Kernel mode? Handle exceptions or die */
if (!(error_code & PF_USER))
--
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 10+ messages in thread
* [PATCH/RFC 7/9] Add page fault code for PPC64 path
2007-10-22 10:45 [PATCH/RFC 0/9] VMA lookup with RCU Vaidyanathan Srinivasan
` (5 preceding siblings ...)
2007-10-22 10:45 ` [PATCH/RFC 6/9] x86_64: " Vaidyanathan Srinivasan
@ 2007-10-22 10:45 ` Vaidyanathan Srinivasan
2007-10-22 10:45 ` [PATCH/RFC 8/9] debug: instrument the fault path Vaidyanathan Srinivasan
2007-10-22 10:45 ` [PATCH/RFC 9/9] mm: nr_ptes needs to be atomic Vaidyanathan Srinivasan
8 siblings, 0 replies; 10+ messages in thread
From: Vaidyanathan Srinivasan @ 2007-10-22 10:45 UTC (permalink / raw)
To: linux-mm; +Cc: Alexis Bruemmer, Peter Zijlstra
[-- Attachment #1: 7_fault-powerpc.patch --]
[-- Type: text/plain, Size: 2828 bytes --]
---
arch/powerpc/mm/fault.c | 44 +++++++++++++++++++++++++++++++++-----------
1 file changed, 33 insertions(+), 11 deletions(-)
--- linux-2.6.23-rc8.orig/arch/powerpc/mm/fault.c
+++ linux-2.6.23-rc8/arch/powerpc/mm/fault.c
@@ -148,6 +148,7 @@ int __kprobes do_page_fault(struct pt_re
int is_write = 0, ret;
int trap = TRAP(regs);
int is_exec = trap == 0x400;
+ int locked = 0;
#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE))
/*
@@ -211,14 +212,22 @@ int __kprobes do_page_fault(struct pt_re
* source. If this is invalid we can skip the address space check,
* thus avoiding the deadlock.
*/
- if (!down_read_trylock(&mm->mmap_sem)) {
- if (!user_mode(regs) && !search_exception_tables(regs->nip))
- goto bad_area_nosemaphore;
-
+// if (!down_read_trylock(&mm->mmap_sem)) {
+// if (!user_mode(regs) && !search_exception_tables(regs->nip))
+// goto bad_area_nosemaphore;
+//
+// down_read(&mm->mmap_sem);
+// }
+
+again:
+ if (likely(!locked)) {
+ vma = __find_get_vma(mm, address, &locked);
+ } else {
down_read(&mm->mmap_sem);
- }
+ vma = find_vma(mm, address);
+ get_vma(vma);
+ }
- vma = find_vma(mm, address);
if (!vma)
goto bad_area;
if (vma->vm_start <= address)
@@ -257,10 +266,19 @@ int __kprobes do_page_fault(struct pt_re
&& (!user_mode(regs) || !store_updates_sp(regs)))
goto bad_area;
}
+ if (!locked) {
+ put_vma(vma);
+ locked = 1;
+ goto again;
+ }
if (expand_stack(vma, address))
goto bad_area;
good_area:
+ if (locked) {
+ up_read(&mm->mmap_sem);
+ locked = 0;
+ }
code = SEGV_ACCERR;
#if defined(CONFIG_6xx)
if (error_code & 0x95700000)
@@ -311,7 +329,7 @@ good_area:
pte_update(ptep, 0, _PAGE_HWEXEC);
_tlbie(address);
pte_unmap_unlock(ptep, ptl);
- up_read(&mm->mmap_sem);
+ put_vma(vma);
return 0;
}
pte_unmap_unlock(ptep, ptl);
@@ -348,11 +366,15 @@ good_area:
current->maj_flt++;
else
current->min_flt++;
- up_read(&mm->mmap_sem);
+ put_vma(vma);
return 0;
bad_area:
- up_read(&mm->mmap_sem);
+ if (locked) {
+ up_read(&mm->mmap_sem);
+ locked = 0;
+ }
+ put_vma(vma);
bad_area_nosemaphore:
/* User mode accesses cause a SIGSEGV */
@@ -374,7 +396,7 @@ bad_area_nosemaphore:
* us unable to handle the page fault gracefully.
*/
out_of_memory:
- up_read(&mm->mmap_sem);
+ put_vma(vma);
if (is_init(current)) {
yield();
down_read(&mm->mmap_sem);
@@ -386,7 +408,7 @@ out_of_memory:
return SIGKILL;
do_sigbus:
- up_read(&mm->mmap_sem);
+ put_vma(vma);
if (user_mode(regs)) {
info.si_signo = SIGBUS;
info.si_errno = 0;
--
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 10+ messages in thread
* [PATCH/RFC 8/9] debug: instrument the fault path
2007-10-22 10:45 [PATCH/RFC 0/9] VMA lookup with RCU Vaidyanathan Srinivasan
` (6 preceding siblings ...)
2007-10-22 10:45 ` [PATCH/RFC 7/9] Add page fault code for PPC64 path Vaidyanathan Srinivasan
@ 2007-10-22 10:45 ` Vaidyanathan Srinivasan
2007-10-22 10:45 ` [PATCH/RFC 9/9] mm: nr_ptes needs to be atomic Vaidyanathan Srinivasan
8 siblings, 0 replies; 10+ messages in thread
From: Vaidyanathan Srinivasan @ 2007-10-22 10:45 UTC (permalink / raw)
To: linux-mm; +Cc: Alexis Bruemmer, Peter Zijlstra
[-- Attachment #1: 8_stats.patch --]
[-- Type: text/plain, Size: 1856 bytes --]
---
arch/x86_64/mm/fault.c | 8 ++++++++
include/linux/vmstat.h | 1 +
mm/vmstat.c | 5 +++++
3 files changed, 14 insertions(+)
--- linux-2.6.23-rc6.orig/arch/x86_64/mm/fault.c
+++ linux-2.6.23-rc6/arch/x86_64/mm/fault.c
@@ -25,6 +25,7 @@
#include <linux/kprobes.h>
#include <linux/uaccess.h>
#include <linux/kdebug.h>
+#include <linux/vmstat.h>
#include <asm/system.h>
#include <asm/pgalloc.h>
@@ -393,10 +394,16 @@ asmlinkage void __kprobes do_page_fault(
if (likely(!locked)) {
vma = __find_get_vma(mm, address, &locked);
+ if (unlikely(locked))
+ count_vm_event(FAULT_RCU_SLOW);
+ else
+ count_vm_event(FAULT_RCU);
+
} else {
down_read(&mm->mmap_sem);
vma = find_vma(mm, address);
get_vma(vma);
+ count_vm_event(FAULT_LOCKED);
}
if (!vma)
goto bad_area;
@@ -416,6 +423,7 @@ asmlinkage void __kprobes do_page_fault(
locked = 1;
goto again;
}
+ count_vm_event(FAULT_STACK);
if (expand_stack(vma, address))
goto bad_area;
/*
--- linux-2.6.23-rc6.orig/include/linux/vmstat.h
+++ linux-2.6.23-rc6/include/linux/vmstat.h
@@ -37,6 +37,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PS
FOR_ALL_ZONES(PGSCAN_DIRECT),
PGINODESTEAL, SLABS_SCANNED, KSWAPD_STEAL, KSWAPD_INODESTEAL,
PAGEOUTRUN, ALLOCSTALL, PGROTATED,
+ FAULT_RCU, FAULT_RCU_SLOW, FAULT_LOCKED, FAULT_STACK,
NR_VM_EVENT_ITEMS
};
--- linux-2.6.23-rc6.orig/mm/vmstat.c
+++ linux-2.6.23-rc6/mm/vmstat.c
@@ -529,6 +529,11 @@ static const char * const vmstat_text[]
"allocstall",
"pgrotated",
+
+ "fault_rcu",
+ "fault_rcu_slow",
+ "fault_locked",
+ "fault_stack",
#endif
};
--
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 10+ messages in thread
* [PATCH/RFC 9/9] mm: nr_ptes needs to be atomic
2007-10-22 10:45 [PATCH/RFC 0/9] VMA lookup with RCU Vaidyanathan Srinivasan
` (7 preceding siblings ...)
2007-10-22 10:45 ` [PATCH/RFC 8/9] debug: instrument the fault path Vaidyanathan Srinivasan
@ 2007-10-22 10:45 ` Vaidyanathan Srinivasan
8 siblings, 0 replies; 10+ messages in thread
From: Vaidyanathan Srinivasan @ 2007-10-22 10:45 UTC (permalink / raw)
To: linux-mm; +Cc: Alexis Bruemmer, Peter Zijlstra
[-- Attachment #1: 9_nr_ptes.patch --]
[-- Type: text/plain, Size: 3692 bytes --]
---
arch/powerpc/mm/fault.c | 6 ++++++
arch/sh/mm/cache-sh4.c | 2 +-
arch/um/kernel/skas/mmu.c | 2 +-
fs/proc/task_mmu.c | 2 +-
include/linux/sched.h | 4 +++-
kernel/fork.c | 2 +-
mm/memory.c | 4 ++--
7 files changed, 15 insertions(+), 7 deletions(-)
--- linux-2.6.23-rc8.orig/arch/powerpc/mm/fault.c
+++ linux-2.6.23-rc8/arch/powerpc/mm/fault.c
@@ -235,6 +235,12 @@ again:
if (!(vma->vm_flags & VM_GROWSDOWN))
goto bad_area;
+ if (!locked) {
+ put_vma(vma);
+ locked = 1;
+ goto again;
+ }
+
/*
* N.B. The POWER/Open ABI allows programs to access up to
* 288 bytes below the stack pointer.
--- linux-2.6.23-rc8.orig/arch/sh/mm/cache-sh4.c
+++ linux-2.6.23-rc8/arch/sh/mm/cache-sh4.c
@@ -373,7 +373,7 @@ void flush_cache_mm(struct mm_struct *mm
* Don't bother groveling around the dcache for the VMA ranges
* if there are too many PTEs to make it worthwhile.
*/
- if (mm->nr_ptes >= MAX_DCACHE_PAGES)
+ if (atomic_long_read(&mm->nr_ptes) >= MAX_DCACHE_PAGES)
flush_dcache_all();
else {
struct vm_area_struct *vma;
--- linux-2.6.23-rc8.orig/arch/um/kernel/skas/mmu.c
+++ linux-2.6.23-rc8/arch/um/kernel/skas/mmu.c
@@ -98,7 +98,7 @@ int init_new_context_skas(struct task_st
if(ret)
goto out_free;
- mm->nr_ptes--;
+ atomic_long_dec(&mm->nr_ptes);
}
to_mm->id.stack = stack;
--- linux-2.6.23-rc8.orig/fs/proc/task_mmu.c
+++ linux-2.6.23-rc8/fs/proc/task_mmu.c
@@ -52,7 +52,7 @@ char *task_mem(struct mm_struct *mm, cha
total_rss << (PAGE_SHIFT-10),
data << (PAGE_SHIFT-10),
mm->stack_vm << (PAGE_SHIFT-10), text, lib,
- (PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10);
+ (PTRS_PER_PTE*sizeof(pte_t)*atomic_long_read(&mm->nr_ptes)) >> 10);
return buffer;
}
--- linux-2.6.23-rc8.orig/include/linux/sched.h
+++ linux-2.6.23-rc8/include/linux/sched.h
@@ -400,11 +400,13 @@ struct mm_struct {
unsigned long hiwater_vm; /* High-water virtual memory usage */
unsigned long total_vm, locked_vm, shared_vm, exec_vm;
- unsigned long stack_vm, reserved_vm, def_flags, nr_ptes;
+ unsigned long stack_vm, reserved_vm, def_flags;
unsigned long start_code, end_code, start_data, end_data;
unsigned long start_brk, brk, start_stack;
unsigned long arg_start, arg_end, env_start, env_end;
+ atomic_long_t nr_ptes;
+
unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */
cpumask_t cpu_vm_mask;
--- linux-2.6.23-rc8.orig/kernel/fork.c
+++ linux-2.6.23-rc8/kernel/fork.c
@@ -335,7 +335,7 @@ static struct mm_struct * mm_init(struct
mm->flags = (current->mm) ? current->mm->flags
: MMF_DUMP_FILTER_DEFAULT;
mm->core_waiters = 0;
- mm->nr_ptes = 0;
+ atomic_long_set(&mm->nr_ptes, 0);
set_mm_counter(mm, file_rss, 0);
set_mm_counter(mm, anon_rss, 0);
spin_lock_init(&mm->page_table_lock);
--- linux-2.6.23-rc8.orig/mm/memory.c
+++ linux-2.6.23-rc8/mm/memory.c
@@ -127,7 +127,7 @@ static void free_pte_range(struct mmu_ga
pte_lock_deinit(page);
pte_free_tlb(tlb, page);
dec_zone_page_state(page, NR_PAGETABLE);
- tlb->mm->nr_ptes--;
+ atomic_long_dec(&tlb->mm->nr_ptes);
}
static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
@@ -310,7 +310,7 @@ int __pte_alloc(struct mm_struct *mm, pm
pte_lock_deinit(new);
pte_free(new);
} else {
- mm->nr_ptes++;
+ atomic_long_inc(&mm->nr_ptes);
inc_zone_page_state(new, NR_PAGETABLE);
pmd_populate(mm, pmd, new);
}
--
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 10+ messages in thread
end of thread, other threads:[~2007-10-22 10:46 UTC | newest]
Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-10-22 10:45 [PATCH/RFC 0/9] VMA lookup with RCU Vaidyanathan Srinivasan
2007-10-22 10:45 ` [PATCH/RFC 1/9] Data structure changes Vaidyanathan Srinivasan
2007-10-22 10:45 ` [PATCH/RFC 2/9] lib: RCU friendly B+tree Vaidyanathan Srinivasan
2007-10-22 10:45 ` [PATCH/RFC 3/9] mm: use the B+tree for vma lookups Vaidyanathan Srinivasan
2007-10-22 10:45 ` [PATCH/RFC 4/9] mm: RCU " Vaidyanathan Srinivasan
2007-10-22 10:45 ` [PATCH/RFC 5/9] i386: rcu vma lookups for faults Vaidyanathan Srinivasan
2007-10-22 10:45 ` [PATCH/RFC 6/9] x86_64: " Vaidyanathan Srinivasan
2007-10-22 10:45 ` [PATCH/RFC 7/9] Add page fault code for PPC64 path Vaidyanathan Srinivasan
2007-10-22 10:45 ` [PATCH/RFC 8/9] debug: instrument the fault path Vaidyanathan Srinivasan
2007-10-22 10:45 ` [PATCH/RFC 9/9] mm: nr_ptes needs to be atomic Vaidyanathan Srinivasan
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).