* [PATCH/RFC 0/9] VMA lookup with RCU
@ 2007-10-22 10:45 Vaidyanathan Srinivasan
2007-10-22 10:45 ` [PATCH/RFC 1/9] Data structure changes Vaidyanathan Srinivasan
` (8 more replies)
0 siblings, 9 replies; 10+ messages in thread
From: Vaidyanathan Srinivasan @ 2007-10-22 10:45 UTC (permalink / raw)
To: linux-mm; +Cc: Alexis Bruemmer, Peter Zijlstra
This is forward port of Peter Zijlstra's RCU based
VMA lookup patch set to kernel version 2.6.23.1
Reference:
http://programming.kicks-ass.net/kernel-patches/vma_lookup/
The patch set still needs some cleanup. I am sending this out
for anybody to review and try more experiments.
I have been posting results based on this patch set at linux-mm
http://marc.info/?l=linux-mm&m=119186241210311&w=2
--Vaidy
--
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 10+ messages in thread
* [PATCH/RFC 1/9] Data structure changes
2007-10-22 10:45 [PATCH/RFC 0/9] VMA lookup with RCU Vaidyanathan Srinivasan
@ 2007-10-22 10:45 ` Vaidyanathan Srinivasan
2007-10-22 10:45 ` [PATCH/RFC 2/9] lib: RCU friendly B+tree Vaidyanathan Srinivasan
` (7 subsequent siblings)
8 siblings, 0 replies; 10+ messages in thread
From: Vaidyanathan Srinivasan @ 2007-10-22 10:45 UTC (permalink / raw)
To: linux-mm; +Cc: Alexis Bruemmer, Peter Zijlstra
[-- Attachment #1: 1_mm_vmas.patch --]
[-- Type: text/plain, Size: 39930 bytes --]
* Replace vm_next with list_head in mm_struct
* Change all instance of mm->vm_next and mm->mmap
* TBD: Not all architectures covered
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>
---
arch/alpha/kernel/osf_sys.c | 2
arch/arm/mm/mmap.c | 2
arch/frv/mm/elf-fdpic.c | 4 -
arch/i386/mm/hugetlbpage.c | 2
arch/ia64/kernel/sys_ia64.c | 2
arch/ia64/mm/hugetlbpage.c | 2
arch/mips/kernel/irixelf.c | 19 +++---
arch/mips/kernel/syscall.c | 2
arch/parisc/kernel/sys_parisc.c | 4 -
arch/powerpc/mm/tlb_32.c | 2
arch/ppc/mm/tlb.c | 2
arch/x86_64/ia32/ia32_aout.c | 2
arch/x86_64/kernel/sys_x86_64.c | 2
drivers/char/mem.c | 2
drivers/oprofile/buffer_sync.c | 4 -
fs/binfmt_aout.c | 2
fs/binfmt_elf.c | 4 -
fs/binfmt_elf_fdpic.c | 5 +
fs/exec.c | 4 -
fs/hugetlbfs/inode.c | 2
fs/proc/task_mmu.c | 18 +++---
include/linux/init_task.h | 1
include/linux/mm.h | 44 ++++++++++++++-
include/linux/sched.h | 2
ipc/shm.c | 4 -
kernel/acct.c | 5 -
kernel/auditsc.c | 4 -
kernel/fork.c | 11 +--
mm/madvise.c | 2
mm/memory.c | 21 +++----
mm/mempolicy.c | 10 +--
mm/migrate.c | 2
mm/mlock.c | 5 +
mm/mmap.c | 114 +++++++++++++++++++---------------------
mm/mprotect.c | 2
mm/mremap.c | 7 +-
mm/msync.c | 2
mm/swapfile.c | 2
38 files changed, 180 insertions(+), 146 deletions(-)
--- linux-2.6.23-rc9.orig/arch/alpha/kernel/osf_sys.c
+++ linux-2.6.23-rc9/arch/alpha/kernel/osf_sys.c
@@ -1255,7 +1255,7 @@ arch_get_unmapped_area_1(unsigned long a
if (!vma || addr + len <= vma->vm_start)
return addr;
addr = vma->vm_end;
- vma = vma->vm_next;
+ vma = vma_next(vma);
}
}
--- linux-2.6.23-rc9.orig/arch/arm/mm/mmap.c
+++ linux-2.6.23-rc9/arch/arm/mm/mmap.c
@@ -84,7 +84,7 @@ full_search:
else
addr = PAGE_ALIGN(addr);
- for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
+ for (vma = find_vma(mm, addr); ; vma = vma_next(vma)) {
/* At this point: (!vma || addr < vma->vm_end). */
if (TASK_SIZE - len < addr) {
/*
--- linux-2.6.23-rc9.orig/arch/frv/mm/elf-fdpic.c
+++ linux-2.6.23-rc9/arch/frv/mm/elf-fdpic.c
@@ -86,7 +86,7 @@ unsigned long arch_get_unmapped_area(str
if (addr <= limit) {
vma = find_vma(current->mm, PAGE_SIZE);
- for (; vma; vma = vma->vm_next) {
+ for (; vma; vma = vma_next(vma)) {
if (addr > limit)
break;
if (addr + len <= vma->vm_start)
@@ -101,7 +101,7 @@ unsigned long arch_get_unmapped_area(str
limit = TASK_SIZE - len;
if (addr <= limit) {
vma = find_vma(current->mm, addr);
- for (; vma; vma = vma->vm_next) {
+ for (; vma; vma = vma_next(vma)) {
if (addr > limit)
break;
if (addr + len <= vma->vm_start)
--- linux-2.6.23-rc9.orig/arch/i386/mm/hugetlbpage.c
+++ linux-2.6.23-rc9/arch/i386/mm/hugetlbpage.c
@@ -241,7 +241,7 @@ static unsigned long hugetlb_get_unmappe
full_search:
addr = ALIGN(start_addr, HPAGE_SIZE);
- for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
+ for (vma = find_vma(mm, addr); ; vma = vma_next(vma)) {
/* At this point: (!vma || addr < vma->vm_end). */
if (TASK_SIZE - len < addr) {
/*
--- linux-2.6.23-rc9.orig/arch/ia64/kernel/sys_ia64.c
+++ linux-2.6.23-rc9/arch/ia64/kernel/sys_ia64.c
@@ -58,7 +58,7 @@ arch_get_unmapped_area (struct file *fil
full_search:
start_addr = addr = (addr + align_mask) & ~align_mask;
- for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
+ for (vma = find_vma(mm, addr); ; vma = vma_next(vma)) {
/* At this point: (!vma || addr < vma->vm_end). */
if (TASK_SIZE - len < addr || RGN_MAP_LIMIT - len < REGION_OFFSET(addr)) {
if (start_addr != TASK_UNMAPPED_BASE) {
--- linux-2.6.23-rc9.orig/arch/ia64/mm/hugetlbpage.c
+++ linux-2.6.23-rc9/arch/ia64/mm/hugetlbpage.c
@@ -159,7 +159,7 @@ unsigned long hugetlb_get_unmapped_area(
addr = HPAGE_REGION_BASE;
else
addr = ALIGN(addr, HPAGE_SIZE);
- for (vmm = find_vma(current->mm, addr); ; vmm = vmm->vm_next) {
+ for (vmm = find_vma(current->mm, addr); ; vmm = vmm_next(vma)) {
/* At this point: (!vmm || addr < vmm->vm_end). */
if (REGION_OFFSET(addr) + len > RGN_MAP_LIMIT)
return -ENOMEM;
--- linux-2.6.23-rc9.orig/arch/mips/kernel/irixelf.c
+++ linux-2.6.23-rc9/arch/mips/kernel/irixelf.c
@@ -718,7 +718,7 @@ static int load_irix_binary(struct linux
/* OK, This is the point of no return */
current->mm->end_data = 0;
current->mm->end_code = 0;
- current->mm->mmap = NULL;
+ INIT_LIST_HEAD(¤t->mm->mm_vmas);
current->flags &= ~PF_FORKNOEXEC;
elf_entry = (unsigned int) elf_ex.e_entry;
@@ -1108,7 +1108,7 @@ static int irix_core_dump(long signr, st
/* Count what's needed to dump, up to the limit of coredump size. */
segs = 0;
size = 0;
- for (vma = current->mm->mmap; vma != NULL; vma = vma->vm_next) {
+ for (vma = current->mm->mmap; vma != NULL; vma = vma_next(vma)) {
if (maydump(vma))
{
int sz = vma->vm_end-vma->vm_start;
@@ -1267,12 +1267,13 @@ static int irix_core_dump(long signr, st
dataoff = offset = roundup(offset, PAGE_SIZE);
/* Write program headers for segments dump. */
- for (vma = current->mm->mmap, i = 0;
- i < segs && vma != NULL; vma = vma->vm_next) {
+ i = 0
+ list_for_each_entry(vma, ¤t->mm->mm_vmas, vm_list) {
struct elf_phdr phdr;
size_t sz;
- i++;
+ if (i++ == seg)
+ break;
sz = vma->vm_end - vma->vm_start;
@@ -1301,15 +1302,15 @@ static int irix_core_dump(long signr, st
DUMP_SEEK(dataoff);
- for (i = 0, vma = current->mm->mmap;
- i < segs && vma != NULL;
- vma = vma->vm_next) {
+ i = 0
+ list_for_each_entry(vma, ¤t->mm->mm_vmas, vm_list) {
unsigned long addr = vma->vm_start;
unsigned long len = vma->vm_end - vma->vm_start;
if (!maydump(vma))
continue;
- i++;
+ if (i++ == seg)
+ break;
pr_debug("elf_core_dump: writing %08lx %lx\n", addr, len);
DUMP_WRITE((void __user *)addr, len);
}
--- linux-2.6.23-rc9.orig/arch/mips/kernel/syscall.c
+++ linux-2.6.23-rc9/arch/mips/kernel/syscall.c
@@ -104,7 +104,7 @@ unsigned long arch_get_unmapped_area(str
else
addr = PAGE_ALIGN(addr);
- for (vmm = find_vma(current->mm, addr); ; vmm = vmm->vm_next) {
+ for (vmm = find_vma(current->mm, addr); ; vmm = vmm_next(vmm)) {
/* At this point: (!vmm || addr < vmm->vm_end). */
if (task_size - len < addr)
return -ENOMEM;
--- linux-2.6.23-rc9.orig/arch/parisc/kernel/sys_parisc.c
+++ linux-2.6.23-rc9/arch/parisc/kernel/sys_parisc.c
@@ -52,7 +52,7 @@ static unsigned long get_unshared_area(u
addr = PAGE_ALIGN(addr);
- for (vma = find_vma(current->mm, addr); ; vma = vma->vm_next) {
+ for (vma = find_vma(current->mm, addr); ; vma = vma_next(vma)) {
/* At this point: (!vma || addr < vma->vm_end). */
if (TASK_SIZE - len < addr)
return -ENOMEM;
@@ -88,7 +88,7 @@ static unsigned long get_shared_area(str
addr = DCACHE_ALIGN(addr - offset) + offset;
- for (vma = find_vma(current->mm, addr); ; vma = vma->vm_next) {
+ for (vma = find_vma(current->mm, addr); ; vma = vma_next(vma)) {
/* At this point: (!vma || addr < vma->vm_end). */
if (TASK_SIZE - len < addr)
return -ENOMEM;
--- linux-2.6.23-rc9.orig/arch/powerpc/mm/tlb_32.c
+++ linux-2.6.23-rc9/arch/powerpc/mm/tlb_32.c
@@ -155,7 +155,7 @@ void flush_tlb_mm(struct mm_struct *mm)
* unmap_region or exit_mmap, but not from vmtruncate on SMP -
* but it seems dup_mmap is the only SMP case which gets here.
*/
- for (mp = mm->mmap; mp != NULL; mp = mp->vm_next)
+ list_for_each_entry(mp, &mm->mm_vmas, vm_list)
flush_range(mp->vm_mm, mp->vm_start, mp->vm_end);
FINISH_FLUSH;
}
--- linux-2.6.23-rc9.orig/arch/ppc/mm/tlb.c
+++ linux-2.6.23-rc9/arch/ppc/mm/tlb.c
@@ -149,7 +149,7 @@ void flush_tlb_mm(struct mm_struct *mm)
return;
}
- for (mp = mm->mmap; mp != NULL; mp = mp->vm_next)
+ list_for_each_entry(mp, &mm->mm_vmas, vm_list)
flush_range(mp->vm_mm, mp->vm_start, mp->vm_end);
FINISH_FLUSH;
}
--- linux-2.6.23-rc9.orig/arch/x86_64/ia32/ia32_aout.c
+++ linux-2.6.23-rc9/arch/x86_64/ia32/ia32_aout.c
@@ -311,7 +311,7 @@ static int load_aout_binary(struct linux
current->mm->free_area_cache = TASK_UNMAPPED_BASE;
current->mm->cached_hole_size = 0;
- current->mm->mmap = NULL;
+ INIT_LIST_HEAD(¤t->mm->mm_vmas);
compute_creds(bprm);
current->flags &= ~PF_FORKNOEXEC;
--- linux-2.6.23-rc9.orig/arch/x86_64/kernel/sys_x86_64.c
+++ linux-2.6.23-rc9/arch/x86_64/kernel/sys_x86_64.c
@@ -119,7 +119,7 @@ arch_get_unmapped_area(struct file *filp
start_addr = addr;
full_search:
- for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
+ for (vma = find_vma(mm, addr); ; vma = vma_next(vma)) {
/* At this point: (!vma || addr < vma->vm_end). */
if (end - len < addr) {
/*
--- linux-2.6.23-rc9.orig/drivers/char/mem.c
+++ linux-2.6.23-rc9/drivers/char/mem.c
@@ -640,7 +640,7 @@ static inline size_t read_zero_pagealign
down_read(&mm->mmap_sem);
/* For private mappings, just map in zero pages. */
- for (vma = find_vma(mm, addr); vma; vma = vma->vm_next) {
+ for (vma = find_vma(mm, addr); vma; vma = vma_next(vma)) {
unsigned long count;
if (vma->vm_start > addr || (vma->vm_flags & VM_WRITE) == 0)
--- linux-2.6.23-rc9.orig/drivers/oprofile/buffer_sync.c
+++ linux-2.6.23-rc9/drivers/oprofile/buffer_sync.c
@@ -217,7 +217,7 @@ static unsigned long get_exec_dcookie(st
if (!mm)
goto out;
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ list_for_each_entry(vma, &mm->mm_vmas, vm_list) {
if (!vma->vm_file)
continue;
if (!(vma->vm_flags & VM_EXECUTABLE))
@@ -242,7 +242,7 @@ static unsigned long lookup_dcookie(stru
unsigned long cookie = NO_COOKIE;
struct vm_area_struct * vma;
- for (vma = find_vma(mm, addr); vma; vma = vma->vm_next) {
+ for (vma = find_vma(mm, addr); vma; vma = vma_next(vma)) {
if (addr < vma->vm_start || addr >= vma->vm_end)
continue;
--- linux-2.6.23-rc9.orig/fs/binfmt_aout.c
+++ linux-2.6.23-rc9/fs/binfmt_aout.c
@@ -323,7 +323,7 @@ static int load_aout_binary(struct linux
current->mm->free_area_cache = current->mm->mmap_base;
current->mm->cached_hole_size = 0;
- current->mm->mmap = NULL;
+ INIT_LIST_HEAD(¤t->mm->mm_vmas);
compute_creds(bprm);
current->flags &= ~PF_FORKNOEXEC;
#ifdef __sparc__
--- linux-2.6.23-rc9.orig/fs/binfmt_elf.c
+++ linux-2.6.23-rc9/fs/binfmt_elf.c
@@ -1458,7 +1458,7 @@ static int elf_dump_thread_status(long s
static struct vm_area_struct *first_vma(struct task_struct *tsk,
struct vm_area_struct *gate_vma)
{
- struct vm_area_struct *ret = tsk->mm->mmap;
+ struct vm_area_struct *ret = __vma_next(&tsk->mm->mm_vmas, NULL);
if (ret)
return ret;
@@ -1473,7 +1473,7 @@ static struct vm_area_struct *next_vma(s
{
struct vm_area_struct *ret;
- ret = this_vma->vm_next;
+ ret = vma_next(this_vma);
if (ret)
return ret;
if (this_vma == gate_vma)
--- linux-2.6.23-rc9.orig/fs/binfmt_elf_fdpic.c
+++ linux-2.6.23-rc9/fs/binfmt_elf_fdpic.c
@@ -1471,7 +1471,8 @@ static int elf_fdpic_dump_segments(struc
{
struct vm_area_struct *vma;
- for (vma = current->mm->mmap; vma; vma = vma->vm_next) {
+ list_for_each_entry(vma, ¤t->mm->mm_vmas, vm_list) {
+
unsigned long addr;
if (!maydump(vma, mm_flags))
@@ -1728,7 +1729,7 @@ static int elf_fdpic_core_dump(long sign
/* write program headers for segments dump */
for (
#ifdef CONFIG_MMU
- vma = current->mm->mmap; vma; vma = vma->vm_next
+ vma = __vma_next(¤t->mm->mm_vmas, NULL); vma; vma = vma_next(vma)
#else
vml = current->mm->context.vmlist; vml; vml = vml->next
#endif
--- linux-2.6.23-rc9.orig/fs/exec.c
+++ linux-2.6.23-rc9/fs/exec.c
@@ -555,7 +555,7 @@ static int shift_arg_pages(struct vm_are
* when the old and new regions overlap clear from new_end.
*/
free_pgd_range(&tlb, new_end, old_end, new_end,
- vma->vm_next ? vma->vm_next->vm_start : 0);
+ vma_next(vma) ? vma_next(vma)->vm_start : 0);
} else {
/*
* otherwise, clean from old_start; this is done to not touch
@@ -564,7 +564,7 @@ static int shift_arg_pages(struct vm_are
* for the others its just a little faster.
*/
free_pgd_range(&tlb, old_start, old_end, new_end,
- vma->vm_next ? vma->vm_next->vm_start : 0);
+ vma_next(vma) ? vma_next(vma)->vm_start : 0);
}
tlb_finish_mmu(tlb, new_end, old_end);
--- linux-2.6.23-rc9.orig/fs/hugetlbfs/inode.c
+++ linux-2.6.23-rc9/fs/hugetlbfs/inode.c
@@ -158,7 +158,7 @@ hugetlb_get_unmapped_area(struct file *f
full_search:
addr = ALIGN(start_addr, HPAGE_SIZE);
- for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
+ for (vma = find_vma(mm, addr); ; vma = vma_next(vma)) {
/* At this point: (!vma || addr < vma->vm_end). */
if (TASK_SIZE - len < addr) {
/*
--- linux-2.6.23-rc9.orig/fs/proc/task_mmu.c
+++ linux-2.6.23-rc9/fs/proc/task_mmu.c
@@ -87,11 +87,11 @@ int proc_exe_link(struct inode *inode, s
goto out;
down_read(&mm->mmap_sem);
- vma = mm->mmap;
+ vma = __vma_next(&mm->mm_vmas, NULL);
while (vma) {
if ((vma->vm_flags & VM_EXECUTABLE) && vma->vm_file)
break;
- vma = vma->vm_next;
+ vma = vma_next(vma);
}
if (vma) {
@@ -364,7 +364,7 @@ void clear_refs_smap(struct mm_struct *m
struct vm_area_struct *vma;
down_read(&mm->mmap_sem);
- for (vma = mm->mmap; vma; vma = vma->vm_next)
+ list_for_each_entry(vma, &mm->mm_vmas, vm_list)
if (vma->vm_mm && !is_vm_hugetlb_page(vma))
walk_page_range(vma, clear_refs_pte_range, NULL);
flush_tlb_mm(mm);
@@ -406,7 +406,7 @@ static void *m_start(struct seq_file *m,
/* Start with last addr hint */
if (last_addr && (vma = find_vma(mm, last_addr))) {
- vma = vma->vm_next;
+ vma = vma_next(vma);
goto out;
}
@@ -416,9 +416,9 @@ static void *m_start(struct seq_file *m,
*/
vma = NULL;
if ((unsigned long)l < mm->map_count) {
- vma = mm->mmap;
+ vma = __vma_next(&mm->mm_vmas, NULL);
while (l-- && vma)
- vma = vma->vm_next;
+ vma = vma_next(vma);
goto out;
}
@@ -448,12 +448,12 @@ static void vma_stop(struct proc_maps_pr
static void *m_next(struct seq_file *m, void *v, loff_t *pos)
{
struct proc_maps_private *priv = m->private;
- struct vm_area_struct *vma = v;
+ struct vm_area_struct *vma = v, *next;
struct vm_area_struct *tail_vma = priv->tail_vma;
(*pos)++;
- if (vma && (vma != tail_vma) && vma->vm_next)
- return vma->vm_next;
+ if (vma && (vma != tail_vma) && (next = vma_next(vma)))
+ return next;
vma_stop(priv, vma);
return (vma != tail_vma)? tail_vma: NULL;
}
--- linux-2.6.23-rc9.orig/include/linux/init_task.h
+++ linux-2.6.23-rc9/include/linux/init_task.h
@@ -47,6 +47,7 @@
#define INIT_MM(name) \
{ \
+ .mm_vmas = LIST_HEAD_INIT(name.mm_vmas), \
.mm_rb = RB_ROOT, \
.pgd = swapper_pg_dir, \
.mm_users = ATOMIC_INIT(2), \
--- linux-2.6.23-rc9.orig/include/linux/mm.h
+++ linux-2.6.23-rc9/include/linux/mm.h
@@ -35,6 +35,7 @@ extern int sysctl_legacy_va_layout;
#define sysctl_legacy_va_layout 0
#endif
+#include <linux/sched.h>
#include <asm/page.h>
#include <asm/pgtable.h>
#include <asm/processor.h>
@@ -63,7 +64,7 @@ struct vm_area_struct {
within vm_mm. */
/* linked list of VM areas per task, sorted by address */
- struct vm_area_struct *vm_next;
+ struct list_head vm_list;
pgprot_t vm_page_prot; /* Access permissions of this VMA. */
unsigned long vm_flags; /* Flags, listed below. */
@@ -113,6 +114,42 @@ struct vm_area_struct {
#endif
};
+static inline struct vm_area_struct *
+__vma_next(struct list_head *head, struct vm_area_struct *vma)
+{
+ if (unlikely(!vma))
+ vma = container_of(head, struct vm_area_struct, vm_list);
+
+ if (vma->vm_list.next == head)
+ return NULL;
+
+ return list_entry(vma->vm_list.next, struct vm_area_struct, vm_list);
+}
+
+static inline struct vm_area_struct *
+vma_next(struct vm_area_struct *vma)
+{
+ return __vma_next(&vma->vm_mm->mm_vmas, vma);
+}
+
+static inline struct vm_area_struct *
+__vma_prev(struct list_head *head, struct vm_area_struct *vma)
+{
+ if (unlikely(!vma))
+ vma = container_of(head, struct vm_area_struct, vm_list);
+
+ if (vma->vm_list.prev == head)
+ return NULL;
+
+ return list_entry(vma->vm_list.prev, struct vm_area_struct, vm_list);
+}
+
+static inline struct vm_area_struct *
+vma_prev(struct vm_area_struct *vma)
+{
+ return __vma_prev(&vma->vm_mm->mm_vmas, vma);
+}
+
extern struct kmem_cache *vm_area_cachep;
/*
@@ -769,13 +806,14 @@ struct zap_details {
struct page *vm_normal_page(struct vm_area_struct *, unsigned long, pte_t);
unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
unsigned long size, struct zap_details *);
-unsigned long unmap_vmas(struct mmu_gather **tlb,
+unsigned long unmap_vmas(struct mmu_gather **tlb, struct list_head *vmas,
struct vm_area_struct *start_vma, unsigned long start_addr,
unsigned long end_addr, unsigned long *nr_accounted,
struct zap_details *);
void free_pgd_range(struct mmu_gather **tlb, unsigned long addr,
unsigned long end, unsigned long floor, unsigned long ceiling);
-void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *start_vma,
+void free_pgtables(struct mmu_gather **tlb, struct list_head *vmas,
+ struct vm_area_struct *start_vma,
unsigned long floor, unsigned long ceiling);
int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
struct vm_area_struct *vma);
--- linux-2.6.23-rc9.orig/include/linux/sched.h
+++ linux-2.6.23-rc9/include/linux/sched.h
@@ -367,7 +367,7 @@ extern int get_dumpable(struct mm_struct
((1 << MMF_DUMP_ANON_PRIVATE) | (1 << MMF_DUMP_ANON_SHARED))
struct mm_struct {
- struct vm_area_struct * mmap; /* list of VMAs */
+ struct list_head mm_vmas;
struct rb_root mm_rb;
struct vm_area_struct * mmap_cache; /* last find_vma result */
unsigned long (*get_unmapped_area) (struct file *filp,
--- linux-2.6.23-rc9.orig/ipc/shm.c
+++ linux-2.6.23-rc9/ipc/shm.c
@@ -1034,7 +1034,7 @@ asmlinkage long sys_shmdt(char __user *s
vma = find_vma(mm, addr);
while (vma) {
- next = vma->vm_next;
+ next = vma_next(vma);
/*
* Check if the starting address would match, i.e. it's
@@ -1067,7 +1067,7 @@ asmlinkage long sys_shmdt(char __user *s
*/
size = PAGE_ALIGN(size);
while (vma && (loff_t)(vma->vm_end - addr) <= size) {
- next = vma->vm_next;
+ next = vma_next(vma);
/* finding a matching vma now does not alter retval */
if ((vma->vm_ops == &shm_vm_ops) &&
--- linux-2.6.23-rc9.orig/kernel/acct.c
+++ linux-2.6.23-rc9/kernel/acct.c
@@ -540,11 +540,8 @@ void acct_collect(long exitcode, int gro
if (group_dead && current->mm) {
struct vm_area_struct *vma;
down_read(¤t->mm->mmap_sem);
- vma = current->mm->mmap;
- while (vma) {
+ list_for_each_entry(vma, ¤t->mm->mm_vmas, vm_list)
vsize += vma->vm_end - vma->vm_start;
- vma = vma->vm_next;
- }
up_read(¤t->mm->mmap_sem);
}
--- linux-2.6.23-rc9.orig/kernel/auditsc.c
+++ linux-2.6.23-rc9/kernel/auditsc.c
@@ -780,8 +780,7 @@ static void audit_log_task_info(struct a
if (mm) {
down_read(&mm->mmap_sem);
- vma = mm->mmap;
- while (vma) {
+ list_for_each_entry(vma, &mm->mm_vmas, vm_list) {
if ((vma->vm_flags & VM_EXECUTABLE) &&
vma->vm_file) {
audit_log_d_path(ab, "exe=",
@@ -789,7 +788,6 @@ static void audit_log_task_info(struct a
vma->vm_file->f_path.mnt);
break;
}
- vma = vma->vm_next;
}
up_read(&mm->mmap_sem);
}
--- linux-2.6.23-rc9.orig/kernel/fork.c
+++ linux-2.6.23-rc9/kernel/fork.c
@@ -197,7 +197,7 @@ static struct task_struct *dup_task_stru
#ifdef CONFIG_MMU
static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
{
- struct vm_area_struct *mpnt, *tmp, **pprev;
+ struct vm_area_struct *mpnt, *tmp;
struct rb_node **rb_link, *rb_parent;
int retval;
unsigned long charge;
@@ -211,7 +211,6 @@ static inline int dup_mmap(struct mm_str
down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING);
mm->locked_vm = 0;
- mm->mmap = NULL;
mm->mmap_cache = NULL;
mm->free_area_cache = oldmm->mmap_base;
mm->cached_hole_size = ~0UL;
@@ -220,9 +219,8 @@ static inline int dup_mmap(struct mm_str
mm->mm_rb = RB_ROOT;
rb_link = &mm->mm_rb.rb_node;
rb_parent = NULL;
- pprev = &mm->mmap;
- for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
+ list_for_each_entry(mpnt, &oldmm->mm_vmas, vm_list) {
struct file *file;
if (mpnt->vm_flags & VM_DONTCOPY) {
@@ -250,7 +248,6 @@ static inline int dup_mmap(struct mm_str
vma_set_policy(tmp, pol);
tmp->vm_flags &= ~VM_LOCKED;
tmp->vm_mm = mm;
- tmp->vm_next = NULL;
anon_vma_link(tmp);
file = tmp->vm_file;
if (file) {
@@ -271,9 +268,8 @@ static inline int dup_mmap(struct mm_str
/*
* Link in the new vma and copy the page table entries.
*/
- *pprev = tmp;
- pprev = &tmp->vm_next;
+ list_add_tail(&tmp->vm_list, &mm->mm_vmas);
__vma_link_rb(mm, tmp, rb_link, rb_parent);
rb_link = &tmp->vm_rb.rb_right;
rb_parent = &tmp->vm_rb;
@@ -330,6 +326,7 @@ static inline void mm_free_pgd(struct mm
static struct mm_struct * mm_init(struct mm_struct * mm)
{
+ INIT_LIST_HEAD(&mm->mm_vmas);
atomic_set(&mm->mm_users, 1);
atomic_set(&mm->mm_count, 1);
init_rwsem(&mm->mmap_sem);
--- linux-2.6.23-rc9.orig/mm/madvise.c
+++ linux-2.6.23-rc9/mm/madvise.c
@@ -351,7 +351,7 @@ asmlinkage long sys_madvise(unsigned lon
if (start >= end)
goto out;
if (prev)
- vma = prev->vm_next;
+ vma = vma_next(prev);
else /* madvise_remove dropped mmap_sem */
vma = find_vma(current->mm, start);
}
--- linux-2.6.23-rc9.orig/mm/memory.c
+++ linux-2.6.23-rc9/mm/memory.c
@@ -264,11 +264,11 @@ void free_pgd_range(struct mmu_gather **
flush_tlb_pgtables((*tlb)->mm, start, end);
}
-void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
- unsigned long floor, unsigned long ceiling)
+void free_pgtables(struct mmu_gather **tlb, struct list_head *vmas,
+ struct vm_area_struct *vma, unsigned long floor, unsigned long ceiling)
{
while (vma) {
- struct vm_area_struct *next = vma->vm_next;
+ struct vm_area_struct *next = __vma_next(vmas, vma);
unsigned long addr = vma->vm_start;
/*
@@ -287,7 +287,7 @@ void free_pgtables(struct mmu_gather **t
while (next && next->vm_start <= vma->vm_end + PMD_SIZE
&& !is_vm_hugetlb_page(next)) {
vma = next;
- next = vma->vm_next;
+ next = __vma_next(vmas, vma);
anon_vma_unlink(vma);
unlink_file_vma(vma);
}
@@ -806,7 +806,7 @@ static unsigned long unmap_page_range(st
* ensure that any thus-far unmapped pages are flushed before unmap_vmas()
* drops the lock and schedules.
*/
-unsigned long unmap_vmas(struct mmu_gather **tlbp,
+unsigned long unmap_vmas(struct mmu_gather **tlbp, struct list_head *vmas,
struct vm_area_struct *vma, unsigned long start_addr,
unsigned long end_addr, unsigned long *nr_accounted,
struct zap_details *details)
@@ -818,7 +818,7 @@ unsigned long unmap_vmas(struct mmu_gath
spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
int fullmm = (*tlbp)->fullmm;
- for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {
+ for ( ; vma && vma->vm_start < end_addr; vma = __vma_next(vmas, vma)) {
unsigned long end;
start = max(vma->vm_start, start_addr);
@@ -889,7 +889,8 @@ unsigned long zap_page_range(struct vm_a
lru_add_drain();
tlb = tlb_gather_mmu(mm, 0);
update_hiwater_rss(mm);
- end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
+ end = unmap_vmas(&tlb, &vma->vm_mm->mm_vmas, vma,
+ address, end, &nr_accounted, details);
if (tlb)
tlb_finish_mmu(tlb, address, end);
return end;
@@ -2093,7 +2094,7 @@ int vmtruncate_range(struct inode *inode
void swapin_readahead(swp_entry_t entry, unsigned long addr,struct vm_area_struct *vma)
{
#ifdef CONFIG_NUMA
- struct vm_area_struct *next_vma = vma ? vma->vm_next : NULL;
+ struct vm_area_struct *next_vma = vma ? vma_next(vma) : NULL;
#endif
int i, num;
struct page *new_page;
@@ -2120,14 +2121,14 @@ void swapin_readahead(swp_entry_t entry,
if (vma) {
if (addr >= vma->vm_end) {
vma = next_vma;
- next_vma = vma ? vma->vm_next : NULL;
+ next_vma = vma ? vma_next(vma) : NULL;
}
if (vma && addr < vma->vm_start)
vma = NULL;
} else {
if (next_vma && addr >= next_vma->vm_start) {
vma = next_vma;
- next_vma = vma->vm_next;
+ next_vma = vma_next(vma);
}
}
#endif
--- linux-2.6.23-rc9.orig/mm/mempolicy.c
+++ linux-2.6.23-rc9/mm/mempolicy.c
@@ -344,9 +344,9 @@ check_range(struct mm_struct *mm, unsign
if (!first)
return ERR_PTR(-EFAULT);
prev = NULL;
- for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
+ for (vma = first; vma && vma->vm_start < end; vma = vma_next(vma)) {
if (!(flags & MPOL_MF_DISCONTIG_OK)) {
- if (!vma->vm_next && vma->vm_end < end)
+ if (!vma_next(vma) && vma->vm_end < end)
return ERR_PTR(-EFAULT);
if (prev && prev->vm_end < vma->vm_start)
return ERR_PTR(-EFAULT);
@@ -403,7 +403,7 @@ static int mbind_range(struct vm_area_st
err = 0;
for (; vma && vma->vm_start < end; vma = next) {
- next = vma->vm_next;
+ next = vma_next(vma);
if (vma->vm_start < start)
err = split_vma(vma->vm_mm, vma, start, 1);
if (!err && vma->vm_end > end)
@@ -610,7 +610,7 @@ int migrate_to_node(struct mm_struct *mm
nodes_clear(nmask);
node_set(source, nmask);
- check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
+ check_range(mm, __vma_next(&mm->mm_vmas, NULL)->vm_start, TASK_SIZE, &nmask,
flags | MPOL_MF_DISCONTIG_OK, &pagelist);
if (!list_empty(&pagelist))
@@ -1785,7 +1785,7 @@ void mpol_rebind_mm(struct mm_struct *mm
struct vm_area_struct *vma;
down_write(&mm->mmap_sem);
- for (vma = mm->mmap; vma; vma = vma->vm_next)
+ list_for_each_entry(vma, &mm->mm_vmas, vm_list)
mpol_rebind_policy(vma->vm_policy, new);
up_write(&mm->mmap_sem);
}
--- linux-2.6.23-rc9.orig/mm/migrate.c
+++ linux-2.6.23-rc9/mm/migrate.c
@@ -1030,7 +1030,7 @@ int migrate_vmas(struct mm_struct *mm, c
struct vm_area_struct *vma;
int err = 0;
- for(vma = mm->mmap; vma->vm_next && !err; vma = vma->vm_next) {
+ list_for_each_entry(vma, &mm->mm_vmas, vm_list) {
if (vma->vm_ops && vma->vm_ops->migrate) {
err = vma->vm_ops->migrate(vma, to, from, flags);
if (err)
--- linux-2.6.23-rc9.orig/mm/mlock.c
+++ linux-2.6.23-rc9/mm/mlock.c
@@ -123,7 +123,7 @@ static int do_mlock(unsigned long start,
if (nstart >= end)
break;
- vma = prev->vm_next;
+ vma = vma_next(prev);
if (!vma || vma->vm_start != nstart) {
error = -ENOMEM;
break;
@@ -181,7 +181,7 @@ static int do_mlockall(int flags)
if (flags == MCL_FUTURE)
goto out;
- for (vma = current->mm->mmap; vma ; vma = prev->vm_next) {
+ list_for_each_entry(vma, ¤t->mm->mm_vmas, vm_list) {
unsigned int newflags;
newflags = vma->vm_flags | VM_LOCKED;
@@ -190,6 +190,7 @@ static int do_mlockall(int flags)
/* Ignore errors */
mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags);
+ vma = prev;
}
out:
return 0;
--- linux-2.6.23-rc9.orig/mm/mmap.c
+++ linux-2.6.23-rc9/mm/mmap.c
@@ -35,7 +35,7 @@
#define arch_mmap_check(addr, len, flags) (0)
#endif
-static void unmap_region(struct mm_struct *mm,
+static void unmap_region(struct mm_struct *mm, struct list_head *vmas,
struct vm_area_struct *vma, struct vm_area_struct *prev,
unsigned long start, unsigned long end);
@@ -220,18 +220,17 @@ void unlink_file_vma(struct vm_area_stru
/*
* Close a vm structure and free it, returning the next.
*/
-static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
+static void remove_vma(struct vm_area_struct *vma)
{
- struct vm_area_struct *next = vma->vm_next;
-
might_sleep();
+ list_del(&vma->vm_list);
if (vma->vm_ops && vma->vm_ops->close)
vma->vm_ops->close(vma);
if (vma->vm_file)
fput(vma->vm_file);
mpol_free(vma_policy(vma));
kmem_cache_free(vm_area_cachep, vma);
- return next;
+ return;
}
asmlinkage unsigned long sys_brk(unsigned long brk)
@@ -316,11 +315,9 @@ void validate_mm(struct mm_struct *mm)
{
int bug = 0;
int i = 0;
- struct vm_area_struct *tmp = mm->mmap;
- while (tmp) {
- tmp = tmp->vm_next;
+ struct vm_area_struct *vma;
+ list_for_each_entry(vma, &mm->mm_vmas, vm_list)
i++;
- }
if (i != mm->map_count)
printk("map_count %d vm_next %d\n", mm->map_count, i), bug = 1;
i = browse_rb(&mm->mm_rb);
@@ -374,15 +371,14 @@ __vma_link_list(struct mm_struct *mm, st
struct vm_area_struct *prev, struct rb_node *rb_parent)
{
if (prev) {
- vma->vm_next = prev->vm_next;
- prev->vm_next = vma;
+ list_add(&vma->vm_list, &prev->vm_list);
} else {
- mm->mmap = vma;
- if (rb_parent)
- vma->vm_next = rb_entry(rb_parent,
+ if (rb_parent) {
+ struct vm_area_struct *next = rb_entry(rb_parent,
struct vm_area_struct, vm_rb);
- else
- vma->vm_next = NULL;
+ list_add_tail(&vma->vm_list, &next->vm_list);
+ } else
+ list_add(&vma->vm_list, &mm->mm_vmas);
}
}
@@ -472,7 +468,7 @@ static inline void
__vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
struct vm_area_struct *prev)
{
- prev->vm_next = vma->vm_next;
+ list_del(&vma->vm_list);
rb_erase(&vma->vm_rb, &mm->mm_rb);
if (mm->mmap_cache == vma)
mm->mmap_cache = prev;
@@ -489,7 +485,7 @@ void vma_adjust(struct vm_area_struct *v
unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert)
{
struct mm_struct *mm = vma->vm_mm;
- struct vm_area_struct *next = vma->vm_next;
+ struct vm_area_struct *next = vma_next(vma);
struct vm_area_struct *importer = NULL;
struct address_space *mapping = NULL;
struct prio_tree_root *root = NULL;
@@ -630,7 +626,7 @@ again: remove_next = 1 + (end > next->
* up the code too much to do both in one go.
*/
if (remove_next == 2) {
- next = vma->vm_next;
+ next = vma_next(vma);
goto again;
}
}
@@ -751,13 +747,10 @@ struct vm_area_struct *vma_merge(struct
if (vm_flags & VM_SPECIAL)
return NULL;
- if (prev)
- next = prev->vm_next;
- else
- next = mm->mmap;
+ next = __vma_next(&mm->mm_vmas, prev);
area = next;
if (next && next->vm_end == end) /* cases 6, 7, 8 */
- next = next->vm_next;
+ next = vma_next(next);
/*
* Can it merge with the predecessor?
@@ -816,7 +809,7 @@ struct anon_vma *find_mergeable_anon_vma
struct vm_area_struct *near;
unsigned long vm_flags;
- near = vma->vm_next;
+ near = vma_next(vma);
if (!near)
goto try_prev;
@@ -899,6 +892,7 @@ unsigned long do_mmap_pgoff(struct file
int error;
int accountable = 1;
unsigned long reqprot = prot;
+ LIST_HEAD(vmas);
/*
* Does the application expect PROT_READ to imply PROT_EXEC?
@@ -1075,6 +1069,7 @@ unsigned long mmap_region(struct file *f
struct rb_node **rb_link, *rb_parent;
unsigned long charged = 0;
struct inode *inode = file ? file->f_path.dentry->d_inode : NULL;
+ LIST_HEAD(vmas);
/* Clear old maps */
error = -ENOMEM;
@@ -1210,7 +1205,8 @@ unmap_and_free_vma:
fput(file);
/* Undo any partial mapping done by a device driver. */
- unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end);
+ list_add(&vma->vm_list, &vmas);
+ unmap_region(mm, &vmas, vma, prev, vma->vm_start, vma->vm_end);
charged = 0;
free_vma:
kmem_cache_free(vm_area_cachep, vma);
@@ -1261,7 +1257,7 @@ arch_get_unmapped_area(struct file *filp
}
full_search:
- for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
+ for (vma = find_vma(mm, addr); ; vma = vma_next(vma)) {
/* At this point: (!vma || addr < vma->vm_end). */
if (TASK_SIZE - len < addr) {
/*
@@ -1472,14 +1468,11 @@ struct vm_area_struct *
find_vma_prev(struct mm_struct *mm, unsigned long addr,
struct vm_area_struct **pprev)
{
- struct vm_area_struct *vma = NULL, *prev = NULL;
+ struct vm_area_struct *prev = NULL, *next;
struct rb_node * rb_node;
if (!mm)
goto out;
- /* Guard against addr being lower than the first VMA */
- vma = mm->mmap;
-
/* Go through the RB tree quickly. */
rb_node = mm->mm_rb.rb_node;
@@ -1491,7 +1484,8 @@ find_vma_prev(struct mm_struct *mm, unsi
rb_node = rb_node->rb_left;
} else {
prev = vma_tmp;
- if (!prev->vm_next || (addr < prev->vm_next->vm_end))
+ next = __vma_next(&mm->mm_vmas, prev);
+ if (!next || (addr < next->vm_end))
break;
rb_node = rb_node->rb_right;
}
@@ -1499,7 +1493,7 @@ find_vma_prev(struct mm_struct *mm, unsi
out:
*pprev = prev;
- return prev ? prev->vm_next : vma;
+ return __vma_next(&mm->mm_vmas, prev);
}
/*
@@ -1707,18 +1701,21 @@ find_extend_vma(struct mm_struct * mm, u
*
* Called with the mm semaphore held.
*/
-static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
+static void remove_vma_list(struct mm_struct *mm, struct list_head *vmas,
+ struct vm_area_struct *vma)
{
/* Update high watermark before we lower total_vm */
update_hiwater_vm(mm);
do {
+ struct vm_area_struct *next = __vma_next(vmas, vma);
long nrpages = vma_pages(vma);
mm->total_vm -= nrpages;
if (vma->vm_flags & VM_LOCKED)
mm->locked_vm -= nrpages;
vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages);
- vma = remove_vma(vma);
+ remove_vma(vma);
+ vma = next;
} while (vma);
validate_mm(mm);
}
@@ -1728,21 +1725,22 @@ static void remove_vma_list(struct mm_st
*
* Called with the mm semaphore held.
*/
-static void unmap_region(struct mm_struct *mm,
+static void unmap_region(struct mm_struct *mm, struct list_head *vmas,
struct vm_area_struct *vma, struct vm_area_struct *prev,
unsigned long start, unsigned long end)
{
- struct vm_area_struct *next = prev? prev->vm_next: mm->mmap;
+ struct vm_area_struct *next = __vma_next(&mm->mm_vmas, prev);
struct mmu_gather *tlb;
unsigned long nr_accounted = 0;
lru_add_drain();
tlb = tlb_gather_mmu(mm, 0);
update_hiwater_rss(mm);
- unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL);
+ unmap_vmas(&tlb, vmas, vma, start, end, &nr_accounted, NULL);
vm_unacct_memory(nr_accounted);
- free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
- next? next->vm_start: 0);
+ free_pgtables(&tlb, vmas, vma,
+ prev ? prev->vm_end : FIRST_USER_ADDRESS,
+ next ? next->vm_start : 0);
tlb_finish_mmu(tlb, start, end);
}
@@ -1752,21 +1750,17 @@ static void unmap_region(struct mm_struc
*/
static void
detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
- struct vm_area_struct *prev, unsigned long end)
+ struct vm_area_struct *prev, unsigned long end, struct list_head *vmas)
{
- struct vm_area_struct **insertion_point;
- struct vm_area_struct *tail_vma = NULL;
unsigned long addr;
- insertion_point = (prev ? &prev->vm_next : &mm->mmap);
do {
+ struct vm_area_struct *next = vma_next(vma);
rb_erase(&vma->vm_rb, &mm->mm_rb);
mm->map_count--;
- tail_vma = vma;
- vma = vma->vm_next;
+ list_move_tail(&vma->vm_list, vmas);
+ vma = next;
} while (vma && vma->vm_start < end);
- *insertion_point = vma;
- tail_vma->vm_next = NULL;
if (mm->unmap_area == arch_unmap_area)
addr = prev ? prev->vm_end : mm->mmap_base;
else
@@ -1836,6 +1830,7 @@ int do_munmap(struct mm_struct *mm, unsi
{
unsigned long end;
struct vm_area_struct *vma, *prev, *last;
+ LIST_HEAD(vmas);
if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start)
return -EINVAL;
@@ -1875,16 +1870,16 @@ int do_munmap(struct mm_struct *mm, unsi
if (error)
return error;
}
- vma = prev? prev->vm_next: mm->mmap;
+ vma = __vma_next(&mm->mm_vmas, prev);
/*
* Remove the vma's, and unmap the actual pages
*/
- detach_vmas_to_be_unmapped(mm, vma, prev, end);
- unmap_region(mm, vma, prev, start, end);
+ detach_vmas_to_be_unmapped(mm, vma, prev, end, &vmas);
+ unmap_region(mm, &vmas, vma, prev, start, end);
/* Fix up all other VM information */
- remove_vma_list(mm, vma);
+ remove_vma_list(mm, &vmas, vma);
return 0;
}
@@ -2021,7 +2016,9 @@ EXPORT_SYMBOL(do_brk);
void exit_mmap(struct mm_struct *mm)
{
struct mmu_gather *tlb;
- struct vm_area_struct *vma = mm->mmap;
+ LIST_HEAD(vmas);
+ struct vm_area_struct *vma = __vma_next(&mm->mm_vmas, NULL);
+ struct vm_area_struct *next;
unsigned long nr_accounted = 0;
unsigned long end;
@@ -2030,22 +2027,23 @@ void exit_mmap(struct mm_struct *mm)
lru_add_drain();
flush_cache_mm(mm);
+ detach_vmas_to_be_unmapped(mm, vma, NULL, -1, &vmas);
tlb = tlb_gather_mmu(mm, 1);
/* Don't update_hiwater_rss(mm) here, do_exit already did */
/* Use -1 here to ensure all VMAs in the mm are unmapped */
- end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
+ end = unmap_vmas(&tlb, &vmas, vma, 0, -1, &nr_accounted, NULL);
vm_unacct_memory(nr_accounted);
- free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);
+ free_pgtables(&tlb, &vmas, vma, FIRST_USER_ADDRESS, 0);
tlb_finish_mmu(tlb, 0, end);
/*
* Walk the list again, actually closing and freeing it,
* with preemption enabled, without holding any MM locks.
*/
- while (vma)
- vma = remove_vma(vma);
+ list_for_each_entry_safe(vma, next, &vmas, vm_list)
+ remove_vma(vma);
- BUG_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
+ /* BUG_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT); */
}
/* Insert vm structure into process list sorted by address
--- linux-2.6.23-rc9.orig/mm/mprotect.c
+++ linux-2.6.23-rc9/mm/mprotect.c
@@ -302,7 +302,7 @@ sys_mprotect(unsigned long start, size_t
if (nstart >= end)
goto out;
- vma = prev->vm_next;
+ vma = vma_next(prev);
if (!vma || vma->vm_start != nstart) {
error = -ENOMEM;
goto out;
--- linux-2.6.23-rc9.orig/mm/mremap.c
+++ linux-2.6.23-rc9/mm/mremap.c
@@ -226,7 +226,7 @@ static unsigned long move_vma(struct vm_
if (excess) {
vma->vm_flags |= VM_ACCOUNT;
if (split)
- vma->vm_next->vm_flags |= VM_ACCOUNT;
+ vma_next(vma)->vm_flags |= VM_ACCOUNT;
}
if (vm_flags & VM_LOCKED) {
@@ -360,8 +360,9 @@ unsigned long do_mremap(unsigned long ad
!((flags & MREMAP_FIXED) && (addr != new_addr)) &&
(old_len != new_len || !(flags & MREMAP_MAYMOVE))) {
unsigned long max_addr = TASK_SIZE;
- if (vma->vm_next)
- max_addr = vma->vm_next->vm_start;
+ struct vm_area_struct *next = vma_next(vma);
+ if (next)
+ max_addr = next->vm_start;
/* can we just expand the current mapping? */
if (max_addr - addr >= new_len) {
int pages = (new_len - old_len) >> PAGE_SHIFT;
--- linux-2.6.23-rc9.orig/mm/msync.c
+++ linux-2.6.23-rc9/mm/msync.c
@@ -93,7 +93,7 @@ asmlinkage long sys_msync(unsigned long
error = 0;
goto out_unlock;
}
- vma = vma->vm_next;
+ vma = vma_next(vma);
}
}
out_unlock:
--- linux-2.6.23-rc9.orig/mm/swapfile.c
+++ linux-2.6.23-rc9/mm/swapfile.c
@@ -626,7 +626,7 @@ static int unuse_mm(struct mm_struct *mm
down_read(&mm->mmap_sem);
lock_page(page);
}
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ list_for_each_entry(vma, &mm->mm_vmas, vm_list) {
if (vma->anon_vma && unuse_vma(vma, entry, page))
break;
}
--
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 10+ messages in thread
* [PATCH/RFC 2/9] lib: RCU friendly B+tree
2007-10-22 10:45 [PATCH/RFC 0/9] VMA lookup with RCU Vaidyanathan Srinivasan
2007-10-22 10:45 ` [PATCH/RFC 1/9] Data structure changes Vaidyanathan Srinivasan
@ 2007-10-22 10:45 ` Vaidyanathan Srinivasan
2007-10-22 10:45 ` [PATCH/RFC 3/9] mm: use the B+tree for vma lookups Vaidyanathan Srinivasan
` (6 subsequent siblings)
8 siblings, 0 replies; 10+ messages in thread
From: Vaidyanathan Srinivasan @ 2007-10-22 10:45 UTC (permalink / raw)
To: linux-mm; +Cc: Alexis Bruemmer, Peter Zijlstra
[-- Attachment #1: 2_btree.patch --]
[-- Type: text/plain, Size: 33151 bytes --]
Introduce an RCU friendly B+tree in lib
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
include/linux/btree.h | 151 ++++++
init/main.c | 2
lib/Makefile | 3
lib/btree.c | 1189 ++++++++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 1344 insertions(+), 1 deletion(-)
--- /dev/null
+++ linux-2.6.23-rc6/include/linux/btree.h
@@ -0,0 +1,151 @@
+#ifndef _LINUX_BTREE_H
+#define _LINUX_BTREE_H
+
+#define BTREE_DEBUG 1
+
+#include <linux/mempool.h>
+#include <linux/log2.h>
+#include <linux/rcupdate.h>
+#include <asm/cache.h>
+#include <asm/bitops.h>
+
+struct btree_item {
+ unsigned long key;
+ void *data;
+};
+
+#define BTREE_ITEM_CACHELINE (L1_CACHE_BYTES / sizeof(struct btree_item))
+#define BTREE_ITEM_MAX (BTREE_ITEM_CACHELINE < 16 ? 16 : BTREE_ITEM_CACHELINE)
+#define BTREE_ITEM_HALF (BTREE_ITEM_MAX/2)
+
+struct btree_node {
+ struct btree_item item[BTREE_ITEM_MAX];
+};
+
+/*
+ * Max depth of the tree.
+ * Assumes BTREE_ITEM_MAX is power of two.
+ */
+#define BTREE_MAX_DEPTH \
+ (1 + DIV_ROUND_UP(BITS_PER_LONG, ilog2(BTREE_ITEM_MAX)))
+
+/*
+ * Max number of nodes to be RCUed per modification.
+ */
+#define BTREE_NODE_REPLACE (2 * BTREE_MAX_DEPTH)
+
+struct btree_freevec {
+ struct rcu_head rcu_head;
+ int pos;
+ void *slot[BTREE_NODE_REPLACE];
+};
+
+struct btree_root {
+ struct btree_node *root;
+ int height;
+ mempool_t *mempool;
+ struct btree_freevec *freevec;
+ gfp_t gfp_mask;
+ void (*flush)(struct btree_freevec *);
+};
+
+#ifdef BTREE_DEBUG
+void btree_print(struct btree_root *);
+#else
+#define btree_print(root) do { } while (0)
+#endif
+
+static inline void *btree_item_deref(struct btree_item *itemp)
+{
+ void *ptr = rcu_dereference(itemp->data);
+ __clear_bit(0, (void *)&ptr);
+ return ptr;
+}
+
+static inline unsigned long btree_item_key(struct btree_item *itemp)
+{
+ unsigned long key = itemp->key;
+ /*
+ * match the smp_wmb() in btree_item_key_set()
+ */
+ smp_rmb();
+ return key;
+}
+
+extern struct btree_item *btree_search_next(struct btree_root *root,
+ unsigned long index, struct btree_item **nextp);
+
+static inline void *btree_lookup(struct btree_root *root, unsigned long index)
+{
+ struct btree_item *item;
+
+ item = btree_search_next(root, index, NULL);
+ if (!item || btree_item_key(item) != index)
+ return NULL;
+
+ return item->data;
+}
+
+static inline void *btree_lookup_next(struct btree_root *root,
+ unsigned long index, void **nextp)
+{
+ struct btree_item *next = NULL, *item;
+
+ item = btree_search_next(root, index, &next);
+ if (next)
+ *nextp = next->data;
+ if (!item || btree_item_key(item) != index)
+ return NULL;
+
+ return item->data;
+}
+
+static inline void *btree_stab(struct btree_root *root, unsigned long index)
+{
+ struct btree_item *item;
+
+ item = btree_search_next(root, index, NULL);
+ if (!item)
+ return NULL;
+
+ return item->data;
+}
+
+static inline void *btree_stab_next(struct btree_root *root,
+ unsigned long index, void **nextp)
+{
+ struct btree_item *next = NULL, *item;
+
+ item = btree_search_next(root, index, &next);
+ if (next)
+ *nextp = next->data;
+ if (!item)
+ return NULL;
+
+ return item->data;
+}
+
+extern int btree_preload(struct btree_root *, gfp_t);
+extern int btree_insert(struct btree_root *, unsigned long, void *);
+extern int btree_update(struct btree_root *, unsigned long, unsigned long);
+extern void *btree_remove(struct btree_root *, unsigned long);
+
+extern void btree_root_init(struct btree_root *, gfp_t);
+extern void btree_root_destroy(struct btree_root *);
+
+extern void btree_freevec_flush(struct btree_freevec *);
+
+#define BTREE_INIT_FLUSH(gfp, f) (struct btree_root){ \
+ .root = NULL, \
+ .height = 0, \
+ .mempool = NULL, \
+ .freevec = NULL, \
+ .gfp_mask = (gfp), \
+ .flush = (f), \
+}
+
+#define BTREE_INIT(gfp) BTREE_INIT_FLUSH(gfp, btree_freevec_flush)
+
+extern void __init btree_init(void);
+
+#endif /* _LINUX_BTREE_H */
--- linux-2.6.23-rc6.orig/init/main.c
+++ linux-2.6.23-rc6/init/main.c
@@ -55,6 +55,7 @@
#include <linux/pid_namespace.h>
#include <linux/device.h>
#include <linux/kthread.h>
+#include <linux/btree.h>
#include <asm/io.h>
#include <asm/bugs.h>
@@ -613,6 +614,7 @@ asmlinkage void __init start_kernel(void
cpuset_init_early();
mem_init();
kmem_cache_init();
+ btree_init();
setup_per_cpu_pageset();
numa_policy_init();
if (late_time_init)
--- linux-2.6.23-rc6.orig/lib/Makefile
+++ linux-2.6.23-rc6/lib/Makefile
@@ -5,7 +5,8 @@
lib-y := ctype.o string.o vsprintf.o kasprintf.o cmdline.o \
rbtree.o radix-tree.o dump_stack.o \
idr.o int_sqrt.o bitmap.o extable.o prio_tree.o \
- sha1.o irq_regs.o reciprocal_div.o argv_split.o
+ sha1.o irq_regs.o reciprocal_div.o argv_split.o \
+ btree.o
lib-$(CONFIG_MMU) += ioremap.o
lib-$(CONFIG_SMP) += cpumask.o
--- /dev/null
+++ linux-2.6.23-rc6/lib/btree.c
@@ -0,0 +1,1189 @@
+/*
+ * Copyright 2007, Red Hat Inc. Peter Zijlstra <pzijlstr@redhat.com>
+ * GPLv2
+ *
+ * Something that started out as an RCU friendly B+tree.
+ *
+ * Contrairy to most B-trees the nodes have an even number of slots. This can
+ * be done because we carry all the children in the leafs and thus we don't
+ * need to push one up on a split. We can just duplicate the first.
+ *
+ * The inner nodes are basically an N-way space partition.
+ *
+ * Another difference to the typical B+tree is that this implementation does
+ * not thread the leafs, this is left up to the user if so desired.
+ *
+ *
+ * [------------------------------>[------------------------------>
+ * | |
+ * [------------------->[--------->[-------------->[-------------->
+ * | | | |
+ * [..) [..) [..) [..) [..) [..) [..) [..) [..) [..) [..) [..)
+ *
+ *
+ */
+#include <linux/btree.h>
+#include <linux/slab.h>
+#include <linux/log2.h>
+#include <linux/err.h>
+
+#ifdef BTREE_DEBUG
+void btree_print(struct btree_root *root);
+int btree_validate(struct btree_root *root);
+#else
+#define btree_print(root) do { } while (0)
+#define btree_validate(root) (0)
+#endif
+
+static struct kmem_cache *btree_cachep;
+
+static inline void btree_free(struct btree_root *root, void *ptr)
+{
+ root->freevec->slot[root->freevec->pos++] = ptr;
+}
+
+static inline void btree_ptr_flip(struct btree_root *root,
+ void **nodep, void *new)
+{
+ void *old = rcu_dereference(*nodep);
+ rcu_assign_pointer(*nodep, new);
+ __clear_bit(0, (void *)&old);
+ btree_free(root, old);
+}
+
+/*
+ * node pointers have bit 0 set.
+ */
+static inline int btree_item_node(struct btree_item *itemp)
+{
+ return test_bit(0, (void *)&itemp->data);
+}
+
+static inline struct btree_node *btree_node_ptr(struct btree_node *nodep)
+{
+ __set_bit(0, (void *)&nodep);
+ return nodep;
+}
+
+static inline void btree_item_key_set(struct btree_item *itemp,
+ unsigned long index)
+{
+ /*
+ * we can only tighten when the new node is linked in
+ */
+ smp_wmb();
+ itemp->key = index;
+}
+
+static void btree_item_flip(struct btree_root *root,
+ struct btree_item *itemp, struct btree_node *new)
+{
+ unsigned long index;
+
+ btree_ptr_flip(root, &itemp->data, btree_node_ptr(new));
+
+ /* possibly tighten */
+ index = btree_item_key(&new->item[0]);
+ if (btree_item_key(itemp) != index)
+ btree_item_key_set(itemp, index);
+}
+
+static inline void btree_item_copy(void *dst, void *src, int nr)
+{
+ memcpy(dst, src, nr*sizeof(struct btree_item));
+}
+
+static inline int btree_item_count(struct btree_node *node)
+{
+ int j;
+
+ for (j = 0; j < BTREE_ITEM_MAX &&
+ node->item[j].data ; j++)
+ ;
+
+ return j;
+}
+
+static struct btree_node *btree_node_alloc(struct btree_root *root)
+{
+ struct btree_node *node;
+
+ node = mempool_alloc(root->mempool, root->gfp_mask);
+ memset(node, 0, sizeof(struct btree_node));
+
+ return node;
+}
+
+static inline void btree_item_free(struct btree_root *root,
+ struct btree_item *itemp)
+{
+ if (btree_item_node(itemp))
+ btree_free(root, btree_item_deref(itemp));
+}
+
+/*
+ * node storage; expose interface to claim all the needed memory in advance
+ * this avoids getting stuck in a situation where going bad is as impossible
+ * as going forward.
+ */
+int btree_preload(struct btree_root *root, gfp_t gfp_mask)
+{
+ int ret = 0;
+
+ if (!root->mempool) {
+ root->mempool = mempool_create_slab_pool(0, btree_cachep);
+ if (!root->mempool)
+ return -ENOMEM;
+ }
+
+ if (!root->freevec) {
+ root->freevec = kzalloc(sizeof(struct btree_freevec), gfp_mask);
+ if (!root->freevec)
+ return -ENOMEM;
+ }
+
+ ret = mempool_resize(root->mempool, 1+2*root->height, gfp_mask);
+
+ return ret;
+}
+
+static int btree_mod_init(struct btree_root *root)
+{
+ return btree_preload(root, root->gfp_mask);
+}
+
+static int btree_mod_finish(struct btree_root *root, unsigned long index,
+ char *op)
+{
+ int ret = 0;
+ if (root->mempool)
+ ret = mempool_resize(root->mempool, 1, root->gfp_mask);
+
+ if (btree_validate(root)) {
+ printk(KERN_DEBUG "modified(%s): %lu\n", op, index);
+ btree_print(root);
+ BUG();
+ }
+
+ root->flush(root->freevec);
+ root->freevec = NULL;
+
+ return ret;
+}
+
+/*
+ * stack, to store traversal path.
+ */
+struct btree_path {
+ struct btree_node *node;
+ int offset;
+};
+
+struct btree_stack {
+ int p;
+ struct btree_path path[BTREE_MAX_DEPTH];
+};
+
+static inline void btree_stack_init(struct btree_stack *stack)
+{
+ memset(stack, 0, sizeof(*stack));
+}
+
+static inline void btree_stack_push(struct btree_stack *stack,
+ struct btree_node *node, int offset)
+{
+ stack->path[stack->p].node = node;
+ stack->path[stack->p].offset = offset;
+ stack->p++;
+
+ BUG_ON(stack->p > ARRAY_SIZE(stack->path));
+}
+
+static inline void btree_stack_pop(struct btree_stack *stack,
+ struct btree_node **nodep, int *offsetp)
+{
+ stack->p--;
+ *nodep = stack->path[stack->p].node;
+ *offsetp = stack->path[stack->p].offset;
+
+ BUG_ON(stack->p < 0);
+}
+
+static inline int btree_stack_size(struct btree_stack *stack)
+{
+ return stack->p;
+}
+
+static struct btree_node *btree_stack_peek_left(struct btree_stack *stack)
+{
+ struct btree_node *node;
+ struct btree_item *item;
+ int offset;
+
+ if (!btree_stack_size(stack))
+ return NULL;
+
+ node = stack->path[stack->p - 1].node;
+ offset = stack->path[stack->p - 1].offset;
+
+ if (offset == 0)
+ return NULL;
+
+ offset--;
+ item = &node->item[offset];
+ return btree_item_deref(item);
+}
+
+static struct btree_node *btree_stack_peek_right(struct btree_stack *stack)
+{
+ struct btree_node *node;
+ struct btree_item *item;
+ int offset;
+
+ if (!btree_stack_size(stack))
+ return NULL;
+
+ node = stack->path[stack->p - 1].node;
+ offset = stack->path[stack->p - 1].offset;
+
+ offset++;
+ if (offset == BTREE_ITEM_MAX)
+ return NULL;
+
+ item = &node->item[offset];
+ return btree_item_deref(item);
+}
+
+/* ------ search ------- */
+
+/*
+ * Since we can re-adjust the space partitioning in delete and update a
+ * lockless lookup might hit a wrong branch once in a while, hence we might
+ * need to backtrack.
+ *
+ * Because we do a search for a less than or equal index we can only backtrack
+ * to the left. So the split can be off to the left but never to the right.
+ * Hence we need to move splits left on descend and right on ascend of the
+ * tree. See update and remove.
+ *
+ * NOTE the backtrack should be limited to one entry so worst time is:
+ * O(2*log(n)-1)
+ */
+
+/*
+ * find which branch to take
+ */
+static inline
+int __btree_item_search(struct btree_root *root,
+ struct btree_node *node, unsigned long index)
+{
+ int i;
+
+ for (i = 1; i < BTREE_ITEM_MAX; i++)
+ if (node->item[i].data == NULL ||
+ index < btree_item_key(&node->item[i]))
+ break;
+ i--;
+ return i;
+}
+
+/*
+ * find item and the path to it.
+ */
+static struct btree_item *__btree_search(struct btree_root *root,
+ struct btree_stack *stack, unsigned long index)
+{
+ struct btree_node *node = rcu_dereference(root->root);
+ struct btree_item *item;
+ int offset;
+
+ btree_stack_init(stack);
+
+ if (!node)
+ return NULL;
+
+ do {
+ /*
+ * if the split was too low, back-track and take the previous
+ * branch
+ */
+ if (unlikely(btree_item_key(&node->item[0]) > index)) {
+ if (!btree_stack_size(stack))
+ return NULL;
+
+ do {
+ btree_stack_pop(stack, &node, &offset);
+ } while (btree_stack_size(stack) && offset == 0);
+
+ if (!btree_stack_size(stack) && offset == 0)
+ return NULL;
+
+ offset--;
+ } else
+ offset = __btree_item_search(root, node, index);
+
+ btree_stack_push(stack, node, offset);
+
+ item = &node->item[offset];
+ if (!btree_item_node(item))
+ break;
+
+ node = btree_item_deref(item);
+ } while (node);
+
+ return item;
+}
+
+/*
+ * find the left-most item in a sub-tree, and the path to it.
+ */
+static struct btree_item *__btree_find_first(struct btree_root *root,
+ struct btree_stack *stack, struct btree_node *node, int offset)
+{
+ struct btree_item *item;
+
+ do {
+ btree_stack_push(stack, node, offset);
+
+ item = &node->item[offset];
+ if (!btree_item_node(item))
+ break;
+
+ node = btree_item_deref(item);
+ offset = 0;
+ } while (node);
+
+ return item;
+}
+
+/*
+ * find the item with a lesser or equal index
+ * and optionally the next item.
+ */
+struct btree_item *btree_search_next(struct btree_root *root,
+ unsigned long index, struct btree_item **nextp)
+{
+ struct btree_stack stack;
+ struct btree_node *node;
+ struct btree_item *item, *next;
+ int offset;
+
+ item = __btree_search(root, &stack, index);
+
+ if (item && nextp) {
+ do {
+ btree_stack_pop(&stack, &node, &offset);
+ } while (btree_stack_size(&stack) &&
+ offset == BTREE_ITEM_MAX-1);
+
+ if (offset != BTREE_ITEM_MAX-1) {
+ offset++;
+ next = __btree_find_first(root, &stack, node, offset);
+ } else
+ next = NULL;
+
+ *nextp = next;
+ } else if (nextp) {
+ node = rcu_dereference(root->root);
+ offset = 0;
+
+ if (node) {
+ next = __btree_find_first(root, &stack, node, offset);
+ *nextp = next;
+ }
+ }
+
+ return item;
+}
+
+/* ------ insert ------- */
+
+/*
+ * insert item; split nodes when needed
+ */
+int btree_insert(struct btree_root *root,
+ unsigned long index, void *data)
+{
+ struct btree_stack stack;
+ struct btree_node *node = NULL,
+ *old_node,
+ *update,
+ *new = NULL,
+ *split = NULL,
+ *uleft,
+ *uright,
+ *left = NULL,
+ *right = NULL;
+ struct btree_item *item;
+ struct btree_item nitem = {
+ .key = index,
+ .data = data,
+ };
+ unsigned long key;
+ int i, j, n, ret = 0;
+
+ ret = btree_mod_init(root);
+ if (ret)
+ return ret;
+
+ item = __btree_search(root, &stack, index);
+ if (!item) {
+ if (!root->root) {
+ root->root = btree_node_alloc(root);
+ root->height = 1;
+ btree_stack_push(&stack, root->root, 0);
+ } else
+ __btree_find_first(root, &stack, root->root, 0);
+ } else if (btree_item_key(item) == index) {
+ ret = -EEXIST;
+ goto out;
+ }
+
+ do {
+ old_node = node;
+ btree_stack_pop(&stack, &node, &i);
+ item = &node->item[i];
+
+ update = NULL;
+ if (btree_item_node(item)) {
+ /* no change */
+ if (!new && !split && !left && !right) {
+ BUG_ON(btree_item_deref(item) != old_node);
+
+ /* possibly tighten the split on our way back up */
+ key = btree_item_key(&old_node->item[0]);
+ if (btree_item_key(item) != key)
+ btree_item_key_set(item, key);
+ continue;
+ }
+
+ /* single change */
+ if (new && !split && !left && !right) {
+ btree_item_flip(root, item, new);
+ new = NULL;
+ continue;
+ }
+
+ /* merge left */
+ if (new && !split && left && !right) {
+ update = btree_node_alloc(root);
+ btree_item_copy(update->item, node->item,
+ BTREE_ITEM_MAX);
+ update->item[i-1] = (struct btree_item){
+ .key = btree_item_key(&left->item[0]),
+ .data = btree_node_ptr(left),
+ };
+ update->item[i] = (struct btree_item){
+ .key = btree_item_key(&new->item[0]),
+ .data = btree_node_ptr(new),
+ };
+ btree_item_free(root, &node->item[i-1]);
+ btree_item_free(root, &node->item[i]);
+
+ left = NULL;
+ new = update;
+ continue;
+ }
+
+ /* merge right */
+ if (new && !split && !left && right) {
+ update = btree_node_alloc(root);
+ btree_item_copy(update->item, node->item,
+ BTREE_ITEM_MAX);
+ update->item[i] = (struct btree_item){
+ .key = btree_item_key(&new->item[0]),
+ .data = btree_node_ptr(new),
+ };
+ update->item[i+1] = (struct btree_item){
+ .key = btree_item_key(&right->item[0]),
+ .data = btree_node_ptr(right),
+ };
+ btree_item_free(root, &node->item[i]);
+ btree_item_free(root, &node->item[i+1]);
+
+ new = update;
+ right = NULL;
+ continue;
+ }
+
+ /* split */
+ BUG_ON(left || right);
+ update = new;
+ }
+
+ if (btree_item_deref(item) &&
+ btree_item_key(item) < nitem.key)
+ i++;
+
+ split = NULL;
+ left = NULL;
+ right = NULL;
+ new = btree_node_alloc(root);
+ /* insert has room */
+ if (node->item[BTREE_ITEM_MAX-1].data == NULL) {
+ btree_item_copy(new->item, node->item, i);
+ new->item[i] = nitem;
+ btree_item_copy(new->item + i + 1, node->item + i,
+ BTREE_ITEM_MAX - i - 1);
+
+ if (update)
+ btree_item_flip(root, &new->item[i-1], update);
+
+ continue;
+ }
+
+#if 0 /* there be dragons here ! */
+
+ /* insert overflows */
+
+ /* see if left sibling will take some overflow */
+ uleft = btree_stack_peek_left(&stack);
+ if (uleft && uleft->item[BTREE_ITEM_MAX-1].data == NULL) {
+
+ j = btree_item_count(uleft);
+ n = (BTREE_ITEM_MAX - j + 1)/2;
+
+ left = btree_node_alloc(root);
+ /*
+ * LLLLl... CCCCcccc
+ * LLLLlC.. CCCcc*cc
+ */
+ btree_item_copy(left->item, uleft->item, j);
+ if (i < n) {
+ btree_item_copy(left->item+j, node->item, i);
+ left->item[j+i] = nitem;
+ btree_item_copy(left->item+j+i+1, node->item+i,
+ n-i);
+ btree_item_copy(new->item, node->item+n,
+ BTREE_ITEM_MAX-n);
+ } else {
+ btree_item_copy(left->item+j, node->item, n);
+ btree_item_copy(new->item, node->item+n, i-n);
+ new->item[i-n] = nitem;
+ btree_item_copy(new->item+i-n+1, node->item+i,
+ BTREE_ITEM_MAX-i);
+ }
+
+ if (update) {
+ if (i-1 < n)
+ btree_item_flip(root,
+ &left->item[j+i-1],
+ update);
+ else
+ btree_item_flip(root, &new->item[i-n-1],
+ update);
+ }
+
+ continue;
+ }
+
+ /* see if right sibling will take some overflow */
+ uright = btree_stack_peek_right(&stack);
+ if (uright && uright->item[BTREE_ITEM_MAX-1].data == NULL) {
+
+ j = btree_item_count(uright);
+ n = (BTREE_ITEM_MAX - j + 1)/2;
+
+ right = btree_node_alloc(root);
+ /*
+ * CCCCcccc RRRR....
+ * CCCC*cc. ccRRRR..
+ */
+ if (i <= BTREE_ITEM_MAX-n) {
+ btree_item_copy(new->item, node->item, i);
+ new->item[i] = nitem;
+ btree_item_copy(new->item+i+1, node->item+i,
+ BTREE_ITEM_MAX-n-i);
+ btree_item_copy(right->item,
+ node->item+BTREE_ITEM_MAX-n, n);
+ btree_item_copy(right->item+n, uright->item, j);
+ } else {
+ btree_item_copy(new->item, node->item,
+ BTREE_ITEM_MAX-n);
+
+ btree_item_copy(right->item,
+ node->item+(BTREE_ITEM_MAX-n),
+ i-(BTREE_ITEM_MAX-n));
+ right->item[i-(BTREE_ITEM_MAX-n)] = nitem;
+ btree_item_copy(right->item+i-(BTREE_ITEM_MAX-n)+1,
+ node->item+i, BTREE_ITEM_MAX-i);
+ btree_item_copy(right->item+n+1, uright->item,
+ j);
+ }
+
+ if (update) {
+ if (i-1 <= BTREE_ITEM_MAX-n)
+ btree_item_flip(root, &new->item[i-1],
+ update);
+ else
+ btree_item_flip(root,
+ &right->item[i-1-BTREE_ITEM_MAX-n],
+ update);
+ }
+
+ continue;
+ }
+#endif
+
+ /* split node */
+ split = btree_node_alloc(root);
+ if (i < BTREE_ITEM_HALF) {
+ btree_item_copy(new->item, node->item, i);
+ new->item[i] = nitem;
+ btree_item_copy(new->item + i + 1, node->item + i,
+ BTREE_ITEM_HALF - i);
+ btree_item_copy(split->item,
+ node->item + BTREE_ITEM_HALF,
+ BTREE_ITEM_HALF);
+ } else {
+ btree_item_copy(new->item, node->item,
+ BTREE_ITEM_HALF);
+ btree_item_copy(split->item,
+ node->item + BTREE_ITEM_HALF,
+ i - BTREE_ITEM_HALF);
+ split->item[i - BTREE_ITEM_HALF] = nitem;
+ btree_item_copy(split->item + i - BTREE_ITEM_HALF + 1,
+ node->item + i,
+ BTREE_ITEM_MAX - i);
+ }
+
+ if (update) {
+ if (i-1 < BTREE_ITEM_HALF)
+ btree_item_flip(root, &new->item[i-1], update);
+ else
+ btree_item_flip(root,
+ &split->item[i-1-BTREE_ITEM_HALF],
+ update);
+ }
+
+ nitem.key = split->item[0].key;
+ nitem.data = btree_node_ptr(split);
+ } while (btree_stack_size(&stack));
+
+ if (new) {
+ if (!split) {
+ btree_ptr_flip(root, (void **)&root->root, new);
+ } else {
+ update = btree_node_alloc(root);
+ update->item[0] = (struct btree_item){
+ .key = new->item[0].key,
+ .data = btree_node_ptr(new),
+ };
+ update->item[1] = (struct btree_item){
+ .key = split->item[0].key,
+ .data = btree_node_ptr(split),
+ };
+ btree_ptr_flip(root, (void **)&root->root, update);
+ root->height++;
+ }
+ }
+
+out:
+ btree_mod_finish(root, index, "insert");
+ return ret;
+}
+
+/* ------ update ------- */
+
+/*
+ * update the index of an item
+ *
+ * be careful the range:
+ * [min(index, new_index), max(index, new_index]
+ * _MUST_ be free, otherwise remove and re-insert.
+ *
+ * see the search comment for lockless lookup constraints
+ */
+int btree_update(struct btree_root *root,
+ unsigned long index, unsigned long new_index)
+{
+ struct btree_stack stack;
+ struct btree_node *node = rcu_dereference(root->root);
+ struct btree_item *item;
+ unsigned long key;
+ int offset, ret = 0;
+
+ if (index == new_index)
+ return 0;
+
+ if (!node)
+ return -EINVAL;
+
+ btree_stack_init(&stack);
+
+ do {
+ offset = __btree_item_search(root, node, index);
+ btree_stack_push(&stack, node, offset);
+ item = &node->item[offset];
+ if (btree_item_node(item)) {
+ /* loosen downwards */
+ if (index > new_index && btree_item_key(item) == index)
+ btree_item_key_set(item, new_index);
+ node = btree_item_deref(item);
+ } else
+ node = NULL;
+ } while (node);
+
+ if (btree_item_key(item) == index)
+ btree_item_key_set(item, new_index);
+ else
+ ret = -EINVAL;
+
+ do {
+ btree_stack_pop(&stack, &node, &offset);
+ item = &node->item[offset];
+ if (!btree_item_node(item))
+ continue;
+
+ key = btree_item_key(item);
+ /* undo on error */
+ if (ret && index > new_index && key == new_index)
+ btree_item_key_set(item, index);
+ /* tighten upwards */
+ if (!ret && index < new_index && key == index)
+ btree_item_key_set(item, new_index);
+ } while (btree_stack_size(&stack));
+
+ if (btree_validate(root)) {
+ printk(KERN_DEBUG "btree_update: index: %lu new_index: %lu\n",
+ index, new_index);
+ btree_print(root);
+ BUG();
+ }
+
+ return 0;
+}
+
+/* ------ delete ------- */
+
+/*
+ * delete an item; borrow items or merge nodes when needed.
+ */
+void *btree_remove(struct btree_root *root, unsigned long index)
+{
+ struct btree_stack stack;
+ struct btree_node *node = NULL,
+ *old_node,
+ *left = NULL,
+ *new = NULL,
+ *right = NULL,
+ *update,
+ *uleft,
+ *uright;
+ struct btree_item *item;
+ void *ret = NULL;
+ unsigned long key;
+ int i, j, n;
+ int err;
+
+ if (!root->root)
+ return ERR_PTR(-EINVAL);
+
+ err = btree_mod_init(root);
+ if (err)
+ return ERR_PTR(err);
+
+ item = __btree_search(root, &stack, index);
+ if (!item || btree_item_key(item) != index) {
+ ret = ERR_PTR(-EINVAL);
+ goto out;
+ } else
+ ret = item->data;
+
+ do {
+ old_node = node;
+ btree_stack_pop(&stack, &node, &i);
+ item = &node->item[i];
+
+ update = NULL;
+ if (btree_item_node(item)) {
+ /* no change */
+ if (!new && !left && !right) {
+ BUG_ON(btree_item_deref(item) != old_node);
+
+ /* tighten the split on our way back up */
+ key = btree_item_key(&old_node->item[0]);
+ if (btree_item_key(item) != key)
+ btree_item_key_set(item, key);
+ continue;
+ }
+
+ /* single change */
+ if (!left && new && !right) {
+ btree_item_flip(root, &node->item[i], new);
+ new = NULL;
+ continue;
+ }
+
+ /* borrowed left */
+ if (left && new && !right) {
+ update = btree_node_alloc(root);
+ btree_item_copy(update->item, node->item,
+ BTREE_ITEM_MAX);
+ update->item[i-1] = (struct btree_item){
+ .key = btree_item_key(&left->item[0]),
+ .data = btree_node_ptr(left),
+ };
+ update->item[i] = (struct btree_item){
+ .key = btree_item_key(&new->item[0]),
+ .data = btree_node_ptr(new),
+ };
+ btree_item_free(root, &node->item[i-1]);
+ btree_item_free(root, &node->item[i]);
+
+ left = NULL;
+ new = update;
+ continue;
+ }
+
+ /* borrowed right */
+ if (!left && new && right) {
+ update = btree_node_alloc(root);
+ btree_item_copy(update->item, node->item,
+ BTREE_ITEM_MAX);
+ update->item[i] = (struct btree_item){
+ .key = btree_item_key(&new->item[0]),
+ .data = btree_node_ptr(new),
+ };
+ update->item[i+1] = (struct btree_item){
+ .key = btree_item_key(&right->item[0]),
+ .data = btree_node_ptr(right),
+ };
+ btree_item_free(root, &node->item[i]);
+ btree_item_free(root, &node->item[i+1]);
+
+ new = update;
+ right = NULL;
+ continue;
+ }
+
+ /* merged left */
+ if (left && !new && !right) {
+ update = left;
+ }
+ /* merged right */
+ else if (!left && !new && right) {
+ update = right;
+ i++;
+ }
+ /* impossible */
+ else {
+ printk("%p %p %p\n", left, new, right);
+ BUG();
+ }
+ }
+
+ /* delete */
+ left = NULL;
+ right = NULL;
+ new = btree_node_alloc(root);
+ if (node->item[BTREE_ITEM_HALF].data) {
+delete_one:
+ /*
+ * CCCxcc..
+ * CCCcc...
+ */
+ btree_item_copy(new->item, node->item, i);
+ btree_item_copy(new->item+i, node->item+i+1,
+ BTREE_ITEM_MAX-i-1);
+
+ if (update)
+ btree_item_flip(root, &new->item[i-1], update);
+
+ btree_item_free(root, &node->item[i]);
+ continue;
+ }
+
+ /* delete underflows */
+
+ /* borrow left */
+ uleft = btree_stack_peek_left(&stack);
+ if (uleft && uleft->item[BTREE_ITEM_HALF].data) {
+ left = btree_node_alloc(root);
+
+ j = btree_item_count(uleft);
+ n = (j-BTREE_ITEM_HALF+1)/2;
+
+ /*
+ * LLLLlll. CxCC....
+ * LLLLl... llCCC...
+ */
+ btree_item_copy(left->item, uleft->item, (j-n));
+ btree_item_copy(new->item, uleft->item+(j-n), n);
+ btree_item_copy(new->item+n, node->item, i);
+ btree_item_copy(new->item+n+i, node->item+i+1,
+ BTREE_ITEM_HALF-i);
+
+ if (update)
+ btree_item_flip(root, &new->item[n+i-1],
+ update);
+
+ btree_item_free(root, &node->item[i]);
+
+ continue;
+ }
+
+ /* borrow right */
+ uright = btree_stack_peek_right(&stack);
+ if (uright && uright->item[BTREE_ITEM_HALF].data) {
+ right = btree_node_alloc(root);
+
+ j = btree_item_count(uright);
+ n = (j-BTREE_ITEM_HALF+1)/2;
+
+ /*
+ * CCxC.... RRRRrrr.
+ * CCCRR... RRrrr...
+ */
+ btree_item_copy(new->item, node->item, i);
+ btree_item_copy(new->item+i, node->item+i+1,
+ BTREE_ITEM_HALF-i-1);
+ btree_item_copy(new->item+BTREE_ITEM_HALF-1,
+ uright->item, n);
+ btree_item_copy(right->item, uright->item+n,
+ BTREE_ITEM_MAX-n);
+
+ if (update)
+ btree_item_flip(root, &new->item[i-1], update);
+
+ btree_item_free(root, &node->item[i]);
+
+ continue;
+ }
+
+ /* merge left */
+ if (uleft) {
+ /*
+ * LLLL.... CCCxc...
+ * LLLLCCCc
+ */
+ btree_item_copy(new->item, uleft->item,
+ BTREE_ITEM_HALF);
+ btree_item_copy(new->item+BTREE_ITEM_HALF,
+ node->item, i);
+ btree_item_copy(new->item+BTREE_ITEM_HALF+i,
+ node->item+i+1, BTREE_ITEM_HALF-i-1);
+
+ if (update)
+ btree_item_flip(root,
+ &new->item[BTREE_ITEM_HALF+i-1],
+ update);
+
+ btree_item_free(root, &node->item[i]);
+
+ left = new;
+ new = NULL;
+ continue;
+ }
+
+ /* merge right */
+ if (uright) {
+ /*
+ * CCxCc... RRRR....
+ * CCCcRRRR
+ */
+ btree_item_copy(new->item, node->item, i);
+ btree_item_copy(new->item+i, node->item+i+1,
+ BTREE_ITEM_HALF-i-1);
+ btree_item_copy(new->item+BTREE_ITEM_HALF-1,
+ uright->item, BTREE_ITEM_HALF);
+
+ if (update)
+ btree_item_flip(root, &new->item[i-1], update);
+
+ btree_item_free(root, &node->item[i]);
+
+ /*
+ * use right to carry the new node to avoid having
+ * the same pattern as single change
+ */
+ right = new;
+ new = NULL;
+ continue;
+ }
+
+ /* only the root may underflow */
+ BUG_ON(root->root != node);
+ goto delete_one;
+
+ } while (btree_stack_size(&stack));
+
+ BUG_ON(left || right);
+
+ if (new)
+ btree_ptr_flip(root, (void **)&root->root, new);
+
+ if (!root->root->item[1].data &&
+ btree_item_node(&root->root->item[0])) {
+ btree_ptr_flip(root, (void **)&root->root,
+ btree_item_deref(&root->root->item[0]));
+ root->height--;
+ }
+
+ if (!root->root->item[0].data) {
+ btree_ptr_flip(root, (void **)&root->root, NULL);
+ root->height = 0;
+ }
+
+out:
+ btree_mod_finish(root, index, "remove");
+ return ret;
+}
+
+/* ------ */
+
+void btree_root_init(struct btree_root *root, gfp_t gfp_mask)
+{
+ *root = BTREE_INIT(gfp_mask);
+}
+
+void btree_freevec_flush(struct btree_freevec *freevec)
+{
+ int i;
+ for (i = 0; i < freevec->pos; i++)
+ kmem_cache_free(btree_cachep, freevec->slot[i]);
+ kfree(freevec);
+}
+
+void btree_root_destroy(struct btree_root *root)
+{
+ /* assume the tree is empty */
+ BUG_ON(root->height);
+ BUG_ON(root->freevec);
+ if (root->mempool)
+ mempool_destroy(root->mempool);
+}
+
+void __init btree_init(void)
+{
+ btree_cachep = KMEM_CACHE(btree_node, SLAB_HWCACHE_ALIGN);
+
+}
+
+#ifdef BTREE_DEBUG
+static void __btree_node_print(struct btree_root *root,
+ struct btree_node *node, int recurse)
+{
+ int i, j;
+ for (i = 0; i < BTREE_ITEM_MAX; i++) {
+ if (node->item[i].data) {
+ printk(KERN_DEBUG);
+ for (j=0; j<recurse; j++)
+ printk(" ");
+ if (!btree_item_node(&node->item[i])) {
+
+ printk("-> leaf: %p, item: %d, key: %lu, data: %p\n",
+ node, i, node->item[i].key, node->item[i].data);
+
+ } else {
+
+ printk("node: %p, item: %d, key: %lu, child: %p\n",
+ node, i, node->item[i].key, btree_item_deref(&node->item[i]));
+ if (recurse)
+ __btree_node_print(root, btree_item_deref(&node->item[i]), recurse+1);
+
+ }
+ }
+ }
+}
+
+void btree_node_print(struct btree_root *root, struct btree_node *node)
+{
+ printk(KERN_DEBUG "node: %p\n", node);
+ __btree_node_print(root, node, 0);
+}
+
+void btree_print(struct btree_root *root)
+{
+ printk(KERN_DEBUG "[%p] root: %p, height: %d\n",
+ root, root->root, root->height);
+ if (root->root)
+ __btree_node_print(root, root->root, 1);
+}
+
+static unsigned long __btree_key(struct btree_root *root,
+ struct btree_node *node, int height)
+{
+ if (height == 0)
+ return node->item[0].key;
+
+ return __btree_key(root, btree_item_deref(&node->item[0]), height - 1);
+}
+
+static int __btree_validate(struct btree_root *root, struct btree_node *node,
+ unsigned long *pindex, int height)
+{
+ unsigned long parent_key = *pindex;
+ unsigned long node_key = 0;
+ unsigned long child_key = 0;
+
+ int i;
+ unsigned long key;
+ int nr = 0;
+ int bug = 0;
+
+ for (i = 0; i < BTREE_ITEM_MAX; i++) {
+ struct btree_node *child = btree_item_deref(&node->item[i]);
+ if (!child)
+ continue;
+
+ nr++;
+
+ key = node->item[i].key;
+ if (key < parent_key || (!i && key != parent_key) ||
+ (i && key == parent_key)) {
+ printk(KERN_DEBUG
+ "wrong parent split: key: %lu parent_key: %lu index: %d\n",
+ key, parent_key, i);
+ bug++;
+ }
+
+ if (key < node_key || (i && key == node_key)) {
+ printk(KERN_DEBUG
+ "wrong order: key: %lu node_key: %lu index: %d\n",
+ key, node_key, i);
+ bug++;
+ }
+ node_key = key;
+
+ if (key < child_key || (i && key == child_key)) {
+ printk(KERN_DEBUG
+ "wrong child split: key: %lu child_key: %lu index: %d\n",
+ key, child_key, i);
+ bug++;
+ }
+
+ child_key = max(node_key, child_key);
+
+ if (height)
+ bug += __btree_validate(root, child,
+ &child_key, height - 1);
+
+ *pindex = max(node_key, child_key);
+ }
+
+ if (node != root->root && nr < BTREE_ITEM_HALF) {
+ printk(KERN_DEBUG "node short\n");
+ bug++;
+ }
+
+ if (bug) {
+ printk(KERN_DEBUG "bug in node: %p\n", node);
+ }
+
+ return bug;
+}
+
+int btree_validate(struct btree_root *root)
+{
+ unsigned long key;
+
+ if (root->root) {
+ key = __btree_key(root, root->root, root->height - 1);
+ return __btree_validate(root, root->root, &key,
+ root->height - 1);
+ }
+
+ return 0;
+}
+#endif
--
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 10+ messages in thread
* [PATCH/RFC 3/9] mm: use the B+tree for vma lookups
2007-10-22 10:45 [PATCH/RFC 0/9] VMA lookup with RCU Vaidyanathan Srinivasan
2007-10-22 10:45 ` [PATCH/RFC 1/9] Data structure changes Vaidyanathan Srinivasan
2007-10-22 10:45 ` [PATCH/RFC 2/9] lib: RCU friendly B+tree Vaidyanathan Srinivasan
@ 2007-10-22 10:45 ` Vaidyanathan Srinivasan
2007-10-22 10:45 ` [PATCH/RFC 4/9] mm: RCU " Vaidyanathan Srinivasan
` (5 subsequent siblings)
8 siblings, 0 replies; 10+ messages in thread
From: Vaidyanathan Srinivasan @ 2007-10-22 10:45 UTC (permalink / raw)
To: linux-mm; +Cc: Alexis Bruemmer, Peter Zijlstra
[-- Attachment #1: 3_mm-btree.patch --]
[-- Type: text/plain, Size: 16893 bytes --]
---
include/linux/init_task.h | 2
include/linux/mm.h | 5 -
include/linux/sched.h | 4
kernel/fork.c | 17 +--
mm/mmap.c | 230 +++++++++++++++-------------------------------
5 files changed, 92 insertions(+), 166 deletions(-)
--- linux-2.6.23-rc6.orig/include/linux/init_task.h
+++ linux-2.6.23-rc6/include/linux/init_task.h
@@ -48,7 +48,7 @@
#define INIT_MM(name) \
{ \
.mm_vmas = LIST_HEAD_INIT(name.mm_vmas), \
- .mm_rb = RB_ROOT, \
+ .mm_btree = BTREE_INIT(GFP_ATOMIC), \
.pgd = swapper_pg_dir, \
.mm_users = ATOMIC_INIT(2), \
.mm_count = ATOMIC_INIT(1), \
--- linux-2.6.23-rc6.orig/include/linux/mm.h
+++ linux-2.6.23-rc6/include/linux/mm.h
@@ -69,8 +69,6 @@ struct vm_area_struct {
pgprot_t vm_page_prot; /* Access permissions of this VMA. */
unsigned long vm_flags; /* Flags, listed below. */
- struct rb_node vm_rb;
-
/*
* For areas with an address space and backing store,
* linkage into the address_space->i_mmap prio tree, or
@@ -1091,8 +1089,7 @@ extern struct anon_vma *find_mergeable_a
extern int split_vma(struct mm_struct *,
struct vm_area_struct *, unsigned long addr, int new_below);
extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *);
-extern void __vma_link_rb(struct mm_struct *, struct vm_area_struct *,
- struct rb_node **, struct rb_node *);
+extern void __vma_link_btree(struct mm_struct *, struct vm_area_struct *);
extern void unlink_file_vma(struct vm_area_struct *);
extern struct vm_area_struct *copy_vma(struct vm_area_struct **,
unsigned long addr, unsigned long len, pgoff_t pgoff);
--- linux-2.6.23-rc6.orig/include/linux/sched.h
+++ linux-2.6.23-rc6/include/linux/sched.h
@@ -52,7 +52,7 @@ struct sched_param {
#include <linux/types.h>
#include <linux/timex.h>
#include <linux/jiffies.h>
-#include <linux/rbtree.h>
+#include <linux/btree.h>
#include <linux/thread_info.h>
#include <linux/cpumask.h>
#include <linux/errno.h>
@@ -368,7 +368,7 @@ extern int get_dumpable(struct mm_struct
struct mm_struct {
struct list_head mm_vmas;
- struct rb_root mm_rb;
+ struct btree_root mm_btree;
struct vm_area_struct * mmap_cache; /* last find_vma result */
unsigned long (*get_unmapped_area) (struct file *filp,
unsigned long addr, unsigned long len,
--- linux-2.6.23-rc6.orig/kernel/fork.c
+++ linux-2.6.23-rc6/kernel/fork.c
@@ -198,7 +198,6 @@ static struct task_struct *dup_task_stru
static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
{
struct vm_area_struct *mpnt, *tmp;
- struct rb_node **rb_link, *rb_parent;
int retval;
unsigned long charge;
struct mempolicy *pol;
@@ -216,9 +215,6 @@ static inline int dup_mmap(struct mm_str
mm->cached_hole_size = ~0UL;
mm->map_count = 0;
cpus_clear(mm->cpu_vm_mask);
- mm->mm_rb = RB_ROOT;
- rb_link = &mm->mm_rb.rb_node;
- rb_parent = NULL;
list_for_each_entry(mpnt, &oldmm->mm_vmas, vm_list) {
struct file *file;
@@ -270,9 +266,8 @@ static inline int dup_mmap(struct mm_str
*/
list_add_tail(&tmp->vm_list, &mm->mm_vmas);
- __vma_link_rb(mm, tmp, rb_link, rb_parent);
- rb_link = &tmp->vm_rb.rb_right;
- rb_parent = &tmp->vm_rb;
+ btree_preload(&mm->mm_btree, GFP_KERNEL|__GFP_NOFAIL); // XXX create error path
+ __vma_link_btree(mm, tmp);
mm->map_count++;
retval = copy_page_range(mm, oldmm, mpnt);
@@ -320,13 +315,19 @@ static inline void mm_free_pgd(struct mm
__cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);
#define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL))
-#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm)))
+
+static void free_mm(struct mm_struct *mm)
+{
+ btree_root_destroy(&mm->mm_btree);
+ kmem_cache_free(mm_cachep, mm);
+}
#include <linux/init_task.h>
static struct mm_struct * mm_init(struct mm_struct * mm)
{
INIT_LIST_HEAD(&mm->mm_vmas);
+ mm->mm_btree = BTREE_INIT(GFP_ATOMIC);
atomic_set(&mm->mm_users, 1);
atomic_set(&mm->mm_count, 1);
init_rwsem(&mm->mmap_sem);
--- linux-2.6.23-rc6.orig/mm/mmap.c
+++ linux-2.6.23-rc6/mm/mmap.c
@@ -4,6 +4,7 @@
* Written by obz.
*
* Address space accounting code <alan@redhat.com>
+ * Btree adaption <pzijlstr@redhat.com>
*/
#include <linux/slab.h>
@@ -283,35 +284,6 @@ out:
}
#ifdef DEBUG_MM_RB
-static int browse_rb(struct rb_root *root)
-{
- int i = 0, j;
- struct rb_node *nd, *pn = NULL;
- unsigned long prev = 0, pend = 0;
-
- for (nd = rb_first(root); nd; nd = rb_next(nd)) {
- struct vm_area_struct *vma;
- vma = rb_entry(nd, struct vm_area_struct, vm_rb);
- if (vma->vm_start < prev)
- printk("vm_start %lx prev %lx\n", vma->vm_start, prev), i = -1;
- if (vma->vm_start < pend)
- printk("vm_start %lx pend %lx\n", vma->vm_start, pend);
- if (vma->vm_start > vma->vm_end)
- printk("vm_end %lx < vm_start %lx\n", vma->vm_end, vma->vm_start);
- i++;
- pn = nd;
- prev = vma->vm_start;
- pend = vma->vm_end;
- }
- j = 0;
- for (nd = pn; nd; nd = rb_prev(nd)) {
- j++;
- }
- if (i != j)
- printk("backwards %d, forwards %d\n", j, i), i = 0;
- return i;
-}
-
void validate_mm(struct mm_struct *mm)
{
int bug = 0;
@@ -321,9 +293,6 @@ void validate_mm(struct mm_struct *mm)
i++;
if (i != mm->map_count)
printk("map_count %d vm_next %d\n", mm->map_count, i), bug = 1;
- i = browse_rb(&mm->mm_rb);
- if (i != mm->map_count)
- printk("map_count %d rb %d\n", mm->map_count, i), bug = 1;
BUG_ON(bug);
}
#else
@@ -332,62 +301,36 @@ void validate_mm(struct mm_struct *mm)
static struct vm_area_struct *
find_vma_prepare(struct mm_struct *mm, unsigned long addr,
- struct vm_area_struct **pprev, struct rb_node ***rb_link,
- struct rb_node ** rb_parent)
+ struct vm_area_struct **pprev)
{
- struct vm_area_struct * vma;
- struct rb_node ** __rb_link, * __rb_parent, * rb_prev;
-
- __rb_link = &mm->mm_rb.rb_node;
- rb_prev = __rb_parent = NULL;
- vma = NULL;
-
- while (*__rb_link) {
- struct vm_area_struct *vma_tmp;
-
- __rb_parent = *__rb_link;
- vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb);
-
- if (vma_tmp->vm_end > addr) {
- vma = vma_tmp;
- if (vma_tmp->vm_start <= addr)
- return vma;
- __rb_link = &__rb_parent->rb_left;
- } else {
- rb_prev = __rb_parent;
- __rb_link = &__rb_parent->rb_right;
- }
- }
-
- *pprev = NULL;
- if (rb_prev)
- *pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
- *rb_link = __rb_link;
- *rb_parent = __rb_parent;
+ struct vm_area_struct *vma, *prev = NULL;
+ vma = btree_stab(&mm->mm_btree, addr);
+ if (!vma || addr >= vma->vm_end)
+ vma = __vma_next(&mm->mm_vmas, vma);
+ if (!(vma && addr < vma->vm_end))
+ prev = __vma_prev(&mm->mm_vmas, vma);
+ *pprev = prev;
return vma;
}
static inline void
__vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
- struct vm_area_struct *prev, struct rb_node *rb_parent)
+ struct vm_area_struct *prev)
{
- if (prev) {
+ if (!prev)
+ prev = btree_stab(&mm->mm_btree, vma->vm_start);
+
+ if (prev)
list_add(&vma->vm_list, &prev->vm_list);
- } else {
- if (rb_parent) {
- struct vm_area_struct *next = rb_entry(rb_parent,
- struct vm_area_struct, vm_rb);
- list_add_tail(&vma->vm_list, &next->vm_list);
- } else
- list_add(&vma->vm_list, &mm->mm_vmas);
- }
+ else
+ list_add(&vma->vm_list, &mm->mm_vmas);
}
-void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
- struct rb_node **rb_link, struct rb_node *rb_parent)
+void __vma_link_btree(struct mm_struct *mm, struct vm_area_struct *vma)
{
- rb_link_node(&vma->vm_rb, rb_parent, rb_link);
- rb_insert_color(&vma->vm_rb, &mm->mm_rb);
+ int err;
+ err = btree_insert(&mm->mm_btree, vma->vm_start, vma);
+ BUG_ON(err);
}
static inline void __vma_link_file(struct vm_area_struct *vma)
@@ -414,20 +357,20 @@ static inline void __vma_link_file(struc
static void
__vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
- struct vm_area_struct *prev, struct rb_node **rb_link,
- struct rb_node *rb_parent)
+ struct vm_area_struct *prev)
{
- __vma_link_list(mm, vma, prev, rb_parent);
- __vma_link_rb(mm, vma, rb_link, rb_parent);
+ __vma_link_list(mm, vma, prev);
+ __vma_link_btree(mm, vma);
__anon_vma_link(vma);
}
static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
- struct vm_area_struct *prev, struct rb_node **rb_link,
- struct rb_node *rb_parent)
+ struct vm_area_struct *prev)
{
struct address_space *mapping = NULL;
+ btree_preload(&mm->mm_btree, GFP_KERNEL|__GFP_NOFAIL); // XXX create error path
+
if (vma->vm_file)
mapping = vma->vm_file->f_mapping;
@@ -437,7 +380,7 @@ static void vma_link(struct mm_struct *m
}
anon_vma_lock(vma);
- __vma_link(mm, vma, prev, rb_link, rb_parent);
+ __vma_link(mm, vma, prev);
__vma_link_file(vma);
anon_vma_unlock(vma);
@@ -456,12 +399,7 @@ static void vma_link(struct mm_struct *m
static void
__insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
{
- struct vm_area_struct * __vma, * prev;
- struct rb_node ** rb_link, * rb_parent;
-
- __vma = find_vma_prepare(mm, vma->vm_start,&prev, &rb_link, &rb_parent);
- BUG_ON(__vma && __vma->vm_start < vma->vm_end);
- __vma_link(mm, vma, prev, rb_link, rb_parent);
+ __vma_link(mm, vma, NULL);
mm->map_count++;
}
@@ -469,8 +407,21 @@ static inline void
__vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
struct vm_area_struct *prev)
{
+ struct vm_area_struct *vma_tmp;
+
list_del(&vma->vm_list);
- rb_erase(&vma->vm_rb, &mm->mm_rb);
+
+ vma_tmp = btree_remove(&mm->mm_btree, vma->vm_start);
+ if (vma_tmp != vma) {
+#ifdef CONFIG_DEBUG_VM
+ printk(KERN_DEBUG
+ "btree_remove(%lu) returned %p instead of %p\n"
+ ,vma->vm_start, vma_tmp, vma);
+ btree_print(&mm->mm_btree);
+#endif
+ BUG();
+ }
+
if (mm->mmap_cache == vma)
mm->mmap_cache = prev;
}
@@ -525,6 +476,8 @@ again: remove_next = 1 + (end > next->
}
}
+ btree_preload(&mm->mm_btree, GFP_KERNEL|__GFP_NOFAIL); // XXX error path
+
if (file) {
mapping = file->f_mapping;
if (!(vma->vm_flags & VM_NONLINEAR))
@@ -576,10 +529,14 @@ again: remove_next = 1 + (end > next->
vma_prio_tree_remove(next, root);
}
+ btree_update(&mm->mm_btree, vma->vm_start, start);
vma->vm_start = start;
vma->vm_end = end;
vma->vm_pgoff = pgoff;
+
if (adjust_next) {
+ btree_update(&mm->mm_btree, next->vm_start,
+ next->vm_start + (adjust_next << PAGE_SHIFT));
next->vm_start += adjust_next << PAGE_SHIFT;
next->vm_pgoff += adjust_next;
}
@@ -1075,7 +1032,7 @@ unsigned long mmap_region(struct file *f
/* Clear old maps */
error = -ENOMEM;
munmap_back:
- vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
+ vma = find_vma_prepare(mm, addr, &prev);
if (vma && vma->vm_start < addr + len) {
if (do_munmap(mm, addr, len))
return -ENOMEM;
@@ -1176,7 +1133,7 @@ munmap_back:
if (!file || !vma_merge(mm, prev, addr, vma->vm_end,
vma->vm_flags, NULL, file, pgoff, vma_policy(vma))) {
file = vma->vm_file;
- vma_link(mm, vma, prev, rb_link, rb_parent);
+ vma_link(mm, vma, prev);
if (correct_wcount)
atomic_inc(&inode->i_writecount);
} else {
@@ -1436,25 +1393,9 @@ struct vm_area_struct * find_vma(struct
/* (Cache hit rate is typically around 35%.) */
vma = mm->mmap_cache;
if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
- struct rb_node * rb_node;
-
- rb_node = mm->mm_rb.rb_node;
- vma = NULL;
-
- while (rb_node) {
- struct vm_area_struct * vma_tmp;
-
- vma_tmp = rb_entry(rb_node,
- struct vm_area_struct, vm_rb);
-
- if (vma_tmp->vm_end > addr) {
- vma = vma_tmp;
- if (vma_tmp->vm_start <= addr)
- break;
- rb_node = rb_node->rb_left;
- } else
- rb_node = rb_node->rb_right;
- }
+ vma = btree_stab(&mm->mm_btree, addr);
+ if (!vma || addr >= vma->vm_end)
+ vma = __vma_next(&mm->mm_vmas, vma);
if (vma)
mm->mmap_cache = vma;
}
@@ -1469,32 +1410,11 @@ struct vm_area_struct *
find_vma_prev(struct mm_struct *mm, unsigned long addr,
struct vm_area_struct **pprev)
{
- struct vm_area_struct *prev = NULL, *next;
- struct rb_node * rb_node;
- if (!mm)
- goto out;
-
- /* Go through the RB tree quickly. */
- rb_node = mm->mm_rb.rb_node;
-
- while (rb_node) {
- struct vm_area_struct *vma_tmp;
- vma_tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb);
-
- if (addr < vma_tmp->vm_end) {
- rb_node = rb_node->rb_left;
- } else {
- prev = vma_tmp;
- next = __vma_next(&mm->mm_vmas, prev);
- if (!next || (addr < next->vm_end))
- break;
- rb_node = rb_node->rb_right;
- }
- }
+ struct vm_area_struct *vma;
-out:
- *pprev = prev;
- return __vma_next(&mm->mm_vmas, prev);
+ vma = find_vma(mm, addr);
+ *pprev = __vma_prev(&mm->mm_vmas, vma);
+ return vma;
}
/*
@@ -1633,6 +1553,17 @@ static inline int expand_downwards(struc
error = acct_stack_growth(vma, size, grow);
if (!error) {
+ /*
+ * This is safe even under the read lock, anon_vma_lock
+ * guards against concurrent expand_stack calls and the
+ * read lock keeps other structural changes to the
+ * btree away.
+ *
+ * btree_update is carfully constructed to shift the
+ * space partitioning within the btree in a safe manner.
+ */
+ btree_update(&vma->vm_mm->mm_btree,
+ vma->vm_start, address);
vma->vm_start = address;
vma->vm_pgoff -= grow;
}
@@ -1757,9 +1688,9 @@ detach_vmas_to_be_unmapped(struct mm_str
do {
struct vm_area_struct *next = vma_next(vma);
- rb_erase(&vma->vm_rb, &mm->mm_rb);
+ __vma_unlink(mm, vma, NULL);
mm->map_count--;
- list_move_tail(&vma->vm_list, vmas);
+ list_add_tail(&vma->vm_list, vmas);
vma = next;
} while (vma && vma->vm_start < end);
if (mm->unmap_area == arch_unmap_area)
@@ -1917,10 +1848,9 @@ static inline void verify_mm_writelocked
*/
unsigned long do_brk(unsigned long addr, unsigned long len)
{
- struct mm_struct * mm = current->mm;
- struct vm_area_struct * vma, * prev;
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma, *prev;
unsigned long flags;
- struct rb_node ** rb_link, * rb_parent;
pgoff_t pgoff = addr >> PAGE_SHIFT;
int error;
@@ -1963,7 +1893,7 @@ unsigned long do_brk(unsigned long addr,
* Clear old maps. this also does some error checking for us
*/
munmap_back:
- vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
+ vma = find_vma_prepare(mm, addr, &prev);
if (vma && vma->vm_start < addr + len) {
if (do_munmap(mm, addr, len))
return -ENOMEM;
@@ -2001,7 +1931,7 @@ unsigned long do_brk(unsigned long addr,
vma->vm_flags = flags;
vma->vm_page_prot = protection_map[flags &
(VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)];
- vma_link(mm, vma, prev, rb_link, rb_parent);
+ vma_link(mm, vma, prev);
out:
mm->total_vm += len >> PAGE_SHIFT;
if (flags & VM_LOCKED) {
@@ -2053,8 +1983,7 @@ void exit_mmap(struct mm_struct *mm)
*/
int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
{
- struct vm_area_struct * __vma, * prev;
- struct rb_node ** rb_link, * rb_parent;
+ struct vm_area_struct *__vma, *prev;
/*
* The vm_pgoff of a purely anonymous vma should be irrelevant
@@ -2072,13 +2001,13 @@ int insert_vm_struct(struct mm_struct *
BUG_ON(vma->anon_vma);
vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
}
- __vma = find_vma_prepare(mm,vma->vm_start,&prev,&rb_link,&rb_parent);
+ __vma = find_vma_prepare(mm,vma->vm_start,&prev);
if (__vma && __vma->vm_start < vma->vm_end)
return -ENOMEM;
if ((vma->vm_flags & VM_ACCOUNT) &&
security_vm_enough_memory_mm(mm, vma_pages(vma)))
return -ENOMEM;
- vma_link(mm, vma, prev, rb_link, rb_parent);
+ vma_link(mm, vma, prev);
return 0;
}
@@ -2093,7 +2022,6 @@ struct vm_area_struct *copy_vma(struct v
unsigned long vma_start = vma->vm_start;
struct mm_struct *mm = vma->vm_mm;
struct vm_area_struct *new_vma, *prev;
- struct rb_node **rb_link, *rb_parent;
struct mempolicy *pol;
/*
@@ -2103,7 +2031,7 @@ struct vm_area_struct *copy_vma(struct v
if (!vma->vm_file && !vma->anon_vma)
pgoff = addr >> PAGE_SHIFT;
- find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
+ find_vma_prepare(mm, addr, &prev);
new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma));
if (new_vma) {
@@ -2130,7 +2058,7 @@ struct vm_area_struct *copy_vma(struct v
get_file(new_vma->vm_file);
if (new_vma->vm_ops && new_vma->vm_ops->open)
new_vma->vm_ops->open(new_vma);
- vma_link(mm, new_vma, prev, rb_link, rb_parent);
+ vma_link(mm, new_vma, prev);
}
}
return new_vma;
--
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 10+ messages in thread
* [PATCH/RFC 4/9] mm: RCU vma lookups
2007-10-22 10:45 [PATCH/RFC 0/9] VMA lookup with RCU Vaidyanathan Srinivasan
` (2 preceding siblings ...)
2007-10-22 10:45 ` [PATCH/RFC 3/9] mm: use the B+tree for vma lookups Vaidyanathan Srinivasan
@ 2007-10-22 10:45 ` Vaidyanathan Srinivasan
2007-10-22 10:45 ` [PATCH/RFC 5/9] i386: rcu vma lookups for faults Vaidyanathan Srinivasan
` (4 subsequent siblings)
8 siblings, 0 replies; 10+ messages in thread
From: Vaidyanathan Srinivasan @ 2007-10-22 10:45 UTC (permalink / raw)
To: linux-mm; +Cc: Alexis Bruemmer, Peter Zijlstra
[-- Attachment #1: 4_mm-rcu.patch --]
[-- Type: text/plain, Size: 12516 bytes --]
mostly lockless vma lookup using the new b+tree
pin the vma using an atomic refcount
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>
---
include/linux/init_task.h | 2
include/linux/mm.h | 9 +
kernel/fork.c | 2
mm/mmap.c | 216 +++++++++++++++++++++++++++++++++++++++++-----
4 files changed, 206 insertions(+), 23 deletions(-)
--- linux-2.6.23-rc9.orig/include/linux/init_task.h
+++ linux-2.6.23-rc9/include/linux/init_task.h
@@ -48,7 +48,7 @@
#define INIT_MM(name) \
{ \
.mm_vmas = LIST_HEAD_INIT(name.mm_vmas), \
- .mm_btree = BTREE_INIT(GFP_ATOMIC), \
+ .mm_btree = BTREE_INIT_FLUSH(GFP_ATOMIC, btree_rcu_flush), \
.pgd = swapper_pg_dir, \
.mm_users = ATOMIC_INIT(2), \
.mm_count = ATOMIC_INIT(1), \
--- linux-2.6.23-rc9.orig/include/linux/mm.h
+++ linux-2.6.23-rc9/include/linux/mm.h
@@ -104,12 +104,14 @@ struct vm_area_struct {
void * vm_private_data; /* was vm_pte (shared mem) */
unsigned long vm_truncate_count;/* truncate_count or restart_addr */
+ struct rw_semaphore vm_sem;
#ifndef CONFIG_MMU
atomic_t vm_usage; /* refcount (VMAs shared if !MMU) */
#endif
#ifdef CONFIG_NUMA
struct mempolicy *vm_policy; /* NUMA policy for the VMA */
#endif
+ struct rcu_head vm_rcu_head;
};
static inline struct vm_area_struct *
@@ -1078,6 +1080,8 @@ static inline void vma_nonlinear_insert(
}
/* mmap.c */
+extern void btree_rcu_flush(struct btree_freevec *);
+extern void free_vma(struct vm_area_struct *vma);
extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin);
extern void vma_adjust(struct vm_area_struct *vma, unsigned long start,
unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert);
@@ -1177,6 +1181,11 @@ extern struct vm_area_struct * find_vma(
extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr,
struct vm_area_struct **pprev);
+extern struct vm_area_struct * __find_get_vma(struct mm_struct *mm, unsigned long addr, int *locked);
+extern struct vm_area_struct * find_get_vma(struct mm_struct *mm, unsigned long addr);
+extern void get_vma(struct vm_area_struct *vma);
+extern void put_vma(struct vm_area_struct *vma);
+
/* Look up the first VMA which intersects the interval start_addr..end_addr-1,
NULL if none. Assume start_addr < end_addr. */
static inline struct vm_area_struct * find_vma_intersection(struct mm_struct * mm, unsigned long start_addr, unsigned long end_addr)
--- linux-2.6.23-rc9.orig/kernel/fork.c
+++ linux-2.6.23-rc9/kernel/fork.c
@@ -327,7 +327,7 @@ static void free_mm(struct mm_struct *mm
static struct mm_struct * mm_init(struct mm_struct * mm)
{
INIT_LIST_HEAD(&mm->mm_vmas);
- mm->mm_btree = BTREE_INIT(GFP_ATOMIC);
+ mm->mm_btree = BTREE_INIT_FLUSH(GFP_ATOMIC, btree_rcu_flush);
atomic_set(&mm->mm_users, 1);
atomic_set(&mm->mm_count, 1);
init_rwsem(&mm->mmap_sem);
--- linux-2.6.23-rc9.orig/mm/mmap.c
+++ linux-2.6.23-rc9/mm/mmap.c
@@ -40,6 +40,18 @@ static void unmap_region(struct mm_struc
struct vm_area_struct *vma, struct vm_area_struct *prev,
unsigned long start, unsigned long end);
+static void __btree_rcu_flush(struct rcu_head *head)
+{
+ struct btree_freevec *freevec =
+ container_of(head, struct btree_freevec, rcu_head);
+ btree_freevec_flush(freevec);
+}
+
+void btree_rcu_flush(struct btree_freevec *freevec)
+{
+ call_rcu(&freevec->rcu_head, __btree_rcu_flush);
+}
+
/*
* WARNING: the debugging will use recursive algorithms so never enable this
* unless you know what you are doing.
@@ -218,6 +230,28 @@ void unlink_file_vma(struct vm_area_stru
}
}
+static void __free_vma(struct rcu_head *head)
+{
+ struct vm_area_struct *vma =
+ container_of(head, struct vm_area_struct, vm_rcu_head);
+ kmem_cache_free(vm_area_cachep, vma);
+}
+
+void free_vma(struct vm_area_struct *vma)
+{
+ call_rcu(&vma->vm_rcu_head, __free_vma);
+}
+
+static void lock_vma(struct vm_area_struct *vma)
+{
+ down_write(&vma->vm_sem);
+}
+
+static void unlock_vma(struct vm_area_struct *vma)
+{
+ up_write(&vma->vm_sem);
+}
+
/*
* Close a vm structure and free it, returning the next.
*/
@@ -230,7 +264,7 @@ static void remove_vma(struct vm_area_st
if (vma->vm_file)
fput(vma->vm_file);
mpol_free(vma_policy(vma));
- kmem_cache_free(vm_area_cachep, vma);
+ free_vma(vma);
return;
}
@@ -288,8 +322,15 @@ void validate_mm(struct mm_struct *mm)
int bug = 0;
int i = 0;
struct vm_area_struct *vma;
- list_for_each_entry(vma, &mm->mm_vmas, vm_list)
+ unsigned long addr = 0UL;
+ list_for_each_entry(vma, &mm->mm_vmas, vm_list) {
+ if (addr > vma->vm_start) {
+ printk("vma list unordered: %lu %lu\n",
+ addr, vma->vm_start);
+ }
+ addr = vma->vm_start;
i++;
+ }
if (i != mm->map_count)
printk("map_count %d vm_next %d\n", mm->map_count, i), bug = 1;
BUG_ON(bug);
@@ -328,6 +369,7 @@ __vma_link_list(struct mm_struct *mm, st
void __vma_link_btree(struct mm_struct *mm, struct vm_area_struct *vma)
{
int err;
+ init_rwsem(&vma->vm_sem);
err = btree_insert(&mm->mm_btree, vma->vm_start, vma);
BUG_ON(err);
}
@@ -421,8 +463,8 @@ __vma_unlink(struct mm_struct *mm, struc
BUG();
}
- if (mm->mmap_cache == vma)
- mm->mmap_cache = prev;
+ if (rcu_dereference(mm->mmap_cache) == vma)
+ rcu_assign_pointer(mm->mmap_cache, prev);
}
/*
@@ -438,6 +480,7 @@ void vma_adjust(struct vm_area_struct *v
struct mm_struct *mm = vma->vm_mm;
struct vm_area_struct *next = vma_next(vma);
struct vm_area_struct *importer = NULL;
+ struct vm_area_struct *locked = NULL;
struct address_space *mapping = NULL;
struct prio_tree_root *root = NULL;
struct file *file = vma->vm_file;
@@ -448,6 +491,14 @@ void vma_adjust(struct vm_area_struct *v
if (next && !insert) {
if (end >= next->vm_end) {
/*
+ * We need to lock the vma to force the lockless
+ * lookup into the slow path. Because there is a
+ * hole between removing the next vma and updating
+ * the current.
+ */
+ lock_vma(vma);
+ locked = vma;
+ /*
* vma expands, overlapping all the next, and
* perhaps the one after too (mprotect case 6).
*/
@@ -475,6 +526,25 @@ again: remove_next = 1 + (end > next->
}
}
+ if (insert) {
+ /*
+ * In order to make the adjust + insert look atomic wrt. the
+ * lockless lookups we need to force those into the slow path.
+ */
+ if (insert->vm_start < start) {
+ /*
+ * If the new vma is to be placed in front of the
+ * current one, we must lock the previous.
+ */
+ locked = vma_prev(vma);
+ if (!locked)
+ locked = vma;
+ } else
+ locked = vma;
+
+ lock_vma(locked);
+ }
+
btree_preload(&mm->mm_btree, GFP_KERNEL|__GFP_NOFAIL); // XXX error path
if (file) {
@@ -521,6 +591,23 @@ again: remove_next = 1 + (end > next->
}
}
+ /*
+ * Remove the next vma before updating the address of the current,
+ * because it might end up having the address of next.
+ */
+ if (remove_next) {
+ /*
+ * vma_merge has merged next into vma, and needs
+ * us to remove next before dropping the locks.
+ */
+ lock_vma(next);
+ __vma_unlink(mm, next, vma);
+ if (file)
+ __remove_shared_vm_struct(next, file, mapping);
+ if (next->anon_vma)
+ __anon_vma_merge(vma, next);
+ }
+
if (root) {
flush_dcache_mmap_lock(mapping);
vma_prio_tree_remove(vma, root);
@@ -547,17 +634,11 @@ again: remove_next = 1 + (end > next->
flush_dcache_mmap_unlock(mapping);
}
- if (remove_next) {
- /*
- * vma_merge has merged next into vma, and needs
- * us to remove next before dropping the locks.
- */
- __vma_unlink(mm, next, vma);
- if (file)
- __remove_shared_vm_struct(next, file, mapping);
- if (next->anon_vma)
- __anon_vma_merge(vma, next);
- } else if (insert) {
+ /*
+ * Insert after updating the address of the current vma, because it
+ * might end up having the previous address.
+ */
+ if (insert) {
/*
* split_vma has split insert from vma, and needs
* us to insert it before dropping the locks
@@ -576,7 +657,7 @@ again: remove_next = 1 + (end > next->
fput(file);
mm->map_count--;
mpol_free(vma_policy(next));
- kmem_cache_free(vm_area_cachep, next);
+ free_vma(next);
/*
* In mprotect's case 6 (see comments on vma_merge),
* we must remove another next too. It would clutter
@@ -588,6 +669,13 @@ again: remove_next = 1 + (end > next->
}
}
+ if (locked) {
+ /*
+ * unlock the vma, enabling lockless lookups.
+ */
+ unlock_vma(locked);
+ }
+
validate_mm(mm);
}
@@ -1142,7 +1230,7 @@ munmap_back:
fput(file);
}
mpol_free(vma_policy(vma));
- kmem_cache_free(vm_area_cachep, vma);
+ free_vma(vma);
}
out:
mm->total_vm += len >> PAGE_SHIFT;
@@ -1166,7 +1254,7 @@ unmap_and_free_vma:
unmap_region(mm, &vmas, vma, prev, vma->vm_start, vma->vm_end);
charged = 0;
free_vma:
- kmem_cache_free(vm_area_cachep, vma);
+ free_vma(vma);
unacct_error:
if (charged)
vm_unacct_memory(charged);
@@ -1390,13 +1478,13 @@ struct vm_area_struct * find_vma(struct
if (mm) {
/* Check the cache first. */
/* (Cache hit rate is typically around 35%.) */
- vma = mm->mmap_cache;
+ vma = rcu_dereference(mm->mmap_cache);
if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
vma = btree_stab(&mm->mm_btree, addr);
if (!vma || addr >= vma->vm_end)
vma = __vma_next(&mm->mm_vmas, vma);
if (vma)
- mm->mmap_cache = vma;
+ rcu_assign_pointer(mm->mmap_cache, vma);
}
}
return vma;
@@ -1404,6 +1492,90 @@ struct vm_area_struct * find_vma(struct
EXPORT_SYMBOL(find_vma);
+/*
+ * Differs only from the above in that it uses the slightly more expensive
+ * btree_stab_next() in order to avoid using the mm->mm_vmas list without
+ * locks.
+ */
+static
+struct vm_area_struct *find_vma_rcu(struct mm_struct * mm, unsigned long addr)
+{
+ struct vm_area_struct *vma = NULL, *next;
+
+ if (mm) {
+ /* Check the cache first. */
+ /* (Cache hit rate is typically around 35%.) */
+ vma = rcu_dereference(mm->mmap_cache);
+ if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
+ vma = btree_stab_next(&mm->mm_btree, addr,
+ (void **)&next);
+ if (!vma || addr >= vma->vm_end)
+ vma = next;
+
+ if (vma)
+ rcu_assign_pointer(mm->mmap_cache, vma);
+ }
+ }
+ return vma;
+}
+
+/*
+ * Lockless lookup and pinning of vmas:
+ *
+ * In order to be able to do vma modifications and have them appear atomic
+ * we must sometimes still take the read lock. We do this when we fail to
+ * get a reference on the vma.
+ *
+ */
+struct vm_area_struct *
+__find_get_vma(struct mm_struct *mm, unsigned long addr, int *locked)
+{
+ struct vm_area_struct *vma;
+
+ if (!mm)
+ return NULL;
+
+ rcu_read_lock();
+ vma = find_vma_rcu(mm, addr);
+ if (!vma || !down_read_trylock(&vma->vm_sem))
+ goto slow;
+ rcu_read_unlock();
+ return vma;
+
+slow:
+ rcu_read_unlock();
+ down_read(&mm->mmap_sem);
+ vma = find_vma(mm, addr);
+ if (vma)
+ down_read(&vma->vm_sem);
+ *locked = 1;
+ return vma;
+}
+
+struct vm_area_struct *
+find_get_vma(struct mm_struct *mm, unsigned long addr)
+{
+ int locked = 0;
+ struct vm_area_struct *vma;
+
+ vma = __find_get_vma(mm, addr, &locked);
+ if (unlikely(locked))
+ up_read(&mm->mmap_sem);
+ return vma;
+}
+
+void get_vma(struct vm_area_struct *vma)
+{
+ if (likely(vma))
+ down_read(&vma->vm_sem);
+}
+
+void put_vma(struct vm_area_struct *vma)
+{
+ if (likely(vma))
+ up_read(&vma->vm_sem);
+}
+
/* Same as find_vma, but also return a pointer to the previous VMA in *pprev. */
struct vm_area_struct *
find_vma_prev(struct mm_struct *mm, unsigned long addr,
@@ -1687,8 +1859,10 @@ detach_vmas_to_be_unmapped(struct mm_str
do {
struct vm_area_struct *next = vma_next(vma);
+ lock_vma(vma);
__vma_unlink(mm, vma, NULL);
mm->map_count--;
+ unlock_vma(vma);
list_add_tail(&vma->vm_list, vmas);
vma = next;
} while (vma && vma->vm_start < end);
@@ -1697,7 +1871,7 @@ detach_vmas_to_be_unmapped(struct mm_str
else
addr = vma ? vma->vm_start : mm->mmap_base;
mm->unmap_area(mm, addr);
- mm->mmap_cache = NULL; /* Kill the cache. */
+ /* mm->mmap_cache = NULL;*/ /* Kill the cache. */
}
/*
--
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 10+ messages in thread
* [PATCH/RFC 5/9] i386: rcu vma lookups for faults
2007-10-22 10:45 [PATCH/RFC 0/9] VMA lookup with RCU Vaidyanathan Srinivasan
` (3 preceding siblings ...)
2007-10-22 10:45 ` [PATCH/RFC 4/9] mm: RCU " Vaidyanathan Srinivasan
@ 2007-10-22 10:45 ` Vaidyanathan Srinivasan
2007-10-22 10:45 ` [PATCH/RFC 6/9] x86_64: " Vaidyanathan Srinivasan
` (3 subsequent siblings)
8 siblings, 0 replies; 10+ messages in thread
From: Vaidyanathan Srinivasan @ 2007-10-22 10:45 UTC (permalink / raw)
To: linux-mm; +Cc: Alexis Bruemmer, Peter Zijlstra
[-- Attachment #1: 5_fault-i386.patch --]
[-- Type: text/plain, Size: 3768 bytes --]
Use the new lockless vma lookup in the i386 fault handler.
This avoids the exclusive cacheline access for the mmap_sem.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
arch/i386/mm/fault.c | 56 +++++++++++++++++++++++++++------------------------
1 file changed, 30 insertions(+), 26 deletions(-)
--- linux-2.6.23-rc6.orig/arch/i386/mm/fault.c
+++ linux-2.6.23-rc6/arch/i386/mm/fault.c
@@ -307,6 +307,7 @@ fastcall void __kprobes do_page_fault(st
unsigned long address;
int write, si_code;
int fault;
+ int locked = 0;
/* get the address */
address = read_cr2();
@@ -357,29 +358,14 @@ fastcall void __kprobes do_page_fault(st
if (in_atomic() || !mm)
goto bad_area_nosemaphore;
- /* When running in the kernel we expect faults to occur only to
- * addresses in user space. All other faults represent errors in the
- * kernel and should generate an OOPS. Unfortunatly, in the case of an
- * erroneous fault occurring in a code path which already holds mmap_sem
- * we will deadlock attempting to validate the fault against the
- * address space. Luckily the kernel only validly references user
- * space from well defined areas of code, which are listed in the
- * exceptions table.
- *
- * As the vast majority of faults will be valid we will only perform
- * the source reference check when there is a possibilty of a deadlock.
- * Attempt to lock the address space, if we cannot we then validate the
- * source. If this is invalid we can skip the address space check,
- * thus avoiding the deadlock.
- */
- if (!down_read_trylock(&mm->mmap_sem)) {
- if ((error_code & 4) == 0 &&
- !search_exception_tables(regs->eip))
- goto bad_area_nosemaphore;
+again:
+ if (likely(!locked)) {
+ vma = __find_get_vma(mm, address, &locked);
+ } else {
down_read(&mm->mmap_sem);
+ vma = find_vma(mm, address);
+ get_vma(vma);
}
-
- vma = find_vma(mm, address);
if (!vma)
goto bad_area;
if (vma->vm_start <= address)
@@ -396,6 +382,15 @@ fastcall void __kprobes do_page_fault(st
if (address + 65536 + 32 * sizeof(unsigned long) < regs->esp)
goto bad_area;
}
+ /*
+ * expand_stack needs the read lock, hence retry the whole thing
+ * read locked.
+ */
+ if (!locked) {
+ put_vma(vma);
+ locked = 1;
+ goto again;
+ }
if (expand_stack(vma, address))
goto bad_area;
/*
@@ -403,6 +398,11 @@ fastcall void __kprobes do_page_fault(st
* we can handle it..
*/
good_area:
+ if (unlikely(locked)) {
+ up_read(&mm->mmap_sem);
+ locked = 0;
+ }
+
si_code = SEGV_ACCERR;
write = 0;
switch (error_code & 3) {
@@ -447,7 +447,7 @@ good_area:
if (bit < 32)
tsk->thread.screen_bitmap |= 1 << bit;
}
- up_read(&mm->mmap_sem);
+ put_vma(vma);
return;
/*
@@ -455,7 +455,11 @@ good_area:
* Fix it, but check if it's kernel or user first..
*/
bad_area:
- up_read(&mm->mmap_sem);
+ if (unlikely(locked)) {
+ up_read(&mm->mmap_sem);
+ locked = 0;
+ }
+ put_vma(vma);
bad_area_nosemaphore:
/* User mode accesses just cause a SIGSEGV */
@@ -590,10 +594,10 @@ no_context:
* us unable to handle the page fault gracefully.
*/
out_of_memory:
- up_read(&mm->mmap_sem);
+ put_vma(vma);
if (is_init(tsk)) {
yield();
- down_read(&mm->mmap_sem);
+ vma = find_get_vma(mm, address);
goto survive;
}
printk("VM: killing process %s\n", tsk->comm);
@@ -602,7 +606,7 @@ out_of_memory:
goto no_context;
do_sigbus:
- up_read(&mm->mmap_sem);
+ put_vma(vma);
/* Kernel mode? Handle exceptions or die */
if (!(error_code & 4))
--
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 10+ messages in thread
* [PATCH/RFC 6/9] x86_64: rcu vma lookups for faults
2007-10-22 10:45 [PATCH/RFC 0/9] VMA lookup with RCU Vaidyanathan Srinivasan
` (4 preceding siblings ...)
2007-10-22 10:45 ` [PATCH/RFC 5/9] i386: rcu vma lookups for faults Vaidyanathan Srinivasan
@ 2007-10-22 10:45 ` Vaidyanathan Srinivasan
2007-10-22 10:45 ` [PATCH/RFC 7/9] Add page fault code for PPC64 path Vaidyanathan Srinivasan
` (2 subsequent siblings)
8 siblings, 0 replies; 10+ messages in thread
From: Vaidyanathan Srinivasan @ 2007-10-22 10:45 UTC (permalink / raw)
To: linux-mm; +Cc: Alexis Bruemmer, Peter Zijlstra
[-- Attachment #1: 6_fault-x86_64.patch --]
[-- Type: text/plain, Size: 2629 bytes --]
---
arch/x86_64/mm/fault.c | 36 +++++++++++++++++++++++++-----------
1 file changed, 25 insertions(+), 11 deletions(-)
--- linux-2.6.23-rc6.orig/arch/x86_64/mm/fault.c
+++ linux-2.6.23-rc6/arch/x86_64/mm/fault.c
@@ -310,10 +310,10 @@ asmlinkage void __kprobes do_page_fault(
int write, fault;
unsigned long flags;
siginfo_t info;
+ int locked = 0;
tsk = current;
mm = tsk->mm;
- prefetchw(&mm->mmap_sem);
/* get the address */
address = read_cr2();
@@ -390,14 +390,14 @@ asmlinkage void __kprobes do_page_fault(
* source. If this is invalid we can skip the address space check,
* thus avoiding the deadlock.
*/
- if (!down_read_trylock(&mm->mmap_sem)) {
- if ((error_code & PF_USER) == 0 &&
- !search_exception_tables(regs->rip))
- goto bad_area_nosemaphore;
+
+ if (likely(!locked)) {
+ vma = __find_get_vma(mm, address, &locked);
+ } else {
down_read(&mm->mmap_sem);
+ vma = find_vma(mm, address);
+ get_vma(vma);
}
-
- vma = find_vma(mm, address);
if (!vma)
goto bad_area;
if (likely(vma->vm_start <= address))
@@ -411,6 +411,11 @@ asmlinkage void __kprobes do_page_fault(
if (address + 65536 + 32 * sizeof(unsigned long) < regs->rsp)
goto bad_area;
}
+ if (!locked) {
+ put_vma(vma);
+ locked = 1;
+ goto again;
+ }
if (expand_stack(vma, address))
goto bad_area;
/*
@@ -418,6 +423,11 @@ asmlinkage void __kprobes do_page_fault(
* we can handle it..
*/
good_area:
+ if (locked) {
+ up_read(&mm->mmap_sem);
+ locked = 0;
+ }
+
info.si_code = SEGV_ACCERR;
write = 0;
switch (error_code & (PF_PROT|PF_WRITE)) {
@@ -452,7 +462,7 @@ good_area:
tsk->maj_flt++;
else
tsk->min_flt++;
- up_read(&mm->mmap_sem);
+ put_vma(vma);
return;
/*
@@ -460,7 +470,11 @@ good_area:
* Fix it, but check if it's kernel or user first..
*/
bad_area:
- up_read(&mm->mmap_sem);
+ if (locked) {
+ up_read(&mm->mmap_sem);
+ locked = 0;
+ }
+ put_vma(vma);
bad_area_nosemaphore:
/* User mode accesses just cause a SIGSEGV */
@@ -552,7 +566,7 @@ no_context:
* us unable to handle the page fault gracefully.
*/
out_of_memory:
- up_read(&mm->mmap_sem);
+ put_vma(vma);
if (is_init(current)) {
yield();
goto again;
@@ -563,7 +577,7 @@ out_of_memory:
goto no_context;
do_sigbus:
- up_read(&mm->mmap_sem);
+ put_vma(vma);
/* Kernel mode? Handle exceptions or die */
if (!(error_code & PF_USER))
--
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 10+ messages in thread
* [PATCH/RFC 7/9] Add page fault code for PPC64 path
2007-10-22 10:45 [PATCH/RFC 0/9] VMA lookup with RCU Vaidyanathan Srinivasan
` (5 preceding siblings ...)
2007-10-22 10:45 ` [PATCH/RFC 6/9] x86_64: " Vaidyanathan Srinivasan
@ 2007-10-22 10:45 ` Vaidyanathan Srinivasan
2007-10-22 10:45 ` [PATCH/RFC 8/9] debug: instrument the fault path Vaidyanathan Srinivasan
2007-10-22 10:45 ` [PATCH/RFC 9/9] mm: nr_ptes needs to be atomic Vaidyanathan Srinivasan
8 siblings, 0 replies; 10+ messages in thread
From: Vaidyanathan Srinivasan @ 2007-10-22 10:45 UTC (permalink / raw)
To: linux-mm; +Cc: Alexis Bruemmer, Peter Zijlstra
[-- Attachment #1: 7_fault-powerpc.patch --]
[-- Type: text/plain, Size: 2828 bytes --]
---
arch/powerpc/mm/fault.c | 44 +++++++++++++++++++++++++++++++++-----------
1 file changed, 33 insertions(+), 11 deletions(-)
--- linux-2.6.23-rc8.orig/arch/powerpc/mm/fault.c
+++ linux-2.6.23-rc8/arch/powerpc/mm/fault.c
@@ -148,6 +148,7 @@ int __kprobes do_page_fault(struct pt_re
int is_write = 0, ret;
int trap = TRAP(regs);
int is_exec = trap == 0x400;
+ int locked = 0;
#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE))
/*
@@ -211,14 +212,22 @@ int __kprobes do_page_fault(struct pt_re
* source. If this is invalid we can skip the address space check,
* thus avoiding the deadlock.
*/
- if (!down_read_trylock(&mm->mmap_sem)) {
- if (!user_mode(regs) && !search_exception_tables(regs->nip))
- goto bad_area_nosemaphore;
-
+// if (!down_read_trylock(&mm->mmap_sem)) {
+// if (!user_mode(regs) && !search_exception_tables(regs->nip))
+// goto bad_area_nosemaphore;
+//
+// down_read(&mm->mmap_sem);
+// }
+
+again:
+ if (likely(!locked)) {
+ vma = __find_get_vma(mm, address, &locked);
+ } else {
down_read(&mm->mmap_sem);
- }
+ vma = find_vma(mm, address);
+ get_vma(vma);
+ }
- vma = find_vma(mm, address);
if (!vma)
goto bad_area;
if (vma->vm_start <= address)
@@ -257,10 +266,19 @@ int __kprobes do_page_fault(struct pt_re
&& (!user_mode(regs) || !store_updates_sp(regs)))
goto bad_area;
}
+ if (!locked) {
+ put_vma(vma);
+ locked = 1;
+ goto again;
+ }
if (expand_stack(vma, address))
goto bad_area;
good_area:
+ if (locked) {
+ up_read(&mm->mmap_sem);
+ locked = 0;
+ }
code = SEGV_ACCERR;
#if defined(CONFIG_6xx)
if (error_code & 0x95700000)
@@ -311,7 +329,7 @@ good_area:
pte_update(ptep, 0, _PAGE_HWEXEC);
_tlbie(address);
pte_unmap_unlock(ptep, ptl);
- up_read(&mm->mmap_sem);
+ put_vma(vma);
return 0;
}
pte_unmap_unlock(ptep, ptl);
@@ -348,11 +366,15 @@ good_area:
current->maj_flt++;
else
current->min_flt++;
- up_read(&mm->mmap_sem);
+ put_vma(vma);
return 0;
bad_area:
- up_read(&mm->mmap_sem);
+ if (locked) {
+ up_read(&mm->mmap_sem);
+ locked = 0;
+ }
+ put_vma(vma);
bad_area_nosemaphore:
/* User mode accesses cause a SIGSEGV */
@@ -374,7 +396,7 @@ bad_area_nosemaphore:
* us unable to handle the page fault gracefully.
*/
out_of_memory:
- up_read(&mm->mmap_sem);
+ put_vma(vma);
if (is_init(current)) {
yield();
down_read(&mm->mmap_sem);
@@ -386,7 +408,7 @@ out_of_memory:
return SIGKILL;
do_sigbus:
- up_read(&mm->mmap_sem);
+ put_vma(vma);
if (user_mode(regs)) {
info.si_signo = SIGBUS;
info.si_errno = 0;
--
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 10+ messages in thread
* [PATCH/RFC 8/9] debug: instrument the fault path
2007-10-22 10:45 [PATCH/RFC 0/9] VMA lookup with RCU Vaidyanathan Srinivasan
` (6 preceding siblings ...)
2007-10-22 10:45 ` [PATCH/RFC 7/9] Add page fault code for PPC64 path Vaidyanathan Srinivasan
@ 2007-10-22 10:45 ` Vaidyanathan Srinivasan
2007-10-22 10:45 ` [PATCH/RFC 9/9] mm: nr_ptes needs to be atomic Vaidyanathan Srinivasan
8 siblings, 0 replies; 10+ messages in thread
From: Vaidyanathan Srinivasan @ 2007-10-22 10:45 UTC (permalink / raw)
To: linux-mm; +Cc: Alexis Bruemmer, Peter Zijlstra
[-- Attachment #1: 8_stats.patch --]
[-- Type: text/plain, Size: 1856 bytes --]
---
arch/x86_64/mm/fault.c | 8 ++++++++
include/linux/vmstat.h | 1 +
mm/vmstat.c | 5 +++++
3 files changed, 14 insertions(+)
--- linux-2.6.23-rc6.orig/arch/x86_64/mm/fault.c
+++ linux-2.6.23-rc6/arch/x86_64/mm/fault.c
@@ -25,6 +25,7 @@
#include <linux/kprobes.h>
#include <linux/uaccess.h>
#include <linux/kdebug.h>
+#include <linux/vmstat.h>
#include <asm/system.h>
#include <asm/pgalloc.h>
@@ -393,10 +394,16 @@ asmlinkage void __kprobes do_page_fault(
if (likely(!locked)) {
vma = __find_get_vma(mm, address, &locked);
+ if (unlikely(locked))
+ count_vm_event(FAULT_RCU_SLOW);
+ else
+ count_vm_event(FAULT_RCU);
+
} else {
down_read(&mm->mmap_sem);
vma = find_vma(mm, address);
get_vma(vma);
+ count_vm_event(FAULT_LOCKED);
}
if (!vma)
goto bad_area;
@@ -416,6 +423,7 @@ asmlinkage void __kprobes do_page_fault(
locked = 1;
goto again;
}
+ count_vm_event(FAULT_STACK);
if (expand_stack(vma, address))
goto bad_area;
/*
--- linux-2.6.23-rc6.orig/include/linux/vmstat.h
+++ linux-2.6.23-rc6/include/linux/vmstat.h
@@ -37,6 +37,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PS
FOR_ALL_ZONES(PGSCAN_DIRECT),
PGINODESTEAL, SLABS_SCANNED, KSWAPD_STEAL, KSWAPD_INODESTEAL,
PAGEOUTRUN, ALLOCSTALL, PGROTATED,
+ FAULT_RCU, FAULT_RCU_SLOW, FAULT_LOCKED, FAULT_STACK,
NR_VM_EVENT_ITEMS
};
--- linux-2.6.23-rc6.orig/mm/vmstat.c
+++ linux-2.6.23-rc6/mm/vmstat.c
@@ -529,6 +529,11 @@ static const char * const vmstat_text[]
"allocstall",
"pgrotated",
+
+ "fault_rcu",
+ "fault_rcu_slow",
+ "fault_locked",
+ "fault_stack",
#endif
};
--
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 10+ messages in thread
* [PATCH/RFC 9/9] mm: nr_ptes needs to be atomic
2007-10-22 10:45 [PATCH/RFC 0/9] VMA lookup with RCU Vaidyanathan Srinivasan
` (7 preceding siblings ...)
2007-10-22 10:45 ` [PATCH/RFC 8/9] debug: instrument the fault path Vaidyanathan Srinivasan
@ 2007-10-22 10:45 ` Vaidyanathan Srinivasan
8 siblings, 0 replies; 10+ messages in thread
From: Vaidyanathan Srinivasan @ 2007-10-22 10:45 UTC (permalink / raw)
To: linux-mm; +Cc: Alexis Bruemmer, Peter Zijlstra
[-- Attachment #1: 9_nr_ptes.patch --]
[-- Type: text/plain, Size: 3692 bytes --]
---
arch/powerpc/mm/fault.c | 6 ++++++
arch/sh/mm/cache-sh4.c | 2 +-
arch/um/kernel/skas/mmu.c | 2 +-
fs/proc/task_mmu.c | 2 +-
include/linux/sched.h | 4 +++-
kernel/fork.c | 2 +-
mm/memory.c | 4 ++--
7 files changed, 15 insertions(+), 7 deletions(-)
--- linux-2.6.23-rc8.orig/arch/powerpc/mm/fault.c
+++ linux-2.6.23-rc8/arch/powerpc/mm/fault.c
@@ -235,6 +235,12 @@ again:
if (!(vma->vm_flags & VM_GROWSDOWN))
goto bad_area;
+ if (!locked) {
+ put_vma(vma);
+ locked = 1;
+ goto again;
+ }
+
/*
* N.B. The POWER/Open ABI allows programs to access up to
* 288 bytes below the stack pointer.
--- linux-2.6.23-rc8.orig/arch/sh/mm/cache-sh4.c
+++ linux-2.6.23-rc8/arch/sh/mm/cache-sh4.c
@@ -373,7 +373,7 @@ void flush_cache_mm(struct mm_struct *mm
* Don't bother groveling around the dcache for the VMA ranges
* if there are too many PTEs to make it worthwhile.
*/
- if (mm->nr_ptes >= MAX_DCACHE_PAGES)
+ if (atomic_long_read(&mm->nr_ptes) >= MAX_DCACHE_PAGES)
flush_dcache_all();
else {
struct vm_area_struct *vma;
--- linux-2.6.23-rc8.orig/arch/um/kernel/skas/mmu.c
+++ linux-2.6.23-rc8/arch/um/kernel/skas/mmu.c
@@ -98,7 +98,7 @@ int init_new_context_skas(struct task_st
if(ret)
goto out_free;
- mm->nr_ptes--;
+ atomic_long_dec(&mm->nr_ptes);
}
to_mm->id.stack = stack;
--- linux-2.6.23-rc8.orig/fs/proc/task_mmu.c
+++ linux-2.6.23-rc8/fs/proc/task_mmu.c
@@ -52,7 +52,7 @@ char *task_mem(struct mm_struct *mm, cha
total_rss << (PAGE_SHIFT-10),
data << (PAGE_SHIFT-10),
mm->stack_vm << (PAGE_SHIFT-10), text, lib,
- (PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10);
+ (PTRS_PER_PTE*sizeof(pte_t)*atomic_long_read(&mm->nr_ptes)) >> 10);
return buffer;
}
--- linux-2.6.23-rc8.orig/include/linux/sched.h
+++ linux-2.6.23-rc8/include/linux/sched.h
@@ -400,11 +400,13 @@ struct mm_struct {
unsigned long hiwater_vm; /* High-water virtual memory usage */
unsigned long total_vm, locked_vm, shared_vm, exec_vm;
- unsigned long stack_vm, reserved_vm, def_flags, nr_ptes;
+ unsigned long stack_vm, reserved_vm, def_flags;
unsigned long start_code, end_code, start_data, end_data;
unsigned long start_brk, brk, start_stack;
unsigned long arg_start, arg_end, env_start, env_end;
+ atomic_long_t nr_ptes;
+
unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */
cpumask_t cpu_vm_mask;
--- linux-2.6.23-rc8.orig/kernel/fork.c
+++ linux-2.6.23-rc8/kernel/fork.c
@@ -335,7 +335,7 @@ static struct mm_struct * mm_init(struct
mm->flags = (current->mm) ? current->mm->flags
: MMF_DUMP_FILTER_DEFAULT;
mm->core_waiters = 0;
- mm->nr_ptes = 0;
+ atomic_long_set(&mm->nr_ptes, 0);
set_mm_counter(mm, file_rss, 0);
set_mm_counter(mm, anon_rss, 0);
spin_lock_init(&mm->page_table_lock);
--- linux-2.6.23-rc8.orig/mm/memory.c
+++ linux-2.6.23-rc8/mm/memory.c
@@ -127,7 +127,7 @@ static void free_pte_range(struct mmu_ga
pte_lock_deinit(page);
pte_free_tlb(tlb, page);
dec_zone_page_state(page, NR_PAGETABLE);
- tlb->mm->nr_ptes--;
+ atomic_long_dec(&tlb->mm->nr_ptes);
}
static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
@@ -310,7 +310,7 @@ int __pte_alloc(struct mm_struct *mm, pm
pte_lock_deinit(new);
pte_free(new);
} else {
- mm->nr_ptes++;
+ atomic_long_inc(&mm->nr_ptes);
inc_zone_page_state(new, NR_PAGETABLE);
pmd_populate(mm, pmd, new);
}
--
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 10+ messages in thread
end of thread, other threads:[~2007-10-22 10:46 UTC | newest]
Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-10-22 10:45 [PATCH/RFC 0/9] VMA lookup with RCU Vaidyanathan Srinivasan
2007-10-22 10:45 ` [PATCH/RFC 1/9] Data structure changes Vaidyanathan Srinivasan
2007-10-22 10:45 ` [PATCH/RFC 2/9] lib: RCU friendly B+tree Vaidyanathan Srinivasan
2007-10-22 10:45 ` [PATCH/RFC 3/9] mm: use the B+tree for vma lookups Vaidyanathan Srinivasan
2007-10-22 10:45 ` [PATCH/RFC 4/9] mm: RCU " Vaidyanathan Srinivasan
2007-10-22 10:45 ` [PATCH/RFC 5/9] i386: rcu vma lookups for faults Vaidyanathan Srinivasan
2007-10-22 10:45 ` [PATCH/RFC 6/9] x86_64: " Vaidyanathan Srinivasan
2007-10-22 10:45 ` [PATCH/RFC 7/9] Add page fault code for PPC64 path Vaidyanathan Srinivasan
2007-10-22 10:45 ` [PATCH/RFC 8/9] debug: instrument the fault path Vaidyanathan Srinivasan
2007-10-22 10:45 ` [PATCH/RFC 9/9] mm: nr_ptes needs to be atomic Vaidyanathan Srinivasan
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.