* [PATCH] anobjrmap 9 priority mjb tree
@ 2004-04-04 12:33 Hugh Dickins
2004-04-09 20:39 ` Martin J. Bligh
0 siblings, 1 reply; 38+ messages in thread
From: Hugh Dickins @ 2004-04-04 12:33 UTC (permalink / raw)
To: linux-kernel; +Cc: Rajesh Venkatasubramanian, Martin J. Bligh
This anobjrmap 9 (or anon_mm9) patch adds Rajesh's radix priority search
tree on top of Martin's 2.6.5-rc3-mjb2 tree, making a priority mjb tree!
Approximately equivalent to Andrea's 2.6.5-aa1, but using anonmm instead
of anon_vma, and of course each tree has its own additional features.
arch/arm/mm/fault-armv.c | 80 ++---
arch/mips/mm/cache.c | 9
arch/parisc/kernel/cache.c | 86 ++---
arch/parisc/kernel/sys_parisc.c | 14
arch/s390/kernel/compat_exec.c | 2
arch/sparc64/mm/init.c | 8
arch/x86_64/ia32/ia32_binfmt.c | 2
fs/exec.c | 2
fs/hugetlbfs/inode.c | 14
fs/inode.c | 5
fs/locks.c | 8
fs/proc/task_mmu.c | 2
fs/xfs/linux/xfs_vnode.h | 5
include/asm-arm/cacheflush.h | 8
include/asm-parisc/cacheflush.h | 10
include/asm-sh/pgalloc.h | 5
include/linux/fs.h | 6
include/linux/mm.h | 167 +++++++++++
include/linux/prio_tree.h | 78 +++++
init/main.c | 2
kernel/fork.c | 4
kernel/kexec.c | 2
mm/Makefile | 3
mm/filemap.c | 3
mm/fremap.c | 14
mm/memory.c | 15 -
mm/mmap.c | 100 +++---
mm/mremap.c | 42 ++
mm/page_io.c | 4
mm/prio_tree.c | 577 ++++++++++++++++++++++++++++++++++++++++
mm/rmap.c | 164 ++++++-----
mm/shmem.c | 3
mm/vmscan.c | 6
33 files changed, 1172 insertions(+), 278 deletions(-)
--- 2.6.5-rc3-mjb2/arch/arm/mm/fault-armv.c 2004-04-02 21:01:43.725406016 +0100
+++ anobjrmap9/arch/arm/mm/fault-armv.c 2004-04-04 13:05:41.369476056 +0100
@@ -16,6 +16,7 @@
#include <linux/bitops.h>
#include <linux/vmalloc.h>
#include <linux/init.h>
+#include <linux/pagemap.h>
#include <asm/cacheflush.h>
#include <asm/io.h>
@@ -186,47 +187,47 @@ no_pmd:
void __flush_dcache_page(struct page *page)
{
+ struct address_space *mapping = page_mapping(page);
struct mm_struct *mm = current->active_mm;
- struct list_head *l;
+ struct vm_area_struct *mpnt;
+ struct prio_tree_iter iter;
+ unsigned long offset;
+ pgoff_t pgoff;
__cpuc_flush_dcache_page(page_address(page));
- if (!page_mapping(page))
+ if (!mapping)
return;
/*
* With a VIVT cache, we need to also write back
* and invalidate any user data.
*/
- list_for_each(l, &page->mapping->i_mmap_shared) {
- struct vm_area_struct *mpnt;
- unsigned long off;
-
- mpnt = list_entry(l, struct vm_area_struct, shared);
-
+ pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+ mpnt = __vma_prio_tree_first(&mapping->i_mmap_shared,
+ &iter, pgoff, pgoff);
+ while (mpnt) {
/*
* If this VMA is not in our MM, we can ignore it.
*/
- if (mpnt->vm_mm != mm)
- continue;
-
- if (page->index < mpnt->vm_pgoff)
- continue;
-
- off = page->index - mpnt->vm_pgoff;
- if (off >= (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT)
- continue;
-
- flush_cache_page(mpnt, mpnt->vm_start + (off << PAGE_SHIFT));
+ if (mpnt->vm_mm == mm) {
+ offset = (pgoff - mpnt->vm_pgoff) << PAGE_SHIFT;
+ flush_cache_page(mpnt, mpnt->vm_start + offset);
+ }
+ mpnt = __vma_prio_tree_next(mpnt, &mapping->i_mmap_shared,
+ &iter, pgoff, pgoff);
}
}
static void
make_coherent(struct vm_area_struct *vma, unsigned long addr, struct page *page, int dirty)
{
- struct list_head *l;
+ struct address_space *mapping = page->mapping;
struct mm_struct *mm = vma->vm_mm;
- unsigned long pgoff = (addr - vma->vm_start) >> PAGE_SHIFT;
+ struct vm_area_struct *mpnt;
+ struct prio_tree_iter iter;
+ unsigned long offset;
+ pgoff_t pgoff;
int aliases = 0;
/*
@@ -234,36 +235,21 @@ make_coherent(struct vm_area_struct *vma
* space, then we need to handle them specially to maintain
* cache coherency.
*/
- list_for_each(l, &page->mapping->i_mmap_shared) {
- struct vm_area_struct *mpnt;
- unsigned long off;
-
- mpnt = list_entry(l, struct vm_area_struct, shared);
-
+ pgoff = vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT);
+ mpnt = __vma_prio_tree_first(&mapping->i_mmap_shared,
+ &iter, pgoff, pgoff);
+ while (mpnt) {
/*
* If this VMA is not in our MM, we can ignore it.
- * Note that we intentionally don't mask out the VMA
+ * Note that we intentionally mask out the VMA
* that we are fixing up.
*/
- if (mpnt->vm_mm != mm || mpnt == vma)
- continue;
-
- /*
- * If the page isn't in this VMA, we can also ignore it.
- */
- if (pgoff < mpnt->vm_pgoff)
- continue;
-
- off = pgoff - mpnt->vm_pgoff;
- if (off >= (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT)
- continue;
-
- off = mpnt->vm_start + (off << PAGE_SHIFT);
-
- /*
- * Ok, it is within mpnt. Fix it up.
- */
- aliases += adjust_pte(mpnt, off);
+ if (mpnt->vm_mm == mm && mpnt != vma) {
+ offset = (pgoff - mpnt->vm_pgoff) << PAGE_SHIFT;
+ aliases += adjust_pte(mpnt, mpnt->vm_start + offset);
+ }
+ mpnt = __vma_prio_tree_next(mpnt, &mapping->i_mmap_shared,
+ &iter, pgoff, pgoff);
}
if (aliases)
adjust_pte(vma, addr);
--- 2.6.5-rc3-mjb2/arch/mips/mm/cache.c 2004-04-02 21:01:44.347311472 +0100
+++ anobjrmap9/arch/mips/mm/cache.c 2004-04-04 13:05:41.370475904 +0100
@@ -55,13 +55,14 @@ asmlinkage int sys_cacheflush(void *addr
void flush_dcache_page(struct page *page)
{
+ struct address_space *mapping = page_mapping(page);
unsigned long addr;
- if (page_mapping(page) &&
- list_empty(&page->mapping->i_mmap) &&
- list_empty(&page->mapping->i_mmap_shared)) {
+ if (mapping &&
+ prio_tree_empty(&mapping->i_mmap) &&
+ prio_tree_empty(&mapping->i_mmap_shared) &&
+ list_empty(&mapping->i_mmap_nonlinear)) {
SetPageDcacheDirty(page);
-
return;
}
--- 2.6.5-rc3-mjb2/arch/parisc/kernel/cache.c 2004-04-02 21:01:44.356310104 +0100
+++ anobjrmap9/arch/parisc/kernel/cache.c 2004-04-04 13:05:41.371475752 +0100
@@ -17,6 +17,7 @@
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/seq_file.h>
+#include <linux/pagemap.h>
#include <asm/pdc.h>
#include <asm/cache.h>
@@ -229,67 +230,60 @@ void disable_sr_hashing(void)
void __flush_dcache_page(struct page *page)
{
+ struct address_space *mapping = page_mapping(page);
struct mm_struct *mm = current->active_mm;
- struct list_head *l;
+ struct vm_area_struct *mpnt;
+ struct prio_tree_iter iter;
+ unsigned long offset;
+ pgoff_t pgoff;
flush_kernel_dcache_page(page_address(page));
- if (!page_mapping(page))
+ if (!mapping)
return;
- /* check shared list first if it's not empty...it's usually
- * the shortest */
- list_for_each(l, &page->mapping->i_mmap_shared) {
- struct vm_area_struct *mpnt;
- unsigned long off;
- mpnt = list_entry(l, struct vm_area_struct, shared);
+ pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+ /* check shared list first if it's not empty...it's usually
+ * the shortest */
+ mpnt = __vma_prio_tree_first(&mapping->i_mmap_shared,
+ &iter, pgoff, pgoff);
+ while (mpnt) {
/*
* If this VMA is not in our MM, we can ignore it.
*/
- if (mpnt->vm_mm != mm)
- continue;
-
- if (page->index < mpnt->vm_pgoff)
- continue;
-
- off = page->index - mpnt->vm_pgoff;
- if (off >= (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT)
- continue;
-
- flush_cache_page(mpnt, mpnt->vm_start + (off << PAGE_SHIFT));
-
- /* All user shared mappings should be equivalently mapped,
- * so once we've flushed one we should be ok
- */
- return;
+ if (mpnt->vm_mm == mm) {
+ offset = (pgoff - mpnt->vm_pgoff) << PAGE_SHIFT;
+ flush_cache_page(mpnt, mpnt->vm_start + offset);
+
+ /* All user shared mappings should be equivalently
+ * mapped, so once we've flushed one we should be ok
+ */
+ return;
+ }
+ mpnt = __vma_prio_tree_next(mpnt, &mapping->i_mmap_shared,
+ &iter, pgoff, pgoff);
}
/* then check private mapping list for read only shared mappings
* which are flagged by VM_MAYSHARE */
- list_for_each(l, &page->mapping->i_mmap) {
- struct vm_area_struct *mpnt;
- unsigned long off;
-
- mpnt = list_entry(l, struct vm_area_struct, shared);
-
-
- if (mpnt->vm_mm != mm || !(mpnt->vm_flags & VM_MAYSHARE))
- continue;
-
- if (page->index < mpnt->vm_pgoff)
- continue;
-
- off = page->index - mpnt->vm_pgoff;
- if (off >= (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT)
- continue;
-
- flush_cache_page(mpnt, mpnt->vm_start + (off << PAGE_SHIFT));
-
- /* All user shared mappings should be equivalently mapped,
- * so once we've flushed one we should be ok
+ mpnt = __vma_prio_tree_first(&mapping->i_mmap,
+ &iter, pgoff, pgoff);
+ while (mpnt) {
+ /*
+ * If this VMA is not in our MM, we can ignore it.
*/
- break;
+ if (mpnt->vm_mm == mm && (mpnt->vm_flags & VM_MAYSHARE)) {
+ offset = (pgoff - mpnt->vm_pgoff) << PAGE_SHIFT;
+ flush_cache_page(mpnt, mpnt->vm_start + offset);
+
+ /* All user shared mappings should be equivalently
+ * mapped, so once we've flushed one we should be ok
+ */
+ return;
+ }
+ mpnt = __vma_prio_tree_next(mpnt, &mapping->i_mmap_shared,
+ &iter, pgoff, pgoff);
}
}
EXPORT_SYMBOL(__flush_dcache_page);
--- 2.6.5-rc3-mjb2/arch/parisc/kernel/sys_parisc.c 2004-03-30 13:04:00.000000000 +0100
+++ anobjrmap9/arch/parisc/kernel/sys_parisc.c 2004-04-04 13:05:41.372475600 +0100
@@ -68,17 +68,8 @@ static unsigned long get_unshared_area(u
* existing mapping and use the same offset. New scheme is to use the
* address of the kernel data structure as the seed for the offset.
* We'll see how that works...
- */
-#if 0
-static int get_offset(struct address_space *mapping)
-{
- struct vm_area_struct *vma = list_entry(mapping->i_mmap_shared.next,
- struct vm_area_struct, shared);
- return (vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT)) &
- (SHMLBA - 1);
-}
-#else
-/* The mapping is cacheline aligned, so there's no information in the bottom
+ *
+ * The mapping is cacheline aligned, so there's no information in the bottom
* few bits of the address. We're looking for 10 bits (4MB / 4k), so let's
* drop the bottom 8 bits and use bits 8-17.
*/
@@ -87,7 +78,6 @@ static int get_offset(struct address_spa
int offset = (unsigned long) mapping << (PAGE_SHIFT - 8);
return offset & 0x3FF000;
}
-#endif
static unsigned long get_shared_area(struct address_space *mapping,
unsigned long addr, unsigned long len, unsigned long pgoff)
--- 2.6.5-rc3-mjb2/arch/s390/kernel/compat_exec.c 2003-07-10 21:16:28.000000000 +0100
+++ anobjrmap9/arch/s390/kernel/compat_exec.c 2004-04-04 13:05:41.372475600 +0100
@@ -71,7 +71,7 @@ int setup_arg_pages32(struct linux_binpr
mpnt->vm_ops = NULL;
mpnt->vm_pgoff = 0;
mpnt->vm_file = NULL;
- INIT_LIST_HEAD(&mpnt->shared);
+ INIT_VMA_SHARED(mpnt);
mpnt->vm_private_data = (void *) 0;
insert_vm_struct(mm, mpnt);
mm->total_vm = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT;
--- 2.6.5-rc3-mjb2/arch/sparc64/mm/init.c 2004-04-02 21:01:44.535282896 +0100
+++ anobjrmap9/arch/sparc64/mm/init.c 2004-04-04 13:05:41.375475144 +0100
@@ -224,12 +224,14 @@ void update_mmu_cache(struct vm_area_str
void flush_dcache_page(struct page *page)
{
+ struct address_space *mapping = page_mapping(page);
int dirty = test_bit(PG_dcache_dirty, &page->flags);
int dirty_cpu = dcache_dirty_cpu(page);
- if (page_mapping(page) &&
- list_empty(&page->mapping->i_mmap) &&
- list_empty(&page->mapping->i_mmap_shared)) {
+ if (mapping &&
+ prio_tree_empty(&mapping->i_mmap) &&
+ prio_tree_empty(&mapping->i_mmap_shared) &&
+ list_empty(&mapping->i_mmap_nonlinear)) {
if (dirty) {
if (dirty_cpu == smp_processor_id())
return;
--- 2.6.5-rc3-mjb2/arch/x86_64/ia32/ia32_binfmt.c 2004-03-30 13:04:03.000000000 +0100
+++ anobjrmap9/arch/x86_64/ia32/ia32_binfmt.c 2004-04-04 13:05:41.375475144 +0100
@@ -360,7 +360,7 @@ int setup_arg_pages(struct linux_binprm
mpnt->vm_ops = NULL;
mpnt->vm_pgoff = 0;
mpnt->vm_file = NULL;
- INIT_LIST_HEAD(&mpnt->shared);
+ INIT_VMA_SHARED(mpnt);
mpnt->vm_private_data = (void *) 0;
insert_vm_struct(mm, mpnt);
mm->total_vm = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT;
--- 2.6.5-rc3-mjb2/fs/exec.c 2004-04-02 21:01:45.785092896 +0100
+++ anobjrmap9/fs/exec.c 2004-04-04 13:05:41.377474840 +0100
@@ -423,7 +423,7 @@ int setup_arg_pages(struct linux_binprm
mpnt->vm_ops = NULL;
mpnt->vm_pgoff = 0;
mpnt->vm_file = NULL;
- INIT_LIST_HEAD(&mpnt->shared);
+ INIT_VMA_SHARED(mpnt);
mpnt->vm_private_data = (void *) 0;
insert_vm_struct(mm, mpnt);
mm->total_vm = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT;
--- 2.6.5-rc3-mjb2/fs/hugetlbfs/inode.c 2004-04-02 21:01:45.794091528 +0100
+++ anobjrmap9/fs/hugetlbfs/inode.c 2004-04-04 13:05:41.378474688 +0100
@@ -270,11 +270,13 @@ static void hugetlbfs_drop_inode(struct
* vma->vm_pgoff is in PAGE_SIZE units.
*/
static void
-hugetlb_vmtruncate_list(struct list_head *list, unsigned long h_pgoff)
+hugetlb_vmtruncate_list(struct prio_tree_root *root, unsigned long h_pgoff)
{
struct vm_area_struct *vma;
+ struct prio_tree_iter iter;
- list_for_each_entry(vma, list, shared) {
+ vma = __vma_prio_tree_first(root, &iter, h_pgoff, ULONG_MAX);
+ while (vma) {
unsigned long h_vm_pgoff;
unsigned long v_length;
unsigned long h_length;
@@ -306,6 +308,8 @@ hugetlb_vmtruncate_list(struct list_head
zap_hugepage_range(vma,
vma->vm_start + v_offset,
v_length - v_offset);
+
+ vma = __vma_prio_tree_next(vma, root, &iter, h_pgoff, ULONG_MAX);
}
}
@@ -325,9 +329,11 @@ static int hugetlb_vmtruncate(struct ino
inode->i_size = offset;
down(&mapping->i_shared_sem);
- if (!list_empty(&mapping->i_mmap))
+ /* Protect against page fault */
+ atomic_inc(&mapping->truncate_count);
+ if (unlikely(!prio_tree_empty(&mapping->i_mmap)))
hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff);
- if (!list_empty(&mapping->i_mmap_shared))
+ if (unlikely(!prio_tree_empty(&mapping->i_mmap_shared)))
hugetlb_vmtruncate_list(&mapping->i_mmap_shared, pgoff);
up(&mapping->i_shared_sem);
truncate_hugepages(mapping, offset);
--- 2.6.5-rc3-mjb2/fs/inode.c 2004-03-30 13:04:15.000000000 +0100
+++ anobjrmap9/fs/inode.c 2004-04-04 13:05:41.380474384 +0100
@@ -189,8 +189,9 @@ void inode_init_once(struct inode *inode
atomic_set(&inode->i_data.truncate_count, 0);
INIT_LIST_HEAD(&inode->i_data.private_list);
spin_lock_init(&inode->i_data.private_lock);
- INIT_LIST_HEAD(&inode->i_data.i_mmap);
- INIT_LIST_HEAD(&inode->i_data.i_mmap_shared);
+ INIT_PRIO_TREE_ROOT(&inode->i_data.i_mmap);
+ INIT_PRIO_TREE_ROOT(&inode->i_data.i_mmap_shared);
+ INIT_LIST_HEAD(&inode->i_data.i_mmap_nonlinear);
spin_lock_init(&inode->i_lock);
i_size_ordered_init(inode);
}
--- 2.6.5-rc3-mjb2/fs/locks.c 2004-03-11 01:56:12.000000000 +0000
+++ anobjrmap9/fs/locks.c 2004-04-04 13:05:41.382474080 +0100
@@ -1455,8 +1455,8 @@ int fcntl_setlk(struct file *filp, unsig
if (IS_MANDLOCK(inode) &&
(inode->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID) {
struct address_space *mapping = filp->f_mapping;
-
- if (!list_empty(&mapping->i_mmap_shared)) {
+ if (!prio_tree_empty(&mapping->i_mmap_shared) ||
+ !list_empty(&mapping->i_mmap_nonlinear)) {
error = -EAGAIN;
goto out;
}
@@ -1593,8 +1593,8 @@ int fcntl_setlk64(struct file *filp, uns
if (IS_MANDLOCK(inode) &&
(inode->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID) {
struct address_space *mapping = filp->f_mapping;
-
- if (!list_empty(&mapping->i_mmap_shared)) {
+ if (!prio_tree_empty(&mapping->i_mmap_shared) ||
+ !list_empty(&mapping->i_mmap_nonlinear)) {
error = -EAGAIN;
goto out;
}
--- 2.6.5-rc3-mjb2/fs/proc/task_mmu.c 2004-04-02 21:01:45.864080888 +0100
+++ anobjrmap9/fs/proc/task_mmu.c 2004-04-04 13:05:41.383473928 +0100
@@ -65,7 +65,7 @@ int task_statm(struct mm_struct *mm, int
*shared += pages;
continue;
}
- if (vma->vm_flags & VM_SHARED || !list_empty(&vma->shared))
+ if (vma->vm_flags & VM_SHARED || !vma_shared_empty(vma))
*shared += pages;
if (vma->vm_flags & VM_EXECUTABLE)
*text += pages;
--- 2.6.5-rc3-mjb2/fs/xfs/linux/xfs_vnode.h 2004-02-04 02:45:43.000000000 +0000
+++ anobjrmap9/fs/xfs/linux/xfs_vnode.h 2004-04-04 13:05:41.384473776 +0100
@@ -597,8 +597,9 @@ static __inline__ void vn_flagclr(struct
* Some useful predicates.
*/
#define VN_MAPPED(vp) \
- (!list_empty(&(LINVFS_GET_IP(vp)->i_mapping->i_mmap)) || \
- (!list_empty(&(LINVFS_GET_IP(vp)->i_mapping->i_mmap_shared))))
+ (!prio_tree_empty(&(LINVFS_GET_IP(vp)->i_mapping->i_mmap)) || \
+ !prio_tree_empty(&(LINVFS_GET_IP(vp)->i_mapping->i_mmap_shared)) || \
+ !list_empty(&(LINVFS_GET_IP(vp)->i_mapping->i_mmap_nonlinear)))
#define VN_CACHED(vp) (LINVFS_GET_IP(vp)->i_mapping->nrpages)
#define VN_DIRTY(vp) (!list_empty(&(LINVFS_GET_IP(vp)->i_mapping->dirty_pages)))
#define VMODIFY(vp) VN_FLAGSET(vp, VMODIFIED)
--- 2.6.5-rc3-mjb2/include/asm-arm/cacheflush.h 2004-04-02 21:01:45.998060520 +0100
+++ anobjrmap9/include/asm-arm/cacheflush.h 2004-04-04 13:05:41.385473624 +0100
@@ -292,8 +292,12 @@ flush_cache_page(struct vm_area_struct *
* about to change to user space. This is the same method as used on SPARC64.
* See update_mmu_cache for the user space part.
*/
-#define mapping_mapped(map) (!list_empty(&(map)->i_mmap) || \
- !list_empty(&(map)->i_mmap_shared))
+static inline int mapping_mapped(struct address_space *mapping)
+{
+ return !prio_tree_empty(&mapping->i_mmap) ||
+ !prio_tree_empty(&mapping->i_mmap_shared) ||
+ !list_empty(&mapping->i_mmap_nonlinear);
+}
extern void __flush_dcache_page(struct page *);
--- 2.6.5-rc3-mjb2/include/asm-parisc/cacheflush.h 2004-04-02 21:01:46.794939376 +0100
+++ anobjrmap9/include/asm-parisc/cacheflush.h 2004-04-04 13:05:41.385473624 +0100
@@ -65,12 +65,18 @@ flush_user_icache_range(unsigned long st
#endif
}
+static inline int mapping_mapped(struct address_space *mapping)
+{
+ return !prio_tree_empty(&mapping->i_mmap) ||
+ !prio_tree_empty(&mapping->i_mmap_shared) ||
+ !list_empty(&mapping->i_mmap_nonlinear);
+}
+
extern void __flush_dcache_page(struct page *page);
static inline void flush_dcache_page(struct page *page)
{
- if (page_mapping(page) && list_empty(&page->mapping->i_mmap) &&
- list_empty(&page->mapping->i_mmap_shared)) {
+ if (page_mapping(page) && !mapping_mapped(page->mapping)) {
set_bit(PG_dcache_dirty, &page->flags);
} else {
__flush_dcache_page(page);
--- 2.6.5-rc3-mjb2/include/asm-sh/pgalloc.h 2004-04-02 21:01:46.963913688 +0100
+++ anobjrmap9/include/asm-sh/pgalloc.h 2004-04-04 13:05:41.386473472 +0100
@@ -101,8 +101,9 @@ static inline pte_t ptep_get_and_clear(p
unsigned long pfn = pte_pfn(pte);
if (pfn_valid(pfn)) {
page = pfn_to_page(pfn);
- if (!page_mapping(page)
- || list_empty(&page->mapping->i_mmap_shared))
+ if (!page_mapping(page) ||
+ (prio_tree_empty(&page->mapping->i_mmap_shared) &&
+ list_empty(&page->mapping->i_mmap_nonlinear)))
__clear_bit(PG_mapped, &page->flags);
}
}
--- 2.6.5-rc3-mjb2/include/linux/fs.h 2004-03-30 13:04:18.000000000 +0100
+++ anobjrmap9/include/linux/fs.h 2004-04-04 13:05:41.388473168 +0100
@@ -18,6 +18,7 @@
#include <linux/stat.h>
#include <linux/cache.h>
#include <linux/radix-tree.h>
+#include <linux/prio_tree.h>
#include <linux/kobject.h>
#include <asm/atomic.h>
@@ -329,8 +330,9 @@ struct address_space {
struct list_head io_pages; /* being prepared for I/O */
unsigned long nrpages; /* number of total pages */
struct address_space_operations *a_ops; /* methods */
- struct list_head i_mmap; /* list of private mappings */
- struct list_head i_mmap_shared; /* list of shared mappings */
+ struct prio_tree_root i_mmap; /* tree of private mappings */
+ struct prio_tree_root i_mmap_shared; /* tree of shared mappings */
+ struct list_head i_mmap_nonlinear;/*list of nonlinear mappings */
struct semaphore i_shared_sem; /* protect both above lists */
atomic_t truncate_count; /* Cover race condition with truncate */
unsigned long flags; /* error bits/gfp mask */
--- 2.6.5-rc3-mjb2/include/linux/mm.h 2004-04-02 21:01:47.316860032 +0100
+++ anobjrmap9/include/linux/mm.h 2004-04-04 13:05:41.390472864 +0100
@@ -11,6 +11,7 @@
#include <linux/list.h>
#include <linux/mmzone.h>
#include <linux/rbtree.h>
+#include <linux/prio_tree.h>
#include <linux/fs.h>
#ifndef CONFIG_DISCONTIGMEM /* Don't use mapnrs, do it properly */
@@ -68,7 +69,26 @@ struct vm_area_struct {
* one of the address_space->i_mmap{,shared} lists,
* for shm areas, the list of attaches, otherwise unused.
*/
- struct list_head shared;
+ union {
+ struct {
+ struct list_head list;
+ void *parent;
+ } vm_set;
+
+ struct prio_tree_node prio_tree_node;
+
+ struct {
+ void *first;
+ void *second;
+ void *parent;
+ } both;
+ } shared;
+
+ /*
+ * shared.vm_set : list of vmas that map exactly the same set of pages
+ * vm_set_head : head of the vm_set list
+ */
+ struct vm_area_struct *vm_set_head;
/* Function pointers to deal with this struct. */
struct vm_operations_struct * vm_ops;
@@ -130,6 +150,150 @@ struct vm_area_struct {
#define VM_RandomReadHint(v) ((v)->vm_flags & VM_RAND_READ)
/*
+ * The following macros are used for implementing prio_tree for i_mmap{_shared}
+ */
+
+#define RADIX_INDEX(vma) ((vma)->vm_pgoff)
+#define VMA_SIZE(vma) (((vma)->vm_end - (vma)->vm_start) >> PAGE_SHIFT)
+/* avoid overflow */
+#define HEAP_INDEX(vma) ((vma)->vm_pgoff + (VMA_SIZE(vma) - 1))
+
+#define GET_INDEX_VMA(vma, radix, heap) \
+do { \
+ radix = RADIX_INDEX(vma); \
+ heap = HEAP_INDEX(vma); \
+} while (0)
+
+#define GET_INDEX(node, radix, heap) \
+do { \
+ struct vm_area_struct *__tmp = \
+ prio_tree_entry(node, struct vm_area_struct, shared.prio_tree_node);\
+ GET_INDEX_VMA(__tmp, radix, heap); \
+} while (0)
+
+#define INIT_VMA_SHARED_LIST(vma) \
+do { \
+ INIT_LIST_HEAD(&(vma)->shared.vm_set.list); \
+ (vma)->shared.vm_set.parent = NULL; \
+ (vma)->vm_set_head = NULL; \
+} while (0)
+
+#define INIT_VMA_SHARED(vma) \
+do { \
+ (vma)->shared.both.first = NULL; \
+ (vma)->shared.both.second = NULL; \
+ (vma)->shared.both.parent = NULL; \
+ (vma)->vm_set_head = NULL; \
+} while (0)
+
+extern void __vma_prio_tree_insert(struct prio_tree_root *,
+ struct vm_area_struct *);
+
+extern void __vma_prio_tree_remove(struct prio_tree_root *,
+ struct vm_area_struct *);
+
+static inline int vma_shared_empty(struct vm_area_struct *vma)
+{
+ return vma->shared.both.first == NULL;
+}
+
+/*
+ * Helps to add a new vma that maps the same (identical) set of pages as the
+ * old vma to an i_mmap tree.
+ */
+static inline void __vma_prio_tree_add(struct vm_area_struct *vma,
+ struct vm_area_struct *old)
+{
+ INIT_VMA_SHARED_LIST(vma);
+
+ /* Leave these BUG_ONs till prio_tree patch stabilizes */
+ BUG_ON(RADIX_INDEX(vma) != RADIX_INDEX(old));
+ BUG_ON(HEAP_INDEX(vma) != HEAP_INDEX(old));
+
+ if (old->shared.both.parent) {
+ if (old->vm_set_head) {
+ list_add_tail(&vma->shared.vm_set.list,
+ &old->vm_set_head->shared.vm_set.list);
+ return;
+ }
+ else {
+ old->vm_set_head = vma;
+ vma->vm_set_head = old;
+ }
+ }
+ else
+ list_add(&vma->shared.vm_set.list, &old->shared.vm_set.list);
+}
+
+/*
+ * We cannot modify vm_start, vm_end, vm_pgoff fields of a vma that has been
+ * already present in an i_mmap{_shared} tree without modifying the tree. The
+ * following helper function should be used when such modifications are
+ * necessary. We should hold the mapping's i_shared_sem.
+ *
+ * This function can be (micro)optimized for some special cases (maybe later).
+ */
+static inline void __vma_modify(struct prio_tree_root *root,
+ struct vm_area_struct *vma, unsigned long start, unsigned long end,
+ unsigned long pgoff)
+{
+ if (root)
+ __vma_prio_tree_remove(root, vma);
+ vma->vm_start = start;
+ vma->vm_end = end;
+ vma->vm_pgoff = pgoff;
+ if (root)
+ __vma_prio_tree_insert(root, vma);
+}
+
+/*
+ * Helper functions to enumerate vmas that map a given file page or a set of
+ * contiguous file pages. The functions return vmas that at least map a single
+ * page in the given range of contiguous file pages.
+ */
+static inline struct vm_area_struct *__vma_prio_tree_first(
+ struct prio_tree_root *root, struct prio_tree_iter *iter,
+ unsigned long begin, unsigned long end)
+{
+ struct prio_tree_node *ptr;
+
+ ptr = prio_tree_first(root, iter, begin, end);
+
+ if (ptr)
+ return prio_tree_entry(ptr, struct vm_area_struct,
+ shared.prio_tree_node);
+ else
+ return NULL;
+}
+
+static inline struct vm_area_struct *__vma_prio_tree_next(
+ struct vm_area_struct *vma, struct prio_tree_root *root,
+ struct prio_tree_iter *iter, unsigned long begin, unsigned long end)
+{
+ struct prio_tree_node *ptr;
+ struct vm_area_struct *next;
+
+ if (vma->shared.both.parent) {
+ if (vma->vm_set_head)
+ return vma->vm_set_head;
+ }
+ else {
+ next = list_entry(vma->shared.vm_set.list.next,
+ struct vm_area_struct, shared.vm_set.list);
+ if (!(next->vm_set_head))
+ return next;
+ }
+
+ ptr = prio_tree_next(root, iter, begin, end);
+
+ if (ptr)
+ return prio_tree_entry(ptr, struct vm_area_struct,
+ shared.prio_tree_node);
+ else
+ return NULL;
+}
+
+/*
* mapping from the currently active vm_flags protection bits (the
* low four bits) to a page protection mask..
*/
@@ -520,7 +684,6 @@ extern void __vma_link_rb(struct mm_stru
struct rb_node **, struct rb_node *);
extern struct vm_area_struct *copy_vma(struct vm_area_struct *,
unsigned long addr, unsigned long len, unsigned long pgoff);
-extern void vma_relink_file(struct vm_area_struct *, struct vm_area_struct *);
extern void exit_mmap(struct mm_struct *);
extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
--- 2.6.5-rc3-mjb2/include/linux/prio_tree.h 1970-01-01 01:00:00.000000000 +0100
+++ anobjrmap9/include/linux/prio_tree.h 2004-04-04 13:05:41.391472712 +0100
@@ -0,0 +1,78 @@
+#ifndef _LINUX_PRIO_TREE_H
+#define _LINUX_PRIO_TREE_H
+
+struct prio_tree_node {
+ struct prio_tree_node *left;
+ struct prio_tree_node *right;
+ struct prio_tree_node *parent;
+};
+
+struct prio_tree_root {
+ struct prio_tree_node *prio_tree_node;
+ unsigned int index_bits;
+};
+
+struct prio_tree_iter {
+ struct prio_tree_node *cur;
+ unsigned long mask;
+ unsigned long value;
+ int size_level;
+};
+
+#define PRIO_TREE_ROOT (struct prio_tree_root) {NULL, 1}
+
+#define PRIO_TREE_ROOT_INIT {NULL, 1}
+
+#define INIT_PRIO_TREE_ROOT(ptr) \
+do { \
+ (ptr)->prio_tree_node = NULL; \
+ (ptr)->index_bits = 1; \
+} while (0)
+
+#define PRIO_TREE_NODE_INIT(name) {&(name), &(name), &(name)}
+
+#define PRIO_TREE_NODE(name) \
+ struct prio_tree_node name = PRIO_TREE_NODE_INIT(name)
+
+#define INIT_PRIO_TREE_NODE(ptr) \
+do { \
+ (ptr)->left = (ptr)->right = (ptr)->parent = (ptr); \
+} while (0)
+
+#define prio_tree_entry(ptr, type, member) \
+ ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member)))
+
+#define PRIO_TREE_ITER (struct prio_tree_iter) {NULL, 0UL, 0UL, 0}
+
+static inline int prio_tree_empty(const struct prio_tree_root *root)
+{
+ return root->prio_tree_node == NULL;
+}
+
+static inline int prio_tree_root(const struct prio_tree_node *node)
+{
+ return node->parent == node;
+}
+
+static inline int prio_tree_left_empty(const struct prio_tree_node *node)
+{
+ return node->left == node;
+}
+
+static inline int prio_tree_right_empty(const struct prio_tree_node *node)
+{
+ return node->right == node;
+}
+
+extern struct prio_tree_node *prio_tree_insert(struct prio_tree_root *,
+ struct prio_tree_node *);
+
+extern void prio_tree_remove(struct prio_tree_root *, struct prio_tree_node *);
+
+extern struct prio_tree_node *prio_tree_first(struct prio_tree_root *,
+ struct prio_tree_iter *, unsigned long, unsigned long);
+
+extern struct prio_tree_node *prio_tree_next(struct prio_tree_root *,
+ struct prio_tree_iter *, unsigned long, unsigned long);
+
+#endif
--- 2.6.5-rc3-mjb2/init/main.c 2004-04-02 21:01:47.345855624 +0100
+++ anobjrmap9/init/main.c 2004-04-04 13:05:41.392472560 +0100
@@ -85,6 +85,7 @@ extern void buffer_init(void);
extern void pidhash_init(void);
extern void pidmap_init(void);
extern void radix_tree_init(void);
+extern void prio_tree_init(void);
extern void free_initmem(void);
extern void populate_rootfs(void);
extern void driver_init(void);
@@ -466,6 +467,7 @@ asmlinkage void __init start_kernel(void
calibrate_delay();
pidmap_init();
pgtable_cache_init();
+ prio_tree_init();
#ifdef CONFIG_X86
if (efi_enabled)
efi_enter_virtual_mode();
--- 2.6.5-rc3-mjb2/kernel/fork.c 2004-04-02 21:01:47.350854864 +0100
+++ anobjrmap9/kernel/fork.c 2004-04-04 13:05:41.394472256 +0100
@@ -314,7 +314,7 @@ static inline int dup_mmap(struct mm_str
tmp->vm_mm = mm;
tmp->vm_next = NULL;
file = tmp->vm_file;
- INIT_LIST_HEAD(&tmp->shared);
+ INIT_VMA_SHARED(tmp);
if (file) {
struct inode *inode = file->f_dentry->d_inode;
get_file(file);
@@ -323,7 +323,7 @@ static inline int dup_mmap(struct mm_str
/* insert tmp into the share list, just after mpnt */
down(&file->f_mapping->i_shared_sem);
- list_add(&tmp->shared, &mpnt->shared);
+ __vma_prio_tree_add(tmp, mpnt);
up(&file->f_mapping->i_shared_sem);
}
--- 2.6.5-rc3-mjb2/kernel/kexec.c 2004-04-02 21:01:47.354854256 +0100
+++ anobjrmap9/kernel/kexec.c 2004-04-04 13:05:41.395472104 +0100
@@ -191,7 +191,7 @@ static int identity_map_pages(struct pag
vma->vm_page_prot = protection_map[vma->vm_flags & 0xf];
vma->vm_file = NULL;
vma->vm_private_data = NULL;
- INIT_LIST_HEAD(&vma->shared);
+ INIT_VMA_SHARED(vma);
insert_vm_struct(mm, vma);
error = remap_page_range(vma, vma->vm_start, vma->vm_start,
--- 2.6.5-rc3-mjb2/mm/Makefile 2004-04-02 21:01:47.392848480 +0100
+++ anobjrmap9/mm/Makefile 2004-04-04 13:05:41.395472104 +0100
@@ -9,7 +9,8 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o
obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
page_alloc.o page-writeback.o pdflush.o readahead.o \
- slab.o swap.o truncate.o vmscan.o $(mmu-y)
+ slab.o swap.o truncate.o vmscan.o prio_tree.o \
+ $(mmu-y)
obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o
obj-$(CONFIG_X86_4G) += usercopy.o
--- 2.6.5-rc3-mjb2/mm/filemap.c 2004-04-02 21:01:47.395848024 +0100
+++ anobjrmap9/mm/filemap.c 2004-04-04 13:05:41.397471800 +0100
@@ -632,7 +632,8 @@ page_ok:
* virtual addresses, take care about potential aliasing
* before reading the page on the kernel side.
*/
- if (!list_empty(&mapping->i_mmap_shared))
+ if (!prio_tree_empty(&mapping->i_mmap_shared) ||
+ !list_empty(&mapping->i_mmap_nonlinear))
flush_dcache_page(page);
/*
--- 2.6.5-rc3-mjb2/mm/fremap.c 2004-04-02 21:01:47.397847720 +0100
+++ anobjrmap9/mm/fremap.c 2004-04-04 13:05:41.398471648 +0100
@@ -157,6 +157,8 @@ asmlinkage long sys_remap_file_pages(uns
unsigned long __prot, unsigned long pgoff, unsigned long flags)
{
struct mm_struct *mm = current->mm;
+ struct address_space *mapping;
+ unsigned long linear_pgoff;
unsigned long end = start + size;
struct vm_area_struct *vma;
int err = -EINVAL;
@@ -197,8 +199,18 @@ asmlinkage long sys_remap_file_pages(uns
end <= vma->vm_end) {
/* Must set VM_NONLINEAR before any pages are populated. */
- if (pgoff != ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff)
+ linear_pgoff = vma->vm_pgoff;
+ linear_pgoff += ((start - vma->vm_start) >> PAGE_SHIFT);
+ if (pgoff != linear_pgoff && !(vma->vm_flags & VM_NONLINEAR)) {
+ mapping = vma->vm_file->f_mapping;
+ down(&mapping->i_shared_sem);
vma->vm_flags |= VM_NONLINEAR;
+ __vma_prio_tree_remove(&mapping->i_mmap_shared, vma);
+ INIT_VMA_SHARED_LIST(vma);
+ list_add_tail(&vma->shared.vm_set.list,
+ &mapping->i_mmap_nonlinear);
+ up(&mapping->i_shared_sem);
+ }
/* ->populate can take a long time, so downgrade the lock. */
downgrade_write(&mm->mmap_sem);
--- 2.6.5-rc3-mjb2/mm/memory.c 2004-04-02 21:01:47.402846960 +0100
+++ anobjrmap9/mm/memory.c 2004-04-04 13:05:41.400471344 +0100
@@ -1077,11 +1077,11 @@ no_new_page:
* An hlen of zero blows away the entire portion file after hba.
*/
static void
-invalidate_mmap_range_list(struct list_head *head,
+invalidate_mmap_range_list(struct prio_tree_root *root,
unsigned long const hba,
unsigned long const hlen)
{
- struct list_head *curr;
+ struct prio_tree_iter iter;
unsigned long hea; /* last page of hole. */
unsigned long vba;
unsigned long vea; /* last page of corresponding uva hole. */
@@ -1092,17 +1092,16 @@ invalidate_mmap_range_list(struct list_h
hea = hba + hlen - 1; /* avoid overflow. */
if (hea < hba)
hea = ULONG_MAX;
- list_for_each(curr, head) {
- vp = list_entry(curr, struct vm_area_struct, shared);
+ vp = __vma_prio_tree_first(root, &iter, hba, hea);
+ while(vp) {
vba = vp->vm_pgoff;
vea = vba + ((vp->vm_end - vp->vm_start) >> PAGE_SHIFT) - 1;
- if (hea < vba || vea < hba)
- continue; /* Mapping disjoint from hole. */
zba = (hba <= vba) ? vba : hba;
zea = (vea <= hea) ? vea : hea;
zap_page_range(vp,
((zba - vba) << PAGE_SHIFT) + vp->vm_start,
(zea - zba + 1) << PAGE_SHIFT);
+ vp = __vma_prio_tree_next(vp, root, &iter, hba, hea);
}
}
@@ -1137,9 +1136,9 @@ void invalidate_mmap_range(struct addres
down(&mapping->i_shared_sem);
/* Protect against page fault */
atomic_inc(&mapping->truncate_count);
- if (unlikely(!list_empty(&mapping->i_mmap)))
+ if (unlikely(!prio_tree_empty(&mapping->i_mmap)))
invalidate_mmap_range_list(&mapping->i_mmap, hba, hlen);
- if (unlikely(!list_empty(&mapping->i_mmap_shared)))
+ if (unlikely(!prio_tree_empty(&mapping->i_mmap_shared)))
invalidate_mmap_range_list(&mapping->i_mmap_shared, hba, hlen);
up(&mapping->i_shared_sem);
}
--- 2.6.5-rc3-mjb2/mm/mmap.c 2004-04-02 21:01:47.406846352 +0100
+++ anobjrmap9/mm/mmap.c 2004-04-04 13:05:41.403470888 +0100
@@ -68,12 +68,20 @@ int mmap_hugepages_map_sz = 256;
* Requires inode->i_mapping->i_shared_sem
*/
static inline void
-__remove_shared_vm_struct(struct vm_area_struct *vma, struct inode *inode)
+__remove_shared_vm_struct(struct vm_area_struct *vma, struct inode *inode,
+ struct address_space * mapping)
{
if (inode) {
if (vma->vm_flags & VM_DENYWRITE)
atomic_inc(&inode->i_writecount);
- list_del_init(&vma->shared);
+ if (unlikely(vma->vm_flags & VM_NONLINEAR)) {
+ list_del_init(&vma->shared.vm_set.list);
+ INIT_VMA_SHARED(vma);
+ }
+ else if (vma->vm_flags & VM_SHARED)
+ __vma_prio_tree_remove(&mapping->i_mmap_shared, vma);
+ else
+ __vma_prio_tree_remove(&mapping->i_mmap, vma);
}
}
@@ -87,7 +95,8 @@ static void remove_shared_vm_struct(stru
if (file) {
struct address_space *mapping = file->f_mapping;
down(&mapping->i_shared_sem);
- __remove_shared_vm_struct(vma, file->f_dentry->d_inode);
+ __remove_shared_vm_struct(vma, file->f_dentry->d_inode,
+ mapping);
up(&mapping->i_shared_sem);
}
}
@@ -261,10 +270,15 @@ static inline void __vma_link_file(struc
if (vma->vm_flags & VM_DENYWRITE)
atomic_dec(&file->f_dentry->d_inode->i_writecount);
- if (vma->vm_flags & VM_SHARED)
- list_add_tail(&vma->shared, &mapping->i_mmap_shared);
+ if (unlikely(vma->vm_flags & VM_NONLINEAR)) {
+ INIT_VMA_SHARED_LIST(vma);
+ list_add_tail(&vma->shared.vm_set.list,
+ &mapping->i_mmap_nonlinear);
+ }
+ else if (vma->vm_flags & VM_SHARED)
+ __vma_prio_tree_insert(&mapping->i_mmap_shared, vma);
else
- list_add_tail(&vma->shared, &mapping->i_mmap);
+ __vma_prio_tree_insert(&mapping->i_mmap, vma);
}
}
@@ -393,7 +407,9 @@ static struct vm_area_struct *vma_merge(
{
spinlock_t *lock = &mm->page_table_lock;
struct inode *inode = file ? file->f_dentry->d_inode : NULL;
+ struct address_space *mapping = file ? file->f_mapping : NULL;
struct semaphore *i_shared_sem;
+ struct prio_tree_root *root = NULL;
/*
* We later require that vma->vm_flags == vm_flags, so this tests
@@ -404,6 +420,15 @@ static struct vm_area_struct *vma_merge(
i_shared_sem = file ? &file->f_mapping->i_shared_sem : NULL;
+ if (mapping) {
+ if (vm_flags & VM_SHARED) {
+ if (likely(!(vm_flags & VM_NONLINEAR)))
+ root = &mapping->i_mmap_shared;
+ }
+ else
+ root = &mapping->i_mmap;
+ }
+
if (!prev) {
prev = rb_entry(rb_parent, struct vm_area_struct, vm_rb);
goto merge_next;
@@ -423,18 +448,18 @@ static struct vm_area_struct *vma_merge(
need_up = 1;
}
spin_lock(lock);
- prev->vm_end = end;
/*
* OK, it did. Can we now merge in the successor as well?
*/
next = prev->vm_next;
- if (next && prev->vm_end == next->vm_start &&
+ if (next && end == next->vm_start &&
can_vma_merge_before(next, vm_flags, file,
pgoff, (end - addr) >> PAGE_SHIFT)) {
- prev->vm_end = next->vm_end;
__vma_unlink(mm, next, prev);
- __remove_shared_vm_struct(next, inode);
+ __vma_modify(root, prev, prev->vm_start,
+ next->vm_end, prev->vm_pgoff);
+ __remove_shared_vm_struct(next, inode, mapping);
spin_unlock(lock);
if (need_up)
up(i_shared_sem);
@@ -445,6 +470,8 @@ static struct vm_area_struct *vma_merge(
kmem_cache_free(vm_area_cachep, next);
return prev;
}
+
+ __vma_modify(root, prev, prev->vm_start, end, prev->vm_pgoff);
spin_unlock(lock);
if (need_up)
up(i_shared_sem);
@@ -464,8 +491,8 @@ static struct vm_area_struct *vma_merge(
if (file)
down(i_shared_sem);
spin_lock(lock);
- prev->vm_start = addr;
- prev->vm_pgoff -= (end - addr) >> PAGE_SHIFT;
+ __vma_modify(root, prev, addr, prev->vm_end,
+ prev->vm_pgoff - ((end - addr) >> PAGE_SHIFT));
spin_unlock(lock);
if (file)
up(i_shared_sem);
@@ -698,7 +725,7 @@ munmap_back:
vma->vm_file = NULL;
vma->vm_private_data = NULL;
vma->vm_next = NULL;
- INIT_LIST_HEAD(&vma->shared);
+ INIT_VMA_SHARED(vma);
if (file) {
error = -EINVAL;
@@ -1289,6 +1316,7 @@ int split_vma(struct mm_struct * mm, str
{
struct vm_area_struct *new;
struct address_space *mapping = NULL;
+ struct prio_tree_root *root = NULL;
if (mm->map_count >= MAX_MAP_COUNT)
return -ENOMEM;
@@ -1300,7 +1328,7 @@ int split_vma(struct mm_struct * mm, str
/* most fields are the same, copy all, and then fixup */
*new = *vma;
- INIT_LIST_HEAD(&new->shared);
+ INIT_VMA_SHARED(new);
if (new_below)
new->vm_end = addr;
@@ -1315,18 +1343,25 @@ int split_vma(struct mm_struct * mm, str
if (new->vm_ops && new->vm_ops->open)
new->vm_ops->open(new);
- if (vma->vm_file)
+ if (vma->vm_file) {
mapping = vma->vm_file->f_mapping;
+ if (vma->vm_flags & VM_SHARED) {
+ if (likely(!(vma->vm_flags & VM_NONLINEAR)))
+ root = &mapping->i_mmap_shared;
+ }
+ else
+ root = &mapping->i_mmap;
+ }
if (mapping)
down(&mapping->i_shared_sem);
spin_lock(&mm->page_table_lock);
- if (new_below) {
- vma->vm_start = addr;
- vma->vm_pgoff += ((addr - new->vm_start) >> PAGE_SHIFT);
- } else
- vma->vm_end = addr;
+ if (new_below)
+ __vma_modify(root, vma, addr, vma->vm_end,
+ vma->vm_pgoff + ((addr - new->vm_start) >> PAGE_SHIFT));
+ else
+ __vma_modify(root, vma, vma->vm_start, addr, vma->vm_pgoff);
__insert_vm_struct(mm, new);
@@ -1499,7 +1534,7 @@ unsigned long do_brk(unsigned long addr,
vma->vm_pgoff = 0;
vma->vm_file = NULL;
vma->vm_private_data = NULL;
- INIT_LIST_HEAD(&vma->shared);
+ INIT_VMA_SHARED(vma);
vma_link(mm, vma, prev, rb_link, rb_parent);
@@ -1597,7 +1632,7 @@ struct vm_area_struct *copy_vma(struct v
new_vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
if (new_vma) {
*new_vma = *vma;
- INIT_LIST_HEAD(&new_vma->shared);
+ INIT_VMA_SHARED(new_vma);
new_vma->vm_start = addr;
new_vma->vm_end = addr + len;
new_vma->vm_pgoff = pgoff;
@@ -1610,24 +1645,3 @@ struct vm_area_struct *copy_vma(struct v
}
return new_vma;
}
-
-/*
- * Position vma after prev in shared file list:
- * for mremap move error recovery racing against vmtruncate.
- */
-void vma_relink_file(struct vm_area_struct *vma, struct vm_area_struct *prev)
-{
- struct mm_struct *mm = vma->vm_mm;
- struct address_space *mapping;
-
- if (vma->vm_file) {
- mapping = vma->vm_file->f_mapping;
- if (mapping) {
- down(&mapping->i_shared_sem);
- spin_lock(&mm->page_table_lock);
- list_move(&vma->shared, &prev->shared);
- spin_unlock(&mm->page_table_lock);
- up(&mapping->i_shared_sem);
- }
- }
-}
--- 2.6.5-rc3-mjb2/mm/mremap.c 2004-04-02 21:01:47.409845896 +0100
+++ anobjrmap9/mm/mremap.c 2004-04-04 13:05:41.405470584 +0100
@@ -237,6 +237,7 @@ static int move_page_tables(struct vm_ar
* only a few pages.. This also makes error recovery easier.
*/
while (offset < len) {
+ cond_resched();
ret = move_one_page(vma, old_addr+offset, new_addr+offset);
if (!ret) {
offset += PAGE_SIZE;
@@ -266,6 +267,7 @@ static unsigned long move_vma(struct vm_
unsigned long new_len, unsigned long new_addr)
{
struct mm_struct *mm = vma->vm_mm;
+ struct address_space *mapping = NULL;
struct vm_area_struct *new_vma;
unsigned long vm_flags = vma->vm_flags;
unsigned long new_pgoff;
@@ -285,26 +287,31 @@ static unsigned long move_vma(struct vm_
if (!new_vma)
return -ENOMEM;
+ if (vma->vm_file) {
+ /*
+ * Subtle point from Rajesh Venkatasubramanian: before
+ * moving file-based ptes, we must lock vmtruncate out,
+ * since it might clean the dst vma before the src vma,
+ * and we propagate stale pages into the dst afterward.
+ */
+ mapping = vma->vm_file->f_mapping;
+ down(&mapping->i_shared_sem);
+ }
moved_len = move_page_tables(vma, new_addr, old_addr, old_len);
if (moved_len < old_len) {
/*
* On error, move entries back from new area to old,
* which will succeed since page tables still there,
* and then proceed to unmap new area instead of old.
- *
- * Subtle point from Rajesh Venkatasubramanian: before
- * moving file-based ptes, move new_vma before old vma
- * in the i_mmap or i_mmap_shared list, so when racing
- * against vmtruncate we cannot propagate pages to be
- * truncated back from new_vma into just cleaned old.
*/
- vma_relink_file(vma, new_vma);
move_page_tables(new_vma, old_addr, new_addr, moved_len);
vma = new_vma;
old_len = new_len;
old_addr = new_addr;
new_addr = -ENOMEM;
}
+ if (mapping)
+ up(&mapping->i_shared_sem);
/* Conceal VM_ACCOUNT so old reservation is not undone */
if (vm_flags & VM_ACCOUNT) {
@@ -351,6 +358,8 @@ unsigned long do_mremap(unsigned long ad
unsigned long flags, unsigned long new_addr)
{
struct vm_area_struct *vma;
+ struct address_space *mapping = NULL;
+ struct prio_tree_root *root = NULL;
unsigned long ret = -EINVAL;
unsigned long charged = 0;
@@ -458,9 +467,26 @@ unsigned long do_mremap(unsigned long ad
/* can we just expand the current mapping? */
if (max_addr - addr >= new_len) {
int pages = (new_len - old_len) >> PAGE_SHIFT;
+
+ if (vma->vm_file) {
+ mapping = vma->vm_file->f_mapping;
+ if (vma->vm_flags & VM_SHARED) {
+ if (likely(!(vma->vm_flags & VM_NONLINEAR)))
+ root = &mapping->i_mmap_shared;
+ }
+ else
+ root = &mapping->i_mmap;
+ down(&mapping->i_shared_sem);
+ }
+
spin_lock(&vma->vm_mm->page_table_lock);
- vma->vm_end = addr + new_len;
+ __vma_modify(root, vma, vma->vm_start,
+ addr + new_len, vma->vm_pgoff);
spin_unlock(&vma->vm_mm->page_table_lock);
+
+ if(mapping)
+ up(&mapping->i_shared_sem);
+
current->mm->total_vm += pages;
if (vma->vm_flags & VM_LOCKED) {
current->mm->locked_vm += pages;
--- 2.6.5-rc3-mjb2/mm/page_io.c 2004-04-02 21:01:47.416844832 +0100
+++ anobjrmap9/mm/page_io.c 2004-04-04 13:05:41.405470584 +0100
@@ -135,12 +135,14 @@ out:
int rw_swap_page_sync(int rw, swp_entry_t entry, struct page *page)
{
int ret;
+ unsigned long save_private;
struct writeback_control swap_wbc = {
.sync_mode = WB_SYNC_ALL,
};
lock_page(page);
SetPageSwapCache(page);
+ save_private = page->private;
page->private = entry.val;
if (rw == READ) {
@@ -150,7 +152,9 @@ int rw_swap_page_sync(int rw, swp_entry_
ret = swap_writepage(page, &swap_wbc);
wait_on_page_writeback(page);
}
+
ClearPageSwapCache(page);
+ page->private = save_private;
if (ret == 0 && (!PageUptodate(page) || PageError(page)))
ret = -EIO;
return ret;
--- 2.6.5-rc3-mjb2/mm/prio_tree.c 1970-01-01 01:00:00.000000000 +0100
+++ anobjrmap9/mm/prio_tree.c 2004-04-04 13:05:41.408470128 +0100
@@ -0,0 +1,577 @@
+/*
+ * mm/prio_tree.c - priority search tree for mapping->i_mmap{,_shared}
+ *
+ * Copyright (C) 2004, Rajesh Venkatasubramanian <vrajesh@umich.edu>
+ *
+ * Based on the radix priority search tree proposed by Edward M. McCreight
+ * SIAM Journal of Computing, vol. 14, no.2, pages 257-276, May 1985
+ *
+ * 02Feb2004 Initial version
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/prio_tree.h>
+
+/*
+ * A clever mix of heap and radix trees forms a radix priority search tree (PST)
+ * which is useful for storing intervals, e.g, we can consider a vma as a closed
+ * interval of file pages [offset_begin, offset_end], and store all vmas that
+ * map a file in a PST. Then, using the PST, we can answer a stabbing query,
+ * i.e., selecting a set of stored intervals (vmas) that overlap with (map) a
+ * given input interval X (a set of consecutive file pages), in "O(log n + m)"
+ * time where 'log n' is the height of the PST, and 'm' is the number of stored
+ * intervals (vmas) that overlap (map) with the input interval X (the set of
+ * consecutive file pages).
+ *
+ * In our implementation, we store closed intervals of the form [radix_index,
+ * heap_index]. We assume that always radix_index <= heap_index. McCreight's PST
+ * is designed for storing intervals with unique radix indices, i.e., each
+ * interval have different radix_index. However, this limitation can be easily
+ * overcome by using the size, i.e., heap_index - radix_index, as part of the
+ * index, so we index the tree using [(radix_index,size), heap_index].
+ *
+ * When the above-mentioned indexing scheme is used, theoretically, in a 32 bit
+ * machine, the maximum height of a PST can be 64. We can use a balanced version
+ * of the priority search tree to optimize the tree height, but the balanced
+ * tree proposed by McCreight is too complex and memory-hungry for our purpose.
+ */
+
+static unsigned long index_bits_to_maxindex[BITS_PER_LONG];
+
+/*
+ * Maximum heap_index that can be stored in a PST with index_bits bits
+ */
+static inline unsigned long prio_tree_maxindex(unsigned int bits)
+{
+ return index_bits_to_maxindex[bits - 1];
+}
+
+/*
+ * Extend a priority search tree so that it can store a node with heap_index
+ * max_heap_index. In the worst case, this algorithm takes O((log n)^2).
+ * However, this function is used rarely and the common case performance is
+ * not bad.
+ */
+static struct prio_tree_node *prio_tree_expand(struct prio_tree_root *root,
+ struct prio_tree_node *node, unsigned long max_heap_index)
+{
+ struct prio_tree_node *first = NULL, *prev, *last = NULL;
+
+ if (max_heap_index > prio_tree_maxindex(root->index_bits))
+ root->index_bits++;
+
+ while (max_heap_index > prio_tree_maxindex(root->index_bits)) {
+ root->index_bits++;
+
+ if (prio_tree_empty(root))
+ continue;
+
+ if (first == NULL) {
+ first = root->prio_tree_node;
+ prio_tree_remove(root, root->prio_tree_node);
+ INIT_PRIO_TREE_NODE(first);
+ last = first;
+ }
+ else {
+ prev = last;
+ last = root->prio_tree_node;
+ prio_tree_remove(root, root->prio_tree_node);
+ INIT_PRIO_TREE_NODE(last);
+ prev->left = last;
+ last->parent = prev;
+ }
+ }
+
+ INIT_PRIO_TREE_NODE(node);
+
+ if (first) {
+ node->left = first;
+ first->parent = node;
+ }
+ else
+ last = node;
+
+ if (!prio_tree_empty(root)) {
+ last->left = root->prio_tree_node;
+ last->left->parent = last;
+ }
+
+ root->prio_tree_node = node;
+ return node;
+}
+
+/*
+ * Replace a prio_tree_node with a new node and return the old node
+ */
+static inline struct prio_tree_node *prio_tree_replace(
+ struct prio_tree_root *root, struct prio_tree_node *old,
+ struct prio_tree_node *node)
+{
+ INIT_PRIO_TREE_NODE(node);
+
+ if (prio_tree_root(old)) {
+ BUG_ON(root->prio_tree_node != old);
+ /*
+ * We can reduce root->index_bits here. However, it is complex
+ * and does not help much to improve performance (IMO).
+ */
+ node->parent = node;
+ root->prio_tree_node = node;
+ }
+ else {
+ node->parent = old->parent;
+ if (old->parent->left == old)
+ old->parent->left = node;
+ else {
+ BUG_ON(old->parent->right != old);
+ old->parent->right = node;
+ }
+ }
+
+ if (!prio_tree_left_empty(old)) {
+ node->left = old->left;
+ old->left->parent = node;
+ }
+
+ if (!prio_tree_right_empty(old)) {
+ node->right = old->right;
+ old->right->parent = node;
+ }
+
+ return old;
+}
+
+#undef swap
+#define swap(x,y,z) do {z = x; x = y; y = z; } while (0)
+
+/*
+ * Insert a prio_tree_node @node into a radix priority search tree @root. The
+ * algorithm typically takes O(log n) time where 'log n' is the number of bits
+ * required to represent the maximum heap_index. In the worst case, the algo
+ * can take O((log n)^2) - check prio_tree_expand.
+ *
+ * If a prior node with same radix_index and heap_index is already found in
+ * the tree, then returns the address of the prior node. Otherwise, inserts
+ * @node into the tree and returns @node.
+ */
+
+struct prio_tree_node *prio_tree_insert(struct prio_tree_root *root,
+ struct prio_tree_node *node)
+{
+ struct prio_tree_node *cur, *res = node;
+ unsigned long radix_index, heap_index;
+ unsigned long r_index, h_index, index, mask;
+ int size_flag = 0;
+
+ GET_INDEX(node, radix_index, heap_index);
+
+ if (prio_tree_empty(root) ||
+ heap_index > prio_tree_maxindex(root->index_bits))
+ return prio_tree_expand(root, node, heap_index);
+
+ cur = root->prio_tree_node;
+ mask = 1UL << (root->index_bits - 1);
+
+ while (mask) {
+ GET_INDEX(cur, r_index, h_index);
+
+ if (r_index == radix_index && h_index == heap_index)
+ return cur;
+
+ if (h_index < heap_index || (h_index == heap_index &&
+ r_index > radix_index))
+ {
+ struct prio_tree_node *tmp = node;
+ node = prio_tree_replace(root, cur, node);
+ cur = tmp;
+ swap(r_index, radix_index, index);
+ swap(h_index, heap_index, index);
+ }
+
+ if (size_flag)
+ index = heap_index - radix_index;
+ else
+ index = radix_index;
+
+ if (index & mask) {
+ if (prio_tree_right_empty(cur)) {
+ INIT_PRIO_TREE_NODE(node);
+ cur->right = node;
+ node->parent = cur;
+ return res;
+ }
+ else
+ cur = cur->right;
+ }
+ else {
+ if (prio_tree_left_empty(cur)) {
+ INIT_PRIO_TREE_NODE(node);
+ cur->left = node;
+ node->parent = cur;
+ return res;
+ }
+ else
+ cur = cur->left;
+ }
+
+ mask >>= 1;
+
+ if (!mask) {
+ mask = 1UL << (root->index_bits - 1);
+ size_flag = 1;
+ }
+ }
+ /* Should not reach here */
+ BUG();
+ return NULL;
+}
+
+/*
+ * Remove a prio_tree_node @node from a radix priority search tree @root. The
+ * algorithm takes O(log n) time where 'log n' is the number of bits required
+ * to represent the maximum heap_index.
+ */
+
+void prio_tree_remove(struct prio_tree_root *root, struct prio_tree_node *node)
+{
+ struct prio_tree_node *cur;
+ unsigned long r_index, h_index_right, h_index_left;
+
+ cur = node;
+
+ while (!prio_tree_left_empty(cur) || !prio_tree_right_empty(cur)) {
+ if (!prio_tree_left_empty(cur))
+ GET_INDEX(cur->left, r_index, h_index_left);
+ else {
+ cur = cur->right;
+ continue;
+ }
+
+ if (!prio_tree_right_empty(cur))
+ GET_INDEX(cur->right, r_index, h_index_right);
+ else {
+ cur = cur->left;
+ continue;
+ }
+
+ /* both h_index_left and h_index_right cannot be 0 */
+ if (h_index_left >= h_index_right)
+ cur = cur->left;
+ else
+ cur = cur->right;
+ }
+
+ if (prio_tree_root(cur)) {
+ BUG_ON(root->prio_tree_node != cur);
+ *root = PRIO_TREE_ROOT;
+ return;
+ }
+
+ if (cur->parent->right == cur)
+ cur->parent->right = cur->parent;
+ else {
+ BUG_ON(cur->parent->left != cur);
+ cur->parent->left = cur->parent;
+ }
+
+ while (cur != node)
+ cur = prio_tree_replace(root, cur->parent, cur);
+
+ return;
+}
+
+/*
+ * Following functions help to enumerate all prio_tree_nodes in the tree that
+ * overlap with the input interval X [radix_index, heap_index]. The enumeration
+ * takes O(log n + m) time where 'log n' is the height of the tree (which is
+ * proportional to # of bits required to represent the maximum heap_index) and
+ * 'm' is the number of prio_tree_nodes that overlap the interval X.
+ */
+
+static inline struct prio_tree_node *__prio_tree_left(
+ struct prio_tree_root *root, struct prio_tree_iter *iter,
+ unsigned long radix_index, unsigned long heap_index,
+ unsigned long *r_index, unsigned long *h_index)
+{
+ if (prio_tree_left_empty(iter->cur))
+ return NULL;
+
+ GET_INDEX(iter->cur->left, *r_index, *h_index);
+
+ if (radix_index <= *h_index) {
+ iter->cur = iter->cur->left;
+ iter->mask >>= 1;
+ if (iter->mask) {
+ if (iter->size_level)
+ iter->size_level++;
+ }
+ else {
+ iter->size_level = 1;
+ iter->mask = 1UL << (root->index_bits - 1);
+ }
+ return iter->cur;
+ }
+
+ return NULL;
+}
+
+
+static inline struct prio_tree_node *__prio_tree_right(
+ struct prio_tree_root *root, struct prio_tree_iter *iter,
+ unsigned long radix_index, unsigned long heap_index,
+ unsigned long *r_index, unsigned long *h_index)
+{
+ unsigned long value;
+
+ if (prio_tree_right_empty(iter->cur))
+ return NULL;
+
+ if (iter->size_level)
+ value = iter->value;
+ else
+ value = iter->value | iter->mask;
+
+ if (heap_index < value)
+ return NULL;
+
+ GET_INDEX(iter->cur->right, *r_index, *h_index);
+
+ if (radix_index <= *h_index) {
+ iter->cur = iter->cur->right;
+ iter->mask >>= 1;
+ iter->value = value;
+ if (iter->mask) {
+ if (iter->size_level)
+ iter->size_level++;
+ }
+ else {
+ iter->size_level = 1;
+ iter->mask = 1UL << (root->index_bits - 1);
+ }
+ return iter->cur;
+ }
+
+ return NULL;
+}
+
+static inline struct prio_tree_node *__prio_tree_parent(
+ struct prio_tree_iter *iter)
+{
+ iter->cur = iter->cur->parent;
+ iter->mask <<= 1;
+ if (iter->size_level) {
+ if (iter->size_level == 1)
+ iter->mask = 1UL;
+ iter->size_level--;
+ }
+ else if (iter->value & iter->mask)
+ iter->value ^= iter->mask;
+ return iter->cur;
+}
+
+static inline int overlap(unsigned long radix_index, unsigned long heap_index,
+ unsigned long r_index, unsigned long h_index)
+{
+ if (heap_index < r_index || radix_index > h_index)
+ return 0;
+
+ return 1;
+}
+
+/*
+ * prio_tree_first:
+ *
+ * Get the first prio_tree_node that overlaps with the interval [radix_index,
+ * heap_index]. Note that always radix_index <= heap_index. We do a pre-order
+ * traversal of the tree.
+ */
+struct prio_tree_node *prio_tree_first(struct prio_tree_root *root,
+ struct prio_tree_iter *iter, unsigned long radix_index,
+ unsigned long heap_index)
+{
+ unsigned long r_index, h_index;
+
+ *iter = PRIO_TREE_ITER;
+
+ if (prio_tree_empty(root))
+ return NULL;
+
+ GET_INDEX(root->prio_tree_node, r_index, h_index);
+
+ if (radix_index > h_index)
+ return NULL;
+
+ iter->mask = 1UL << (root->index_bits - 1);
+ iter->cur = root->prio_tree_node;
+
+ while (1) {
+ if (overlap(radix_index, heap_index, r_index, h_index))
+ return iter->cur;
+
+ if (__prio_tree_left(root, iter, radix_index, heap_index,
+ &r_index, &h_index))
+ continue;
+
+ if (__prio_tree_right(root, iter, radix_index, heap_index,
+ &r_index, &h_index))
+ continue;
+
+ break;
+ }
+ return NULL;
+}
+EXPORT_SYMBOL(prio_tree_first);
+
+/* Get the next prio_tree_node that overlaps with the input interval in iter */
+struct prio_tree_node *prio_tree_next(struct prio_tree_root *root,
+ struct prio_tree_iter *iter, unsigned long radix_index,
+ unsigned long heap_index)
+{
+ unsigned long r_index, h_index;
+
+repeat:
+ while (__prio_tree_left(root, iter, radix_index, heap_index,
+ &r_index, &h_index))
+ if (overlap(radix_index, heap_index, r_index, h_index))
+ return iter->cur;
+
+ while (!__prio_tree_right(root, iter, radix_index, heap_index,
+ &r_index, &h_index)) {
+ while (!prio_tree_root(iter->cur) &&
+ iter->cur->parent->right == iter->cur)
+ __prio_tree_parent(iter);
+
+ if (prio_tree_root(iter->cur))
+ return NULL;
+
+ __prio_tree_parent(iter);
+ }
+
+ if (overlap(radix_index, heap_index, r_index, h_index))
+ return iter->cur;
+
+ goto repeat;
+}
+EXPORT_SYMBOL(prio_tree_next);
+
+/*
+ * Radix priority search tree for address_space->i_mmap_{_shared}
+ *
+ * For each vma that map a unique set of file pages i.e., unique [radix_index,
+ * heap_index] value, we have a corresponing priority search tree node. If
+ * multiple vmas have identical [radix_index, heap_index] value, then one of
+ * them is used as a tree node and others are stored in a vm_set list. The tree
+ * node points to the first vma (head) of the list using vm_set_head.
+ *
+ * prio_tree_root
+ * |
+ * A vm_set_head
+ * / \ /
+ * L R -> H-I-J-K-M-N-O-P-Q-S
+ * ^ ^ <-- vm_set.list -->
+ * tree nodes
+ *
+ * We need some way to identify whether a vma is a tree node, head of a vm_set
+ * list, or just a member of a vm_set list. We cannot use vm_flags to store
+ * such information. The reason is, in the above figure, it is possible that
+ * vm_flags' of R and H are covered by the different mmap_sems. When R is
+ * removed under R->mmap_sem, H replaces R as a tree node. Since we do not hold
+ * H->mmap_sem, we cannot use H->vm_flags for marking that H is a tree node now.
+ * That's why some trick involving shared.both.parent is used for identifying
+ * tree nodes and list head nodes. We can possibly use the least significant
+ * bit of the vm_set_head field to mark tree and list head nodes. I was worried
+ * about the alignment of vm_area_struct in various architectures.
+ *
+ * vma radix priority search tree node rules:
+ *
+ * vma->shared.both.parent != NULL ==> a tree node
+ *
+ * vma->shared.both.parent == NULL
+ * vma->vm_set_head != NULL ==> list head of vmas that map same pages
+ * vma->vm_set_head == NULL ==> a list node
+ */
+
+void __vma_prio_tree_insert(struct prio_tree_root *root,
+ struct vm_area_struct *vma)
+{
+ struct prio_tree_node *ptr;
+ struct vm_area_struct *old;
+
+ ptr = prio_tree_insert(root, &vma->shared.prio_tree_node);
+
+ if (ptr == &vma->shared.prio_tree_node) {
+ vma->vm_set_head = NULL;
+ return;
+ }
+
+ old = prio_tree_entry(ptr, struct vm_area_struct,
+ shared.prio_tree_node);
+
+ __vma_prio_tree_add(vma, old);
+}
+
+void __vma_prio_tree_remove(struct prio_tree_root *root,
+ struct vm_area_struct *vma)
+{
+ struct vm_area_struct *node, *head, *new_head;
+
+ if (vma->shared.both.parent == NULL && vma->vm_set_head == NULL) {
+ list_del_init(&vma->shared.vm_set.list);
+ INIT_VMA_SHARED(vma);
+ return;
+ }
+
+ if (vma->vm_set_head) {
+ /* Leave this BUG_ON till prio_tree patch stabilizes */
+ BUG_ON(vma->vm_set_head->vm_set_head != vma);
+ if (vma->shared.both.parent) {
+ head = vma->vm_set_head;
+ if (!list_empty(&head->shared.vm_set.list)) {
+ new_head = list_entry(
+ head->shared.vm_set.list.next,
+ struct vm_area_struct,
+ shared.vm_set.list);
+ list_del_init(&head->shared.vm_set.list);
+ }
+ else
+ new_head = NULL;
+
+ prio_tree_replace(root, &vma->shared.prio_tree_node,
+ &head->shared.prio_tree_node);
+ head->vm_set_head = new_head;
+ if (new_head)
+ new_head->vm_set_head = head;
+
+ }
+ else {
+ node = vma->vm_set_head;
+ if (!list_empty(&vma->shared.vm_set.list)) {
+ new_head = list_entry(
+ vma->shared.vm_set.list.next,
+ struct vm_area_struct,
+ shared.vm_set.list);
+ list_del_init(&vma->shared.vm_set.list);
+ node->vm_set_head = new_head;
+ new_head->vm_set_head = node;
+ }
+ else
+ node->vm_set_head = NULL;
+ }
+ INIT_VMA_SHARED(vma);
+ return;
+ }
+
+ prio_tree_remove(root, &vma->shared.prio_tree_node);
+ INIT_VMA_SHARED(vma);
+}
+
+void __init prio_tree_init(void)
+{
+ unsigned int i;
+
+ for (i = 0; i < ARRAY_SIZE(index_bits_to_maxindex) - 1; i++)
+ index_bits_to_maxindex[i] = (1UL << (i + 1)) - 1;
+ index_bits_to_maxindex[ARRAY_SIZE(index_bits_to_maxindex) - 1] = ~0UL;
+}
--- 2.6.5-rc3-mjb2/mm/rmap.c 2004-04-02 21:01:47.423843768 +0100
+++ anobjrmap9/mm/rmap.c 2004-04-04 13:05:41.411469672 +0100
@@ -154,19 +154,16 @@ static inline void clear_page_anon(struc
**/
/*
- * At what user virtual address is page expected in file-backed vma?
+ * At what user virtual address is pgoff expected in file-backed vma?
*/
-#define NOADDR (~0UL) /* impossible user virtual address */
static inline unsigned long
-vma_address(struct page *page, struct vm_area_struct *vma)
+vma_address(struct vm_area_struct *vma, unsigned long pgoff)
{
- unsigned long pgoff;
unsigned long address;
- pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
- return (address >= vma->vm_start && address < vma->vm_end)?
- address: NOADDR;
+ BUG_ON(address < vma->vm_start || address >= vma->vm_end);
+ return address;
}
/**
@@ -182,8 +179,14 @@ static int page_referenced_one(struct pa
pte_t *pte;
int referenced = 0;
- if (!spin_trylock(&mm->page_table_lock))
+ if (!spin_trylock(&mm->page_table_lock)) {
+ /*
+ * For debug we're currently warning if not all found,
+ * but in this case that's expected: suppress warning.
+ */
+ *mapcount = -1;
return 0;
+ }
pgd = pgd_offset(mm, address);
if (!pgd_present(*pgd))
@@ -246,6 +249,8 @@ static inline int page_referenced_anon(s
if (!*mapcount)
goto out;
}
+
+ WARN_ON(*mapcount > 0);
out:
spin_unlock(&anonhd->lock);
return referenced;
@@ -268,45 +273,54 @@ out:
static inline int page_referenced_obj(struct page *page, int *mapcount)
{
struct address_space *mapping = page->mapping;
+ unsigned long pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
struct vm_area_struct *vma;
+ struct prio_tree_iter iter;
unsigned long address;
int referenced = 0;
if (down_trylock(&mapping->i_shared_sem))
return 0;
- list_for_each_entry(vma, &mapping->i_mmap, shared) {
- if (!vma->vm_mm->rss)
- continue;
- address = vma_address(page, vma);
- if (address == NOADDR)
- continue;
- if ((vma->vm_flags & (VM_LOCKED|VM_MAYSHARE)) ==
- (VM_LOCKED|VM_MAYSHARE)) {
+ vma = __vma_prio_tree_first(&mapping->i_mmap,
+ &iter, pgoff, pgoff);
+ while (vma) {
+ if ((vma->vm_flags & (VM_LOCKED|VM_MAYSHARE))
+ == (VM_LOCKED|VM_MAYSHARE)) {
referenced++;
goto out;
}
- referenced += page_referenced_one(
- page, vma->vm_mm, address, mapcount);
- if (!*mapcount)
- goto out;
+ if (vma->vm_mm->rss) {
+ address = vma_address(vma, pgoff);
+ referenced += page_referenced_one(
+ page, vma->vm_mm, address, mapcount);
+ if (!*mapcount)
+ goto out;
+ }
+ vma = __vma_prio_tree_next(vma, &mapping->i_mmap,
+ &iter, pgoff, pgoff);
}
- list_for_each_entry(vma, &mapping->i_mmap_shared, shared) {
- if (!vma->vm_mm->rss || (vma->vm_flags & VM_NONLINEAR))
- continue;
- address = vma_address(page, vma);
- if (address == NOADDR)
- continue;
+ vma = __vma_prio_tree_first(&mapping->i_mmap_shared,
+ &iter, pgoff, pgoff);
+ while (vma) {
if (vma->vm_flags & (VM_LOCKED|VM_RESERVED)) {
referenced++;
goto out;
}
- referenced += page_referenced_one(
- page, vma->vm_mm, address, mapcount);
- if (!*mapcount)
- goto out;
+ if (vma->vm_mm->rss) {
+ address = vma_address(vma, pgoff);
+ referenced += page_referenced_one(
+ page, vma->vm_mm, address, mapcount);
+ if (!*mapcount)
+ goto out;
+ }
+ vma = __vma_prio_tree_next(vma, &mapping->i_mmap_shared,
+ &iter, pgoff, pgoff);
}
+
+ if (list_empty(&mapping->i_mmap_nonlinear))
+ WARN_ON(*mapcount > 0);
out:
up(&mapping->i_shared_sem);
return referenced;
@@ -688,7 +702,9 @@ out:
static inline int try_to_unmap_obj(struct page *page, int *mapcount)
{
struct address_space *mapping = page->mapping;
+ unsigned long pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
struct vm_area_struct *vma;
+ struct prio_tree_iter iter;
unsigned long address;
int ret = SWAP_AGAIN;
unsigned long cursor;
@@ -698,47 +714,50 @@ static inline int try_to_unmap_obj(struc
if (down_trylock(&mapping->i_shared_sem))
return ret;
- list_for_each_entry(vma, &mapping->i_mmap, shared) {
- if (!vma->vm_mm->rss)
- continue;
- address = vma_address(page, vma);
- if (address == NOADDR)
- continue;
- ret = try_to_unmap_one(
- page, vma->vm_mm, address, mapcount, vma);
- if (ret == SWAP_FAIL || !*mapcount)
- goto out;
+ vma = __vma_prio_tree_first(&mapping->i_mmap,
+ &iter, pgoff, pgoff);
+ while (vma) {
+ if (vma->vm_mm->rss) {
+ address = vma_address(vma, pgoff);
+ ret = try_to_unmap_one(
+ page, vma->vm_mm, address, mapcount, vma);
+ if (ret == SWAP_FAIL || !*mapcount)
+ goto out;
+ }
+ vma = __vma_prio_tree_next(vma, &mapping->i_mmap,
+ &iter, pgoff, pgoff);
}
- list_for_each_entry(vma, &mapping->i_mmap_shared, shared) {
- if (unlikely(vma->vm_flags & VM_NONLINEAR)) {
- /*
- * Defer unmapping nonlinear to the next loop,
- * but take notes while we're here e.g. don't
- * want to loop again when no nonlinear vmas.
- */
- if (vma->vm_flags & (VM_LOCKED|VM_RESERVED))
- continue;
- cursor = (unsigned long) vma->vm_private_data;
- if (cursor > max_nl_cursor)
- max_nl_cursor = cursor;
- cursor = vma->vm_end - vma->vm_start;
- if (cursor > max_nl_size)
- max_nl_size = cursor;
- continue;
+ vma = __vma_prio_tree_first(&mapping->i_mmap_shared,
+ &iter, pgoff, pgoff);
+ while (vma) {
+ if (vma->vm_mm->rss) {
+ address = vma_address(vma, pgoff);
+ ret = try_to_unmap_one(
+ page, vma->vm_mm, address, mapcount, vma);
+ if (ret == SWAP_FAIL || !*mapcount)
+ goto out;
}
- if (!vma->vm_mm->rss)
- continue;
- address = vma_address(page, vma);
- if (address == NOADDR)
- continue;
- ret = try_to_unmap_one(
- page, vma->vm_mm, address, mapcount, vma);
- if (ret == SWAP_FAIL || !*mapcount)
- goto out;
+ vma = __vma_prio_tree_next(vma, &mapping->i_mmap_shared,
+ &iter, pgoff, pgoff);
}
- if (max_nl_size == 0) /* no nonlinear vmas of this file */
+ if (list_empty(&mapping->i_mmap_nonlinear))
+ goto out;
+
+ list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
+ shared.vm_set.list) {
+ if (vma->vm_flags & (VM_LOCKED|VM_RESERVED))
+ continue;
+ cursor = (unsigned long) vma->vm_private_data;
+ if (cursor > max_nl_cursor)
+ max_nl_cursor = cursor;
+ cursor = vma->vm_end - vma->vm_start;
+ if (cursor > max_nl_size)
+ max_nl_size = cursor;
+ }
+
+ if (max_nl_size == 0)
goto out;
/*
@@ -755,9 +774,9 @@ static inline int try_to_unmap_obj(struc
max_nl_cursor = CLUSTER_SIZE;
do {
- list_for_each_entry(vma, &mapping->i_mmap_shared, shared) {
- if (VM_NONLINEAR != (vma->vm_flags &
- (VM_NONLINEAR|VM_LOCKED|VM_RESERVED)))
+ list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
+ shared.vm_set.list) {
+ if (vma->vm_flags & (VM_LOCKED|VM_RESERVED))
continue;
cursor = (unsigned long) vma->vm_private_data;
while (vma->vm_mm->rss &&
@@ -771,6 +790,7 @@ static inline int try_to_unmap_obj(struc
vma->vm_private_data = (void *) cursor;
if (*mapcount <= 0)
goto relock;
+ cond_resched();
}
if (ret != SWAP_FAIL)
vma->vm_private_data =
@@ -785,9 +805,9 @@ static inline int try_to_unmap_obj(struc
* in locked vmas). Reset cursor on all unreserved nonlinear
* vmas, now forgetting on which ones it had fallen behind.
*/
- list_for_each_entry(vma, &mapping->i_mmap_shared, shared) {
- if ((vma->vm_flags & (VM_NONLINEAR|VM_RESERVED)) ==
- VM_NONLINEAR)
+ list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
+ shared.vm_set.list) {
+ if (!(vma->vm_flags & VM_RESERVED))
vma->vm_private_data = 0;
}
relock:
--- 2.6.5-rc3-mjb2/mm/shmem.c 2004-04-02 21:01:47.427843160 +0100
+++ anobjrmap9/mm/shmem.c 2004-04-04 13:05:41.413469368 +0100
@@ -1351,7 +1351,8 @@ static void do_shmem_file_read(struct fi
* virtual addresses, take care about potential aliasing
* before reading the page on the kernel side.
*/
- if (!list_empty(&mapping->i_mmap_shared))
+ if (!prio_tree_empty(&mapping->i_mmap_shared) ||
+ !list_empty(&mapping->i_mmap_nonlinear))
flush_dcache_page(page);
/*
* Mark the page accessed if we read the beginning.
--- 2.6.5-rc3-mjb2/mm/vmscan.c 2004-04-02 21:01:47.443840728 +0100
+++ anobjrmap9/mm/vmscan.c 2004-04-04 13:05:41.415469064 +0100
@@ -191,9 +191,11 @@ static inline int page_mapping_inuse(str
return 0;
/* File is mmap'd by somebody. */
- if (!list_empty(&mapping->i_mmap))
+ if (!prio_tree_empty(&mapping->i_mmap))
return 1;
- if (!list_empty(&mapping->i_mmap_shared))
+ if (!prio_tree_empty(&mapping->i_mmap_shared))
+ return 1;
+ if (!list_empty(&mapping->i_mmap_nonlinear))
return 1;
return 0;
^ permalink raw reply [flat|nested] 38+ messages in thread* Re: [PATCH] anobjrmap 9 priority mjb tree
2004-04-04 12:33 [PATCH] anobjrmap 9 priority mjb tree Hugh Dickins
@ 2004-04-09 20:39 ` Martin J. Bligh
2004-04-09 21:31 ` Rajesh Venkatasubramanian
2004-04-09 21:51 ` Hugh Dickins
0 siblings, 2 replies; 38+ messages in thread
From: Martin J. Bligh @ 2004-04-09 20:39 UTC (permalink / raw)
To: Hugh Dickins, linux-kernel, Andrew Morton; +Cc: Rajesh Venkatasubramanian
> This anobjrmap 9 (or anon_mm9) patch adds Rajesh's radix priority search
> tree on top of Martin's 2.6.5-rc3-mjb2 tree, making a priority mjb tree!
> Approximately equivalent to Andrea's 2.6.5-aa1, but using anonmm instead
> of anon_vma, and of course each tree has its own additional features.
This slows down kernel compile a little, but worse, it slows down SDET
by about 25% (on the 16x). I think you did something horrible to sem
contention ... presumably i_shared_sem, which SDET was fighting with
as it was anyway ;-(
Diffprofile shows:
122626 15.7% total
44129 790.0% __down
20988 4.1% default_idle
12101 550.3% __wake_up
11723 489.1% finish_task_switch
6988 77.4% do_wp_page
3983 21.7% copy_page_range
2683 19.2% zap_pte_range
2325 54.3% do_anonymous_page
2293 73.1% copy_mm
1787 68.3% remove_shared_vm_struct
1768 101.6% pte_alloc_one
1564 40.0% do_no_page
1520 50.8% do_page_fault
1376 39.2% clear_page_tables
1282 63.4% __copy_user_intel
926 9.4% page_remove_rmap
878 13.1% __copy_to_user_ll
835 46.8% __block_prepare_write
788 35.8% copy_process
777 0.0% __vma_prio_tree_remove
761 48.8% buffered_rmqueue
740 48.6% free_hot_cold_page
674 128.4% vma_link
641 0.0% __vma_prio_tree_insert
612 941.5% sched_clock
585 0.0% prio_tree_insert
563 60.4% exit_notify
547 225.1% split_vma
539 6.4% release_pages
534 464.3% schedule
495 32.0% release_task
422 148.1% flush_signal_handlers
421 66.6% find_vma
420 79.5% set_page_dirty
409 60.1% fput
359 44.5% __copy_from_user_ll
319 47.6% do_mmap_pgoff
290 254.4% find_vma_prepare
270 167.7% rb_insert_color
254 61.7% pte_alloc_map
251 91.3% exit_mmap
229 23.2% __read_lock_failed
228 9.9% filemap_nopage
...
-100 -29.3% group_reserve_blocks
-107 -53.5% .text.lock.namespace
-107 -18.4% render_sigset_t
-126 -18.7% mmgrab
-146 -10.9% generic_file_open
-166 -9.5% ext2_new_inode
-166 -38.1% d_path
-166 -20.1% __find_get_block_slow
-173 -20.7% proc_pid_status
-182 -19.3% update_atime
-185 -25.8% fd_install
-202 -13.8% .text.lock.highmem
-221 -14.5% __fput
-225 -14.3% number
-257 -14.2% proc_pid_stat
-284 -21.6% file_kill
-290 -35.3% proc_root_link
-300 -36.5% ext2_new_block
-349 -61.7% .text.lock.base
-382 -48.0% proc_check_root
-412 -19.4% path_release
-454 -20.0% file_move
-462 -32.2% lookup_mnt
-515 -4.5% find_get_page
-547 -34.5% .text.lock.dcache
-689 -31.2% follow_mount
-940 -33.8% .text.lock.dec_and_lock
-1043 -51.6% .text.lock.file_table
-1115 -9.9% __d_lookup
-1226 -20.1% path_lookup
-1305 -61.5% grab_block
-2101 -29.8% atomic_dec_and_lock
-2554 -40.3% .text.lock.filemap
^ permalink raw reply [flat|nested] 38+ messages in thread* Re: [PATCH] anobjrmap 9 priority mjb tree
2004-04-09 20:39 ` Martin J. Bligh
@ 2004-04-09 21:31 ` Rajesh Venkatasubramanian
2004-04-09 21:40 ` Martin J. Bligh
2004-04-09 21:51 ` Hugh Dickins
1 sibling, 1 reply; 38+ messages in thread
From: Rajesh Venkatasubramanian @ 2004-04-09 21:31 UTC (permalink / raw)
To: Martin J. Bligh; +Cc: Hugh Dickins, linux-kernel, Andrew Morton
Does SDET use mremap a lot ? Hugh added i_shared_sem in mremap -> move_vma
-> move_page_tables path to avoid orphaned ptes due to mremap vs. truncate
race. That may be the reason for the slowdown, I am not sure.
Rajesh
On Fri, 9 Apr 2004, Martin J. Bligh wrote:
> > This anobjrmap 9 (or anon_mm9) patch adds Rajesh's radix priority search
> > tree on top of Martin's 2.6.5-rc3-mjb2 tree, making a priority mjb tree!
> > Approximately equivalent to Andrea's 2.6.5-aa1, but using anonmm instead
> > of anon_vma, and of course each tree has its own additional features.
>
> This slows down kernel compile a little, but worse, it slows down SDET
> by about 25% (on the 16x). I think you did something horrible to sem
> contention ... presumably i_shared_sem, which SDET was fighting with
> as it was anyway ;-(
>
> Diffprofile shows:
>
>
> 122626 15.7% total
> 44129 790.0% __down
> 20988 4.1% default_idle
> 12101 550.3% __wake_up
> 11723 489.1% finish_task_switch
> 6988 77.4% do_wp_page
> 3983 21.7% copy_page_range
> 2683 19.2% zap_pte_range
> 2325 54.3% do_anonymous_page
> 2293 73.1% copy_mm
> 1787 68.3% remove_shared_vm_struct
> 1768 101.6% pte_alloc_one
> 1564 40.0% do_no_page
> 1520 50.8% do_page_fault
> 1376 39.2% clear_page_tables
> 1282 63.4% __copy_user_intel
> 926 9.4% page_remove_rmap
> 878 13.1% __copy_to_user_ll
> 835 46.8% __block_prepare_write
> 788 35.8% copy_process
> 777 0.0% __vma_prio_tree_remove
> 761 48.8% buffered_rmqueue
> 740 48.6% free_hot_cold_page
> 674 128.4% vma_link
> 641 0.0% __vma_prio_tree_insert
> 612 941.5% sched_clock
> 585 0.0% prio_tree_insert
> 563 60.4% exit_notify
> 547 225.1% split_vma
> 539 6.4% release_pages
> 534 464.3% schedule
> 495 32.0% release_task
> 422 148.1% flush_signal_handlers
> 421 66.6% find_vma
> 420 79.5% set_page_dirty
> 409 60.1% fput
> 359 44.5% __copy_from_user_ll
> 319 47.6% do_mmap_pgoff
> 290 254.4% find_vma_prepare
> 270 167.7% rb_insert_color
> 254 61.7% pte_alloc_map
> 251 91.3% exit_mmap
> 229 23.2% __read_lock_failed
> 228 9.9% filemap_nopage
> ...
> -100 -29.3% group_reserve_blocks
> -107 -53.5% .text.lock.namespace
> -107 -18.4% render_sigset_t
> -126 -18.7% mmgrab
> -146 -10.9% generic_file_open
> -166 -9.5% ext2_new_inode
> -166 -38.1% d_path
> -166 -20.1% __find_get_block_slow
> -173 -20.7% proc_pid_status
> -182 -19.3% update_atime
> -185 -25.8% fd_install
> -202 -13.8% .text.lock.highmem
> -221 -14.5% __fput
> -225 -14.3% number
> -257 -14.2% proc_pid_stat
> -284 -21.6% file_kill
> -290 -35.3% proc_root_link
> -300 -36.5% ext2_new_block
> -349 -61.7% .text.lock.base
> -382 -48.0% proc_check_root
> -412 -19.4% path_release
> -454 -20.0% file_move
> -462 -32.2% lookup_mnt
> -515 -4.5% find_get_page
> -547 -34.5% .text.lock.dcache
> -689 -31.2% follow_mount
> -940 -33.8% .text.lock.dec_and_lock
> -1043 -51.6% .text.lock.file_table
> -1115 -9.9% __d_lookup
> -1226 -20.1% path_lookup
> -1305 -61.5% grab_block
> -2101 -29.8% atomic_dec_and_lock
> -2554 -40.3% .text.lock.filemap
>
>
^ permalink raw reply [flat|nested] 38+ messages in thread
* Re: [PATCH] anobjrmap 9 priority mjb tree
2004-04-09 21:31 ` Rajesh Venkatasubramanian
@ 2004-04-09 21:40 ` Martin J. Bligh
2004-04-09 23:17 ` Rajesh Venkatasubramanian
0 siblings, 1 reply; 38+ messages in thread
From: Martin J. Bligh @ 2004-04-09 21:40 UTC (permalink / raw)
To: Rajesh Venkatasubramanian; +Cc: Hugh Dickins, linux-kernel, Andrew Morton
> Does SDET use mremap a lot ? Hugh added i_shared_sem in mremap -> move_vma
> -> move_page_tables path to avoid orphaned ptes due to mremap vs. truncate
> race. That may be the reason for the slowdown, I am not sure.
I don't think so .... I presume you're just holding the sem for longer
during normal operations due to the more complex data structure.
This was just with vs without the prio tree patch, so Hughs's changes
were in both sets ...
M.
> Rajesh
>
> On Fri, 9 Apr 2004, Martin J. Bligh wrote:
>
>> > This anobjrmap 9 (or anon_mm9) patch adds Rajesh's radix priority search
>> > tree on top of Martin's 2.6.5-rc3-mjb2 tree, making a priority mjb tree!
>> > Approximately equivalent to Andrea's 2.6.5-aa1, but using anonmm instead
>> > of anon_vma, and of course each tree has its own additional features.
>>
>> This slows down kernel compile a little, but worse, it slows down SDET
>> by about 25% (on the 16x). I think you did something horrible to sem
>> contention ... presumably i_shared_sem, which SDET was fighting with
>> as it was anyway ;-(
>>
>> Diffprofile shows:
>>
>>
>> 122626 15.7% total
>> 44129 790.0% __down
>> 20988 4.1% default_idle
>> 12101 550.3% __wake_up
>> 11723 489.1% finish_task_switch
>> 6988 77.4% do_wp_page
>> 3983 21.7% copy_page_range
>> 2683 19.2% zap_pte_range
>> 2325 54.3% do_anonymous_page
>> 2293 73.1% copy_mm
>> 1787 68.3% remove_shared_vm_struct
>> 1768 101.6% pte_alloc_one
>> 1564 40.0% do_no_page
>> 1520 50.8% do_page_fault
>> 1376 39.2% clear_page_tables
>> 1282 63.4% __copy_user_intel
>> 926 9.4% page_remove_rmap
>> 878 13.1% __copy_to_user_ll
>> 835 46.8% __block_prepare_write
>> 788 35.8% copy_process
>> 777 0.0% __vma_prio_tree_remove
>> 761 48.8% buffered_rmqueue
>> 740 48.6% free_hot_cold_page
>> 674 128.4% vma_link
>> 641 0.0% __vma_prio_tree_insert
>> 612 941.5% sched_clock
>> 585 0.0% prio_tree_insert
>> 563 60.4% exit_notify
>> 547 225.1% split_vma
>> 539 6.4% release_pages
>> 534 464.3% schedule
>> 495 32.0% release_task
>> 422 148.1% flush_signal_handlers
>> 421 66.6% find_vma
>> 420 79.5% set_page_dirty
>> 409 60.1% fput
>> 359 44.5% __copy_from_user_ll
>> 319 47.6% do_mmap_pgoff
>> 290 254.4% find_vma_prepare
>> 270 167.7% rb_insert_color
>> 254 61.7% pte_alloc_map
>> 251 91.3% exit_mmap
>> 229 23.2% __read_lock_failed
>> 228 9.9% filemap_nopage
>> ...
>> -100 -29.3% group_reserve_blocks
>> -107 -53.5% .text.lock.namespace
>> -107 -18.4% render_sigset_t
>> -126 -18.7% mmgrab
>> -146 -10.9% generic_file_open
>> -166 -9.5% ext2_new_inode
>> -166 -38.1% d_path
>> -166 -20.1% __find_get_block_slow
>> -173 -20.7% proc_pid_status
>> -182 -19.3% update_atime
>> -185 -25.8% fd_install
>> -202 -13.8% .text.lock.highmem
>> -221 -14.5% __fput
>> -225 -14.3% number
>> -257 -14.2% proc_pid_stat
>> -284 -21.6% file_kill
>> -290 -35.3% proc_root_link
>> -300 -36.5% ext2_new_block
>> -349 -61.7% .text.lock.base
>> -382 -48.0% proc_check_root
>> -412 -19.4% path_release
>> -454 -20.0% file_move
>> -462 -32.2% lookup_mnt
>> -515 -4.5% find_get_page
>> -547 -34.5% .text.lock.dcache
>> -689 -31.2% follow_mount
>> -940 -33.8% .text.lock.dec_and_lock
>> -1043 -51.6% .text.lock.file_table
>> -1115 -9.9% __d_lookup
>> -1226 -20.1% path_lookup
>> -1305 -61.5% grab_block
>> -2101 -29.8% atomic_dec_and_lock
>> -2554 -40.3% .text.lock.filemap
>>
>>
>
>
^ permalink raw reply [flat|nested] 38+ messages in thread
* Re: [PATCH] anobjrmap 9 priority mjb tree
2004-04-09 21:40 ` Martin J. Bligh
@ 2004-04-09 23:17 ` Rajesh Venkatasubramanian
0 siblings, 0 replies; 38+ messages in thread
From: Rajesh Venkatasubramanian @ 2004-04-09 23:17 UTC (permalink / raw)
To: Martin J. Bligh; +Cc: Hugh Dickins, linux-kernel, Andrew Morton
> > Does SDET use mremap a lot ? Hugh added i_shared_sem in mremap -> move_vma
> > -> move_page_tables path to avoid orphaned ptes due to mremap vs. truncate
> > race. That may be the reason for the slowdown, I am not sure.
>
> I don't think so .... I presume you're just holding the sem for longer
> during normal operations due to the more complex data structure.
I haven't done any benchmarks with prio_tree. I just tried kernel compile
which was not bad. I tried rmap-test.c and test-mmap3.c. The results were
not bad on UP. I didn't try them on SMP, so you can be right.
> This was just with vs without the prio tree patch, so Hughs's changes
> were in both sets ...
The change I mentioned above was added only in Hugh's prio_tree patch,
it was not there before. However, I was just guessing that the mremap
change can be a reason. It can very well be due to prio_tree complexity.
I don't have any data to prove or disprove either way.
Rajesh
> > On Fri, 9 Apr 2004, Martin J. Bligh wrote:
> >
> >> > This anobjrmap 9 (or anon_mm9) patch adds Rajesh's radix priority search
> >> > tree on top of Martin's 2.6.5-rc3-mjb2 tree, making a priority mjb tree!
> >> > Approximately equivalent to Andrea's 2.6.5-aa1, but using anonmm instead
> >> > of anon_vma, and of course each tree has its own additional features.
> >>
> >> This slows down kernel compile a little, but worse, it slows down SDET
> >> by about 25% (on the 16x). I think you did something horrible to sem
> >> contention ... presumably i_shared_sem, which SDET was fighting with
> >> as it was anyway ;-(
> >>
> >> Diffprofile shows:
> >>
> >>
> >> 122626 15.7% total
> >> 44129 790.0% __down
> >> 20988 4.1% default_idle
> >> 12101 550.3% __wake_up
> >> 11723 489.1% finish_task_switch
> >> 6988 77.4% do_wp_page
> >> 3983 21.7% copy_page_range
> >> 2683 19.2% zap_pte_range
> >> 2325 54.3% do_anonymous_page
> >> 2293 73.1% copy_mm
> >> 1787 68.3% remove_shared_vm_struct
> >> 1768 101.6% pte_alloc_one
> >> 1564 40.0% do_no_page
> >> 1520 50.8% do_page_fault
> >> 1376 39.2% clear_page_tables
> >> 1282 63.4% __copy_user_intel
> >> 926 9.4% page_remove_rmap
> >> 878 13.1% __copy_to_user_ll
> >> 835 46.8% __block_prepare_write
> >> 788 35.8% copy_process
> >> 777 0.0% __vma_prio_tree_remove
> >> 761 48.8% buffered_rmqueue
> >> 740 48.6% free_hot_cold_page
> >> 674 128.4% vma_link
> >> 641 0.0% __vma_prio_tree_insert
> >> 612 941.5% sched_clock
> >> 585 0.0% prio_tree_insert
> >> 563 60.4% exit_notify
> >> 547 225.1% split_vma
> >> 539 6.4% release_pages
> >> 534 464.3% schedule
> >> 495 32.0% release_task
> >> 422 148.1% flush_signal_handlers
> >> 421 66.6% find_vma
> >> 420 79.5% set_page_dirty
> >> 409 60.1% fput
> >> 359 44.5% __copy_from_user_ll
> >> 319 47.6% do_mmap_pgoff
> >> 290 254.4% find_vma_prepare
> >> 270 167.7% rb_insert_color
> >> 254 61.7% pte_alloc_map
> >> 251 91.3% exit_mmap
> >> 229 23.2% __read_lock_failed
> >> 228 9.9% filemap_nopage
> >> ...
> >> -100 -29.3% group_reserve_blocks
> >> -107 -53.5% .text.lock.namespace
> >> -107 -18.4% render_sigset_t
> >> -126 -18.7% mmgrab
> >> -146 -10.9% generic_file_open
> >> -166 -9.5% ext2_new_inode
> >> -166 -38.1% d_path
> >> -166 -20.1% __find_get_block_slow
> >> -173 -20.7% proc_pid_status
> >> -182 -19.3% update_atime
> >> -185 -25.8% fd_install
> >> -202 -13.8% .text.lock.highmem
> >> -221 -14.5% __fput
> >> -225 -14.3% number
> >> -257 -14.2% proc_pid_stat
> >> -284 -21.6% file_kill
> >> -290 -35.3% proc_root_link
> >> -300 -36.5% ext2_new_block
> >> -349 -61.7% .text.lock.base
> >> -382 -48.0% proc_check_root
> >> -412 -19.4% path_release
> >> -454 -20.0% file_move
> >> -462 -32.2% lookup_mnt
> >> -515 -4.5% find_get_page
> >> -547 -34.5% .text.lock.dcache
> >> -689 -31.2% follow_mount
> >> -940 -33.8% .text.lock.dec_and_lock
> >> -1043 -51.6% .text.lock.file_table
> >> -1115 -9.9% __d_lookup
> >> -1226 -20.1% path_lookup
> >> -1305 -61.5% grab_block
> >> -2101 -29.8% atomic_dec_and_lock
> >> -2554 -40.3% .text.lock.filemap
> >>
> >>
> >
> >
>
>
>
^ permalink raw reply [flat|nested] 38+ messages in thread
* Re: [PATCH] anobjrmap 9 priority mjb tree
2004-04-09 20:39 ` Martin J. Bligh
2004-04-09 21:31 ` Rajesh Venkatasubramanian
@ 2004-04-09 21:51 ` Hugh Dickins
2004-04-09 22:01 ` Martin J. Bligh
2004-04-09 22:56 ` Martin J. Bligh
1 sibling, 2 replies; 38+ messages in thread
From: Hugh Dickins @ 2004-04-09 21:51 UTC (permalink / raw)
To: Martin J. Bligh; +Cc: linux-kernel, Andrew Morton, Rajesh Venkatasubramanian
On Fri, 9 Apr 2004, Martin J. Bligh wrote:
> > This anobjrmap 9 (or anon_mm9) patch adds Rajesh's radix priority search
> > tree on top of Martin's 2.6.5-rc3-mjb2 tree, making a priority mjb tree!
> > Approximately equivalent to Andrea's 2.6.5-aa1, but using anonmm instead
> > of anon_vma, and of course each tree has its own additional features.
>
> This slows down kernel compile a little, but worse, it slows down SDET
> by about 25% (on the 16x). I think you did something horrible to sem
> contention ... presumably i_shared_sem, which SDET was fighting with
> as it was anyway ;-(
>
> Diffprofile shows:
>
> 122626 15.7% total
> 44129 790.0% __down
> 20988 4.1% default_idle
Many thanks for the good news, Martin ;)
Looks like I've done something very stupid, perhaps a mismerge.
Not found it yet, I'll carry on looking tomorrow.
Hugh
^ permalink raw reply [flat|nested] 38+ messages in thread
* Re: [PATCH] anobjrmap 9 priority mjb tree
2004-04-09 21:51 ` Hugh Dickins
@ 2004-04-09 22:01 ` Martin J. Bligh
2004-04-09 22:56 ` Martin J. Bligh
1 sibling, 0 replies; 38+ messages in thread
From: Martin J. Bligh @ 2004-04-09 22:01 UTC (permalink / raw)
To: Hugh Dickins; +Cc: linux-kernel, Andrew Morton, Rajesh Venkatasubramanian
--Hugh Dickins <hugh@veritas.com> wrote (on Friday, April 09, 2004 22:51:03 +0100):
> On Fri, 9 Apr 2004, Martin J. Bligh wrote:
>> > This anobjrmap 9 (or anon_mm9) patch adds Rajesh's radix priority search
>> > tree on top of Martin's 2.6.5-rc3-mjb2 tree, making a priority mjb tree!
>> > Approximately equivalent to Andrea's 2.6.5-aa1, but using anonmm instead
>> > of anon_vma, and of course each tree has its own additional features.
>>
>> This slows down kernel compile a little, but worse, it slows down SDET
>> by about 25% (on the 16x). I think you did something horrible to sem
>> contention ... presumably i_shared_sem, which SDET was fighting with
>> as it was anyway ;-(
>>
>> Diffprofile shows:
>>
>> 122626 15.7% total
>> 44129 790.0% __down
>> 20988 4.1% default_idle
>
> Many thanks for the good news, Martin ;)
> Looks like I've done something very stupid, perhaps a mismerge.
> Not found it yet, I'll carry on looking tomorrow.
I don't think so ... I ran across a similar problems when I tried to convert
the vma list to a list-of-lists for what you're trying to use prio_tree
for, I think. The sorry fact is that the that thing is heavily contended,
and doing complex manipulations under it makes it worse ;-(
I vaguely considered being a smarty-pants and doing RCU list-of-lists, but
never got around to it. I'll shove in some backtraces and check out the
sem activity to be sure.
M.
^ permalink raw reply [flat|nested] 38+ messages in thread
* Re: [PATCH] anobjrmap 9 priority mjb tree
2004-04-09 21:51 ` Hugh Dickins
2004-04-09 22:01 ` Martin J. Bligh
@ 2004-04-09 22:56 ` Martin J. Bligh
2004-04-11 16:09 ` Hugh Dickins
1 sibling, 1 reply; 38+ messages in thread
From: Martin J. Bligh @ 2004-04-09 22:56 UTC (permalink / raw)
To: Hugh Dickins; +Cc: linux-kernel, Andrew Morton, Rajesh Venkatasubramanian
>> > This anobjrmap 9 (or anon_mm9) patch adds Rajesh's radix priority search
>> > tree on top of Martin's 2.6.5-rc3-mjb2 tree, making a priority mjb tree!
>> > Approximately equivalent to Andrea's 2.6.5-aa1, but using anonmm instead
>> > of anon_vma, and of course each tree has its own additional features.
>>
>> This slows down kernel compile a little, but worse, it slows down SDET
>> by about 25% (on the 16x). I think you did something horrible to sem
>> contention ... presumably i_shared_sem, which SDET was fighting with
>> as it was anyway ;-(
>>
>> Diffprofile shows:
>>
>> 122626 15.7% total
>> 44129 790.0% __down
>> 20988 4.1% default_idle
>
> Many thanks for the good news, Martin ;)
> Looks like I've done something very stupid, perhaps a mismerge.
> Not found it yet, I'll carry on looking tomorrow.
I applied Andrew's high sophisticated proprietary semtrace technology.
The common ones are:
Call Trace:
[<c0105aee>] __down+0x96/0x10c
[<c0118fb8>] default_wake_function+0x0/0x1c
[<c0105cb8>] __down_failed+0x8/0xc
[<c014314f>] .text.lock.mmap+0x39/0x12a
[<c01421a7>] do_mmap_pgoff+0x4cf/0x60c
[<c010cb74>] old_mmap+0x108/0x144
[<c025c753>] syscall_call+0x7/0xb
Which is the vma_link call from do_mmap_pgoff here:
/* Can addr have changed??
*
* Answer: Yes, several device drivers can do it in their
* f_op->mmap method. -DaveM
*/
addr = vma->vm_start;
if (!file || !rb_parent || !vma_merge(mm, prev, rb_parent, addr,
addr + len, vma->vm_flags, file, pgoff)) {
vma_link(mm, vma, prev, rb_link, rb_parent);
if (correct_wcount)
atomic_inc(&inode->i_writecount);
vma_link takes i_shared_sem.
Call Trace:
[<c0105aee>] __down+0x96/0x10c
[<c0118fb8>] default_wake_function+0x0/0x1c
[<c0105cb8>] __down_failed+0x8/0xc
[<c01431dd>] .text.lock.mmap+0xc7/0x12a
[<c0142b4c>] do_munmap+0xbc/0x128
[<c0141f91>] do_mmap_pgoff+0x2b9/0x60c
[<c010cb74>] old_mmap+0x108/0x144
[<c025c753>] syscall_call+0x7/0xb
Is the call to split_vma from do_munmap here:
/*
* If we need to split any vma, do it now to save pain later.
*
* Note: mremap's move_vma VM_ACCOUNT handling assumes a partially
* unmapped vm_area_struct will remain in use: so lower split_vma
* places tmp vma above, and higher split_vma places tmp vma below.
*/
if (start > mpnt->vm_start) {
if (split_vma(mm, mpnt, start, 0))
return -ENOMEM;
prev = mpnt;
}
split_vma takes i_shared_sem, then takes page_table_lock inside it
(which probably isn't helping either ;-)).
if (mapping)
down(&mapping->i_shared_sem);
spin_lock(&mm->page_table_lock);
Call Trace:
[<c0105aee>] __down+0x96/0x10c
[<c0118fb8>] default_wake_function+0x0/0x1c
[<c0105cb8>] __down_failed+0x8/0xc
[<c014311b>] .text.lock.mmap+0x5/0x12a
[<c0142f79>] exit_mmap+0x191/0x1d0
[<c011ae84>] mmput+0x50/0x70
[<c011ec6d>] do_exit+0x1b9/0x330
[<c011eefa>] do_group_exit+0x9e/0xa0
[<c011ef0a>] sys_exit_group+0xe/0x14
[<c025c753>] syscall_call+0x7/0xb
That's remove_shared_vm_struct calling i_shared_sem
Call Trace:
[<c0105aee>] __down+0x96/0x10c
[<c0118fb8>] default_wake_function+0x0/0x1c
[<c0105cb8>] __down_failed+0x8/0xc
[<c014311b>] .text.lock.mmap+0x5/0x12a
[<c0142694>] unmap_vma+0x44/0x78
[<c01426dc>] unmap_vma_list+0x14/0x20
[<c0142ba5>] do_munmap+0x115/0x128
[<c0141f91>] do_mmap_pgoff+0x2b9/0x60c
[<c010cb74>] old_mmap+0x108/0x144
[<c025c753>] syscall_call+0x7/0xb
That's remove_shared_vm_struct again, but called from unmap_vma this time
Call Trace:
[<c0105aee>] __down+0x96/0x10c
[<c0118fb8>] default_wake_function+0x0/0x1c
[<c0105cb8>] __down_failed+0x8/0xc
[<c011c494>] .text.lock.fork+0x79/0x125
[<c011be5c>] copy_process+0x61c/0xa6c
[<c011c322>] do_fork+0x76/0x16f
[<c0105619>] sys_clone+0x29/0x30
[<c025c753>] syscall_call+0x7/0xb
That's dup_mmap taking i_shared_sem here:
/* insert tmp into the share list, just after mpnt */
down(&file->f_mapping->i_shared_sem);
__vma_prio_tree_add(tmp, mpnt);
up(&file->f_mapping->i_shared_sem);
^ permalink raw reply [flat|nested] 38+ messages in thread* Re: [PATCH] anobjrmap 9 priority mjb tree
2004-04-09 22:56 ` Martin J. Bligh
@ 2004-04-11 16:09 ` Hugh Dickins
2004-04-11 17:28 ` Martin J. Bligh
0 siblings, 1 reply; 38+ messages in thread
From: Hugh Dickins @ 2004-04-11 16:09 UTC (permalink / raw)
To: Martin J. Bligh; +Cc: linux-kernel, Andrew Morton, Rajesh Venkatasubramanian
On Fri, 9 Apr 2004, Martin J. Bligh wrote:
> >> This slows down kernel compile a little, but worse, it slows down SDET
> >> by about 25% (on the 16x). I think you did something horrible to sem
> >> contention ... presumably i_shared_sem, which SDET was fighting with
> >> as it was anyway ;-(
> >>
> >> Diffprofile shows:
> >>
> >> 122626 15.7% total
> >> 44129 790.0% __down
> >> 20988 4.1% default_idle
>
> I applied Andrew's high sophisticated proprietary semtrace technology.
Thanks a lot, Martin, this seems pretty important.
So, i_shared_sem, as you supposed.
Do you still have the two profiles input to diffprofile?
I wonder if they'd have clues to help us understand it better.
Any chance of you doing the same comparison between 2.6.5-aa5
2.6.5-aa5 minus prio-tree? (Well, needn't be -aa5, whatever comes to
hand. Looks like "patch -p1 -R < prio-tree" mostly works, just some
rejects in mm/mmap.c itself, let me know if I can help out on that.)
If -aa is okay, I hope so, then it's surely some stupidity from me.
We're not at all surprised that vma linking and unlinking should take
rather longer; but the rise in __down, __wake_up, finish_task_switch
is horrifying. Or is that how it usually looks, when a semaphore is
well contended - thundering herd?
Hugh
^ permalink raw reply [flat|nested] 38+ messages in thread
* Re: [PATCH] anobjrmap 9 priority mjb tree
2004-04-11 16:09 ` Hugh Dickins
@ 2004-04-11 17:28 ` Martin J. Bligh
2004-04-12 4:32 ` Rajesh Venkatasubramanian
2004-04-12 15:46 ` Martin J. Bligh
0 siblings, 2 replies; 38+ messages in thread
From: Martin J. Bligh @ 2004-04-11 17:28 UTC (permalink / raw)
To: Hugh Dickins; +Cc: linux-kernel, Andrew Morton, Rajesh Venkatasubramanian
[-- Attachment #1: Type: text/plain, Size: 1805 bytes --]
>> I applied Andrew's high sophisticated proprietary semtrace technology.
>
> Thanks a lot, Martin, this seems pretty important.
>
> So, i_shared_sem, as you supposed.
>
> Do you still have the two profiles input to diffprofile?
> I wonder if they'd have clues to help us understand it better.
Yup. Attatched.
> Any chance of you doing the same comparison between 2.6.5-aa5
> 2.6.5-aa5 minus prio-tree? (Well, needn't be -aa5, whatever comes to
> hand. Looks like "patch -p1 -R < prio-tree" mostly works, just some
> rejects in mm/mmap.c itself, let me know if I can help out on that.)
>
> If -aa is okay, I hope so, then it's surely some stupidity from me.
Good idea. Not sure how easy it'll be to back prio_tree out, but I can
surely do aa5, which would give us a good clue still. Might not be until
this time tommorow though.
> We're not at all surprised that vma linking and unlinking should take
> rather longer; but the rise in __down, __wake_up, finish_task_switch
> is horrifying. Or is that how it usually looks, when a semaphore is
> well contended - thundering herd?
I think there's just a locking cliff you fall off after a certain level
of contention ... i_shared_sem has always been bad for SDET, to be fair.
But I hate making it worse ;-) I did more investigation of it a year or
so ago ... something does horrible things to it (which is why akpm turned
it into a sem in the first place) ... maybe holding it over proess teardown
for eons or something. Bah, I want lockmeter for sems ;-)
Maybe I can dig out my old analysis ... I'll take a look. I suppose I could
always turn it back into a spinlock to look at it ;-) On the other hand, if
we can fix that problem, I think my "list of lists" was simpler than prio
tree (and is probably much more susceptible to RCU).
M.
[-- Attachment #2: mjb1 --]
[-- Type: application/octet-stream, Size: 17226 bytes --]
783079 total
519748 default_idle
18229 copy_page_range
13878 zap_pte_range
11235 __d_lookup
11135 find_get_page
9775 page_remove_rmap
8904 do_wp_page
8460 release_pages
7232 atomic_dec_and_lock
6488 __copy_to_user_ll
6212 path_lookup
6143 .text.lock.filemap
4651 __down
4264 do_anonymous_page
3921 do_no_page
3439 clear_page_tables
3056 free_pages_and_swap_cache
3055 copy_mm
2961 do_page_fault
2926 .text.lock.dec_and_lock
2548 remove_shared_vm_struct
2302 link_path_walk
2288 filemap_nopage
2286 follow_mount
2242 file_move
2208 copy_process
2205 path_release
2191 finish_task_switch
2082 grab_block
1975 __copy_user_intel
1887 __wake_up
1801 proc_pid_stat
1772 kmem_cache_free
1765 .text.lock.file_table
1752 ext2_new_inode
1746 __block_prepare_write
1689 pte_alloc_one
1652 .text.lock.dcache
1584 __fput
1573 __find_get_block
1571 lookup_mnt
1563 buffered_rmqueue
1556 number
1543 page_add_obj_rmap
1491 free_hot_cold_page
1485 release_task
1413 .text.lock.highmem
1324 generic_file_open
1314 page_address
1295 current_kernel_time
1295 __brelse
1276 file_kill
1212 task_mem
1083 kmap_high
1079 do_generic_mapping_read
1034 alloc_inode
1014 __mark_inode_dirty
990 system_call
988 d_alloc
979 vsnprintf
958 update_atime
957 kmap_atomic
940 __read_lock_failed
910 exit_notify
891 proc_check_root
874 handle_mm_fault
861 ext2_new_block
826 dput
819 proc_root_link
816 proc_pid_status
795 __copy_from_user_ll
788 generic_fillattr
774 dnotify_parent
770 new_inode
726 __find_get_block_slow
716 fd_install
694 .text.lock.base
671 __generic_file_aio_write_nolock
663 do_mmap_pgoff
659 fput
650 mmgrab
641 radix_tree_lookup
610 ext2_update_inode
602 find_vma
586 vfs_read
584 kunmap_high
567 d_instantiate
560 render_sigset_t
541 set_page_address
530 vma_link
516 generic_delete_inode
506 del_timer_sync
503 read_block_bitmap
503 do_page_cache_readahead
499 dentry_open
498 set_page_dirty
486 pid_revalidate
472 prune_dcache
440 block_invalidatepage
436 d_path
420 deny_write_access
415 pte_alloc_map
414 may_open
396 mark_page_accessed
394 kmem_ptr_validate
379 copy_files
373 ext2_free_blocks
352 __blk_queue_bounce
351 .text.lock.inode
350 __alloc_pages
342 group_reserve_blocks
342 get_tgid_list
338 exec_mmap
337 __generic_file_aio_read
332 select_parent
327 ext2_find_entry
325 real_lookup
325 page_add_anon_rmap
322 ext2_get_group_desc
316 ext2_add_link
313 find_group_other
307 file_ra_state_init
307 ext2_preread_inode
294 filp_close
293 unlock_page
292 ext2_get_block
283 dup_task_struct
279 __pagevec_lru_add_active
269 open_namei
267 exit_mmap
265 d_delete
264 ext2_get_inode
261 task_dumpable
261 flush_signal_handlers
256 proc_fd_link
256 do_lookup
256 __insert_inode_hash
248 .text.lock.namespace
244 copy_strings
239 direct_strncpy_from_user
233 group_release_blocks
231 vfs_unlink
230 is_subdir
230 flush_tlb_page
229 kunmap_atomic
228 split_vma
226 get_empty_filp
223 find_get_pages
222 direct_strnlen_user
221 tid_fd_revalidate
218 do_sigaction
217 proc_pid_readlink
217 do_exit
215 create_buffers
214 in_group_p
206 truncate_inode_pages
191 file_read_actor
185 vfs_getattr
185 generic_file_write
184 vfs_write
183 unmap_vmas
178 ext2_discard_prealloc
175 get_unused_fd
173 read_inode_bitmap
172 iput
168 rb_insert_color
168 load_elf_binary
168 d_lookup
167 sys_wait4
167 flush_tlb_mm
166 __block_commit_write
163 percpu_counter_mod
160 dnotify_flush
159 ext2_block_to_path
155 profile_exit_mmap
155 generic_file_read
155 fget_light
155 __d_path
154 get_wchan
151 syscall_exit
147 fget
147 ext2_truncate
146 proc_delete_inode
145 locks_remove_flock
145 __free_pages
141 sys_close
137 proc_info_read
134 sys_unlink
134 reap_timer_fnc
134 __lookup
131 try_to_free_buffers
130 call_rcu
129 sigprocmask
128 __pagevec_lru_add
127 ext2_inode_by_name
125 bad_range
123 inode_times_differ
123 ext2_commit_chunk
120 schedule
120 lru_cache_add_active
119 rcu_do_batch
117 page_update_anon_rmap
116 ext2_delete_entry
114 vfs_permission
113 vm_acct_memory
112 page_cache_readahead
111 ext2_readdir
109 find_vma_prepare
108 cp_new_stat64
106 put_files_struct
106 ext2_free_inode
105 pid_alive
102 alloc_pidmap
101 try_to_unmap_one
100 sys_read
100 find_lock_page
99 flush_old_exec
97 profile_exec_unmap
97 generic_file_llseek
96 set_fs_pwd
95 locks_remove_posix
94 kmem_cache_alloc
92 mm_alloc
92 get_signal_to_deliver
89 proc_lookup
89 inode_update_time
86 wake_up_inode
85 getname
85 .text.lock.readahead
82 wait_task_zombie
79 vma_merge
79 search_binary_handler
79 inode_init_once
78 __vma_link
77 permission
76 unlock_buffer
76 generic_file_mmap
76 find_vma_prev
75 sys_newuname
75 setup_arg_pages
73 get_offset_pit
73 ext2_release_inode
73 cap_vm_enough_memory
72 d_rehash
72 .text.lock.namei
71 generic_commit_write
70 __copy_user_zeroing_intel
68 kpmd_ctor
67 create_empty_buffers
66 mark_buffer_dirty
66 inode_sub_bytes
65 vfs_readdir
65 sys_brk
65 load_elf_interp
64 sched_clock
63 copy_namespace
62 show_stat
62 lru_cache_add
61 unmap_vma
61 proc_pid_lookup
61 old_mmap
61 eligible_child
60 find_trylock_page
60 ext2_free_branches
60 chown_common
59 do_munmap
58 prep_new_page
58 add_to_page_cache
57 ret_from_intr
56 __remove_from_page_cache
55 prepare_binprm
54 inode_has_buffers
53 drop_buffers
53 do_brk
53 __put_task_struct
52 wake_up_forked_process
51 pgd_alloc
50 find_group_orlov
48 error_code
48 do_gettimeofday
48 __vma_link_rb
46 zap_pmd_range
46 sched_fork
46 pipe_aio_readv
46 open_exec
46 ext2_get_page
46 ext2_alloc_branch
46 __get_page_state
46 .text.lock.generic
45 try_to_wake_up
45 task_vsize
45 create_elf_tables
45 .text.lock.fs_writeback
45 .text.lock.balloc
44 sys_utime
44 profile_exit_task
44 mmput
44 get_unmapped_area
44 exit_rmap
44 __clear_page_buffers
43 pipe_aio_writev
43 filldir64
43 .text.lock.fork
42 unmap_page_range
42 proc_pid_unhash
42 page_waitqueue
42 __user_walk
42 __set_page_dirty_nobuffers
41 sys_open
41 direct_clear_user
41 .text.lock.ioctl
40 unmap_region
40 mm_init
39 sys_chmod
38 sys_ioctl
38 free_percpu
38 filp_open
38 complete
38 clear_inode
38 __rb_rotate_left
37 truncate_complete_page
37 get_jiffies_64
37 eventpoll_release_file
37 __sync_single_inode
37 .text.lock.swap
36 setup_sigcontext
36 proc_pid_readdir
36 destroy_inode
36 cached_lookup
35 ext2_alloc_block
35 __rb_erase_color
33 proc_pid_make_inode
33 exit_aio
33 dup_rmap
33 __generic_file_write_nolock
32 proc_lookupfd
32 find_pid
32 .text.lock.root
31 sys_write
31 skip_atoi
31 radix_tree_insert
31 mm_release
31 ext2_create
31 do_fork
31 default_wake_function
31 cache_grow
31 __get_free_pages
30 do_group_exit
30 d_free
29 wake_up_buffer
29 vfs_follow_link
29 ext2_count_free_blocks
29 count_open_files
29 block_prepare_write
28 sys_rt_sigprocmask
28 sys_mmap2
28 rb_erase
28 notify_change
28 elf_map
28 d_invalidate
27 remove_suid
27 radix_tree_preload
27 __do_softirq
26 copy_thread
26 add_wait_queue
26 __rb_rotate_right
25 unshare_files
25 setup_frame
25 radix_tree_delete
25 proc_root_lookup
25 proc_readfd
25 proc_pident_lookup
25 lru_add_drain
25 get_write_access
25 ext2_empty_dir
24 inode_change_ok
24 __pagevec_free
23 sys_rt_sigaction
22 vfs_stat
22 test_clear_page_dirty
22 kmap_atomic_to_page
22 inode_setattr
22 inode_add_bytes
22 flush_tlb_others
22 do_pipe
22 compute_creds
22 __lookup_hash
22 .text.lock.dnotify
21 task_nice
21 reserve_blocks
21 read_cache_page
21 __kmalloc
20 scsi_request_fn
20 pipe_wait
20 mpage_writepage
20 ext2_group_sparse
20 ext2_get_branch
20 can_share_swap_page
20 bh_lru_install
20 __getblk
20 .text.lock.ialloc
19 try_hugetlb_get_unmapped_area
19 sys_stat64
19 ll_rw_block
19 chrdev_open
19 balance_dirty_pages_ratelimited
19 bad_page
19 __make_request
18 sys_llseek
18 set_bh_page
18 get_zone_counts
18 follow_down
18 flush_thread
18 cap_bprm_compute_creds
18 can_vma_merge_after
17 write_inode
17 sys_access
17 sched_balance_exec
17 pid_delete_dentry
17 fasync_helper
17 ext2_check_page
17 end_page_writeback
17 d_unhash
17 cpu_idle
16 vsprintf
16 scsi_end_request
16 recalc_bh_state
16 next_thread
16 init_new_context
16 flush_all_zero_pkmaps
16 ext2_prepare_write
16 __bread
15 vfs_create
15 sys_dup2
15 put_dirty_page
15 pid_base_iput
15 nr_blockdev_pages
15 ext2_lookup
15 ext2_delete_inode
15 __page_cache_release
14 generic_forget_inode
14 ext2_setattr
14 eventpoll_init_file
13 sys_set_thread_area
13 sync_sb_inodes
13 proc_root_readdir
13 kunmap
13 inode_needs_sync
13 free_pgtables
13 expand_stack
13 __get_user_4
12 sys_munmap
12 sys_lstat64
12 sprintf
12 remove_from_page_cache
12 posix_block_lock
12 memcpy
12 invalidate_inode_buffers
12 free_task
12 fcntl_dirnotify
12 do_signal
12 do_execve
12 can_vma_merge_before
12 __set_page_buffers
12 __insert_vm_struct
11 setattr_mask
11 save_i387
11 prepare_to_wait
11 mpage_writepages
11 free_pages
11 ext2_unlink
11 convert_fxsr_to_user
11 alloc_buffer_head
10 vfs_rmdir
10 sys_mkdir
10 sys_fstat64
10 sys_execve
10 kfree
10 groups_search
10 expand_fd_array
10 exit_itimers
10 detach_vmas_to_be_unmapped
9 vfs_lstat
9 release_blocks
9 put_filp
9 mempool_alloc
9 is_bad_inode
9 ext2_set_inode_flags
9 destroy_context
9 .text.lock.commoncap
8 vfs_fstat
8 syscall_call
8 sys_time
8 sys_readlink
8 resume_userspace
8 pgd_free
8 get_vmalloc_info
8 get_new_inode_fast
8 ext2_put_inode
8 ext2_last_byte
8 ext2_find_near
8 count
8 cache_init_objs
8 block_truncate_page
8 block_commit_write
8 __mmdrop
8 __iget
8 __down_failed_interruptible
7 write_profile
7 pipe_new
7 pagevec_lookup
7 kill_fasync
7 get_pipe_inode
7 finish_wait
7 ext2_release_file
7 ext2_alloc_inode
7 do_proc_readlink
7 do_mpage_readpage
7 check_tty_count
7 bad_get_user
7 autoremove_wake_function
7 .text.lock.locks
6 unmap_vma_list
6 unmap_underlying_metadata
6 unix_create1
6 scsi_dispatch_cmd
6 release_x86_irqs
6 radix_tree_gang_lookup
6 pipe_release
6 page_slot
6 kmap
6 init_fpu
6 init_buffer_head
6 group_send_sig_info
6 free_buffer_head
6 ext2_make_empty
6 dget_locked
6 __posix_lock_file
6 .text.lock.exec
6 .text.lock.char_dev
6 .text.lock.buffer
5 try_to_release_page
5 sys_sigreturn
5 si_swapinfo
5 rwsem_wake
5 rwsem_down_read_failed
5 restore_sigcontext
5 restore_all
5 put_unused_fd
5 pte_alloc_kernel
5 insert_vm_struct
5 iget_locked
5 filemap_getpage
5 ext2_statfs
5 exit_sem
5 do_sync_write
5 do_fcntl
5 de_put
5 cdev_get
5 bio_alloc
5 __pmd_alloc
5 __block_write_full_page
5 .text.lock.tty_io
4 writeback_inodes
4 up_tty_sem
4 sys_getdents64
4 sys_chown
4 sock_alloc
4 smp_call_function
4 slab_destroy
4 set_close_on_exec
4 send_IPI_mask_sequence
4 schedule_tail
4 proc_get_inode
4 proc_alloc_inode
4 pipe_write_fasync
4 path_walk
4 padzero
4 nr_free_pages
4 lock_rename
4 kobject_get
4 generic_drop_inode
4 free_hot_page
4 do_invalidatepage
4 __pagevec_release
4 __mod_timer
3 work_resched
3 tty_open
3 sys_setpgid
3 sys_getcwd
3 sys_fcntl64
3 sock_map_fd
3 sock_create
3 set_brk
3 remove_inode_buffers
3 proc_file_read
3 proc_destroy_inode
3 nr_iowait
3 notifier_call_chain
3 name_to_int
3 locate_fd
3 kernel_read
3 get_request
3 get_node
3 get_free_idx
3 find_or_create_page
3 fcntl_setlk
3 ext2_writepages
3 ext2_mkdir
3 ext2_ioctl
3 ext2_destroy_inode
3 exec_rmap
3 del_timer
3 d_alloc_root
3 copy_semundo
3 check_mnt
3 bounce_copy_vec
3 block_read_full_page
3 background_writeout
3 __put_ioctx
2 writeback_acquire
2 wake_up_process
2 wait_for_completion
2 vsscanf
2 vfs_statfs
2 vfs_mkdir
2 unix_release_sock
2 sys_waitpid
2 sys_vhangup
2 sys_setrlimit
2 sys_getrlimit
2 sys_getpid
2 sys_creat
2 sys_chdir
2 steal_locks
2 sk_alloc
2 scsi_single_lun_run
2 restore_fpu
2 remove_exclusive_swap_page
2 release_thread
2 release_dev
2 rb_first
2 radix_tree_extend
2 pty_open
2 proc_tgid_base_lookup
2 proc_permission
2 kobject_put
2 i_waitq_head
2 handle_signal
2 handle_ra_miss
2 grow_dev_page
2 generic_make_request
2 free_uid
2 find_inode_fast
2 file_ioctl
2 ext2_set_link
2 ext2_rmdir
2 ext2_follow_link
2 ext2_count_free_inodes
2 ext2_bg_num_gdb
2 expand_files
2 exit_thread
2 exit_fs
2 down_tty_sem
2 do_writepages
2 do_flush_tlb_all
2 dispose_list
2 d_move
2 copy_strings_kernel
2 check_ttfb_buffer
2 bounce_end_io
2 as_set_request
2 activate_page
2 __up
2 __rmqueue
2 __cond_resched
2 __breadahead
2 .text.lock.open
2 .text.lock.array
1 wait_on_page_bit_wq
1 vscnprintf
1 vfs_statfs_native
1 unix_stream_connect
1 unix_sock_destructor
1 task_rq_lock
1 task_prio
1 sys_statfs
1 sys_socketcall
1 sys_set_tid_address
1 sys_rmdir
1 sys_gettimeofday
1 sys_getppid
1 sys_fchdir
1 sys_exit_group
1 sys_dup
1 sys_clone
1 sock_init_data
1 sock_fasync
1 shrink_dcache_parent
1 show_tty_driver
1 set_binfmt
1 seq_read
1 seq_printf
1 scsi_put_command
1 scsi_io_completion
1 save_i387_fxsave
1 remove_wait_queue
1 read_pages
1 radix_tree_node_alloc
1 put_io_context
1 put_fs_struct
1 proc_readdir
1 proc_read_inode
1 pipe_write
1 pipe_read
1 nr_running
1 mempool_free
1 memory_open
1 meminfo_read_proc
1 may_ptrace_attach
1 math_state_restore
1 lookup_hash
1 lookup_create
1 locks_wake_up_blocks
1 ksoftirqd
1 invalidate_mmap_range
1 invalidate_complete_page
1 init_special_inode
1 init_dev
1 getrusage
1 flock_to_posix_lock
1 ext2_nobh_prepare_write
1 ext2_mknod
1 ext2_get_parent
1 ext2_bg_has_super
1 exit_files
1 end_buffer_read_sync
1 end_buffer_read_nobh
1 dupfd
1 do_setitimer
1 do_getitimer
1 do_file_page
1 device_not_available
1 d_genocide
1 chroot_fs_refs
1 cap_bprm_set_security
1 cap_bprm_secureexec
1 bmap
1 bio_put
1 bh_waitq_head
1 alloc_skb
1 add_to_page_cache_lru
1 __end_that_request_first
1 __down_failed
1 __bforget
1 .text.lock.time
1 .text.lock.mmap
1 .text.lock.exit
0 zap_page_range
0 yield
0 writeback_in_progress
0 wb_kupdate
0 vmtruncate
0 vfs_rename_other
0 vfs_rename
0 vfs_link
0 uptime_read_proc
0 unlock_rename
0 unix_stream_recvmsg
0 unix_poll
0 unix_mkname
0 unix_find_other
0 tty_write
0 tty_release
0 tty_ioctl
0 tty_fasync
0 t_next
0 sys_vfork
0 sys_umask
0 sys_times
0 sys_stime
0 sys_socket
0 sys_sethostname
0 sys_pread64
0 sys_prctl
0 sys_pipe
0 sys_old_getrlimit
0 sys_mprotect
0 sys_getrusage
0 sys_getpgrp
0 sys_getpgid
0 sys_geteuid
0 sys_fcntl
0 sys_connect
0 sys_clock_gettime
0 sys_alarm
0 sync_dirty_buffer
0 submit_bio
0 submit_bh
0 sockfd_lookup
0 sock_wmalloc
0 sock_wfree
0 sock_release
0 sock_destroy_inode
0 sock_close
0 sock_alloc_inode
0 skb_release_data
0 sk_free
0 single_open
0 show_vfsmnt
0 show_tty_range
0 setup_rt_frame
0 set_current_groups
0 seq_open
0 seq_escape
0 sd_rw_intr
0 scsi_softirq
0 scsi_init_cmd_errh
0 scsi_free_sgtable
0 scsi_finish_command
0 scsi_decide_disposition
0 scsi_add_timer
0 schedule_timeout
0 rwsem_down_write_failed
0 run_timer_softirq
0 ret_from_fork
0 restore_i387_fxsave
0 restore_i387
0 remove_inode_hash
0 remove_arg_zero
0 remote_llseek
0 release_sock
0 register_reboot_notifier
0 rcu_process_callbacks
0 rcu_check_quiescent_state
0 pty_unthrottle
0 pty_close
0 process_backlog
0 proc_self_follow_link
0 proc_pid_statm
0 proc_pid_flush
0 proc_file_lseek
0 proc_exe_link
0 proc_calc_metrics
0 prepare_to_wait_exclusive
0 prepare_to_copy
0 pmd_ctor
0 pipefs_delete_dentry
0 pipe_write_release
0 pipe_read_release
0 pipe_read_fasync
0 pagevec_strip
0 page_pool_free
0 page_pool_alloc
0 osync_buffers_list
0 operate_on_page_range
0 open_private_file
0 nr_free_highpages
0 netif_rx
0 mwait_idle
0 mprotect_fixup
0 mpage_readpages
0 mpage_end_io_write
0 mpage_alloc
0 move_addr_to_kernel
0 mmap_hugetlb_implicit
0 migration_thread
0 memset
0 mempool_alloc_slab
0 mark_offset_tsc
0 mark_buffer_dirty_inode
0 maps_open
0 lookup_swap_cache
0 lookup_one_len
0 locks_insert_lock
0 locks_delete_lock
0 locks_copy_lock
0 locks_alloc_lock
0 lock_sock
0 local_bh_enable
0 load_script
0 kmem_flagcheck
0 kfree_skbmem
0 ip_route_input_slow
0 io_schedule_timeout
0 invalidate_mmap_range_list
0 invalidate_bh_lru
0 intr_handler
0 interruptible_sleep_on
0 inode_get_bytes
0 init_page_buffers
0 init_once
0 in_egroup_p
0 idle_cpu
0 handle_IRQ_event
0 grab_cache_page_nowait
0 get_page_state
0 get_io_context
0 get_dirty_limits
0 generic_file_aio_write_nolock
0 fs_may_remount_ro
0 free_fd_array
0 free_cold_page
0 free_anonmm
0 follow_up
0 find_task_by_pid
0 filp_ctor
0 file_send_actor
0 ext3_count_free_blocks
0 ext2_write_inode
0 ext2_rename
0 ext2_dotdot
0 end_buffer_async_write
0 double_lock_balance
0 do_vm86_irq_handling
0 do_truncate
0 do_softirq
0 do_sigaltstack
0 do_proc_dointvec_conv
0 do_proc_dointvec
0 do_posix_gettime
0 do_posix_clock_monotonic_gettime_parts
0 do_notify_resume
0 devpts_get_tty
0 copy_io_context
0 convert_fxsr_from_user
0 check_kill_permission
0 change_protection
0 cascade
0 cap_inode_setxattr
0 buffer_insert_list
0 bounce_end_io_write
0 block_sync_page
0 blk_run_queues
0 blk_recount_segments
0 blk_queue_bounce
0 blk_congestion_wait
0 bio_get_nr_vecs
0 bio_endio
0 bio_destructor
0 balance_dirty_pages
0 bad_follow_link
0 arp_process
0 alloc_uid
0 alloc_fd_array
0 add_wait_queue_exclusive
0 add_disk_randomness
0 __writeback_single_inode
0 __write_lock_failed
0 __wake_up_sync
0 __wait_on_buffer
0 __up_wakeup
0 __unix_remove_socket
0 __unix_insert_socket
0 __lock_page_wq
0 __kfree_skb
0 __getblk_slow
0 __bio_add_page
0 .text.lock.timer_pit
0 .text.lock.smp
0 .text.lock.signal
0 .text.lock.readdir
0 .text.lock.page_writeback
0 .text.lock.fcntl
0 .text.lock.block_dev
0 .text.lock.binfmt_elf
[-- Attachment #3: mjb1-prio --]
[-- Type: application/octet-stream, Size: 17871 bytes --]
901551 total
534689 default_idle
49715 __down
22305 copy_page_range
16684 zap_pte_range
16014 do_wp_page
14300 __wake_up
14120 finish_task_switch
10865 find_get_page
10741 page_remove_rmap
10175 __d_lookup
8971 release_pages
7559 __copy_to_user_ll
6609 do_anonymous_page
5477 do_no_page
5428 copy_mm
4940 atomic_dec_and_lock
4887 clear_page_tables
4886 path_lookup
4510 do_page_fault
4404 remove_shared_vm_struct
3789 .text.lock.filemap
3509 pte_alloc_one
3303 __copy_user_intel
3160 free_pages_and_swap_cache
2988 copy_process
2620 __block_prepare_write
2533 filemap_nopage
2319 buffered_rmqueue
2264 free_hot_cold_page
2206 link_path_walk
2042 release_task
1844 .text.lock.dec_and_lock
1820 file_move
1715 path_release
1651 kmem_cache_free
1624 __find_get_block
1573 ext2_new_inode
1555 page_add_obj_rmap
1554 proc_pid_stat
1516 page_address
1516 follow_mount
1495 exit_notify
1417 current_kernel_time
1347 number
1303 __fput
1276 task_mem
1266 .text.lock.highmem
1236 __brelse
1216 __read_lock_failed
1199 vma_link
1198 generic_file_open
1165 __copy_from_user_ll
1115 kmap_atomic
1100 system_call
1089 fput
1081 do_generic_mapping_read
1053 find_vma
1046 kmap_high
1037 .text.lock.dcache
1031 file_kill
1023 alloc_inode
996 __mark_inode_dirty
989 do_mmap_pgoff
988 d_alloc
980 .text.lock.file_table
972 lookup_mnt
949 handle_mm_fault
948 set_page_dirty
888 vsnprintf
824 dnotify_parent
817 grab_block
815 new_inode
790 split_vma
786 generic_fillattr
777 __vma_prio_tree_remove
775 dput
761 update_atime
730 ext2_update_inode
707 flush_signal_handlers
677 sched_clock
666 pte_alloc_map
661 proc_pid_status
661 __find_get_block_slow
649 schedule
643 set_page_address
641 __vma_prio_tree_insert
630 block_invalidatepage
619 radix_tree_lookup
614 del_timer_sync
610 generic_file_aio_write_nolock
585 prio_tree_insert
566 vfs_read
562 copy_files
548 mmgrab
543 kunmap_high
543 dentry_open
533 fd_install
532 proc_root_link
526 exit_mmap
522 ext2_new_block
511 generic_delete_inode
494 do_page_cache_readahead
485 get_tgid_list
474 render_sigset_t
468 dup_task_struct
467 mark_page_accessed
465 __generic_file_aio_read
460 d_instantiate
460 __blk_queue_bounce
431 rb_insert_color
428 __alloc_pages
427 deny_write_access
423 prune_dcache
414 proc_check_root
412 read_block_bitmap
410 pid_revalidate
407 kmem_ptr_validate
404 find_vma_prepare
401 page_add_anon_rmap
400 filp_close
396 flush_tlb_page
378 may_open
371 vfs_unlink
369 direct_strnlen_user
365 do_sigaction
350 get_empty_filp
349 exec_mmap
348 __pagevec_lru_add_active
343 file_ra_state_init
338 ext2_find_entry
334 real_lookup
333 find_get_pages
329 sys_close
323 unlock_page
318 truncate_inode_pages
313 unmap_vmas
311 copy_strings
305 ext2_add_link
304 ext2_free_blocks
300 select_parent
297 do_exit
292 load_elf_binary
291 dnotify_flush
284 sigprocmask
282 ext2_get_group_desc
271 create_buffers
270 d_path
269 task_dumpable
268 .text.lock.inode
263 ext2_preread_inode
259 ext2_get_inode
255 ext2_get_block
250 sys_wait4
247 find_group_other
246 group_release_blocks
241 group_reserve_blocks
237 syscall_exit
233 kunmap_atomic
233 direct_strncpy_from_user
232 try_to_wake_up
232 get_unused_fd
231 d_delete
228 proc_fd_link
225 open_namei
222 put_files_struct
222 fget
220 page_update_anon_rmap
218 __insert_inode_hash
217 __do_softirq
217 .text.lock.base
215 vfs_getattr
213 default_wake_function
211 try_to_free_buffers
207 old_mmap
204 call_rcu
197 exit_rmap
195 profile_exit_mmap
195 lru_cache_add_active
194 ext2_discard_prealloc
194 __vma_link
193 ext2_truncate
192 vfs_write
192 file_read_actor
187 tid_fd_revalidate
187 page_cache_readahead
187 fget_light
185 do_lookup
183 flush_old_exec
182 get_wchan
182 generic_file_write
181 in_group_p
177 sys_unlink
174 locks_remove_flock
170 iput
170 do_munmap
169 flush_tlb_mm
166 find_vma_prev
165 pid_alive
165 __pagevec_lru_add
164 __block_commit_write
163 ext2_commit_chunk
163 bad_range
160 mm_alloc
157 is_subdir
157 __vma_link_rb
153 d_lookup
152 vm_acct_memory
151 percpu_counter_mod
151 ext2_readdir
148 locks_remove_posix
147 __lookup
143 alloc_pidmap
141 unmap_vma
141 reap_timer_fnc
139 get_signal_to_deliver
137 proc_delete_inode
134 ext2_block_to_path
133 rcu_do_batch
133 proc_pid_readlink
133 pgd_alloc
132 proc_lookup
129 __free_pages
128 setup_arg_pages
128 __insert_vm_struct
126 profile_exec_unmap
125 read_inode_bitmap
123 generic_file_mmap
123 __d_path
122 ext2_delete_entry
121 wait_task_zombie
114 cap_vm_enough_memory
113 proc_info_read
113 generic_file_read
112 vfs_permission
111 find_lock_page
104 cp_new_stat64
103 try_to_unmap_one
102 kmem_cache_alloc
100 generic_file_llseek
100 ext2_free_inode
99 wake_up_forked_process
98 unlock_buffer
98 inode_times_differ
97 getname
93 .text.lock.namespace
91 eligible_child
89 sys_read
89 search_binary_handler
89 inode_sub_bytes
88 inode_update_time
87 cpu_idle
86 get_offset_pit
86 copy_namespace
85 vfs_readdir
85 unmap_region
85 __rb_rotate_left
84 ext2_inode_by_name
83 ext2_free_branches
82 inode_init_once
81 mmput
80 vma_merge
80 permission
78 set_fs_pwd
78 create_elf_tables
75 truncate_complete_page
75 proc_pid_lookup
74 sys_newuname
74 load_elf_interp
74 __copy_user_zeroing_intel
73 pipe_writev
73 pipe_readv
71 sched_fork
71 do_brk
70 sys_brk
70 eventpoll_release_file
70 create_empty_buffers
69 add_to_page_cache
69 __put_task_struct
68 __remove_from_page_cache
67 inode_has_buffers
67 generic_commit_write
67 __rb_erase_color
66 setup_sigcontext
65 show_stat
65 prep_new_page
65 drop_buffers
65 copy_thread
64 mm_init
62 prepare_binprm
62 exit_aio
62 do_group_exit
61 dup_rmap
60 wake_up_inode
60 __rb_rotate_right
59 kpmd_ctor
58 __clear_page_buffers
57 profile_exit_task
57 mark_buffer_dirty
56 do_fork
56 can_vma_merge_after
56 add_wait_queue
55 .text.lock.fork
54 rb_erase
54 d_rehash
53 page_waitqueue
53 lru_add_drain
53 flush_tlb_others
53 ext2_release_inode
52 ret_from_intr
52 proc_pid_make_inode
52 ext2_get_page
51 flush_thread
51 find_group_orlov
50 get_unmapped_area
50 get_jiffies_64
50 find_trylock_page
50 do_gettimeofday
49 find_pid
49 complete
49 __set_page_dirty_nobuffers
49 .text.lock.namei
48 proc_pid_readdir
48 filldir64
47 __get_page_state
46 try_hugetlb_get_unmapped_area
46 sys_ioctl
46 mm_release
46 lru_cache_add
46 ext2_count_free_blocks
46 __sync_single_inode
45 ext2_alloc_branch
45 ext2_alloc_block
45 count_open_files
45 can_vma_merge_before
45 .text.lock.readahead
44 proc_readfd
44 filp_open
44 error_code
43 setup_frame
43 d_free
43 cap_bprm_compute_creds
43 bad_page
43 __page_cache_release
43 __down_failed_interruptible
42 sys_set_thread_area
42 proc_pid_unhash
42 free_percpu
42 ext2_empty_dir
41 direct_clear_user
40 zap_pmd_range
40 chown_common
39 sys_mmap2
39 sched_balance_exec
39 destroy_context
39 __user_walk
38 unshare_files
38 unmap_page_range
38 sys_open
38 open_exec
37 compute_creds
37 cached_lookup
37 .text.lock.balloc
36 rwsem_wake
36 radix_tree_insert
36 proc_root_lookup
36 .text.lock.generic
36 .text.lock.fs_writeback
35 task_vsize
35 destroy_inode
35 clear_inode
34 sys_utime
34 scsi_request_fn
34 radix_tree_preload
34 next_thread
34 do_execve
33 inode_add_bytes
33 get_write_access
32 elf_map
32 cache_grow
31 wake_up_buffer
31 remove_suid
31 put_dirty_page
31 ext2_lookup
31 __make_request
31 .text.lock.mmap
30 proc_lookupfd
30 __get_free_pages
29 vfs_follow_link
29 sys_access
29 block_prepare_write
28 skip_atoi
28 proc_pident_lookup
28 exit_itimers
27 rwsem_down_read_failed
27 prio_tree_remove
27 __kmalloc
26 work_resched
26 sys_write
26 sys_chmod
26 pgd_free
26 notify_change
26 inode_setattr
26 .text.lock.ialloc
25 test_clear_page_dirty
25 sys_dup2
25 memcpy
25 ext2_create
25 exit_sem
25 __lookup_hash
24 sys_rt_sigaction
24 inode_change_ok
24 ext2_group_sparse
24 .text.lock.dnotify
23 vfs_stat
23 sys_rt_sigprocmask
23 ext2_get_branch
23 __getblk
22 read_cache_page
22 radix_tree_delete
22 mpage_writepage
22 generic_file_write_nolock
22 fasync_helper
22 can_share_swap_page
21 vfs_rmdir
21 kmap_atomic_to_page
21 bh_lru_install
20 reserve_blocks
20 invalidate_inode_buffers
20 ext2_prepare_write
20 do_pipe
20 __pagevec_free
19 vfs_create
19 sys_llseek
19 init_new_context
19 end_page_writeback
19 d_invalidate
18 recalc_bh_state
18 pid_delete_dentry
18 pid_base_iput
18 get_zone_counts
18 free_pgtables
18 follow_down
18 expand_stack
18 chrdev_open
17 write_inode
17 __set_page_buffers
16 sys_stat64
16 set_bh_page
16 save_i387
16 ll_rw_block
16 generic_forget_inode
16 free_task
16 flush_all_zero_pkmaps
16 ext2_check_page
16 balance_dirty_pages_ratelimited
16 bad_get_user
16 alloc_buffer_head
15 scsi_end_request
15 ext2_delete_inode
15 convert_fxsr_to_user
15 __get_user_4
14 write_profile
14 sys_readlink
14 sys_munmap
14 smp_call_function
14 init_fpu
14 expand_fd_array
14 eventpoll_init_file
14 do_signal
14 .text.lock.ioctl
13 proc_root_readdir
13 kunmap
13 free_pages
13 finish_wait
13 ext2_setattr
13 __bread
12 vsprintf
12 sync_sb_inodes
12 steal_locks
12 run_timer_softirq
12 prepare_to_wait
12 pipe_wait
12 insert_vm_struct
12 detach_vmas_to_be_unmapped
12 d_alloc_root
11 setattr_mask
11 remove_from_page_cache
11 page_slot
11 kmap
11 kfree
11 is_bad_inode
11 ext2_unlink
10 pipe_release
10 nr_blockdev_pages
10 kill_fasync
10 groups_search
10 ext2_release_file
10 ext2_find_near
10 bio_alloc
10 .text.lock.commoncap
9 sys_mkdir
9 sys_lstat64
9 sys_execve
9 posix_block_lock
9 mpage_writepages
9 free_buffer_head
9 ext2_set_inode_flags
9 ext2_put_inode
9 ext2_alloc_inode
9 dget_locked
9 d_unhash
9 __mmdrop
8 vfs_fstat
8 try_to_release_page
8 sys_sigreturn
8 sys_fstat64
8 sprintf
8 resume_userspace
8 restore_all
8 release_x86_irqs
8 put_filp
8 padzero
8 inode_needs_sync
8 do_mpage_readpage
8 block_truncate_page
8 .text.lock.root
7 wait_for_completion
7 unmap_vma_list
7 unmap_underlying_metadata
7 syscall_call
7 sys_getdents64
7 sys_chown
7 restore_fpu
7 prepare_to_wait_exclusive
7 pagevec_lookup
7 init_buffer_head
7 group_send_sig_info
7 get_vmalloc_info
7 get_pipe_inode
7 ext2_last_byte
7 do_sync_write
7 count
7 cache_init_objs
7 __cond_resched
7 .text.lock.swap
7 .text.lock.array
6 vfs_lstat
6 sys_prctl
6 slab_destroy
6 scsi_dispatch_cmd
6 rwsem_down_write_failed
6 release_blocks
6 mempool_alloc
6 lock_rename
6 kernel_read
6 get_new_inode_fast
6 block_commit_write
6 __posix_lock_file
6 __iget
6 __down_failed
6 __block_write_full_page
5 writeback_inodes
5 up_tty_sem
5 unix_create1
5 sys_time
5 sock_map_fd
5 si_swapinfo
5 schedule_tail
5 restore_sigcontext
5 rcu_check_quiescent_state
5 radix_tree_gang_lookup
5 put_unused_fd
5 pte_alloc_kernel
5 proc_get_inode
5 proc_alloc_inode
5 pipe_new
5 nr_iowait
5 generic_drop_inode
5 free_hot_page
5 filemap_getpage
5 ext2_mkdir
5 ext2_make_empty
5 ext2_destroy_inode
5 do_proc_readlink
5 do_fcntl
5 del_timer
5 check_tty_count
5 __put_ioctx
5 __pmd_alloc
5 .text.lock.exec
5 .text.lock.buffer
4 sys_fcntl64
4 sys_chdir
4 proc_tgid_base_lookup
4 pipe_write_fasync
4 pipe_read_fasync
4 path_walk
4 kobject_get
4 iget_locked
4 handle_signal
4 get_node
4 fcntl_dirnotify
4 exec_rmap
4 de_put
4 check_mnt
4 cdev_get
4 bounce_end_io
4 block_read_full_page
4 background_writeout
4 .text.lock.open
4 .text.lock.locks
3 vfs_mkdir
3 sys_exit_group
3 sock_create
3 sock_alloc
3 save_i387_fxsave
3 rt_run_flush
3 put_fs_struct
3 proc_file_read
3 proc_destroy_inode
3 notifier_call_chain
3 name_to_int
3 kobject_put
3 grow_dev_page
3 find_or_create_page
3 find_inode_fast
3 ext2_writepages
3 ext2_statfs
3 ext2_rmdir
3 ext2_follow_link
3 ext2_count_free_inodes
3 ext2_bg_has_super
3 d_move
3 __write_lock_failed
3 __rmqueue
3 .text.lock.exit
2 wake_up_process
2 wait_on_page_bit
2 vsscanf
2 vfs_statfs_native
2 vfs_statfs
2 task_rq_lock
2 task_nice
2 sys_waitpid
2 sys_vhangup
2 sys_vfork
2 sys_setrlimit
2 sys_rmdir
2 sys_getrlimit
2 sys_getcwd
2 sys_fchdir
2 sys_creat
2 sys_clone
2 sk_alloc
2 set_close_on_exec
2 set_brk
2 set_binfmt
2 send_IPI_mask_sequence
2 scsi_single_lun_run
2 remove_wait_queue
2 remove_inode_buffers
2 release_thread
2 release_dev
2 rb_first
2 radix_tree_node_alloc
2 radix_tree_extend
2 proc_read_inode
2 proc_permission
2 nr_free_pages
2 mempool_free
2 lookup_create
2 locate_fd
2 invalidate_mmap_range
2 invalidate_complete_page
2 invalidate_bh_lru
2 i_waitq_head
2 handle_ra_miss
2 getrusage
2 get_request
2 get_free_idx
2 file_ioctl
2 fcntl_setlk
2 ext2_set_link
2 ext2_get_parent
2 ext2_bg_num_gdb
2 exit_thread
2 exit_fs
2 do_truncate
2 do_invalidatepage
2 do_flush_tlb_all
2 dispose_list
2 copy_strings_kernel
2 copy_semundo
2 cap_bprm_set_security
2 bounce_copy_vec
2 bmap
2 bh_waitq_head
2 as_set_request
2 __up_wakeup
2 __pagevec_release
2 __mod_timer
2 __breadahead
2 .text.lock.char_dev
1 writeback_acquire
1 unlock_rename
1 unix_release_sock
1 tty_open
1 tty_fasync
1 task_prio
1 sys_statfs
1 sys_setpgid
1 sys_gettimeofday
1 sys_getrusage
1 sys_getppid
1 sys_getpid
1 sys_fcntl
1 submit_bio
1 sock_init_data
1 scsi_run_queue
1 scsi_put_command
1 scsi_init_cmd_errh
1 scsi_finish_command
1 schedule_timeout
1 remove_exclusive_swap_page
1 read_zero
1 rcu_process_callbacks
1 proc_readdir
1 proc_pid_flush
1 prio_tree_expand
1 prepare_to_copy
1 pipe_write
1 open_private_file
1 nr_running
1 memory_open
1 meminfo_read_proc
1 math_state_restore
1 lookup_hash
1 locks_init_lock
1 local_bh_enable
1 kmem_flagcheck
1 inode_get_bytes
1 init_dev
1 grab_cache_page_nowait
1 get_io_context
1 generic_make_request
1 free_uid
1 follow_up
1 flock_to_posix_lock
1 ext2_nobh_prepare_write
1 ext2_ioctl
1 expand_files
1 dupfd
1 down_tty_sem
1 do_writepages
1 do_sigaltstack
1 do_setitimer
1 do_getitimer
1 convert_fxsr_from_user
1 chroot_fs_refs
1 check_ttfb_buffer
1 change_protection
1 buffer_insert_list
1 blk_run_queues
1 bio_endio
1 bad_follow_link
1 alloc_skb
1 add_disk_randomness
1 activate_page
1 __up
1 __getblk_slow
1 __end_that_request_first
1 __bforget
1 .text.lock.tty_io
1 .text.lock.task_mmu
1 .text.lock.sys_i386
1 .text.lock.page_writeback
0 zap_page_range
0 yield
0 writeback_release
0 writeback_in_progress
0 wb_kupdate
0 vscnprintf
0 vmtruncate
0 vfs_symlink
0 vfs_rename_other
0 vfs_rename
0 vfs_mknod
0 uptime_read_proc
0 unlock_new_inode
0 unix_stream_recvmsg
0 unix_stream_connect
0 unix_sock_destructor
0 unix_mkname
0 unix_find_other
0 unix_create
0 udp_rcv
0 tty_release
0 tty_ioctl
0 tty_drivers_open
0 test_perm
0 tcp_transmit_skb
0 tasklet_action
0 t_start
0 t_next
0 sys_umask
0 sys_times
0 sys_stime
0 sys_socketcall
0 sys_set_tid_address
0 sys_select
0 sys_rt_sigreturn
0 sys_rename
0 sys_pread64
0 sys_pipe
0 sys_pause
0 sys_old_getrlimit
0 sys_kill
0 sys_getpgrp
0 sys_geteuid
0 sys_dup
0 sys_connect
0 sys_clock_gettime
0 sys_alarm
0 sync_dirty_buffer
0 submit_bh
0 stat_open
0 sock_wmalloc
0 sock_release
0 sock_poll
0 sock_fasync
0 sock_alloc_inode
0 skb_release_data
0 single_open
0 si_meminfo
0 shrink_dcache_parent
0 show_tty_range
0 show_tty_driver
0 setfl
0 seq_read
0 seq_printf
0 seq_open
0 send_sigio_to_task
0 sd_rw_intr
0 scsi_softirq
0 scsi_next_command
0 scsi_io_completion
0 scsi_decide_disposition
0 scsi_add_timer
0 run_local_timers
0 rt_set_nexthop
0 ret_from_fork
0 restore_i387_fxsave
0 restore_i387
0 remove_inode_hash
0 remove_arg_zero
0 remote_llseek
0 release_sock
0 register_reboot_notifier
0 read_cache_pages
0 raise_softirq
0 put_io_context
0 put_device
0 pty_unthrottle
0 pty_open
0 pty_close
0 profile_event_register
0 proc_self_follow_link
0 proc_readsys
0 proc_opensys
0 proc_file_lseek
0 proc_dointvec
0 proc_calc_metrics
0 prio_tree_first
0 pmd_ctor
0 pipefs_delete_dentry
0 pipe_write_release
0 pipe_read_release
0 pipe_read
0 pagevec_strip
0 page_pool_alloc
0 page_getlink
0 osync_buffers_list
0 nr_free_highpages
0 nr_context_switches
0 netif_rx
0 mprotect_fixup
0 mpage_readpages
0 mpage_end_io_write
0 mpage_alloc
0 move_addr_to_kernel
0 mounts_open
0 mod_timer
0 mmap_hugetlb_implicit
0 migration_thread
0 memset
0 mempool_free_slab
0 mempool_alloc_slab
0 mem_open
0 media_not_present
0 math_emulate
0 mark_offset_tsc
0 maps_open
0 lookup_swap_cache
0 lookup_one_len
0 locks_wake_up_blocks
0 locks_insert_lock
0 locks_delete_lock
0 load_script
0 kthread_should_stop
0 ksoftirqd
0 kill_something_info
0 kill_proc_info
0 kfree_skbmem
0 io_schedule_timeout
0 invalidate_mmap_range_list
0 intr_handler
0 interruptible_sleep_on
0 init_special_inode
0 init_page_buffers
0 init_once
0 inet_addr_type
0 in_egroup_p
0 idle_cpu
0 hash_futex
0 get_zeroed_page
0 get_page_state
0 get_dirty_limits
0 generic_file_readv
0 fs_may_remount_ro
0 free_fd_array
0 free_cold_page
0 free_anonmm
0 format_corename
0 find_task_by_pid
0 filp_ctor
0 ext3_bg_has_super
0 ext2_writepage
0 ext2_write_inode
0 ext2_rename
0 ext2_read_inode
0 ext2_mknod
0 ext2_find_shared
0 exit_files
0 end_buffer_write_sync
0 end_buffer_read_sync
0 end_buffer_read_nobh
0 end_buffer_async_write
0 end_bio_bh_io_sync
0 do_select
0 do_rw_proc
0 do_proc_dointvec_conv
0 do_proc_dointvec
0 do_posix_gettime
0 do_posix_clock_monotonic_gettime_parts
0 do_file_page
0 devpts_get_tty
0 device_not_available
0 d_genocide
0 collect_sigign_sigcatch
0 check_kill_permission
0 cdev_put
0 cascade
0 cap_inode_setxattr
0 cap_bprm_secureexec
0 bounce_end_io_write
0 block_write_full_page
0 block_sync_page
0 blk_run_queue
0 blk_recount_segments
0 blk_congestion_wait
0 bio_put
0 bio_get_nr_vecs
0 bio_destructor
0 bio_add_page
0 balance_dirty_pages
0 bad_pipe_r
0 autoremove_wake_function
0 assign_type
0 arp_process
0 arch_get_unmapped_area
0 alloc_uid
0 alloc_fd_array
0 add_wait_queue_exclusive
0 add_to_page_cache_lru
0 __writeback_single_inode
0 __wake_up_locked
0 __wait_on_buffer
0 __unix_insert_socket
0 __netdev_rx
0 __kfree_skb
0 __down_interruptible
0 __direct_clear_user
0 __bread_slow
0 __bio_add_page
0 __alloc_percpu
0 .text.lock.timer_pit
0 .text.lock.time
0 .text.lock.signal
0 .text.lock.rwsem
0 .text.lock.rmap
0 .text.lock.readdir
0 .text.lock.rcupdate
0 .text.lock.fcntl
^ permalink raw reply [flat|nested] 38+ messages in thread
* Re: [PATCH] anobjrmap 9 priority mjb tree
2004-04-11 17:28 ` Martin J. Bligh
@ 2004-04-12 4:32 ` Rajesh Venkatasubramanian
2004-04-12 5:24 ` Martin J. Bligh
2004-04-12 15:46 ` Martin J. Bligh
1 sibling, 1 reply; 38+ messages in thread
From: Rajesh Venkatasubramanian @ 2004-04-12 4:32 UTC (permalink / raw)
To: Martin J. Bligh; +Cc: Hugh Dickins, linux-kernel, Andrew Morton
This patch is an attempt at reducing the contention on i_shared_sem
by introducing a new semaphore i_mmap_sem. The i_shared_sem covers
i_mmap_shared tree and i_mmap_nonlinear list now, whereas i_mmap_sem
covers i_mmap tree. This may help to reduce the contention on
i_shared_sem if a file is mapped both private and shared. Kernel
compile time with and without this patch did not change much, though.
This patch is on top of 2.6.5-mjb1+anobjrmap9_prio. Compiled and
tested.
Martin! Are you interested in testing SDET with this patch ?
fs/hugetlbfs/inode.c | 6 +++++-
fs/inode.c | 1 +
include/linux/fs.h | 3 ++-
include/linux/mm.h | 24 +++++++++++++++++++++++-
kernel/fork.c | 4 ++--
mm/filemap.c | 6 +++---
mm/memory.c | 6 +++++-
mm/mmap.c | 51 +++++++++++++++------------------------------------
mm/mremap.c | 29 ++++++++++++-----------------
mm/rmap.c | 32 ++++++++++++++++++++++++--------
10 files changed, 92 insertions(+), 70 deletions(-)
diff -puN include/linux/fs.h~010_sem_contention include/linux/fs.h
--- mmlinux-2.6/include/linux/fs.h~010_sem_contention 2004-04-11 22:07:40.000000000 -0400
+++ mmlinux-2.6-jaya/include/linux/fs.h 2004-04-11 22:07:40.000000000 -0400
@@ -333,7 +333,8 @@ struct address_space {
struct prio_tree_root i_mmap; /* tree of private mappings */
struct prio_tree_root i_mmap_shared; /* tree of shared mappings */
struct list_head i_mmap_nonlinear;/*list of nonlinear mappings */
- struct semaphore i_shared_sem; /* protect both above lists */
+ struct semaphore i_mmap_sem; /* protect i_mmap prio_tree */
+ struct semaphore i_shared_sem; /* protect shared and nonlinear */
atomic_t truncate_count; /* Cover race condition with truncate */
unsigned long flags; /* error bits/gfp mask */
struct backing_dev_info *backing_dev_info; /* device readahead, etc */
diff -puN include/linux/mm.h~010_sem_contention include/linux/mm.h
--- mmlinux-2.6/include/linux/mm.h~010_sem_contention 2004-04-11 22:07:40.000000000 -0400
+++ mmlinux-2.6-jaya/include/linux/mm.h 2004-04-11 22:08:13.000000000 -0400
@@ -232,7 +232,7 @@ static inline void __vma_prio_tree_add(s
* We cannot modify vm_start, vm_end, vm_pgoff fields of a vma that has been
* already present in an i_mmap{_shared} tree without modifying the tree. The
* following helper function should be used when such modifications are
- * necessary. We should hold the mapping's i_shared_sem.
+ * necessary. We should hold the mapping's i_shared_sem or i_mmap_sem.
*
* This function can be (micro)optimized for some special cases (maybe later).
*/
@@ -296,6 +296,28 @@ static inline struct vm_area_struct *__v
return NULL;
}
+/* Caller should hold mmap_sem */
+static inline void vma_mapping_lock(struct vm_area_struct *vma)
+{
+ if (vma->vm_file) {
+ if (vma->vm_flags & VM_SHARED)
+ down(&vma->vm_file->f_mapping->i_shared_sem);
+ else
+ down(&vma->vm_file->f_mapping->i_mmap_sem);
+ }
+}
+
+/* Caller should hold mmap_sem */
+static inline void vma_mapping_unlock(struct vm_area_struct *vma)
+{
+ if (vma->vm_file) {
+ if (vma->vm_flags & VM_SHARED)
+ up(&vma->vm_file->f_mapping->i_shared_sem);
+ else
+ up(&vma->vm_file->f_mapping->i_mmap_sem);
+ }
+}
+
/*
* mapping from the currently active vm_flags protection bits (the
* low four bits) to a page protection mask..
diff -puN kernel/fork.c~010_sem_contention kernel/fork.c
--- mmlinux-2.6/kernel/fork.c~010_sem_contention 2004-04-11 22:07:40.000000000 -0400
+++ mmlinux-2.6-jaya/kernel/fork.c 2004-04-11 22:07:40.000000000 -0400
@@ -332,9 +332,9 @@ static inline int dup_mmap(struct mm_str
atomic_dec(&inode->i_writecount);
/* insert tmp into the share list, just after mpnt */
- down(&file->f_mapping->i_shared_sem);
+ vma_mapping_lock(tmp);
__vma_prio_tree_add(tmp, mpnt);
- up(&file->f_mapping->i_shared_sem);
+ vma_mapping_unlock(tmp);
}
/*
diff -puN mm/mmap.c~010_sem_contention mm/mmap.c
--- mmlinux-2.6/mm/mmap.c~010_sem_contention 2004-04-11 22:07:40.000000000 -0400
+++ mmlinux-2.6-jaya/mm/mmap.c 2004-04-11 22:07:40.000000000 -0400
@@ -67,7 +67,7 @@ int mmap_use_hugepages = 0;
int mmap_hugepages_map_sz = 256;
/*
- * Requires inode->i_mapping->i_shared_sem
+ * Requires inode->i_mapping->i_shared_sem or i_mmap_sem
*/
static inline void
__remove_shared_vm_struct(struct vm_area_struct *vma, struct inode *inode,
@@ -96,10 +96,10 @@ static void remove_shared_vm_struct(stru
if (file) {
struct address_space *mapping = file->f_mapping;
- down(&mapping->i_shared_sem);
+ vma_mapping_lock(vma);
__remove_shared_vm_struct(vma, file->f_dentry->d_inode,
mapping);
- up(&mapping->i_shared_sem);
+ vma_mapping_unlock(vma);
}
}
@@ -298,18 +298,11 @@ static void vma_link(struct mm_struct *m
struct vm_area_struct *prev, struct rb_node **rb_link,
struct rb_node *rb_parent)
{
- struct address_space *mapping = NULL;
-
- if (vma->vm_file)
- mapping = vma->vm_file->f_mapping;
-
- if (mapping)
- down(&mapping->i_shared_sem);
+ vma_mapping_lock(vma);
spin_lock(&mm->page_table_lock);
__vma_link(mm, vma, prev, rb_link, rb_parent);
spin_unlock(&mm->page_table_lock);
- if (mapping)
- up(&mapping->i_shared_sem);
+ vma_mapping_unlock(vma);
mark_mm_hugetlb(mm, vma);
mm->map_count++;
@@ -318,8 +311,8 @@ static void vma_link(struct mm_struct *m
/*
* Insert vm structure into process list sorted by address and into the inode's
- * i_mmap ring. The caller should hold mm->page_table_lock and
- * ->f_mappping->i_shared_sem if vm_file is non-NULL.
+ * i_mmap ring. The caller should hold mm->page_table_lock and i_shared_sem or
+ * i_mmap_sem if vm_file is non-NULL.
*/
static void
__insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
@@ -410,7 +403,6 @@ static struct vm_area_struct *vma_merge(
spinlock_t *lock = &mm->page_table_lock;
struct inode *inode = file ? file->f_dentry->d_inode : NULL;
struct address_space *mapping = file ? file->f_mapping : NULL;
- struct semaphore *i_shared_sem;
struct prio_tree_root *root = NULL;
/*
@@ -420,8 +412,6 @@ static struct vm_area_struct *vma_merge(
if (vm_flags & VM_SPECIAL)
return NULL;
- i_shared_sem = file ? &file->f_mapping->i_shared_sem : NULL;
-
if (mapping) {
if (vm_flags & VM_SHARED) {
if (likely(!(vm_flags & VM_NONLINEAR)))
@@ -442,13 +432,8 @@ static struct vm_area_struct *vma_merge(
if (prev->vm_end == addr &&
can_vma_merge_after(prev, vm_flags, file, pgoff)) {
struct vm_area_struct *next;
- int need_up = 0;
- if (unlikely(file && prev->vm_next &&
- prev->vm_next->vm_file == file)) {
- down(i_shared_sem);
- need_up = 1;
- }
+ vma_mapping_lock(prev);
spin_lock(lock);
/*
@@ -463,8 +448,7 @@ static struct vm_area_struct *vma_merge(
next->vm_end, prev->vm_pgoff);
__remove_shared_vm_struct(next, inode, mapping);
spin_unlock(lock);
- if (need_up)
- up(i_shared_sem);
+ vma_mapping_unlock(prev);
if (file)
fput(file);
@@ -475,8 +459,7 @@ static struct vm_area_struct *vma_merge(
__vma_modify(root, prev, prev->vm_start, end, prev->vm_pgoff);
spin_unlock(lock);
- if (need_up)
- up(i_shared_sem);
+ vma_mapping_unlock(prev);
return prev;
}
@@ -490,14 +473,12 @@ static struct vm_area_struct *vma_merge(
pgoff, (end - addr) >> PAGE_SHIFT))
return NULL;
if (end == prev->vm_start) {
- if (file)
- down(i_shared_sem);
+ vma_mapping_lock(prev);
spin_lock(lock);
__vma_modify(root, prev, addr, prev->vm_end,
prev->vm_pgoff - ((end - addr) >> PAGE_SHIFT));
spin_unlock(lock);
- if (file)
- up(i_shared_sem);
+ vma_mapping_unlock(prev);
return prev;
}
}
@@ -1361,8 +1342,7 @@ int split_vma(struct mm_struct * mm, str
root = &mapping->i_mmap;
}
- if (mapping)
- down(&mapping->i_shared_sem);
+ vma_mapping_lock(vma);
spin_lock(&mm->page_table_lock);
if (new_below)
@@ -1374,8 +1354,7 @@ int split_vma(struct mm_struct * mm, str
__insert_vm_struct(mm, new);
spin_unlock(&mm->page_table_lock);
- if (mapping)
- up(&mapping->i_shared_sem);
+ vma_mapping_unlock(vma);
return 0;
}
@@ -1609,7 +1588,7 @@ void exit_mmap(struct mm_struct *mm)
/* Insert vm structure into process list sorted by address
* and into the inode's i_mmap ring. If vm_file is non-NULL
- * then i_shared_sem is taken here.
+ * then i_shared_sem or i_mmap_sem is taken here.
*/
void insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
{
diff -puN mm/memory.c~010_sem_contention mm/memory.c
--- mmlinux-2.6/mm/memory.c~010_sem_contention 2004-04-11 22:07:40.000000000 -0400
+++ mmlinux-2.6-jaya/mm/memory.c 2004-04-11 22:07:40.000000000 -0400
@@ -1133,11 +1133,15 @@ void invalidate_mmap_range(struct addres
if (holeend & ~(long long)ULONG_MAX)
hlen = ULONG_MAX - hba + 1;
}
- down(&mapping->i_shared_sem);
+ down(&mapping->i_mmap_sem);
/* Protect against page fault */
atomic_inc(&mapping->truncate_count);
if (unlikely(!prio_tree_empty(&mapping->i_mmap)))
invalidate_mmap_range_list(&mapping->i_mmap, hba, hlen);
+ up(&mapping->i_mmap_sem);
+ down(&mapping->i_shared_sem);
+ /* Protect against page fault -- not sure this is required */
+ atomic_inc(&mapping->truncate_count);
if (unlikely(!prio_tree_empty(&mapping->i_mmap_shared)))
invalidate_mmap_range_list(&mapping->i_mmap_shared, hba, hlen);
up(&mapping->i_shared_sem);
diff -puN mm/filemap.c~010_sem_contention mm/filemap.c
--- mmlinux-2.6/mm/filemap.c~010_sem_contention 2004-04-11 22:07:40.000000000 -0400
+++ mmlinux-2.6-jaya/mm/filemap.c 2004-04-11 22:07:40.000000000 -0400
@@ -55,17 +55,17 @@
/*
* Lock ordering:
*
- * ->i_shared_sem (vmtruncate)
+ * ->i_shared{_mmap}_sem (vmtruncate)
* ->private_lock (__free_pte->__set_page_dirty_buffers)
* ->swap_list_lock
* ->swap_device_lock (exclusive_swap_page, others)
* ->mapping->page_lock
*
* ->i_sem
- * ->i_shared_sem (truncate->invalidate_mmap_range)
+ * ->i_shared{_mmap}_sem (truncate->invalidate_mmap_range)
*
* ->mmap_sem
- * ->i_shared_sem (various places)
+ * ->i_shared{_mmap}_sem (various places)
*
* ->mmap_sem
* ->lock_page (access_process_vm)
diff -puN mm/mremap.c~010_sem_contention mm/mremap.c
--- mmlinux-2.6/mm/mremap.c~010_sem_contention 2004-04-11 22:07:40.000000000 -0400
+++ mmlinux-2.6-jaya/mm/mremap.c 2004-04-11 22:07:40.000000000 -0400
@@ -267,7 +267,6 @@ static unsigned long move_vma(struct vm_
unsigned long new_len, unsigned long new_addr)
{
struct mm_struct *mm = vma->vm_mm;
- struct address_space *mapping = NULL;
struct vm_area_struct *new_vma;
unsigned long vm_flags = vma->vm_flags;
unsigned long new_pgoff;
@@ -287,16 +286,14 @@ static unsigned long move_vma(struct vm_
if (!new_vma)
return -ENOMEM;
- if (vma->vm_file) {
- /*
- * Subtle point from Rajesh Venkatasubramanian: before
- * moving file-based ptes, we must lock vmtruncate out,
- * since it might clean the dst vma before the src vma,
- * and we propagate stale pages into the dst afterward.
- */
- mapping = vma->vm_file->f_mapping;
- down(&mapping->i_shared_sem);
- }
+ /*
+ * Subtle point from Rajesh Venkatasubramanian: before
+ * moving file-based ptes, we must lock vmtruncate out,
+ * since it might clean the dst vma before the src vma,
+ * and we propagate stale pages into the dst afterward.
+ */
+ vma_mapping_lock(vma);
+
moved_len = move_page_tables(vma, new_addr, old_addr, old_len);
if (moved_len < old_len) {
/*
@@ -310,8 +307,8 @@ static unsigned long move_vma(struct vm_
old_addr = new_addr;
new_addr = -ENOMEM;
}
- if (mapping)
- up(&mapping->i_shared_sem);
+
+ vma_mapping_unlock(vma);
/* Conceal VM_ACCOUNT so old reservation is not undone */
if (vm_flags & VM_ACCOUNT) {
@@ -476,16 +473,14 @@ unsigned long do_mremap(unsigned long ad
}
else
root = &mapping->i_mmap;
- down(&mapping->i_shared_sem);
}
+ vma_mapping_lock(vma);
spin_lock(&vma->vm_mm->page_table_lock);
__vma_modify(root, vma, vma->vm_start,
addr + new_len, vma->vm_pgoff);
spin_unlock(&vma->vm_mm->page_table_lock);
-
- if(mapping)
- up(&mapping->i_shared_sem);
+ vma_mapping_unlock(vma);
current->mm->total_vm += pages;
if (vma->vm_flags & VM_LOCKED) {
diff -puN mm/rmap.c~010_sem_contention mm/rmap.c
--- mmlinux-2.6/mm/rmap.c~010_sem_contention 2004-04-11 22:07:40.000000000 -0400
+++ mmlinux-2.6-jaya/mm/rmap.c 2004-04-11 22:07:40.000000000 -0400
@@ -267,8 +267,8 @@ out:
*
* This function is only called from page_referenced for object-based pages.
*
- * The semaphore address_space->i_shared_sem is tried. If it can't be gotten,
- * assume a reference count of 0, so try_to_unmap will then have a go.
+ * The semaphores ->i_mmap_sem and ->i_shared_sem are tried. If they can't be
+ * gotten, assume a reference count of 0, so try_to_unmap will then have a go.
*/
static inline int page_referenced_obj(struct page *page, int *mapcount)
{
@@ -276,12 +276,14 @@ static inline int page_referenced_obj(st
unsigned long pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
struct vm_area_struct *vma;
struct prio_tree_iter iter;
+ struct semaphore *semaphore;
unsigned long address;
int referenced = 0;
- if (down_trylock(&mapping->i_shared_sem))
+ if (down_trylock(&mapping->i_mmap_sem))
return 0;
+ semaphore = &mapping->i_mmap_sem;
vma = __vma_prio_tree_first(&mapping->i_mmap,
&iter, pgoff, pgoff);
while (vma) {
@@ -301,6 +303,12 @@ static inline int page_referenced_obj(st
&iter, pgoff, pgoff);
}
+ up(&mapping->i_mmap_sem);
+
+ if (down_trylock(&mapping->i_shared_sem))
+ return 0;
+
+ semaphore = &mapping->i_shared_sem;
vma = __vma_prio_tree_first(&mapping->i_mmap_shared,
&iter, pgoff, pgoff);
while (vma) {
@@ -322,7 +330,7 @@ static inline int page_referenced_obj(st
if (list_empty(&mapping->i_mmap_nonlinear))
WARN_ON(*mapcount > 0);
out:
- up(&mapping->i_shared_sem);
+ up(semaphore);
return referenced;
}
@@ -696,8 +704,8 @@ out:
*
* This function is only called from try_to_unmap for object-based pages.
*
- * The semaphore address_space->i_shared_sem is tried. If it can't be gotten,
- * return a temporary error.
+ * The semaphores ->i_mmap_sem and ->i_shared_sem are tried. If they can't be
+ * gotten, return a temporary error.
*/
static inline int try_to_unmap_obj(struct page *page, int *mapcount)
{
@@ -705,15 +713,17 @@ static inline int try_to_unmap_obj(struc
unsigned long pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
struct vm_area_struct *vma;
struct prio_tree_iter iter;
+ struct semaphore *semaphore;
unsigned long address;
int ret = SWAP_AGAIN;
unsigned long cursor;
unsigned long max_nl_cursor = 0;
unsigned long max_nl_size = 0;
- if (down_trylock(&mapping->i_shared_sem))
+ if (down_trylock(&mapping->i_mmap_sem))
return ret;
+ semaphore = &mapping->i_mmap_sem;
vma = __vma_prio_tree_first(&mapping->i_mmap,
&iter, pgoff, pgoff);
while (vma) {
@@ -728,6 +738,12 @@ static inline int try_to_unmap_obj(struc
&iter, pgoff, pgoff);
}
+ up(&mapping->i_mmap_sem);
+
+ if (down_trylock(&mapping->i_shared_sem))
+ return ret;
+
+ semaphore = &mapping->i_shared_sem;
vma = __vma_prio_tree_first(&mapping->i_mmap_shared,
&iter, pgoff, pgoff);
while (vma) {
@@ -813,7 +829,7 @@ static inline int try_to_unmap_obj(struc
relock:
rmap_lock(page);
out:
- up(&mapping->i_shared_sem);
+ up(semaphore);
return ret;
}
diff -puN fs/inode.c~010_sem_contention fs/inode.c
--- mmlinux-2.6/fs/inode.c~010_sem_contention 2004-04-11 22:07:40.000000000 -0400
+++ mmlinux-2.6-jaya/fs/inode.c 2004-04-11 22:07:40.000000000 -0400
@@ -185,6 +185,7 @@ void inode_init_once(struct inode *inode
sema_init(&inode->i_sem, 1);
INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
spin_lock_init(&inode->i_data.page_lock);
+ init_MUTEX(&inode->i_data.i_mmap_sem);
init_MUTEX(&inode->i_data.i_shared_sem);
atomic_set(&inode->i_data.truncate_count, 0);
INIT_LIST_HEAD(&inode->i_data.private_list);
diff -puN fs/hugetlbfs/inode.c~010_sem_contention fs/hugetlbfs/inode.c
--- mmlinux-2.6/fs/hugetlbfs/inode.c~010_sem_contention 2004-04-11 22:07:40.000000000 -0400
+++ mmlinux-2.6-jaya/fs/hugetlbfs/inode.c 2004-04-11 22:07:40.000000000 -0400
@@ -325,11 +325,15 @@ static int hugetlb_vmtruncate(struct ino
pgoff = offset >> HPAGE_SHIFT;
inode->i_size = offset;
- down(&mapping->i_shared_sem);
+ down(&mapping->i_mmap_sem);
/* Protect against page fault */
atomic_inc(&mapping->truncate_count);
if (unlikely(!prio_tree_empty(&mapping->i_mmap)))
hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff);
+ up(&mapping->i_mmap_sem);
+ down(&mapping->i_shared_sem);
+ /* Protect against page fault -- not sure this is required */
+ atomic_inc(&mapping->truncate_count);
if (unlikely(!prio_tree_empty(&mapping->i_mmap_shared)))
hugetlb_vmtruncate_list(&mapping->i_mmap_shared, pgoff);
up(&mapping->i_shared_sem);
_
^ permalink raw reply [flat|nested] 38+ messages in thread* Re: [PATCH] anobjrmap 9 priority mjb tree
2004-04-12 4:32 ` Rajesh Venkatasubramanian
@ 2004-04-12 5:24 ` Martin J. Bligh
0 siblings, 0 replies; 38+ messages in thread
From: Martin J. Bligh @ 2004-04-12 5:24 UTC (permalink / raw)
To: Rajesh Venkatasubramanian; +Cc: Hugh Dickins, linux-kernel, Andrew Morton
> This patch is an attempt at reducing the contention on i_shared_sem
> by introducing a new semaphore i_mmap_sem. The i_shared_sem covers
> i_mmap_shared tree and i_mmap_nonlinear list now, whereas i_mmap_sem
> covers i_mmap tree. This may help to reduce the contention on
> i_shared_sem if a file is mapped both private and shared. Kernel
> compile time with and without this patch did not change much, though.
>
> This patch is on top of 2.6.5-mjb1+anobjrmap9_prio. Compiled and
> tested.
>
> Martin! Are you interested in testing SDET with this patch ?
Runs exactly the same as prio ... thanks for having a crack at it though.
I guess that sharing combination isn't that common ;-(
M.
^ permalink raw reply [flat|nested] 38+ messages in thread
* Re: [PATCH] anobjrmap 9 priority mjb tree
2004-04-11 17:28 ` Martin J. Bligh
2004-04-12 4:32 ` Rajesh Venkatasubramanian
@ 2004-04-12 15:46 ` Martin J. Bligh
2004-04-12 18:43 ` Hugh Dickins
1 sibling, 1 reply; 38+ messages in thread
From: Martin J. Bligh @ 2004-04-12 15:46 UTC (permalink / raw)
To: Hugh Dickins; +Cc: linux-kernel, Andrew Morton, Rajesh Venkatasubramanian
[-- Attachment #1: Type: text/plain, Size: 3432 bytes --]
>> Any chance of you doing the same comparison between 2.6.5-aa5
>> 2.6.5-aa5 minus prio-tree? (Well, needn't be -aa5, whatever comes to
>> hand. Looks like "patch -p1 -R < prio-tree" mostly works, just some
>> rejects in mm/mmap.c itself, let me know if I can help out on that.)
>>
>> If -aa is okay, I hope so, then it's surely some stupidity from me.
>
> Good idea. Not sure how easy it'll be to back prio_tree out, but I can
> surely do aa5, which would give us a good clue still. Might not be until
> this time tommorow though.
Hmm. it's halfway between the two. There does seem to be less sem
contention, though the profile ticks in __down isn't really an accurate
measure. I'll try to think of some more accurate way to instrument sems
(maybe I can do rdtsc before and after taking the sem, and then hack
the profiling code to stash delta increment indexed by the callers
address). Meanwhile, maybe you can decipher something from the attatched
and appended ...
M.
SDET 128 (see disclaimer)
Throughput Std. Dev
2.6.5-mjb1 100.0% 1.2%
2.6.5-mjb1-prio 73.6% 0.0%
2.6.5-aa5 86.4% 1.9%
Full profile attatch (to match yesterday's).
diffprofile from mjb1+prio to -aa5
11084 102.0% find_get_page
4976 101.8% clear_page_tables
4482 690.6% schedule
2322 1745.9% pgd_alloc
2036 41.2% atomic_dec_and_lock
1816 0.0% page_add_rmap
1623 16.0% __d_lookup
1428 0.0% __set_page_dirty_buffers
1275 0.0% anon_vma_unlink
1234 25.3% path_lookup
1101 25.0% remove_shared_vm_struct
990 53.7% .text.lock.dec_and_lock
921 0.0% find_get_pages_tag
883 0.0% anon_vma_link
791 52.2% follow_mount
783 250.2% unmap_vmas
764 0.0% strnlen_user
588 32.3% file_move
572 36.8% proc_pid_stat
563 57.9% lookup_mnt
543 55.4% .text.lock.file_table
498 48.0% .text.lock.dcache
488 69.0% flush_signal_handlers
467 215.2% .text.lock.base
460 27.9% kmem_cache_free
...
-217 -100.0% __do_softirq
-220 -100.0% page_update_anon_rmap
-228 -57.6% flush_tlb_page
-230 -69.9% sys_close
-233 -100.0% direct_strncpy_from_user
-233 -7.8% copy_process
-235 -21.1% kmap_atomic
-254 -37.5% sched_clock
-257 -59.6% rb_insert_color
-318 -47.7% pte_alloc_map
-330 -99.1% find_get_pages
-332 -31.5% find_vma
-340 -43.8% __vma_prio_tree_remove
-356 -29.7% vma_link
-369 -100.0% direct_strnlen_user
-401 -100.0% page_add_anon_rmap
-470 -8.6% do_no_page
-610 -8.1% __copy_to_user_ll
-619 -100.0% radix_tree_lookup
-923 -97.4% set_page_dirty
-1133 -12.6% release_pages
-1406 -53.7% __block_prepare_write
-1553 -14.5% page_remove_rmap
-1555 -100.0% page_add_obj_rmap
-1753 -38.9% do_page_fault
-2269 -68.7% __copy_user_intel
-2959 -44.8% do_anonymous_page
-3789 -100.0% .text.lock.filemap
-7671 -53.6% __wake_up
-8214 -36.8% copy_page_range
-9948 -62.1% do_wp_page
-14120 -100.0% finish_task_switch
-27763 -55.8% __down
-64017 -12.0% default_idle
-107386 -11.9% total
[-- Attachment #2: 2.6.5-aa5 --]
[-- Type: application/octet-stream, Size: 17598 bytes --]
794165 total
470672 default_idle
21952 __down
21949 find_get_page
16864 zap_pte_range
14091 copy_page_range
11798 __d_lookup
9863 clear_page_tables
9188 page_remove_rmap
7838 release_pages
6976 atomic_dec_and_lock
6949 __copy_to_user_ll
6629 __wake_up
6120 path_lookup
6066 do_wp_page
5505 remove_shared_vm_struct
5343 copy_mm
5131 schedule
5007 do_no_page
3650 do_anonymous_page
3615 pte_alloc_one
3118 free_pages_and_swap_cache
2909 filemap_nopage
2834 .text.lock.dec_and_lock
2757 do_page_fault
2755 copy_process
2662 link_path_walk
2645 free_hot_cold_page
2524 buffered_rmqueue
2455 pgd_alloc
2408 file_move
2307 follow_mount
2126 proc_pid_stat
2125 path_release
2111 kmem_cache_free
1908 release_task
1816 page_add_rmap
1699 ext2_new_inode
1676 __find_get_block
1667 task_mem
1623 __read_lock_failed
1576 __fput
1535 lookup_mnt
1535 .text.lock.dcache
1523 .text.lock.file_table
1520 number
1503 exit_notify
1428 __set_page_dirty_buffers
1365 __brelse
1358 page_address
1279 current_kernel_time
1275 anon_vma_unlink
1269 .text.lock.highmem
1257 file_kill
1214 __block_prepare_write
1195 flush_signal_handlers
1114 d_alloc
1112 do_generic_mapping_read
1100 alloc_inode
1099 generic_file_open
1096 unmap_vmas
1086 __mark_inode_dirty
1046 kmap_high
1034 __copy_user_intel
1004 handle_mm_fault
983 __copy_from_user_ll
968 vsnprintf
937 fput
921 find_get_pages_tag
916 system_call
883 anon_vma_link
880 kmap_atomic
879 proc_pid_status
866 do_mmap_pgoff
856 dnotify_parent
852 grab_block
843 vma_link
839 update_atime
839 __find_get_block_slow
833 proc_root_link
832 dput
813 proc_check_root
776 do_page_cache_readahead
769 generic_fillattr
764 strnlen_user
738 new_inode
721 find_vma
687 mmgrab
684 dentry_open
684 .text.lock.base
654 ext2_update_inode
631 vfs_read
623 generic_delete_inode
614 generic_file_aio_write_nolock
609 ext2_new_block
602 split_vma
591 block_invalidatepage
584 fd_install
554 render_sigset_t
540 pid_revalidate
539 kmem_ptr_validate
523 d_instantiate
513 del_timer_sync
502 __vma_prio_tree_insert
501 set_page_address
498 copy_files
497 add_to_page_cache
490 prune_dcache
484 complete
481 d_path
476 prio_tree_insert
463 get_tgid_list
450 read_block_bitmap
437 __vma_prio_tree_remove
424 exit_mmap
423 sched_clock
419 __alloc_pages
409 ext2_find_entry
400 may_open
392 deny_write_access
392 __free_pages
385 __generic_file_aio_read
383 real_lookup
383 mark_page_accessed
380 __blk_queue_bounce
366 exec_mmap
353 kunmap_high
352 dup_task_struct
352 .text.lock.inode
349 task_dumpable
348 pte_alloc_map
348 ext2_get_group_desc
341 filp_close
334 vfs_unlink
330 select_parent
330 __pagevec_lru_add_active
330 __filemap_fdatawrite
328 ext2_free_blocks
322 ext2_add_link
301 file_ra_state_init
290 try_to_free_buffers
290 sys_wait4
289 try_to_wake_up
285 ext2_get_inode
284 ext2_get_block
281 get_unused_fd
278 ext2_preread_inode
277 proc_fd_link
271 group_reserve_blocks
271 d_delete
268 create_buffers
263 tid_fd_revalidate
258 __remove_shared_vm_struct
256 open_namei
256 find_group_other
254 __insert_inode_hash
253 ext2_discard_prealloc
249 do_lookup
248 vfs_write
247 unlock_page
246 group_release_blocks
246 copy_strings
243 strncpy_from_user
240 kunmap_atomic
235 is_subdir
218 sigprocmask
218 do_exit
216 .text.lock.namespace
213 sys_unlink
211 proc_pid_readlink
208 sched_best_cpu
208 anon_vma_prepare
207 truncate_inode_pages
203 get_empty_filp
202 get_signal_to_deliver
200 load_elf_binary
200 in_group_p
200 file_read_actor
197 call_rcu
192 flush_old_exec
191 flush_tlb_mm
190 get_wchan
190 dnotify_flush
189 profile_exit_mmap
187 find_vma_prepare
183 d_lookup
177 generic_file_write
175 ext2_truncate
174 rb_insert_color
173 proc_delete_inode
173 do_sigaction
171 page_cache_readahead
171 bad_get_user
170 ext2_block_to_path
168 flush_tlb_page
168 find_lock_page
166 iput
165 __block_commit_write
163 vfs_getattr
161 fget
161 __d_path
159 percpu_counter_mod
159 ext2_commit_chunk
155 vm_acct_memory
151 sched_migrate_task
150 fget_light
147 pid_alive
144 rcu_do_batch
143 read_inode_bitmap
143 __insert_vm_struct
142 inode_times_differ
142 ext2_free_inode
138 ext2_readdir
138 do_execve
137 vfs_permission
137 profile_exec_unmap
137 default_wake_function
137 __pagevec_lru_add
136 schedule_tail
131 wait_for_completion
125 wake_up_forked_process
125 proc_info_read
124 kmem_cache_alloc
121 prepare_binprm
120 generic_file_read
119 vfs_readdir
119 generic_file_llseek
118 search_binary_handler
117 wait_task_zombie
116 proc_lookup
115 find_vma_prev
112 put_files_struct
110 unmap_page_range
110 ext2_delete_entry
110 eligible_child
110 cp_new_stat64
109 prep_new_page
109 lru_cache_add_active
109 alloc_pidmap
108 setup_arg_pages
106 inode_sub_bytes
106 ext2_inode_by_name
106 create_empty_buffers
103 do_munmap
102 generic_file_mmap
100 cap_vm_enough_memory
99 unlock_buffer
99 sys_close
98 getname
96 reap_timer_fnc
94 __vma_link
93 pipe_writev
91 unmap_pte_page
87 unshare_files
87 sys_read
87 pipe_readv
86 unmap_vma
86 mm_init
86 migration_thread
85 old_mmap
85 free_buffer_head
84 set_fs_pwd
83 ext2_free_branches
81 locks_remove_flock
81 inode_update_time
80 mm_alloc
80 get_offset_pit
79 test_clear_page_dirty
77 open_exec
76 locks_remove_posix
76 do_softirq
76 bad_range
74 wake_up_inode
74 permission
73 sys_brk
72 generic_commit_write
72 do_brk
70 profile_exit_task
70 d_rehash
68 proc_pid_lookup
68 drop_buffers
68 do_group_exit
68 .text.lock.namei
65 __copy_user_zeroing_intel
63 __set_page_dirty_nobuffers
62 mmput
61 mark_buffer_dirty
61 load_elf_interp
61 __vma_link_rb
61 __put_task_struct
60 syscall_exit
60 show_stat
60 lru_cache_add
59 sys_newuname
59 inode_has_buffers
57 vma_merge
56 ext2_release_inode
56 .text.lock.generic
55 exit_aio
54 prepare_to_wait
54 find_busiest_node
52 ext2_count_free_blocks
52 __kmalloc
52 .text.lock.fs_writeback
51 task_vsize
51 copy_thread
50 inode_init_once
49 sys_ioctl
49 ext2_get_page
49 copy_namespace
48 zap_pmd_range
48 do_gettimeofday
48 __get_page_state
47 free_percpu
47 chown_common
46 sched_fork
45 sys_mmap2
45 proc_pid_make_inode
45 cap_bprm_compute_creds
45 __rb_erase_color
44 ret_from_intr
44 mm_release
43 prio_tree_remove
43 find_group_orlov
43 filp_open
43 create_elf_tables
42 sys_open
42 page_waitqueue
42 __user_walk
41 rb_erase
41 proc_pid_unhash
41 ext2_alloc_branch
41 clear_inode
40 sys_chmod
40 d_free
40 .text.lock.fork
39 __rb_rotate_left
39 .text.lock.balloc
38 sys_utime
38 compute_creds
38 clear_user
38 __anon_vma_link
37 vfs_follow_link
36 proc_lookupfd
36 lru_add_drain
36 filldir64
36 __get_free_pages
35 get_write_access
35 get_jiffies_64
35 cached_lookup
34 ext2_empty_dir
34 __cond_resched
33 ext2_lookup
32 read_cache_page
32 find_pid
32 .text.lock.ioctl
31 unmap_region
31 sys_execve
31 ext2_alloc_block
31 destroy_inode
31 can_share_swap_page
30 get_unmapped_area
30 do_fork
29 proc_root_lookup
29 pipe_wait
29 cpu_idle
28 wake_up_buffer
28 truncate_complete_page
28 sys_write
28 skip_atoi
28 notify_change
28 inode_setattr
28 can_vma_merge_after
28 cache_grow
28 block_prepare_write
28 bh_lru_install
27 d_invalidate
27 .text.lock.root
26 remove_suid
26 generic_file_write_nolock
26 eventpoll_release_file
25 set_page_dirty
25 proc_pident_lookup
25 kmap_atomic_to_page
24 .text.lock.ialloc
23 vfs_stat
23 sys_rt_sigaction
23 inode_change_ok
23 flush_thread
23 fasync_helper
23 detach_vmas_to_be_unmapped
23 __rb_rotate_right
23 __clear_page_buffers
22 sys_rt_sigprocmask
22 pid_delete_dentry
22 next_thread
22 inode_add_bytes
22 ext2_prepare_write
21 task_nice
21 scsi_request_fn
21 free_pages
21 ext2_get_branch
21 .text.lock.dnotify
20 vfs_create
20 sys_access
20 pid_base_iput
20 ll_rw_block
20 get_zone_counts
20 follow_down
20 ext2_delete_inode
20 __make_request
20 __lookup_hash
20 __getblk
19 sys_llseek
19 reserve_blocks
19 finish_wait
19 error_code
19 count_open_files
18 write_profile
18 vfs_rmdir
18 set_bh_page
18 ext2_group_sparse
18 elf_map
18 balance_dirty_pages_ratelimited
17 write_inode
17 mpage_writepage
17 ext2_create
17 ext2_check_page
17 do_pipe
17 d_unhash
17 __pagevec_free
16 recalc_bh_state
16 __set_page_buffers
15 sys_set_thread_area
15 scsi_end_request
15 radix_tree_preload
15 put_dirty_page
15 mpage_writepages
15 memcpy
15 free_task
15 eventpoll_init_file
15 end_page_writeback
15 chrdev_open
15 bad_page
15 __get_user_4
15 __bread
15 .text.lock.swap
14 rwsem_down_read_failed
14 proc_root_readdir
14 is_bad_inode
14 init_fpu
14 generic_forget_inode
14 flush_all_zero_pkmaps
14 exit_itimers
14 add_wait_queue
13 sys_munmap
13 fsync_buffers_list
13 ext2_setattr
13 dget_locked
13 __mmdrop
12 setattr_mask
12 rwsem_wake
12 release_thread
12 posix_block_lock
12 nr_blockdev_pages
12 kfree
12 interruptible_sleep_on
12 inode_needs_sync
12 groups_search
12 ext2_unlink
12 ext2_set_inode_flags
12 alloc_buffer_head
12 __iget
11 syscall_call
11 sys_stat64
11 sys_mkdir
11 sys_dup2
11 sprintf
11 setup_frame
11 sched_balance_exec
11 proc_pid_readdir
11 kunmap
11 free_pgtables
11 ext2_find_near
11 do_signal
10 vsprintf
10 vfs_fstat
10 try_to_release_page
10 test_set_page_writeback
10 pagevec_lookup
10 ext2_put_inode
10 expand_fd_array
10 __rmqueue
10 .text.lock.commoncap
10 .text.lock.array
9 sys_readlink
9 restore_sigcontext
9 release_x86_irqs
9 put_filp
9 kill_fasync
9 free_hot_page
9 find_trylock_page
9 expand_stack
9 .text.lock.locks
8 vfs_lstat
8 sys_sigreturn
8 sys_fstat64
8 sync_sb_inodes
8 setup_sigcontext
8 resume_userspace
8 page_slot
8 kernel_read
8 init_new_context
8 ext2_release_file
8 ext2_alloc_inode
8 cdev_get
8 anon_vma_ctor
8 __put_ioctx
7 unmap_vma_list
7 sys_time
7 sys_getdents64
7 proc_get_inode
7 prepare_to_wait_exclusive
7 pipe_release
7 kmap
7 invalidate_inode_buffers
7 group_send_sig_info
7 get_pipe_inode
7 filemap_getpage
7 fcntl_dirnotify
7 ext2_last_byte
7 do_invalidatepage
7 block_truncate_page
7 block_commit_write
7 __sync_single_inode
7 __page_cache_release
7 .text.lock.exec
6 wake_up_process
6 up_tty_sem
6 sys_lstat64
6 smp_call_function
6 scsi_dispatch_cmd
6 save_i387
6 pgd_free
6 lock_rename
6 insert_vm_struct
6 get_vmalloc_info
6 do_proc_readlink
6 convert_fxsr_to_user
6 bio_alloc
6 __posix_lock_file
6 __down_failed
6 .text.lock.char_dev
5 unix_create1
5 sys_chown
5 run_timer_softirq
5 restore_all
5 remove_from_page_cache
5 release_blocks
5 put_unused_fd
5 proc_alloc_inode
5 pipe_new
5 handle_signal
5 get_new_inode_fast
5 ext2_make_empty
5 do_sync_write
5 do_mpage_readpage
5 do_flush_tlb_all
5 do_fcntl
5 count
5 check_tty_count
5 cache_init_objs
5 bounce_copy_vec
5 anon_vma_merge
4 vsscanf
4 unmap_underlying_metadata
4 sock_create
4 set_close_on_exec
4 send_IPI_mask_sequence
4 rwsem_down_write_failed
4 pipe_write_fasync
4 notifier_call_chain
4 kthread_should_stop
4 generic_drop_inode
4 flush_tlb_others
4 find_or_create_page
4 ext2_ioctl
4 ext2_destroy_inode
4 do_file_page
4 check_mnt
4 __pmd_alloc
4 __down_failed_interruptible
4 .text.lock.buffer
3 vfs_statfs
3 vfs_mkdir
3 sys_vhangup
3 sys_setpgid
3 sys_fcntl64
3 sys_creat
3 sock_map_fd
3 si_swapinfo
3 restore_fpu
3 remove_exclusive_swap_page
3 put_fs_struct
3 proc_tgid_base_lookup
3 proc_readfd
3 proc_file_read
3 proc_destroy_inode
3 pipe_read_fasync
3 nr_iowait
3 name_to_int
3 mempool_alloc
3 meminfo_read_proc
3 lookup_create
3 locate_fd
3 kobject_put
3 kobject_get
3 invalidate_mmap_range
3 init_buffer_head
3 iget_locked
3 get_request
3 get_node
3 find_get_pages
3 ext2_statfs
3 ext2_set_link
3 ext2_rmdir
3 ext2_get_parent
3 ext2_bg_has_super
3 exit_sem
3 destroy_context
3 del_timer
3 de_put
3 cap_bprm_secureexec
3 block_read_full_page
3 __write_lock_failed
3 __mod_timer
3 __breadahead
3 .text.lock.open
2 wait_on_page_bit
2 tty_open
2 sys_vfork
2 sys_gettimeofday
2 sys_getrlimit
2 sys_getcwd
2 sys_chdir
2 steal_locks
2 sock_alloc
2 sk_alloc
2 set_brk
2 set_binfmt
2 scsi_single_lun_run
2 remove_inode_buffers
2 release_dev
2 read_ldt
2 rcu_check_quiescent_state
2 rb_first
2 pte_alloc_kernel
2 proc_permission
2 path_walk
2 memory_open
2 math_state_restore
2 locks_insert_lock
2 invalidate_bh_lru
2 init_dev
2 grow_dev_page
2 getrusage
2 find_inode_fast
2 file_ioctl
2 fcntl_setlk
2 ext2_mkdir
2 ext2_follow_link
2 ext2_count_free_inodes
2 exit_thread
2 down_tty_sem
2 d_move
2 d_alloc_root
2 copy_semundo
2 check_ttfb_buffer
2 cap_bprm_set_security
2 can_vma_merge_before
2 bounce_end_io
2 bmap
2 background_writeout
2 activate_page
2 __up_wakeup
2 __block_write_full_page
2 .text.lock.tty_io
2 .text.lock.objrmap
1 writeback_acquire
1 work_resched
1 vmtruncate
1 vfs_statfs_native
1 vfs_rename
1 unix_release_sock
1 unix_create
1 tty_release
1 test_clear_page_writeback
1 sys_waitpid
1 sys_umask
1 sys_statfs
1 sys_setrlimit
1 sys_rmdir
1 sys_pread64
1 sys_pipe
1 sys_getpid
1 sys_fchdir
1 sys_exit_group
1 sys_clone
1 sock_init_data
1 sock_fasync
1 slab_destroy
1 shrink_dcache_parent
1 show_tty_driver
1 seq_read
1 seq_printf
1 scsi_put_command
1 scsi_io_completion
1 scsi_init_cmd_errh
1 save_i387_fxsave
1 restore_i387_fxsave
1 remove_wait_queue
1 put_io_context
1 pty_open
1 proc_readdir
1 proc_read_inode
1 proc_calc_metrics
1 prio_tree_expand
1 prepare_to_copy
1 pipe_write
1 pipe_read
1 padzero
1 nr_free_pages
1 nr_free_highpages
1 mprotect_fixup
1 memset
1 mempool_free
1 mempool_alloc_slab
1 mark_offset_tsc
1 lookup_hash
1 invalidate_complete_page
1 init_special_inode
1 i_waitq_head
1 handle_ra_miss
1 get_free_idx
1 generic_make_request
1 fs_may_remount_ro
1 free_uid
1 find_task_by_pid
1 filp_ctor
1 ext2_nobh_prepare_write
1 ext2_bg_num_gdb
1 expand_files
1 exit_fs
1 end_buffer_read_nobh
1 do_truncate
1 do_setitimer
1 do_getitimer
1 dispose_list
1 d_genocide
1 copy_strings_kernel
1 convert_fxsr_from_user
1 clear_page_dirty_for_io
1 bh_waitq_head
1 as_set_request
1 alloc_skb
1 __up
1 __pagevec_release
1 __getblk_slow
1 __end_that_request_first
1 __bforget
1 .text.lock.mmap
1 .text.lock.fault
0 writeback_release
0 writeback_inodes
0 writeback_in_progress
0 worker_thread
0 vscnprintf
0 vfs_symlink
0 vfs_rename_other
0 uptime_read_proc
0 unlock_rename
0 unlock_new_inode
0 unix_stream_connect
0 unix_sock_destructor
0 unix_mkname
0 unix_find_other
0 unhash_process
0 tty_ioctl
0 tty_fasync
0 time_out_leases
0 tasklet_action
0 task_prio
0 t_start
0 t_next
0 sys_stime
0 sys_socketcall
0 sys_socket
0 sys_sethostname
0 sys_set_tid_address
0 sys_prctl
0 sys_old_getrlimit
0 sys_mprotect
0 sys_getppid
0 sys_geteuid
0 sys_getegid
0 sys_fcntl
0 sys_connect
0 sys_clock_gettime
0 sys_clock_getres
0 sys_chroot
0 sys_alarm
0 sync_supers
0 sync_dirty_buffer
0 submit_bh
0 stat_open
0 sockfd_lookup
0 sock_wfree
0 sock_close
0 skb_release_data
0 skb_drop_fraglist
0 single_open
0 shrink_dcache_sb
0 show_vfsmnt
0 show_tty_range
0 set_page_dirty_lock
0 set_current_groups
0 seq_path
0 seq_open
0 send_sigio_to_task
0 sd_rw_intr
0 scsi_softirq
0 scsi_run_queue
0 scsi_finish_command
0 scsi_device_unbusy
0 scsi_decide_disposition
0 scsi_add_timer
0 rt_hash_code
0 restore_i387
0 remove_inode_hash
0 remove_arg_zero
0 remote_llseek
0 release_sock
0 register_reboot_notifier
0 read_zero
0 read_pages
0 read_cache_pages
0 rcu_process_callbacks
0 radix_tree_extend
0 pty_unthrottle
0 pty_close
0 profile_event_register
0 proc_self_follow_link
0 proc_pid_flush
0 proc_file_lseek
0 proc_exe_link
0 proc_dointvec
0 prep_compound_page
0 pipe_write_release
0 pipe_read_release
0 pipe_ioctl
0 pagevec_strip
0 pagevec_lookup_tag
0 page_getlink
0 osync_buffers_list
0 open_private_file
0 old_select
0 nr_running
0 nr_context_switches
0 netif_rx
0 neigh_update
0 neigh_lookup
0 mpage_readpages
0 mpage_end_io_write
0 mpage_bio_submit
0 mpage_alloc
0 move_addr_to_kernel
0 mem_open
0 maps_open
0 mapping_tagged
0 lookup_swap_cache
0 lookup_one_len
0 locks_wake_up_blocks
0 locks_init_lock
0 locks_delete_lock
0 locks_copy_lock
0 locks_alloc_lock
0 local_bh_enable
0 load_script
0 ksoftirqd
0 kmem_flagcheck
0 kill_proc_info
0 kfree_skbmem
0 kernel_fpu_begin
0 ip_route_input
0 invalidate_mmap_range_list
0 invalidate_bdev
0 intr_handler
0 inode_get_bytes
0 init_page_buffers
0 init_once
0 inet_addr_type
0 in_egroup_p
0 handle_IRQ_event
0 get_zeroed_page
0 get_page_state
0 get_io_context
0 get_dirty_limits
0 generic_file_readv
0 generic_file_aio_read
0 futex_wake
0 free_fd_array
0 free_cold_page
0 follow_up
0 fn_hash_lookup
0 flush_tlb_all
0 flock_to_posix_lock
0 fib_semantic_match
0 ext3_statfs
0 ext3_group_sparse
0 ext3_bg_num_gdb
0 ext2_writepage
0 ext2_rename
0 ext2_read_inode
0 ext2_mknod
0 ext2_dotdot
0 exit_files
0 end_that_request_chunk
0 end_buffer_read_sync
0 end_buffer_async_write
0 end_bio_bh_io_sync
0 elv_set_request
0 dupfd
0 do_writepages
0 do_sigaltstack
0 do_proc_dointvec
0 do_posix_gettime
0 do_posix_clock_monotonic_gettime_parts
0 do_notify_resume
0 devpts_get_tty
0 device_not_available
0 d_find_alias
0 d_callback
0 cp_old_stat
0 collect_sigign_sigcatch
0 chroot_fs_refs
0 check_kill_permission
0 change_protection
0 cdev_put
0 cap_inode_setxattr
0 buffer_io_error
0 buffer_insert_list
0 bounce_end_io_write
0 block_write_full_page
0 block_sync_page
0 blk_run_queues
0 blk_recount_segments
0 blk_queue_bounce
0 blk_congestion_wait
0 bio_get_nr_vecs
0 bio_endio
0 bio_destructor
0 bio_add_page
0 balance_dirty_pages
0 bad_pipe_r
0 bad_follow_link
0 alloc_uid
0 alloc_fd_array
0 add_wait_queue_exclusive
0 add_to_page_cache_lru
0 add_disk_randomness
0 __writeback_single_inode
0 __wake_up_sync
0 __wait_on_buffer
0 __unix_remove_socket
0 __unix_insert_socket
0 __netdev_rx
0 __lock_page
0 __kfree_skb
0 __clear_user
0 __bio_add_page
0 __alloc_percpu
0 .text.lock.time
0 .text.lock.task_mmu
0 .text.lock.signal
0 .text.lock.rwsem
0 .text.lock.exit
0 .text.lock.block_dev
^ permalink raw reply [flat|nested] 38+ messages in thread* Re: [PATCH] anobjrmap 9 priority mjb tree
2004-04-12 15:46 ` Martin J. Bligh
@ 2004-04-12 18:43 ` Hugh Dickins
2004-04-12 18:58 ` Rajesh Venkatasubramanian
2004-04-12 19:01 ` Martin J. Bligh
0 siblings, 2 replies; 38+ messages in thread
From: Hugh Dickins @ 2004-04-12 18:43 UTC (permalink / raw)
To: Martin J. Bligh; +Cc: linux-kernel, Andrew Morton, Rajesh Venkatasubramanian
On Mon, 12 Apr 2004, Martin J. Bligh wrote:
> >> Any chance of you doing the same comparison between 2.6.5-aa5
> >> 2.6.5-aa5 minus prio-tree?
>
> Hmm. it's halfway between the two. There does seem to be less sem
> contention, though the profile ticks in __down isn't really an accurate
> measure.
Thanks a lot, Martin (despite my silence, I really am listening and
pondering intermittently on this). So, -aa5 shows high __down count
too, may not be any kind of accurate measure, but it's surely not good.
Mainly I'm concentrating on getting ready the next few patches
of the objrmap set for Andrew (not to be sent for a day or two).
Unless we see a plausible way forward on your SDET numbers, I
think it casts this project in doubt - but even so I do need
to focus or I'll just mess them up.
Seems as if prio tree has too high a cost there, yet we believe we
need it to handle the corner cases of objrmap.
What I want to investigate, when I'm far enough done, is the effect
of restoring i_shared_sem to the i_shared_lock it was before 2.5.57.
My fantasy is that your SDET would behave much more stably without
that as a semaphore, that prio tree just pushes it over your cliff.
It is easier to insert and remove vmas in the list than the tree,
and you can get away with leaving them in place quite often with
the list.
(Expect me to shout excitedly "Hey, the __down count has gone right
down, that proves I'm right!")
i_shared_lock changed to i_shared_sem to allow that cond_resched_lock
in unmap_vmas to solve vmtruncate latency problems? With i_mmap and
i_mmap_shared as lists, isn't it easy to insert a dummy marker vma
and drop the lock if we need resched? Resuming from marker after.
But, sadly, I doubt that can be done with the prio tree: Rajesh?
Hugh
^ permalink raw reply [flat|nested] 38+ messages in thread
* Re: [PATCH] anobjrmap 9 priority mjb tree
2004-04-12 18:43 ` Hugh Dickins
@ 2004-04-12 18:58 ` Rajesh Venkatasubramanian
2004-04-12 19:01 ` Martin J. Bligh
1 sibling, 0 replies; 38+ messages in thread
From: Rajesh Venkatasubramanian @ 2004-04-12 18:58 UTC (permalink / raw)
To: Hugh Dickins; +Cc: Martin J. Bligh, linux-kernel, Andrew Morton
> Unless we see a plausible way forward on your SDET numbers, I
> think it casts this project in doubt - but even so I do need
We can try a few fancy locking tricks. But, we don't know whether
such tricks will help.
> i_shared_lock changed to i_shared_sem to allow that cond_resched_lock
> in unmap_vmas to solve vmtruncate latency problems? With i_mmap and
> i_mmap_shared as lists, isn't it easy to insert a dummy marker vma
> and drop the lock if we need resched? Resuming from marker after.
>
> But, sadly, I doubt that can be done with the prio tree: Rajesh?
Yeap. With prio_tree it is tricky. We already have the marker for
prio_tree, i.e., prio_tree_iter. But, when you drop a lock new tree
nodes may be added to the prio_tree, and the marker does not provide
any consistent meaning after the node additions.
Rajesh
^ permalink raw reply [flat|nested] 38+ messages in thread
* Re: [PATCH] anobjrmap 9 priority mjb tree
2004-04-12 18:43 ` Hugh Dickins
2004-04-12 18:58 ` Rajesh Venkatasubramanian
@ 2004-04-12 19:01 ` Martin J. Bligh
2004-04-12 19:10 ` Hugh Dickins
1 sibling, 1 reply; 38+ messages in thread
From: Martin J. Bligh @ 2004-04-12 19:01 UTC (permalink / raw)
To: Hugh Dickins; +Cc: linux-kernel, Andrew Morton, Rajesh Venkatasubramanian
> On Mon, 12 Apr 2004, Martin J. Bligh wrote:
>> >> Any chance of you doing the same comparison between 2.6.5-aa5
>> >> 2.6.5-aa5 minus prio-tree?
>>
>> Hmm. it's halfway between the two. There does seem to be less sem
>> contention, though the profile ticks in __down isn't really an accurate
>> measure.
>
> Thanks a lot, Martin (despite my silence, I really am listening and
> pondering intermittently on this). So, -aa5 shows high __down count
> too, may not be any kind of accurate measure, but it's surely not good.
>
> Mainly I'm concentrating on getting ready the next few patches
> of the objrmap set for Andrew (not to be sent for a day or two).
> Unless we see a plausible way forward on your SDET numbers, I
> think it casts this project in doubt - but even so I do need
> to focus or I'll just mess them up.
>
> Seems as if prio tree has too high a cost there, yet we believe we
> need it to handle the corner cases of objrmap.
I'm still not sure those corner cases actually occur now that we have
Oracle using remap_file_pages. But Andrew mentioned some uml thing
recently - not sure if that workload affects this or not.
> What I want to investigate, when I'm far enough done, is the effect
> of restoring i_shared_sem to the i_shared_lock it was before 2.5.57.
> My fantasy is that your SDET would behave much more stably without
> that as a semaphore, that prio tree just pushes it over your cliff.
Yeah, to be honest, maybe SDET is kind of bolloxed anyway by i_shared_sem
It needs a better fix. This probably shouldn't hold up objrmap, but it
would be nice to have a think about ;-)
> It is easier to insert and remove vmas in the list than the tree,
> and you can get away with leaving them in place quite often with
> the list.
>
> (Expect me to shout excitedly "Hey, the __down count has gone right
> down, that proves I'm right!")
>
> i_shared_lock changed to i_shared_sem to allow that cond_resched_lock
> in unmap_vmas to solve vmtruncate latency problems? With i_mmap and
> i_mmap_shared as lists, isn't it easy to insert a dummy marker vma
> and drop the lock if we need resched? Resuming from marker after.
>
> But, sadly, I doubt that can be done with the prio tree: Rajesh?
If it were just a list, maybe RCU would be appropriate. It might be
rather write-heavy though ? I think I played with an rwsem instead
of a sem in the past too (though be careful if you try this, as for
no good reason the return codes are inverted ;-()
M.
^ permalink raw reply [flat|nested] 38+ messages in thread
* Re: [PATCH] anobjrmap 9 priority mjb tree
2004-04-12 19:01 ` Martin J. Bligh
@ 2004-04-12 19:10 ` Hugh Dickins
2004-04-12 19:38 ` Rajesh Venkatasubramanian
0 siblings, 1 reply; 38+ messages in thread
From: Hugh Dickins @ 2004-04-12 19:10 UTC (permalink / raw)
To: Martin J. Bligh; +Cc: linux-kernel, Andrew Morton, Rajesh Venkatasubramanian
On Mon, 12 Apr 2004, Martin J. Bligh wrote:
>
> If it were just a list, maybe RCU would be appropriate. It might be
> rather write-heavy though ? I think I played with an rwsem instead
> of a sem in the past too (though be careful if you try this, as for
> no good reason the return codes are inverted ;-()
Yes, I think all the common paths have to write, in case the
uncommon paths (truncation and swapout) want to read: the wrong
way round for any kind of read-write optimization, isn't it?
Hugh
^ permalink raw reply [flat|nested] 38+ messages in thread
* Re: [PATCH] anobjrmap 9 priority mjb tree
2004-04-12 19:10 ` Hugh Dickins
@ 2004-04-12 19:38 ` Rajesh Venkatasubramanian
2004-04-12 21:14 ` Martin J. Bligh
0 siblings, 1 reply; 38+ messages in thread
From: Rajesh Venkatasubramanian @ 2004-04-12 19:38 UTC (permalink / raw)
To: Hugh Dickins; +Cc: Martin J. Bligh, linux-kernel, Andrew Morton
On Mon, 12 Apr 2004, Hugh Dickins wrote:
> On Mon, 12 Apr 2004, Martin J. Bligh wrote:
> >
> > If it were just a list, maybe RCU would be appropriate. It might be
> > rather write-heavy though ? I think I played with an rwsem instead
> > of a sem in the past too (though be careful if you try this, as for
> > no good reason the return codes are inverted ;-()
>
> Yes, I think all the common paths have to write, in case the
> uncommon paths (truncation and swapout) want to read: the wrong
> way round for any kind of read-write optimization, isn't it?
In common workloads e.g., add libc mapping using __vma_prio_tree_insert,
mostly you do not add new nodes to the tree. Instead, you just add to
a vm_set list. I am currently considering using rwsem to optimize
such cases. Similarly __vma_prio_tree_remove can also be optimized
in some common cases. I don't know whether it will help. Let us see...
Rajesh
^ permalink raw reply [flat|nested] 38+ messages in thread
* Re: [PATCH] anobjrmap 9 priority mjb tree
2004-04-12 19:38 ` Rajesh Venkatasubramanian
@ 2004-04-12 21:14 ` Martin J. Bligh
2004-04-12 21:12 ` Andrew Morton
2004-04-14 20:18 ` Rajesh Venkatasubramanian
0 siblings, 2 replies; 38+ messages in thread
From: Martin J. Bligh @ 2004-04-12 21:14 UTC (permalink / raw)
To: Rajesh Venkatasubramanian, Hugh Dickins, Andrea Arcangeli
Cc: linux-kernel, Andrew Morton
> On Mon, 12 Apr 2004, Hugh Dickins wrote:
>> On Mon, 12 Apr 2004, Martin J. Bligh wrote:
>> >
>> > If it were just a list, maybe RCU would be appropriate. It might be
>> > rather write-heavy though ? I think I played with an rwsem instead
>> > of a sem in the past too (though be careful if you try this, as for
>> > no good reason the return codes are inverted ;-()
>>
>> Yes, I think all the common paths have to write, in case the
>> uncommon paths (truncation and swapout) want to read: the wrong
>> way round for any kind of read-write optimization, isn't it?
But isn't objrmap a big read case? ;-)
> In common workloads e.g., add libc mapping using __vma_prio_tree_insert,
> mostly you do not add new nodes to the tree. Instead, you just add to
> a vm_set list. I am currently considering using rwsem to optimize
> such cases. Similarly __vma_prio_tree_remove can also be optimized
> in some common cases. I don't know whether it will help. Let us see...
Sounds interesting ... so basically you're breaking out the locking of
the tree itself separately?
M.
PS. In the diffprofiles, I observed that Andrea had killed one of the large
remaining lock entries (.text.lock.filemap). Turns out he'd turned the
locking in find_get_page from "spin_lock(&mapping->page_lock)" into
"spin_lock_irq(&mapping->tree_lock)", and I'm using readprofile, which
doesn't profile with irqs off, so it's not really disappeared, just hidden.
Not sure which sub-patch that comes from, and it turned out to be a bit of
a dead end, but whilst I'm there, I thought I'd point out this was contended,
and show the diffprofile with and without spinline for aa5:
22210 246777.8% find_trylock_page
2538 36.4% atomic_dec_and_lock
1249 146.6% grab_block
1042 99.6% kmap_high
882 29400.0% find_get_pages
868 69.1% file_kill
744 30.9% file_move
499 236.5% proc_pid_readlink
433 82.8% d_instantiate
389 110.2% kunmap_high
319 52.4% ext2_new_block
303 27.2% d_alloc
220 44.9% prune_dcache
206 3.1% __wake_up
195 26.4% new_inode
194 71.6% d_delete
161 33.5% d_path
146 53.9% group_reserve_blocks
124 11.4% __mark_inode_dirty
117 13.9% __find_get_block_slow
116 45.7% __insert_inode_hash
113 8.3% page_address
106 5.0% proc_pid_stat
...
-216 -100.0% .text.lock.namespace
-244 -1.1% __down
-352 -100.0% .text.lock.inode
-684 -100.0% .text.lock.base
-887 -96.3% find_get_pages_tag
-1269 -100.0% .text.lock.highmem
-1523 -100.0% .text.lock.file_table
-1535 -100.0% .text.lock.dcache
-1549 -0.2% total
-2834 -100.0% .text.lock.dec_and_lock
-2915 -0.6% default_idle
-21908 -99.8% find_get_page
(SDET 128 on the 16-way NUMA-Q).
(this basically shows who was taking the locks we see in profiles).
Still not quite sure why inlining the spinlocks did this, to be honest:
22210 246777.8% find_trylock_page
-21908 -99.8% find_get_page
as neither seems to call the other. Humpf.
^ permalink raw reply [flat|nested] 38+ messages in thread* Re: [PATCH] anobjrmap 9 priority mjb tree
2004-04-12 21:14 ` Martin J. Bligh
@ 2004-04-12 21:12 ` Andrew Morton
2004-04-12 21:43 ` Martin J. Bligh
2004-04-14 20:18 ` Rajesh Venkatasubramanian
1 sibling, 1 reply; 38+ messages in thread
From: Andrew Morton @ 2004-04-12 21:12 UTC (permalink / raw)
To: Martin J. Bligh; +Cc: vrajesh, hugh, andrea, linux-kernel
"Martin J. Bligh" <mbligh@aracnet.com> wrote:
>
> Turns out he'd turned the
> locking in find_get_page from "spin_lock(&mapping->page_lock)" into
> "spin_lock_irq(&mapping->tree_lock)",
That's from the use-radix-tree-walks-for-writeback code.
Use oprofile - it's NMI-based.
> and I'm using readprofile, which
> doesn't profile with irqs off, so it's not really disappeared, just hidden.
> Not sure which sub-patch that comes from, and it turned out to be a bit of
> a dead end, but whilst I'm there, I thought I'd point out this was contended,
> and show the diffprofile with and without spinline for aa5:
>
> 22210 246777.8% find_trylock_page
> 2538 36.4% atomic_dec_and_lock
profiler brokenness, surely. Almost nothing calls find_trylock_page(),
unless Andrea has done something peculiar. Use oprofile.
^ permalink raw reply [flat|nested] 38+ messages in thread
* Re: [PATCH] anobjrmap 9 priority mjb tree
2004-04-12 21:12 ` Andrew Morton
@ 2004-04-12 21:43 ` Martin J. Bligh
0 siblings, 0 replies; 38+ messages in thread
From: Martin J. Bligh @ 2004-04-12 21:43 UTC (permalink / raw)
To: Andrew Morton; +Cc: vrajesh, hugh, andrea, linux-kernel
--On Monday, April 12, 2004 14:12:44 -0700 Andrew Morton <akpm@osdl.org> wrote:
> "Martin J. Bligh" <mbligh@aracnet.com> wrote:
>>
>> Turns out he'd turned the
>> locking in find_get_page from "spin_lock(&mapping->page_lock)" into
>> "spin_lock_irq(&mapping->tree_lock)",
>
> That's from the use-radix-tree-walks-for-writeback code.
>
> Use oprofile - it's NMI-based.
>
>> and I'm using readprofile, which
>> doesn't profile with irqs off, so it's not really disappeared, just hidden.
>> Not sure which sub-patch that comes from, and it turned out to be a bit of
>> a dead end, but whilst I'm there, I thought I'd point out this was contended,
>> and show the diffprofile with and without spinline for aa5:
>>
>> 22210 246777.8% find_trylock_page
>> 2538 36.4% atomic_dec_and_lock
>
> profiler brokenness, surely. Almost nothing calls find_trylock_page(),
> unless Andrea has done something peculiar. Use oprofile.
Well, he did do this:
@@ -413,11 +412,11 @@ struct page *find_trylock_page(struct ad
{
struct page *page;
- spin_lock(&mapping->page_lock);
+ spin_lock_irq(&mapping->tree_lock);
page = radix_tree_lookup(&mapping->page_tree, offset);
if (page && TestSetPageLocked(page))
page = NULL;
- spin_unlock(&mapping->page_lock);
+ spin_unlock_irq(&mapping->tree_lock);
return page;
}
Which would stop it appearing in readprofile. But why spinlock inlining
should affect that one way or the other is beyond me. I'll see about
using oprofile, but it's not a trivial conversion (it's all scripted).
There's no other occurences of that in his patchset. But you're right,
only xfs, and free_swap_and_cache seem to use it, and I'm not swapping.
M.
^ permalink raw reply [flat|nested] 38+ messages in thread
* Re: [PATCH] anobjrmap 9 priority mjb tree
2004-04-12 21:14 ` Martin J. Bligh
2004-04-12 21:12 ` Andrew Morton
@ 2004-04-14 20:18 ` Rajesh Venkatasubramanian
2004-04-15 0:05 ` Andrea Arcangeli
1 sibling, 1 reply; 38+ messages in thread
From: Rajesh Venkatasubramanian @ 2004-04-14 20:18 UTC (permalink / raw)
To: Martin J. Bligh
Cc: Hugh Dickins, Andrea Arcangeli, linux-kernel, Andrew Morton
This patch is another attempt at reducing the contention on i_shared_sem.
The patch converts i_shared_sem from normal semaphore to read-write
semaphore. The locking rules used are:
1) A prio_tree cannot be modified without holding write lock.
2) However, vmas can be added and removed from a vm_set list
by just holding the read lock and a bit lock (vm_set_lock)
in the corresponding prio_tree node.
3) All objrmap functions just hold read lock now. So when we
walk a vm_set list we have to hold the corresponding
vm_set_lock.
4) Since truncate uses write lock (provides exclusion) we don't
have to take vm_set_locks.
Martin! When you get time to test your SDET with this patch, please
let me know whether this patch helps you at all. The patch applies
on top of 2.6.5-mjb1+anobjrmap9_prio_tree.
fs/hugetlbfs/inode.c | 4 -
fs/inode.c | 2
include/linux/fs.h | 2
include/linux/mm.h | 127 ++++++++++++++++++++++++++++++++++++++++++++--
include/linux/prio_tree.h | 3 +
kernel/fork.c | 34 +++++++++++-
mm/fremap.c | 4 -
mm/memory.c | 4 -
mm/mmap.c | 120 ++++++++++++++++++++++++++++++++-----------
mm/mremap.c | 8 +-
mm/prio_tree.c | 117 ++++++++++++++++++++++++++++--------------
mm/rmap.c | 46 ++++++++++------
12 files changed, 365 insertions(+), 106 deletions(-)
diff -puN fs/hugetlbfs/inode.c~110_sem_contention fs/hugetlbfs/inode.c
--- mmlinux-2.6/fs/hugetlbfs/inode.c~110_sem_contention 2004-04-14 15:49:01.000000000 -0400
+++ mmlinux-2.6-jaya/fs/hugetlbfs/inode.c 2004-04-14 15:49:01.000000000 -0400
@@ -325,14 +325,14 @@ static int hugetlb_vmtruncate(struct ino
pgoff = offset >> HPAGE_SHIFT;
inode->i_size = offset;
- down(&mapping->i_shared_sem);
+ down_write(&mapping->i_shared_sem);
/* Protect against page fault */
atomic_inc(&mapping->truncate_count);
if (unlikely(!prio_tree_empty(&mapping->i_mmap)))
hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff);
if (unlikely(!prio_tree_empty(&mapping->i_mmap_shared)))
hugetlb_vmtruncate_list(&mapping->i_mmap_shared, pgoff);
- up(&mapping->i_shared_sem);
+ up_write(&mapping->i_shared_sem);
truncate_hugepages(mapping, offset);
return 0;
}
diff -puN fs/inode.c~110_sem_contention fs/inode.c
--- mmlinux-2.6/fs/inode.c~110_sem_contention 2004-04-14 15:49:01.000000000 -0400
+++ mmlinux-2.6-jaya/fs/inode.c 2004-04-14 15:49:01.000000000 -0400
@@ -185,7 +185,7 @@ void inode_init_once(struct inode *inode
sema_init(&inode->i_sem, 1);
INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
spin_lock_init(&inode->i_data.page_lock);
- init_MUTEX(&inode->i_data.i_shared_sem);
+ init_rwsem(&inode->i_data.i_shared_sem);
atomic_set(&inode->i_data.truncate_count, 0);
INIT_LIST_HEAD(&inode->i_data.private_list);
spin_lock_init(&inode->i_data.private_lock);
diff -puN include/linux/fs.h~110_sem_contention include/linux/fs.h
--- mmlinux-2.6/include/linux/fs.h~110_sem_contention 2004-04-14 15:49:01.000000000 -0400
+++ mmlinux-2.6-jaya/include/linux/fs.h 2004-04-14 15:49:01.000000000 -0400
@@ -333,7 +333,7 @@ struct address_space {
struct prio_tree_root i_mmap; /* tree of private mappings */
struct prio_tree_root i_mmap_shared; /* tree of shared mappings */
struct list_head i_mmap_nonlinear;/*list of nonlinear mappings */
- struct semaphore i_shared_sem; /* protect both above lists */
+ struct rw_semaphore i_shared_sem; /* protect both above lists */
atomic_t truncate_count; /* Cover race condition with truncate */
unsigned long flags; /* error bits/gfp mask */
struct backing_dev_info *backing_dev_info; /* device readahead, etc */
diff -puN include/linux/mm.h~110_sem_contention include/linux/mm.h
--- mmlinux-2.6/include/linux/mm.h~110_sem_contention 2004-04-14 15:49:01.000000000 -0400
+++ mmlinux-2.6-jaya/include/linux/mm.h 2004-04-14 15:49:01.000000000 -0400
@@ -87,6 +87,10 @@ struct vm_area_struct {
/*
* shared.vm_set : list of vmas that map exactly the same set of pages
* vm_set_head : head of the vm_set list
+ *
+ * Both shared.vm_set.list and vm_set_head are protected by VM_SET_LOCK
+ * bit of the corresponding tree node's vm_flags when accessed under
+ * down_read(i_shared_sem)
*/
struct vm_area_struct *vm_set_head;
@@ -133,6 +137,8 @@ struct vm_area_struct {
#define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */
#define VM_NONLINEAR 0x00800000 /* Is non-linear (remap_file_pages) */
+#define VM_SET_LOCK 24 /* Lock bit for vm_set list, head */
+
/* It makes sense to apply VM_ACCOUNT to this vma. */
#define VM_MAYACCT(vma) (!!((vma)->vm_flags & VM_HUGETLB))
@@ -156,6 +162,13 @@ struct vm_area_struct {
* The following macros are used for implementing prio_tree for i_mmap{_shared}
*/
+#define vm_set_lock(vma) bit_spin_lock(VM_SET_LOCK, \
+ (unsigned long *)&(vma->vm_flags))
+#define vm_set_trylock(vma) bit_spin_trylock(VM_SET_LOCK, \
+ (unsigned long *)&(vma->vm_flags))
+#define vm_set_unlock(vma) bit_spin_unlock(VM_SET_LOCK, \
+ (unsigned long *)&(vma->vm_flags))
+
#define RADIX_INDEX(vma) ((vma)->vm_pgoff)
#define VMA_SIZE(vma) (((vma)->vm_end - (vma)->vm_start) >> PAGE_SHIFT)
/* avoid overflow */
@@ -202,7 +215,8 @@ static inline int vma_shared_empty(struc
/*
* Helps to add a new vma that maps the same (identical) set of pages as the
- * old vma to an i_mmap tree.
+ * old vma to an i_mmap tree. No new tree node is added by this function.
+ * The new vma is added to an already existing tree node's vm_set list.
*/
static inline void __vma_prio_tree_add(struct vm_area_struct *vma,
struct vm_area_struct *old)
@@ -229,6 +243,37 @@ static inline void __vma_prio_tree_add(s
}
/*
+ * Delete a vm_set list node from an i_mmap tree. Note that this function
+ * should not be called with a tree node, i.e., shared.both.parent != NULL.
+ */
+static inline void __vma_prio_tree_del(struct vm_area_struct *vma)
+{
+ /* Leave this BUG_ON till prio_tree patch stabilizes */
+ BUG_ON(vma->shared.both.parent);
+
+ if (vma->vm_set_head) {
+ struct vm_area_struct *tree_node, *new_head;
+ /* Leave this BUG_ON till prio_tree patch stabilizes */
+ BUG_ON(vma->vm_set_head->vm_set_head != vma);
+ tree_node = vma->vm_set_head;
+ if (!list_empty(&vma->shared.vm_set.list)) {
+ new_head = list_entry(
+ vma->shared.vm_set.list.next,
+ struct vm_area_struct,
+ shared.vm_set.list);
+ list_del_init(&vma->shared.vm_set.list);
+ tree_node->vm_set_head = new_head;
+ new_head->vm_set_head = tree_node;
+ }
+ else
+ tree_node->vm_set_head = NULL;
+ } else
+ list_del_init(&vma->shared.vm_set.list);
+
+ INIT_VMA_SHARED(vma);
+}
+
+/*
* We cannot modify vm_start, vm_end, vm_pgoff fields of a vma that has been
* already present in an i_mmap{_shared} tree without modifying the tree. The
* following helper function should be used when such modifications are
@@ -250,6 +295,25 @@ static inline void __vma_modify(struct p
}
/*
+ * Find a vma with given radix_index and heap_index in the prio_tree. Return
+ * the vma pointer if found, NULL otherwise.
+ */
+static inline struct vm_area_struct *__vma_prio_tree_find(
+ struct prio_tree_root *root, unsigned long radix_index,
+ unsigned long heap_index)
+{
+ struct prio_tree_node *ptr;
+
+ ptr = prio_tree_find(root, radix_index, heap_index);
+
+ if (ptr)
+ return prio_tree_entry(ptr, struct vm_area_struct,
+ shared.prio_tree_node);
+ else
+ return NULL;
+}
+
+/*
* Helper functions to enumerate vmas that map a given file page or a set of
* contiguous file pages. The functions return vmas that at least map a single
* page in the given range of contiguous file pages.
@@ -269,11 +333,19 @@ static inline struct vm_area_struct *__v
return NULL;
}
-static inline struct vm_area_struct *__vma_prio_tree_next(
- struct vm_area_struct *vma, struct prio_tree_root *root,
- struct prio_tree_iter *iter, unsigned long begin, unsigned long end)
+static inline struct vm_area_struct *__vma_prio_tree_first_lock(
+ struct prio_tree_root *root, struct prio_tree_iter *iter,
+ unsigned long begin, unsigned long end)
+{
+ struct vm_area_struct *vma;
+ vma = __vma_prio_tree_first(root, iter, begin, end);
+ if (vma)
+ vm_set_lock(vma);
+ return vma;
+}
+
+static inline struct vm_area_struct *__vm_set_next(struct vm_area_struct *vma)
{
- struct prio_tree_node *ptr;
struct vm_area_struct *next;
if (vma->shared.both.parent) {
@@ -286,6 +358,19 @@ static inline struct vm_area_struct *__v
if (!(next->vm_set_head))
return next;
}
+ return NULL;
+}
+
+static inline struct vm_area_struct *__vma_prio_tree_next(
+ struct vm_area_struct *vma, struct prio_tree_root *root,
+ struct prio_tree_iter *iter, unsigned long begin, unsigned long end)
+{
+ struct prio_tree_node *ptr;
+ struct vm_area_struct *next;
+
+ next = __vm_set_next(vma);
+ if (next)
+ return next;
ptr = prio_tree_next(root, iter, begin, end);
@@ -296,6 +381,38 @@ static inline struct vm_area_struct *__v
return NULL;
}
+static inline void __vma_prio_tree_iter_unlock(struct prio_tree_iter *iter)
+{
+ struct vm_area_struct *vma;
+ vma = prio_tree_entry(iter->cur, struct vm_area_struct,
+ shared.prio_tree_node);
+ vm_set_unlock(vma);
+}
+
+static inline struct vm_area_struct *__vma_prio_tree_next_lock(
+ struct vm_area_struct *vma, struct prio_tree_root *root,
+ struct prio_tree_iter *iter, unsigned long begin, unsigned long end)
+{
+ struct prio_tree_node *ptr;
+ struct vm_area_struct *next;
+
+ next = __vm_set_next(vma);
+ if (next)
+ return next;
+
+ __vma_prio_tree_iter_unlock(iter);
+ ptr = prio_tree_next(root, iter, begin, end);
+
+ if (ptr) {
+ next = prio_tree_entry(ptr, struct vm_area_struct,
+ shared.prio_tree_node);
+ vm_set_lock(next);
+ return next;
+ } else
+ return NULL;
+}
+
+
/*
* mapping from the currently active vm_flags protection bits (the
* low four bits) to a page protection mask..
diff -puN include/linux/prio_tree.h~110_sem_contention include/linux/prio_tree.h
--- mmlinux-2.6/include/linux/prio_tree.h~110_sem_contention 2004-04-14 15:49:01.000000000 -0400
+++ mmlinux-2.6-jaya/include/linux/prio_tree.h 2004-04-14 15:49:01.000000000 -0400
@@ -67,6 +67,9 @@ static inline int prio_tree_right_empty(
extern struct prio_tree_node *prio_tree_insert(struct prio_tree_root *,
struct prio_tree_node *);
+extern struct prio_tree_node *prio_tree_find(struct prio_tree_root *,
+ unsigned long, unsigned long);
+
extern void prio_tree_remove(struct prio_tree_root *, struct prio_tree_node *);
extern struct prio_tree_node *prio_tree_first(struct prio_tree_root *,
diff -puN kernel/fork.c~110_sem_contention kernel/fork.c
--- mmlinux-2.6/kernel/fork.c~110_sem_contention 2004-04-14 15:49:01.000000000 -0400
+++ mmlinux-2.6-jaya/kernel/fork.c 2004-04-14 15:49:01.000000000 -0400
@@ -326,15 +326,43 @@ static inline int dup_mmap(struct mm_str
file = tmp->vm_file;
INIT_VMA_SHARED(tmp);
if (file) {
+ struct address_space *mapping = file->f_mapping;
struct inode *inode = file->f_dentry->d_inode;
get_file(file);
if (tmp->vm_flags & VM_DENYWRITE)
atomic_dec(&inode->i_writecount);
/* insert tmp into the share list, just after mpnt */
- down(&file->f_mapping->i_shared_sem);
- __vma_prio_tree_add(tmp, mpnt);
- up(&file->f_mapping->i_shared_sem);
+ if (down_write_trylock(&mapping->i_shared_sem)) {
+ __vma_prio_tree_add(tmp, mpnt);
+ up_write(&mapping->i_shared_sem);
+ }
+ else {
+ if (unlikely(mpnt->vm_flags & VM_NONLINEAR)) {
+ down_write(&mapping->i_shared_sem);
+ list_add(&tmp->shared.vm_set.list,
+ &mpnt->shared.vm_set.list);
+ up_write(&mapping->i_shared_sem);
+ }
+ else {
+ struct vm_area_struct *tree_node;
+ struct prio_tree_root *root;
+ if (mpnt->vm_flags & VM_SHARED)
+ root = &mapping->i_mmap_shared;
+ else
+ root = &mapping->i_mmap;
+
+ down_read(&mapping->i_shared_sem);
+ tree_node = __vma_prio_tree_find(root,
+ RADIX_INDEX(mpnt),
+ HEAP_INDEX(mpnt));
+ BUG_ON(!tree_node);
+ vm_set_lock(tree_node);
+ __vma_prio_tree_add(tmp, mpnt);
+ vm_set_unlock(tree_node);
+ up_read(&mapping->i_shared_sem);
+ }
+ }
}
/*
diff -puN mm/fremap.c~110_sem_contention mm/fremap.c
--- mmlinux-2.6/mm/fremap.c~110_sem_contention 2004-04-14 15:49:01.000000000 -0400
+++ mmlinux-2.6-jaya/mm/fremap.c 2004-04-14 15:49:01.000000000 -0400
@@ -203,13 +203,13 @@ asmlinkage long sys_remap_file_pages(uns
linear_pgoff += ((start - vma->vm_start) >> PAGE_SHIFT);
if (pgoff != linear_pgoff && !(vma->vm_flags & VM_NONLINEAR)) {
mapping = vma->vm_file->f_mapping;
- down(&mapping->i_shared_sem);
+ down_write(&mapping->i_shared_sem);
vma->vm_flags |= VM_NONLINEAR;
__vma_prio_tree_remove(&mapping->i_mmap_shared, vma);
INIT_VMA_SHARED_LIST(vma);
list_add_tail(&vma->shared.vm_set.list,
&mapping->i_mmap_nonlinear);
- up(&mapping->i_shared_sem);
+ up_write(&mapping->i_shared_sem);
}
/* ->populate can take a long time, so downgrade the lock. */
diff -puN mm/memory.c~110_sem_contention mm/memory.c
--- mmlinux-2.6/mm/memory.c~110_sem_contention 2004-04-14 15:49:01.000000000 -0400
+++ mmlinux-2.6-jaya/mm/memory.c 2004-04-14 15:49:01.000000000 -0400
@@ -1133,14 +1133,14 @@ void invalidate_mmap_range(struct addres
if (holeend & ~(long long)ULONG_MAX)
hlen = ULONG_MAX - hba + 1;
}
- down(&mapping->i_shared_sem);
+ down_write(&mapping->i_shared_sem);
/* Protect against page fault */
atomic_inc(&mapping->truncate_count);
if (unlikely(!prio_tree_empty(&mapping->i_mmap)))
invalidate_mmap_range_list(&mapping->i_mmap, hba, hlen);
if (unlikely(!prio_tree_empty(&mapping->i_mmap_shared)))
invalidate_mmap_range_list(&mapping->i_mmap_shared, hba, hlen);
- up(&mapping->i_shared_sem);
+ up_write(&mapping->i_shared_sem);
}
EXPORT_SYMBOL_GPL(invalidate_mmap_range);
diff -puN mm/mmap.c~110_sem_contention mm/mmap.c
--- mmlinux-2.6/mm/mmap.c~110_sem_contention 2004-04-14 15:49:01.000000000 -0400
+++ mmlinux-2.6-jaya/mm/mmap.c 2004-04-14 15:49:01.000000000 -0400
@@ -96,10 +96,43 @@ static void remove_shared_vm_struct(stru
if (file) {
struct address_space *mapping = file->f_mapping;
- down(&mapping->i_shared_sem);
- __remove_shared_vm_struct(vma, file->f_dentry->d_inode,
- mapping);
- up(&mapping->i_shared_sem);
+ struct inode *inode = file->f_dentry->d_inode;
+
+ if (down_write_trylock(&mapping->i_shared_sem)) {
+ __remove_shared_vm_struct(vma, inode, mapping);
+ up_write(&mapping->i_shared_sem);
+ return;
+ }
+
+ if (likely(!(vma->vm_flags & VM_NONLINEAR) &&
+ !vma->shared.both.parent)) {
+ struct prio_tree_root *root;
+ struct vm_area_struct *tree_node;
+ if (vma->vm_flags & VM_SHARED)
+ root = &mapping->i_mmap_shared;
+ else
+ root = &mapping->i_mmap;
+
+ down_read(&mapping->i_shared_sem);
+ if (unlikely(vma->shared.both.parent)) {
+ up_read(&mapping->i_shared_sem);
+ goto get_write;
+ }
+ tree_node = __vma_prio_tree_find(root,
+ RADIX_INDEX(vma), HEAP_INDEX(vma));
+ BUG_ON(!tree_node);
+ vm_set_lock(tree_node);
+ if (vma->vm_flags & VM_DENYWRITE)
+ atomic_inc(&inode->i_writecount);
+ __vma_prio_tree_del(vma);
+ vm_set_unlock(tree_node);
+ up_read(&mapping->i_shared_sem);
+ return;
+ }
+get_write:
+ down_write(&mapping->i_shared_sem);
+ __remove_shared_vm_struct(vma, inode, mapping);
+ up_write(&mapping->i_shared_sem);
}
}
@@ -291,26 +324,58 @@ __vma_link(struct mm_struct *mm, struct
{
__vma_link_list(mm, vma, prev, rb_parent);
__vma_link_rb(mm, vma, rb_link, rb_parent);
- __vma_link_file(vma);
}
static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
struct vm_area_struct *prev, struct rb_node **rb_link,
struct rb_node *rb_parent)
{
+ struct prio_tree_root *root;
+ struct vm_area_struct *tree_node;
struct address_space *mapping = NULL;
if (vma->vm_file)
mapping = vma->vm_file->f_mapping;
- if (mapping)
- down(&mapping->i_shared_sem);
- spin_lock(&mm->page_table_lock);
- __vma_link(mm, vma, prev, rb_link, rb_parent);
- spin_unlock(&mm->page_table_lock);
- if (mapping)
- up(&mapping->i_shared_sem);
+ if (mapping) {
+ if (unlikely(vma->vm_flags & VM_NONLINEAR))
+ goto get_write;
+ if (vma->vm_flags & VM_SHARED)
+ root = &mapping->i_mmap_shared;
+ else
+ root = &mapping->i_mmap;
+ down_read(&mapping->i_shared_sem);
+ tree_node = __vma_prio_tree_find(root,
+ RADIX_INDEX(vma), HEAP_INDEX(vma));
+ if (tree_node) {
+ struct inode *inode = vma->vm_file->f_dentry->d_inode;
+ if (vma->vm_flags & VM_DENYWRITE)
+ atomic_dec(&inode->i_writecount);
+ spin_lock(&mm->page_table_lock);
+ __vma_link(mm, vma, prev, rb_link, rb_parent);
+ spin_unlock(&mm->page_table_lock);
+ vm_set_lock(tree_node);
+ __vma_prio_tree_add(vma, tree_node);
+ vm_set_unlock(tree_node);
+ up_read(&mapping->i_shared_sem);
+ }
+ else {
+ up_read(&mapping->i_shared_sem);
+get_write:
+ down_write(&mapping->i_shared_sem);
+ spin_lock(&mm->page_table_lock);
+ __vma_link(mm, vma, prev, rb_link, rb_parent);
+ __vma_link_file(vma);
+ spin_unlock(&mm->page_table_lock);
+ up_write(&mapping->i_shared_sem);
+ }
+ }
+ else {
+ spin_lock(&mm->page_table_lock);
+ __vma_link(mm, vma, prev, rb_link, rb_parent);
+ spin_unlock(&mm->page_table_lock);
+ }
mark_mm_hugetlb(mm, vma);
mm->map_count++;
validate_mm(mm);
@@ -331,6 +396,7 @@ __insert_vm_struct(struct mm_struct * mm
if (__vma && __vma->vm_start < vma->vm_end)
BUG();
__vma_link(mm, vma, prev, rb_link, rb_parent);
+ __vma_link_file(vma);
mark_mm_hugetlb(mm, vma);
mm->map_count++;
validate_mm(mm);
@@ -410,7 +476,7 @@ static struct vm_area_struct *vma_merge(
spinlock_t *lock = &mm->page_table_lock;
struct inode *inode = file ? file->f_dentry->d_inode : NULL;
struct address_space *mapping = file ? file->f_mapping : NULL;
- struct semaphore *i_shared_sem;
+ struct rw_semaphore *i_shared_sem;
struct prio_tree_root *root = NULL;
/*
@@ -442,13 +508,9 @@ static struct vm_area_struct *vma_merge(
if (prev->vm_end == addr &&
can_vma_merge_after(prev, vm_flags, file, pgoff)) {
struct vm_area_struct *next;
- int need_up = 0;
- if (unlikely(file && prev->vm_next &&
- prev->vm_next->vm_file == file)) {
- down(i_shared_sem);
- need_up = 1;
- }
+ if (file)
+ down_write(i_shared_sem);
spin_lock(lock);
/*
@@ -463,20 +525,18 @@ static struct vm_area_struct *vma_merge(
next->vm_end, prev->vm_pgoff);
__remove_shared_vm_struct(next, inode, mapping);
spin_unlock(lock);
- if (need_up)
- up(i_shared_sem);
- if (file)
+ if (file) {
+ up_write(i_shared_sem);
fput(file);
-
+ }
mm->map_count--;
kmem_cache_free(vm_area_cachep, next);
return prev;
}
-
__vma_modify(root, prev, prev->vm_start, end, prev->vm_pgoff);
spin_unlock(lock);
- if (need_up)
- up(i_shared_sem);
+ if (file)
+ up_write(i_shared_sem);
return prev;
}
@@ -491,13 +551,13 @@ static struct vm_area_struct *vma_merge(
return NULL;
if (end == prev->vm_start) {
if (file)
- down(i_shared_sem);
+ down_write(i_shared_sem);
spin_lock(lock);
__vma_modify(root, prev, addr, prev->vm_end,
prev->vm_pgoff - ((end - addr) >> PAGE_SHIFT));
spin_unlock(lock);
if (file)
- up(i_shared_sem);
+ up_write(i_shared_sem);
return prev;
}
}
@@ -1362,7 +1422,7 @@ int split_vma(struct mm_struct * mm, str
}
if (mapping)
- down(&mapping->i_shared_sem);
+ down_write(&mapping->i_shared_sem);
spin_lock(&mm->page_table_lock);
if (new_below)
@@ -1375,7 +1435,7 @@ int split_vma(struct mm_struct * mm, str
spin_unlock(&mm->page_table_lock);
if (mapping)
- up(&mapping->i_shared_sem);
+ up_write(&mapping->i_shared_sem);
return 0;
}
diff -puN mm/mremap.c~110_sem_contention mm/mremap.c
--- mmlinux-2.6/mm/mremap.c~110_sem_contention 2004-04-14 15:49:01.000000000 -0400
+++ mmlinux-2.6-jaya/mm/mremap.c 2004-04-14 15:49:01.000000000 -0400
@@ -295,7 +295,7 @@ static unsigned long move_vma(struct vm_
* and we propagate stale pages into the dst afterward.
*/
mapping = vma->vm_file->f_mapping;
- down(&mapping->i_shared_sem);
+ down_read(&mapping->i_shared_sem);
}
moved_len = move_page_tables(vma, new_addr, old_addr, old_len);
if (moved_len < old_len) {
@@ -311,7 +311,7 @@ static unsigned long move_vma(struct vm_
new_addr = -ENOMEM;
}
if (mapping)
- up(&mapping->i_shared_sem);
+ up_read(&mapping->i_shared_sem);
/* Conceal VM_ACCOUNT so old reservation is not undone */
if (vm_flags & VM_ACCOUNT) {
@@ -476,7 +476,7 @@ unsigned long do_mremap(unsigned long ad
}
else
root = &mapping->i_mmap;
- down(&mapping->i_shared_sem);
+ down_write(&mapping->i_shared_sem);
}
spin_lock(&vma->vm_mm->page_table_lock);
@@ -485,7 +485,7 @@ unsigned long do_mremap(unsigned long ad
spin_unlock(&vma->vm_mm->page_table_lock);
if(mapping)
- up(&mapping->i_shared_sem);
+ up_write(&mapping->i_shared_sem);
current->mm->total_vm += pages;
if (vma->vm_flags & VM_LOCKED) {
diff -puN mm/prio_tree.c~110_sem_contention mm/prio_tree.c
--- mmlinux-2.6/mm/prio_tree.c~110_sem_contention 2004-04-14 15:49:01.000000000 -0400
+++ mmlinux-2.6-jaya/mm/prio_tree.c 2004-04-14 15:49:01.000000000 -0400
@@ -279,6 +279,66 @@ void prio_tree_remove(struct prio_tree_r
}
/*
+ * Find a prio_tree node with the given radix_index and heap_index. This
+ * algorithm takes 0(log n) time. At most 64 (less than 32 in common case)
+ * nodes are visited in a 32 bit machine.
+ */
+struct prio_tree_node *prio_tree_find(struct prio_tree_root *root,
+ unsigned long radix_index, unsigned long heap_index)
+{
+ struct prio_tree_node *cur;
+ unsigned long r_index, h_index, index, mask;
+ int size_flag = 0;
+
+ if (prio_tree_empty(root) ||
+ heap_index > prio_tree_maxindex(root->index_bits))
+ return NULL;
+
+ cur = root->prio_tree_node;
+ mask = 1UL << (root->index_bits - 1);
+
+ while (mask) {
+ GET_INDEX(cur, r_index, h_index);
+
+ if (r_index == radix_index && h_index == heap_index)
+ return cur;
+
+ if (h_index < heap_index || (h_index == heap_index &&
+ r_index > radix_index))
+ return NULL;
+
+ if (size_flag)
+ index = heap_index - radix_index;
+ else
+ index = radix_index;
+
+ if (index & mask) {
+ if (prio_tree_right_empty(cur))
+ return NULL;
+ else
+ cur = cur->right;
+ }
+ else {
+ if (prio_tree_left_empty(cur))
+ return NULL;
+ else
+ cur = cur->left;
+ }
+
+ mask >>= 1;
+
+ if (!mask) {
+ mask = 1UL << (root->index_bits - 1);
+ size_flag = 1;
+ }
+ }
+ /* Should not reach here */
+ BUG();
+ return NULL;
+}
+
+
+/*
* Following functions help to enumerate all prio_tree_nodes in the tree that
* overlap with the input interval X [radix_index, heap_index]. The enumeration
* takes O(log n + m) time where 'log n' is the height of the tree (which is
@@ -529,55 +589,34 @@ void __vma_prio_tree_insert(struct prio_
void __vma_prio_tree_remove(struct prio_tree_root *root,
struct vm_area_struct *vma)
{
- struct vm_area_struct *node, *head, *new_head;
+ struct vm_area_struct *head, *new_head;
- if (vma->shared.both.parent == NULL && vma->vm_set_head == NULL) {
- list_del_init(&vma->shared.vm_set.list);
- INIT_VMA_SHARED(vma);
+ if (!vma->shared.both.parent) {
+ __vma_prio_tree_del(vma);
return;
}
if (vma->vm_set_head) {
/* Leave this BUG_ON till prio_tree patch stabilizes */
BUG_ON(vma->vm_set_head->vm_set_head != vma);
- if (vma->shared.both.parent) {
- head = vma->vm_set_head;
- if (!list_empty(&head->shared.vm_set.list)) {
- new_head = list_entry(
- head->shared.vm_set.list.next,
- struct vm_area_struct,
- shared.vm_set.list);
- list_del_init(&head->shared.vm_set.list);
- }
- else
- new_head = NULL;
+ head = vma->vm_set_head;
+ if (!list_empty(&head->shared.vm_set.list)) {
+ new_head = list_entry(head->shared.vm_set.list.next,
+ struct vm_area_struct, shared.vm_set.list);
+ list_del_init(&head->shared.vm_set.list);
+ }
+ else
+ new_head = NULL;
- prio_tree_replace(root, &vma->shared.prio_tree_node,
- &head->shared.prio_tree_node);
- head->vm_set_head = new_head;
- if (new_head)
- new_head->vm_set_head = head;
+ prio_tree_replace(root, &vma->shared.prio_tree_node,
+ &head->shared.prio_tree_node);
+ head->vm_set_head = new_head;
+ if (new_head)
+ new_head->vm_set_head = head;
- }
- else {
- node = vma->vm_set_head;
- if (!list_empty(&vma->shared.vm_set.list)) {
- new_head = list_entry(
- vma->shared.vm_set.list.next,
- struct vm_area_struct,
- shared.vm_set.list);
- list_del_init(&vma->shared.vm_set.list);
- node->vm_set_head = new_head;
- new_head->vm_set_head = node;
- }
- else
- node->vm_set_head = NULL;
- }
- INIT_VMA_SHARED(vma);
- return;
- }
+ } else
+ prio_tree_remove(root, &vma->shared.prio_tree_node);
- prio_tree_remove(root, &vma->shared.prio_tree_node);
INIT_VMA_SHARED(vma);
}
diff -puN mm/rmap.c~110_sem_contention mm/rmap.c
--- mmlinux-2.6/mm/rmap.c~110_sem_contention 2004-04-14 15:49:01.000000000 -0400
+++ mmlinux-2.6-jaya/mm/rmap.c 2004-04-14 15:49:01.000000000 -0400
@@ -279,10 +279,10 @@ static inline int page_referenced_obj(st
unsigned long address;
int referenced = 0;
- if (down_trylock(&mapping->i_shared_sem))
+ if (!down_read_trylock(&mapping->i_shared_sem))
return 0;
- vma = __vma_prio_tree_first(&mapping->i_mmap,
+ vma = __vma_prio_tree_first_lock(&mapping->i_mmap,
&iter, pgoff, pgoff);
while (vma) {
if ((vma->vm_flags & (VM_LOCKED|VM_MAYSHARE))
@@ -297,11 +297,11 @@ static inline int page_referenced_obj(st
if (!*mapcount)
goto out;
}
- vma = __vma_prio_tree_next(vma, &mapping->i_mmap,
+ vma = __vma_prio_tree_next_lock(vma, &mapping->i_mmap,
&iter, pgoff, pgoff);
}
- vma = __vma_prio_tree_first(&mapping->i_mmap_shared,
+ vma = __vma_prio_tree_first_lock(&mapping->i_mmap_shared,
&iter, pgoff, pgoff);
while (vma) {
if (vma->vm_flags & (VM_LOCKED|VM_RESERVED)) {
@@ -315,14 +315,17 @@ static inline int page_referenced_obj(st
if (!*mapcount)
goto out;
}
- vma = __vma_prio_tree_next(vma, &mapping->i_mmap_shared,
+ vma = __vma_prio_tree_next_lock(vma, &mapping->i_mmap_shared,
&iter, pgoff, pgoff);
}
-
+
if (list_empty(&mapping->i_mmap_nonlinear))
- WARN_ON(*mapcount > 0);
+ WARN_ON(*mapcount > 0);
+ up_read(&mapping->i_shared_sem);
+ return referenced;
out:
- up(&mapping->i_shared_sem);
+ __vma_prio_tree_iter_unlock(&iter);
+ up_read(&mapping->i_shared_sem);
return referenced;
}
@@ -711,10 +714,10 @@ static inline int try_to_unmap_obj(struc
unsigned long max_nl_cursor = 0;
unsigned long max_nl_size = 0;
- if (down_trylock(&mapping->i_shared_sem))
+ if (!down_read_trylock(&mapping->i_shared_sem))
return ret;
- vma = __vma_prio_tree_first(&mapping->i_mmap,
+ vma = __vma_prio_tree_first_lock(&mapping->i_mmap,
&iter, pgoff, pgoff);
while (vma) {
if (vma->vm_mm->rss) {
@@ -722,13 +725,13 @@ static inline int try_to_unmap_obj(struc
ret = try_to_unmap_one(
page, vma->vm_mm, address, mapcount, vma);
if (ret == SWAP_FAIL || !*mapcount)
- goto out;
+ goto out_read;
}
- vma = __vma_prio_tree_next(vma, &mapping->i_mmap,
+ vma = __vma_prio_tree_next_lock(vma, &mapping->i_mmap,
&iter, pgoff, pgoff);
}
- vma = __vma_prio_tree_first(&mapping->i_mmap_shared,
+ vma = __vma_prio_tree_first_lock(&mapping->i_mmap_shared,
&iter, pgoff, pgoff);
while (vma) {
if (vma->vm_mm->rss) {
@@ -736,14 +739,18 @@ static inline int try_to_unmap_obj(struc
ret = try_to_unmap_one(
page, vma->vm_mm, address, mapcount, vma);
if (ret == SWAP_FAIL || !*mapcount)
- goto out;
+ goto out_read;
}
- vma = __vma_prio_tree_next(vma, &mapping->i_mmap_shared,
+ vma = __vma_prio_tree_next_lock(vma, &mapping->i_mmap_shared,
&iter, pgoff, pgoff);
}
if (list_empty(&mapping->i_mmap_nonlinear))
- goto out;
+ goto nolock;
+
+ up_read(&mapping->i_shared_sem);
+ if (!down_write_trylock(&mapping->i_shared_sem))
+ return ret;
list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
shared.vm_set.list) {
@@ -813,7 +820,12 @@ static inline int try_to_unmap_obj(struc
relock:
rmap_lock(page);
out:
- up(&mapping->i_shared_sem);
+ up_write(&mapping->i_shared_sem);
+ return ret;
+out_read:
+ __vma_prio_tree_iter_unlock(&iter);
+nolock:
+ up_read(&mapping->i_shared_sem);
return ret;
}
_
^ permalink raw reply [flat|nested] 38+ messages in thread* Re: [PATCH] anobjrmap 9 priority mjb tree
2004-04-14 20:18 ` Rajesh Venkatasubramanian
@ 2004-04-15 0:05 ` Andrea Arcangeli
2004-04-15 0:22 ` Martin J. Bligh
2004-04-15 3:40 ` Rajesh Venkatasubramanian
0 siblings, 2 replies; 38+ messages in thread
From: Andrea Arcangeli @ 2004-04-15 0:05 UTC (permalink / raw)
To: Rajesh Venkatasubramanian
Cc: Martin J. Bligh, Hugh Dickins, linux-kernel, Andrew Morton
On Wed, Apr 14, 2004 at 04:18:38PM -0400, Rajesh Venkatasubramanian wrote:
>
> This patch is another attempt at reducing the contention on i_shared_sem.
> The patch converts i_shared_sem from normal semaphore to read-write
> semaphore. The locking rules used are:
>
> 1) A prio_tree cannot be modified without holding write lock.
> 2) However, vmas can be added and removed from a vm_set list
> by just holding the read lock and a bit lock (vm_set_lock)
> in the corresponding prio_tree node.
no way, you cannot bitflip vm_flags unless you own the mmap_sem, this
patch seems very broken to me, it should randomly corrupt memory in
vma->vm_flags while racing against mprotect etc.. or am I missing
something?
> 3) All objrmap functions just hold read lock now. So when we
> walk a vm_set list we have to hold the corresponding
> vm_set_lock.
> 4) Since truncate uses write lock (provides exclusion) we don't
> have to take vm_set_locks.
>
> Martin! When you get time to test your SDET with this patch, please
> let me know whether this patch helps you at all. The patch applies
> on top of 2.6.5-mjb1+anobjrmap9_prio_tree.
I considered converting it to a rwsem too, details are in the the email
I posted while providing the rwspinlock solution to the parisc cache
flushing code.
As I wrote there, I wasn't convinced in the common case this is going to
gain anything significant (the only thing that sleeps while teh
semaphore is held is truncate and truncate during paging on the same
inode isn't an extremly common case, especially for the big apps), and
it makes it a bit more complicated, but giving it a try will be
interesting. I was mostly interested about having the objrmap code very
rarely failing the trylock during paging (that semaphore is by far the
biggest scalability hit during paging of shm, but the cacheline bouncing
won't be avoided by the rwsem). To make the paging scale better
(something SDET cannot measure) I don't need a safe vm_set_lock, I
believe simply making it a rwsem is the way to go just to make the
paging potentially scale a bit better. I rated implementing the locking
abstraction to fixup the basic parisc race as a bit higher prio, after
that it should be easy to have it implementing a rwsem for all archs w/o
cache flushing, the abstraction will have to expose a read/write
functionality for the rwlock. I'm not convinced your double locking is
going to boost anything even if it would be safe, I'd just take it in
write mode when the tree is being modified, with the only object of
avoiding the paging to block (and potentially to avoid blocking against
big truncates too).
^ permalink raw reply [flat|nested] 38+ messages in thread
* Re: [PATCH] anobjrmap 9 priority mjb tree
2004-04-15 0:05 ` Andrea Arcangeli
@ 2004-04-15 0:22 ` Martin J. Bligh
2004-04-15 3:40 ` Rajesh Venkatasubramanian
1 sibling, 0 replies; 38+ messages in thread
From: Martin J. Bligh @ 2004-04-15 0:22 UTC (permalink / raw)
To: Andrea Arcangeli, Rajesh Venkatasubramanian
Cc: Hugh Dickins, linux-kernel, Andrew Morton
>> Martin! When you get time to test your SDET with this patch, please
>> let me know whether this patch helps you at all. The patch applies
>> on top of 2.6.5-mjb1+anobjrmap9_prio_tree.
>
> I considered converting it to a rwsem too, details are in the the email
> I posted while providing the rwspinlock solution to the parisc cache
> flushing code.
I will try it, but I'm not convinced it'll help. I profiled the takers
of i_shared_sem, and I think they're all writers (and I tried rwsem on
the simple linked list before with no benefit).
M.
^ permalink raw reply [flat|nested] 38+ messages in thread
* Re: [PATCH] anobjrmap 9 priority mjb tree
2004-04-15 0:05 ` Andrea Arcangeli
2004-04-15 0:22 ` Martin J. Bligh
@ 2004-04-15 3:40 ` Rajesh Venkatasubramanian
2004-04-15 6:23 ` Martin J. Bligh
2004-04-15 13:00 ` Andrea Arcangeli
1 sibling, 2 replies; 38+ messages in thread
From: Rajesh Venkatasubramanian @ 2004-04-15 3:40 UTC (permalink / raw)
To: Andrea Arcangeli
Cc: Martin J. Bligh, Hugh Dickins, linux-kernel, Andrew Morton
> > 2) However, vmas can be added and removed from a vm_set list
> > by just holding the read lock and a bit lock (vm_set_lock)
> > in the corresponding prio_tree node.
>
> no way, you cannot bitflip vm_flags unless you own the mmap_sem, this
> patch seems very broken to me, it should randomly corrupt memory in
> vma->vm_flags while racing against mprotect etc.. or am I missing
> something?
I don't know why bit_spin_lock with vma->vm_flags should be a problem
if it is used without mmap_sem. Can you explain ?
Anyway, in my patch, the only places that use vm_set_lock without
mmap_sem is __vma_prio_tree_first_lock and __vma_prio_tree_next_lock.
If it is really racy to use bit_spin_lock on vm_flags without mmap_sem
(I am not sure and I am not convinced that it is racy), then we can
revert these changes and take down_write on the page out path.
Well. In that case, we can use rwsem as you mentioned below: take
down_write on all modifications and take down_read on pageout. Here, you
allow multiple parallel page_referenced and try_to_unmap on the same
inode, however with only one modification at a time.
Wherease my solution will allow multiple modifications at the same
time (if possible) with only one pageout routine at a time. I chose
this solution because Martin's SDET took big hit in common cases of
adding and removing vmas from the i_mmap{_shared} data structure.
Thanks,
Rajesh
>
> > 3) All objrmap functions just hold read lock now. So when we
> > walk a vm_set list we have to hold the corresponding
> > vm_set_lock.
> > 4) Since truncate uses write lock (provides exclusion) we don't
> > have to take vm_set_locks.
> >
> > Martin! When you get time to test your SDET with this patch, please
> > let me know whether this patch helps you at all. The patch applies
> > on top of 2.6.5-mjb1+anobjrmap9_prio_tree.
>
> I considered converting it to a rwsem too, details are in the the email
> I posted while providing the rwspinlock solution to the parisc cache
> flushing code.
>
> As I wrote there, I wasn't convinced in the common case this is going to
> gain anything significant (the only thing that sleeps while teh
> semaphore is held is truncate and truncate during paging on the same
> inode isn't an extremly common case, especially for the big apps), and
> it makes it a bit more complicated, but giving it a try will be
> interesting. I was mostly interested about having the objrmap code very
> rarely failing the trylock during paging (that semaphore is by far the
> biggest scalability hit during paging of shm, but the cacheline bouncing
> won't be avoided by the rwsem). To make the paging scale better
> (something SDET cannot measure) I don't need a safe vm_set_lock, I
> believe simply making it a rwsem is the way to go just to make the
> paging potentially scale a bit better. I rated implementing the locking
> abstraction to fixup the basic parisc race as a bit higher prio, after
> that it should be easy to have it implementing a rwsem for all archs w/o
> cache flushing, the abstraction will have to expose a read/write
> functionality for the rwlock. I'm not convinced your double locking is
> going to boost anything even if it would be safe, I'd just take it in
> write mode when the tree is being modified, with the only object of
> avoiding the paging to block (and potentially to avoid blocking against
> big truncates too).
>
^ permalink raw reply [flat|nested] 38+ messages in thread* Re: [PATCH] anobjrmap 9 priority mjb tree
2004-04-15 3:40 ` Rajesh Venkatasubramanian
@ 2004-04-15 6:23 ` Martin J. Bligh
2004-04-15 10:26 ` Hugh Dickins
2004-04-15 13:00 ` Andrea Arcangeli
1 sibling, 1 reply; 38+ messages in thread
From: Martin J. Bligh @ 2004-04-15 6:23 UTC (permalink / raw)
To: Rajesh Venkatasubramanian, Andrea Arcangeli
Cc: Hugh Dickins, linux-kernel, Andrew Morton
> Wherease my solution will allow multiple modifications at the same
> time (if possible) with only one pageout routine at a time. I chose
> this solution because Martin's SDET took big hit in common cases of
> adding and removing vmas from the i_mmap{_shared} data structure.
FYI, even without prio-tree, I get a 12% boost from converting i_shared_sem
into a spinlock. I'll try doing the same on top of prio-tree next.
M.
^ permalink raw reply [flat|nested] 38+ messages in thread* Re: [PATCH] anobjrmap 9 priority mjb tree
2004-04-15 6:23 ` Martin J. Bligh
@ 2004-04-15 10:26 ` Hugh Dickins
2004-04-15 12:52 ` Andrea Arcangeli
2004-04-15 15:40 ` Martin J. Bligh
0 siblings, 2 replies; 38+ messages in thread
From: Hugh Dickins @ 2004-04-15 10:26 UTC (permalink / raw)
To: Martin J. Bligh
Cc: Rajesh Venkatasubramanian, Andrea Arcangeli, linux-kernel,
Andrew Morton
On Wed, 14 Apr 2004, Martin J. Bligh wrote:
>
> FYI, even without prio-tree, I get a 12% boost from converting i_shared_sem
> into a spinlock. I'll try doing the same on top of prio-tree next.
Good news, though not a surprise.
Any ideas how we might handle latency from vmtruncate (and
try_to_unmap) if using prio_tree with i_shared_lock spinlock?
Hugh
^ permalink raw reply [flat|nested] 38+ messages in thread
* Re: [PATCH] anobjrmap 9 priority mjb tree
2004-04-15 10:26 ` Hugh Dickins
@ 2004-04-15 12:52 ` Andrea Arcangeli
2004-04-15 15:40 ` Martin J. Bligh
1 sibling, 0 replies; 38+ messages in thread
From: Andrea Arcangeli @ 2004-04-15 12:52 UTC (permalink / raw)
To: Hugh Dickins
Cc: Martin J. Bligh, Rajesh Venkatasubramanian, linux-kernel,
Andrew Morton
On Thu, Apr 15, 2004 at 11:26:09AM +0100, Hugh Dickins wrote:
> On Wed, 14 Apr 2004, Martin J. Bligh wrote:
> >
> > FYI, even without prio-tree, I get a 12% boost from converting i_shared_sem
> > into a spinlock. I'll try doing the same on top of prio-tree next.
>
> Good news, though not a surprise.
>
> Any ideas how we might handle latency from vmtruncate (and
> try_to_unmap) if using prio_tree with i_shared_lock spinlock?
we'd need to break the loop after need_resched returns 1 (and then the
second time we'd just screw the latency and go ahead). I also wanted to
make it a spinlock again like in 2.4, the semaphore probably generates
overscheduling. OTOH the spinlock saved some cpu in slightly different
workloads with big truncates (plus it made the cond_resched trivial w/o
requiring loop break) and I agree with Andrew about that, Martin isn't
benchmarking the other side, the one that made Andrew to change it to a
semaphore.
^ permalink raw reply [flat|nested] 38+ messages in thread
* Re: [PATCH] anobjrmap 9 priority mjb tree
2004-04-15 10:26 ` Hugh Dickins
2004-04-15 12:52 ` Andrea Arcangeli
@ 2004-04-15 15:40 ` Martin J. Bligh
2004-04-15 16:55 ` Hugh Dickins
2004-04-15 22:33 ` Andrea Arcangeli
1 sibling, 2 replies; 38+ messages in thread
From: Martin J. Bligh @ 2004-04-15 15:40 UTC (permalink / raw)
To: Hugh Dickins
Cc: Rajesh Venkatasubramanian, Andrea Arcangeli, linux-kernel,
Andrew Morton
>> FYI, even without prio-tree, I get a 12% boost from converting i_shared_sem
>> into a spinlock. I'll try doing the same on top of prio-tree next.
>
> Good news, though not a surprise.
>
> Any ideas how we might handle latency from vmtruncate (and
> try_to_unmap) if using prio_tree with i_shared_lock spinlock?
I've been thinking about that. My rough plan is to go wild, naked and lockless.
If we arrange things in the correct order, new entries onto the list would
pick up the truncated image of the file (so they'd be OK). Entries removed
from the list don't matter anyway. We just need to make sure that everything
that was on the list when we start does get truncated.
Basically there are two sets of operations ... ones that map and unmap
the file object (address_space) and ones that alter it - we should be
able to proceed with inserts and deletes whilst truncating, though we
probably need to protect against the alterations. The two op types could
go under separate locking.
But I need to think on it some more - would not suprise me to come to the
conclusion that I'm full of shit ;-) The opinions of others would be
very welcome ...
M.
^ permalink raw reply [flat|nested] 38+ messages in thread
* Re: [PATCH] anobjrmap 9 priority mjb tree
2004-04-15 15:40 ` Martin J. Bligh
@ 2004-04-15 16:55 ` Hugh Dickins
2004-04-15 17:14 ` Martin J. Bligh
2004-04-15 22:33 ` Andrea Arcangeli
1 sibling, 1 reply; 38+ messages in thread
From: Hugh Dickins @ 2004-04-15 16:55 UTC (permalink / raw)
To: Martin J. Bligh
Cc: Rajesh Venkatasubramanian, Andrea Arcangeli, linux-kernel,
Andrew Morton
On Thu, 15 Apr 2004, Martin J. Bligh wrote:
> >
> > Any ideas how we might handle latency from vmtruncate (and
> > try_to_unmap) if using prio_tree with i_shared_lock spinlock?
>
> I've been thinking about that. My rough plan is to go wild, naked and lockless.
> If we arrange things in the correct order, new entries onto the list would
It's quite easy if there's a list - though I'm not that eager to go wild,
naked and lockless with you! But what if there's a prio_tree?
Hugh
^ permalink raw reply [flat|nested] 38+ messages in thread
* Re: [PATCH] anobjrmap 9 priority mjb tree
2004-04-15 16:55 ` Hugh Dickins
@ 2004-04-15 17:14 ` Martin J. Bligh
2004-04-15 17:50 ` Hugh Dickins
2004-04-15 22:40 ` Andrea Arcangeli
0 siblings, 2 replies; 38+ messages in thread
From: Martin J. Bligh @ 2004-04-15 17:14 UTC (permalink / raw)
To: Hugh Dickins
Cc: Rajesh Venkatasubramanian, Andrea Arcangeli, linux-kernel,
Andrew Morton
>> > Any ideas how we might handle latency from vmtruncate (and
>> > try_to_unmap) if using prio_tree with i_shared_lock spinlock?
>>
>> I've been thinking about that. My rough plan is to go wild, naked and lockless.
>> If we arrange things in the correct order, new entries onto the list would
>
> It's quite easy if there's a list - though I'm not that eager to go wild,
> naked and lockless with you! But what if there's a prio_tree?
I still think my list-of-lists patch fixes the original problem, and is
simpler ... I'll try to get it updated, and sent out.
M.,
^ permalink raw reply [flat|nested] 38+ messages in thread
* Re: [PATCH] anobjrmap 9 priority mjb tree
2004-04-15 17:14 ` Martin J. Bligh
@ 2004-04-15 17:50 ` Hugh Dickins
2004-04-15 18:42 ` Dave McCracken
[not found] ` <192710000.1082052992@flay>
2004-04-15 22:40 ` Andrea Arcangeli
1 sibling, 2 replies; 38+ messages in thread
From: Hugh Dickins @ 2004-04-15 17:50 UTC (permalink / raw)
To: Martin J. Bligh
Cc: Rajesh Venkatasubramanian, Andrea Arcangeli, linux-kernel,
Andrew Morton
On Thu, 15 Apr 2004, Martin J. Bligh wrote:
>
> I still think my list-of-lists patch fixes the original problem, and is
> simpler ... I'll try to get it updated, and sent out.
Please do, I never saw it before.
Though I have to admit I'm sceptical: prio_tree appears to be well
designed for the issue in question, list-of-lists sounds, well,
no offence, but a bit of a hack.
But we may well have overlooked the overhead of prio_tree's
complexity relative to list, and need to reconsider options.
Hugh
^ permalink raw reply [flat|nested] 38+ messages in thread* Re: [PATCH] anobjrmap 9 priority mjb tree
2004-04-15 17:50 ` Hugh Dickins
@ 2004-04-15 18:42 ` Dave McCracken
[not found] ` <192710000.1082052992@flay>
1 sibling, 0 replies; 38+ messages in thread
From: Dave McCracken @ 2004-04-15 18:42 UTC (permalink / raw)
To: Hugh Dickins, Martin J. Bligh
Cc: Rajesh Venkatasubramanian, Andrea Arcangeli, linux-kernel,
Andrew Morton
--On Thursday, April 15, 2004 18:50:42 +0100 Hugh Dickins
<hugh@veritas.com> wrote:
> Though I have to admit I'm sceptical: prio_tree appears to be well
> designed for the issue in question, list-of-lists sounds, well,
> no offence, but a bit of a hack.
It is a bit of a hack, but the theory behind it is fairly simple. It came
out of my early efforts to sort the list. Martin and I produced a theory
that many vmas have identical start and end addresses due to fork and/or
fixed address mappings. If this theory is true list-of-lists will create a
much shorter top-level list of unique start-end pairs for searching. We'd
only need to walk the second level list when we get a match to the search.
It never got any serious exposure or testing. It came out just as
everyone's attention shifted away from objrmap so no one really looked at
it.
Dave McCracken
^ permalink raw reply [flat|nested] 38+ messages in thread[parent not found: <192710000.1082052992@flay>]
* Re: [PATCH] anobjrmap 9 priority mjb tree
[not found] ` <192710000.1082052992@flay>
@ 2004-04-15 18:47 ` Rajesh Venkatasubramanian
0 siblings, 0 replies; 38+ messages in thread
From: Rajesh Venkatasubramanian @ 2004-04-15 18:47 UTC (permalink / raw)
To: Martin J. Bligh
Cc: Hugh Dickins, Andrea Arcangeli, linux-kernel, Andrew Morton
> It has similar problems, IIRC with increasing i_shared_sem contention.
Agreed.
> But I think it solves the same issues as prio_tree,
Agreed.
> is simpler,
Agreed.
> is easier to fix up to do clever locking with.
I haven't thought about it fully, so I am not sure. But, it is likely
that locking is easier with list-of-lists.
[snip]
> diff -urpN -X /home/fletch/.diff.exclude 820-numa_large_pages/mm/mmap.c 830-list-of-lists/mm/mmap.c
> --- 820-numa_large_pages/mm/mmap.c Wed Jun 18 21:49:20 2003
> +++ 830-list-of-lists/mm/mmap.c Wed Jun 18 23:29:38 2003
> @@ -306,6 +306,56 @@ static void __vma_link_rb(struct mm_stru
> rb_insert_color(&vma->vm_rb, &mm->mm_rb);
> }
>
> +static void vma_add (struct vm_area_struct *vma,
> + struct list_head *range_list)
> +{
> + struct address_range *range;
> + struct list_head *prev, *next;
> + unsigned long start = vma->vm_pgoff;
> + unsigned long end = vma->vm_pgoff +
> + (((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1);
> +
> + /* First, look for an existing range that matches ours */
> + prev = range_list;
> + list_for_each(next, range_list) {
> + range = list_entry(next, struct address_range, ranges);
> + if (range->start > start)
> + break; /* this list is sorted by start */
> + if ((range->start == start) && (range->end == end)) {
> + goto found;
> + }
> + prev = next;
> + }
Hmm.. We do a linear O(N) search for each vma added. If the range_list
has 1000 vmas, then it is really bad. Running Ingo's test-mmap3.c or
Andrew's rmap-test.c (check 3rd test Andrew did - single process,
10,000 different vmas - with different range->start and range->end)
will be slow.
The prio_tree patch optimizes these cases with O(log N) insert algorithm.
[snip]
> +static void vma_del (struct vm_area_struct *vma)
> +{
[snip]
> + next = vma->shared.next; /* stash the range list we're on */
> + list_del(&vma->shared); /* remove us from the list of vmas */
> + if (list_empty(next)) { /* we were the last vma for range */
> + range = list_entry(next, struct address_range, vmas);
> + list_del(&range->ranges);
> + kfree(range);
> + }
> +}
Agree that vma_del is much simpler.
> page_referenced_obj(struct page *page)
> {
[snip]
> + list_for_each_entry(range, &mapping->i_mmap, ranges) {
> + if (range->start > index)
> + break; /* Sorted by start address => we are done */
> + if (range->end < index)
> + continue;
Again O(N) search...
> + list_for_each_entry(vma, &range->vmas, shared)
> + referenced += page_referenced_obj_one(vma, page);
> + }
> @@ -512,7 +532,9 @@ static int
> try_to_unmap_obj(struct page *page)
> {
[snip]
> + list_for_each_entry(range, &mapping->i_mmap, ranges) {
> + if (range->start > index)
> + break; /* Sorted by start address => we are done */
> + if (range->end < index)
> + continue;
Here also O(N) search when each vma map a unique set of file pages...
Thanks for posting the code. Your old postings (almost a year ago)
regarding list-of-lists inspired me to develop prio_tree. Thanks.
Rajesh
^ permalink raw reply [flat|nested] 38+ messages in thread
* Re: [PATCH] anobjrmap 9 priority mjb tree
2004-04-15 17:14 ` Martin J. Bligh
2004-04-15 17:50 ` Hugh Dickins
@ 2004-04-15 22:40 ` Andrea Arcangeli
1 sibling, 0 replies; 38+ messages in thread
From: Andrea Arcangeli @ 2004-04-15 22:40 UTC (permalink / raw)
To: Martin J. Bligh
Cc: Hugh Dickins, Rajesh Venkatasubramanian, linux-kernel,
Andrew Morton
On Thu, Apr 15, 2004 at 10:14:51AM -0700, Martin J. Bligh wrote:
> I still think my list-of-lists patch fixes the original problem, and is
> simpler ... I'll try to get it updated, and sent out.
it's a lot worse than the prio-tree IMHO, when a new range is generated
you've to loop all over the vmas etc... it's O(N) stuff for certain ops,
prio-tree is O(log(N)) for all.
If your object is to be able to use RCU (and implementing a RCU
prio-tree is going to be extremely complicated) you can attempt a
prio-skip-list, that would be a skip-list (that still provides O(log(N))
but that uses lists everywhere so that you can more easily create a
RCU-prio-skip-list, though I didn't even think if the range-lookup can
be implemented reasonably easily on top of a skip-list to create the
prio-skip-list).
but even if we could create the rcu-prio-skip-list (that would solve all
complexity issues like the prio-tree and it would allow lockless lookups
too [unlike prio-tree]) you'd still have to deal with the mess of
freeing vmas with rcu, that would cause everything else over the
vma to be freed with rcu too, mm, pgds etc... that would require quite
some changes, at the very least to be able to garbage collect the mm,pgd
from the vma free operations. I doubt it worth it, for the fast path you
cannot go lockless anyways, the lockless is only for the readonly
operations, and the readonly are the only unlikely ones (namely only
truncate and paging). So it's overdesign.
^ permalink raw reply [flat|nested] 38+ messages in thread
* Re: [PATCH] anobjrmap 9 priority mjb tree
2004-04-15 15:40 ` Martin J. Bligh
2004-04-15 16:55 ` Hugh Dickins
@ 2004-04-15 22:33 ` Andrea Arcangeli
1 sibling, 0 replies; 38+ messages in thread
From: Andrea Arcangeli @ 2004-04-15 22:33 UTC (permalink / raw)
To: Martin J. Bligh
Cc: Hugh Dickins, Rajesh Venkatasubramanian, linux-kernel,
Andrew Morton
On Thu, Apr 15, 2004 at 08:40:50AM -0700, Martin J. Bligh wrote:
> >> FYI, even without prio-tree, I get a 12% boost from converting i_shared_sem
> >> into a spinlock. I'll try doing the same on top of prio-tree next.
> >
> > Good news, though not a surprise.
> >
> > Any ideas how we might handle latency from vmtruncate (and
> > try_to_unmap) if using prio_tree with i_shared_lock spinlock?
>
> I've been thinking about that. My rough plan is to go wild, naked and lockless.
> If we arrange things in the correct order, new entries onto the list would
> pick up the truncated image of the file (so they'd be OK). Entries removed
> from the list don't matter anyway. We just need to make sure that everything
> that was on the list when we start does get truncated.
entries removed must be freed with RCU, and that means vmas freed with
rcu that means mm and pgd freed with rcu and the whole vm will collapse
on you when you attempt that. I mean it's going all the way up to the
whole MM, not just the shared list of vmas.
^ permalink raw reply [flat|nested] 38+ messages in thread
* Re: [PATCH] anobjrmap 9 priority mjb tree
2004-04-15 3:40 ` Rajesh Venkatasubramanian
2004-04-15 6:23 ` Martin J. Bligh
@ 2004-04-15 13:00 ` Andrea Arcangeli
2004-04-15 14:41 ` Rajesh Venkatasubramanian
1 sibling, 1 reply; 38+ messages in thread
From: Andrea Arcangeli @ 2004-04-15 13:00 UTC (permalink / raw)
To: Rajesh Venkatasubramanian
Cc: Martin J. Bligh, Hugh Dickins, linux-kernel, Andrew Morton
On Wed, Apr 14, 2004 at 11:40:52PM -0400, Rajesh Venkatasubramanian wrote:
>
> > > 2) However, vmas can be added and removed from a vm_set list
> > > by just holding the read lock and a bit lock (vm_set_lock)
> > > in the corresponding prio_tree node.
> >
> > no way, you cannot bitflip vm_flags unless you own the mmap_sem, this
> > patch seems very broken to me, it should randomly corrupt memory in
> > vma->vm_flags while racing against mprotect etc.. or am I missing
> > something?
>
> I don't know why bit_spin_lock with vma->vm_flags should be a problem
> if it is used without mmap_sem. Can you explain ?
you seem not to know all rules about the atomic operations in smp, you
cannot just set_bit on one side and use non-atomic operations on the
other side, and expect the set_bit not to invalidate the non-atomic
operations.
The effect of the mprotect may be deleted by your new concurrent
set_bit and stuff like that.
> If it is really racy to use bit_spin_lock on vm_flags without mmap_sem
it __definitely__ is racy.
> Well. In that case, we can use rwsem as you mentioned below: take
> down_write on all modifications and take down_read on pageout. Here, you
exactly, this also avoids the more complex (and racy, but the racy would
be easy to fix by adding another 4 atomic bytes to the vma) double
locking that you introduced.
> allow multiple parallel page_referenced and try_to_unmap on the same
> inode, however with only one modification at a time.
exactly.
> Wherease my solution will allow multiple modifications at the same
> time (if possible) with only one pageout routine at a time. I chose
> this solution because Martin's SDET took big hit in common cases of
> adding and removing vmas from the i_mmap{_shared} data structure.
you can still fix the smp race condition by trivially adding 4 bytes to
the vma (i.e. a vma->vm_flags_atomic), but I'd be surprised if your
double locking actually improve things, Martin is running on a very
parallel old-numa where cacheline bouncing across nodes pass through a
fibre channel IIRC, and the cacheline bouncing that the semaphore
generates is huge, it's not necessairly huge contention, it's just the
bouncing that hurts, and the down_read won't help at all for the
cacheline trashing, it'll still bounce like before. Though you may gain
something minor, but I doubt it'd be huge.
I'd suggest Martin to give a try to the racy code, it's just good enough
for a pratical experiment (the race shouldn't easily trigger so it
probably passes one run of SDET safely).
^ permalink raw reply [flat|nested] 38+ messages in thread* Re: [PATCH] anobjrmap 9 priority mjb tree
2004-04-15 13:00 ` Andrea Arcangeli
@ 2004-04-15 14:41 ` Rajesh Venkatasubramanian
0 siblings, 0 replies; 38+ messages in thread
From: Rajesh Venkatasubramanian @ 2004-04-15 14:41 UTC (permalink / raw)
To: Andrea Arcangeli
Cc: Martin J. Bligh, Hugh Dickins, linux-kernel, Andrew Morton
> > I don't know why bit_spin_lock with vma->vm_flags should be a problem
> > if it is used without mmap_sem. Can you explain ?
>
> you seem not to know all rules about the atomic operations in smp, you
> cannot just set_bit on one side and use non-atomic operations on the
> other side, and expect the set_bit not to invalidate the non-atomic
> operations.
>
> The effect of the mprotect may be deleted by your new concurrent
> set_bit and stuff like that.
Thank you very much for that. Stupid me. I didn't read the code in
page->flags properly. Thanks again.
Rajesh
^ permalink raw reply [flat|nested] 38+ messages in thread
end of thread, other threads:[~2004-04-15 22:40 UTC | newest]
Thread overview: 38+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2004-04-04 12:33 [PATCH] anobjrmap 9 priority mjb tree Hugh Dickins
2004-04-09 20:39 ` Martin J. Bligh
2004-04-09 21:31 ` Rajesh Venkatasubramanian
2004-04-09 21:40 ` Martin J. Bligh
2004-04-09 23:17 ` Rajesh Venkatasubramanian
2004-04-09 21:51 ` Hugh Dickins
2004-04-09 22:01 ` Martin J. Bligh
2004-04-09 22:56 ` Martin J. Bligh
2004-04-11 16:09 ` Hugh Dickins
2004-04-11 17:28 ` Martin J. Bligh
2004-04-12 4:32 ` Rajesh Venkatasubramanian
2004-04-12 5:24 ` Martin J. Bligh
2004-04-12 15:46 ` Martin J. Bligh
2004-04-12 18:43 ` Hugh Dickins
2004-04-12 18:58 ` Rajesh Venkatasubramanian
2004-04-12 19:01 ` Martin J. Bligh
2004-04-12 19:10 ` Hugh Dickins
2004-04-12 19:38 ` Rajesh Venkatasubramanian
2004-04-12 21:14 ` Martin J. Bligh
2004-04-12 21:12 ` Andrew Morton
2004-04-12 21:43 ` Martin J. Bligh
2004-04-14 20:18 ` Rajesh Venkatasubramanian
2004-04-15 0:05 ` Andrea Arcangeli
2004-04-15 0:22 ` Martin J. Bligh
2004-04-15 3:40 ` Rajesh Venkatasubramanian
2004-04-15 6:23 ` Martin J. Bligh
2004-04-15 10:26 ` Hugh Dickins
2004-04-15 12:52 ` Andrea Arcangeli
2004-04-15 15:40 ` Martin J. Bligh
2004-04-15 16:55 ` Hugh Dickins
2004-04-15 17:14 ` Martin J. Bligh
2004-04-15 17:50 ` Hugh Dickins
2004-04-15 18:42 ` Dave McCracken
[not found] ` <192710000.1082052992@flay>
2004-04-15 18:47 ` Rajesh Venkatasubramanian
2004-04-15 22:40 ` Andrea Arcangeli
2004-04-15 22:33 ` Andrea Arcangeli
2004-04-15 13:00 ` Andrea Arcangeli
2004-04-15 14:41 ` Rajesh Venkatasubramanian
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox