* [Xenomai-core] Nocow patch.
@ 2007-01-31 8:37 Gilles Chanteperdrix
0 siblings, 0 replies; 11+ messages in thread
From: Gilles Chanteperdrix @ 2007-01-31 8:37 UTC (permalink / raw)
To: xenomai-core
[-- Attachment #1: Type: text/plain, Size: 240 bytes --]
Hi,
after testing on ARM, here is the latest version of the nocow patch,
split in three parts, the noarch part, the x86 specific patch and the
arm specific patch.
--
Gilles Chanteperdrix
[-- Attachment #2: vm-nocow-2.6.19-5-noarch.patch --]
[-- Type: text/x-patch, Size: 12796 bytes --]
diff -Naurdp -x '*~' -x '*.orig' -x '*.rej' ipipe-2.6.19-arm/include/linux/ipipe.h ipipe-2.6.19-arm-nocow/include/linux/ipipe.h
--- ipipe-2.6.19-arm/include/linux/ipipe.h 2007-01-15 21:33:00.000000000 +0100
+++ ipipe-2.6.19-arm-nocow/include/linux/ipipe.h 2007-01-30 21:22:26.769349729 +0100
@@ -337,6 +337,15 @@ int fastcall __ipipe_dispatch_wired(stru
void fastcall __ipipe_sync_stage(unsigned long syncmask);
+int __ipipe_update_all_pinned_mm(unsigned long start, unsigned long end);
+
+struct mm_struct;
+
+void __ipipe_unlink_pinned_mm(struct mm_struct *mm);
+
+int __ipipe_pin_range_mapping(struct mm_struct *mm,
+ unsigned long start, unsigned long end);
+
#ifndef __ipipe_sync_pipeline
#define __ipipe_sync_pipeline(syncmask) __ipipe_sync_stage(syncmask)
#endif
@@ -434,12 +443,11 @@ static inline void ipipe_init_notify(str
__ipipe_dispatch_event(IPIPE_EVENT_INIT,p);
}
-struct mm_struct;
-
static inline void ipipe_cleanup_notify(struct mm_struct *mm)
{
if (__ipipe_event_monitored_p(IPIPE_EVENT_CLEANUP))
__ipipe_dispatch_event(IPIPE_EVENT_CLEANUP,mm);
+ __ipipe_unlink_pinned_mm(mm);
}
/* Public interface */
@@ -643,6 +651,8 @@ int fastcall ipipe_set_ptd(int key,
void fastcall *ipipe_get_ptd(int key);
+int ipipe_disable_ondemand_mappings(struct task_struct *tsk);
+
#define local_irq_enable_hw_cond() local_irq_enable_hw()
#define local_irq_disable_hw_cond() local_irq_disable_hw()
#define local_irq_save_hw_cond(flags) local_irq_save_hw(flags)
@@ -690,6 +700,7 @@ void fastcall *ipipe_get_ptd(int key);
#define ipipe_cleanup_notify(mm) do { } while(0)
#define ipipe_trap_notify(t,r) 0
#define ipipe_init_proc() do { } while(0)
+#define __ipipe_update_all_pinned_mm(start, end) 0
#define local_irq_enable_hw_cond() do { } while(0)
#define local_irq_disable_hw_cond() do { } while(0)
diff -Naurdp -x '*~' -x '*.orig' -x '*.rej' ipipe-2.6.19-arm/include/linux/mm.h ipipe-2.6.19-arm-nocow/include/linux/mm.h
--- ipipe-2.6.19-arm/include/linux/mm.h 2007-01-04 22:05:12.000000000 +0100
+++ ipipe-2.6.19-arm-nocow/include/linux/mm.h 2007-01-30 21:22:26.769349729 +0100
@@ -166,6 +166,7 @@ extern unsigned int kobjsize(const void
#define VM_NONLINEAR 0x00800000 /* Is non-linear (remap_file_pages) */
#define VM_MAPPED_COPY 0x01000000 /* T if mapped copy of data (nommu mmap) */
#define VM_INSERTPAGE 0x02000000 /* The vma has had "vm_insert_page()" done on it */
+#define VM_PINNED 0x10000000 /* Disable faults for the vma */
#ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */
#define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS
diff -Naurdp -x '*~' -x '*.orig' -x '*.rej' ipipe-2.6.19-arm/include/linux/sched.h ipipe-2.6.19-arm-nocow/include/linux/sched.h
--- ipipe-2.6.19-arm/include/linux/sched.h 2007-01-15 21:33:00.000000000 +0100
+++ ipipe-2.6.19-arm-nocow/include/linux/sched.h 2007-01-30 21:22:26.770349605 +0100
@@ -363,6 +363,10 @@ struct mm_struct {
/* aio bits */
rwlock_t ioctx_list_lock;
struct kioctx *ioctx_list;
+
+#if CONFIG_IPIPE
+ struct list_head pinned;
+#endif /* CONFIG_IPIPE */
};
struct sighand_struct {
diff -Naurdp -x '*~' -x '*.orig' -x '*.rej' ipipe-2.6.19-arm/lib/ioremap.c ipipe-2.6.19-arm-nocow/lib/ioremap.c
--- ipipe-2.6.19-arm/lib/ioremap.c 2007-01-15 21:33:01.000000000 +0100
+++ ipipe-2.6.19-arm-nocow/lib/ioremap.c 2007-01-30 21:22:26.771349480 +0100
@@ -85,10 +85,9 @@ int ioremap_page_range(unsigned long add
err = ioremap_pud_range(pgd, addr, next, phys_addr+addr, prot);
if (err)
break;
- set_pgdir(addr, *pgd);
} while (pgd++, addr = next, addr != end);
-
- flush_cache_vmap(start, end);
+ __ipipe_update_all_pinned_mm(start, end);
+ flush_cache_vmap(start, end);
return err;
}
diff -Naurdp -x '*~' -x '*.orig' -x '*.rej' ipipe-2.6.19-arm/mm/memory.c ipipe-2.6.19-arm-nocow/mm/memory.c
--- ipipe-2.6.19-arm/mm/memory.c 2007-01-04 22:05:15.000000000 +0100
+++ ipipe-2.6.19-arm-nocow/mm/memory.c 2007-01-30 23:35:51.960412122 +0100
@@ -50,6 +50,7 @@
#include <linux/delayacct.h>
#include <linux/init.h>
#include <linux/writeback.h>
+#include <linux/vmalloc.h>
#include <asm/pgalloc.h>
#include <asm/uaccess.h>
@@ -418,13 +419,41 @@ struct page *vm_normal_page(struct vm_ar
return pfn_to_page(pfn);
}
+static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va)
+{
+ /*
+ * If the source page was a PFN mapping, we don't have
+ * a "struct page" for it. We do a best-effort copy by
+ * just copying from the original user address. If that
+ * fails, we just zero-fill it. Live with it.
+ */
+ if (unlikely(!src)) {
+ void *kaddr = kmap_atomic(dst, KM_USER0);
+ void __user *uaddr = (void __user *)(va & PAGE_MASK);
+
+ /*
+ * This really shouldn't fail, because the page is there
+ * in the page tables. But it might just be unreadable,
+ * in which case we just give up and fill the result with
+ * zeroes.
+ */
+ if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
+ memset(kaddr, 0, PAGE_SIZE);
+ kunmap_atomic(kaddr, KM_USER0);
+ flush_dcache_page(dst);
+ return;
+
+ }
+ copy_user_highpage(dst, src, va);
+}
+
/*
* copy one vm_area from one task to the other. Assumes the page tables
* already present in the new task to be cleared in the whole range
* covered by this vma.
*/
-static inline void
+static inline int
copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
unsigned long addr, int *rss)
@@ -466,6 +495,25 @@ copy_one_pte(struct mm_struct *dst_mm, s
* in the parent and the child
*/
if (is_cow_mapping(vm_flags)) {
+#ifdef CONFIG_IPIPE
+ if (((vm_flags|src_mm->def_flags) & (VM_LOCKED|VM_PINNED)) == (VM_LOCKED|VM_PINNED)) {
+ struct page *old_page = vm_normal_page(vma, addr, pte);
+ page = alloc_page_vma(GFP_HIGHUSER, vma, addr);
+ if (!page)
+ return -ENOMEM;
+
+ cow_user_page(page, old_page, addr);
+ pte = mk_pte(page, vma->vm_page_prot);
+
+ if (vm_flags & VM_SHARED)
+ pte = pte_mkclean(pte);
+ pte = pte_mkold(pte);
+
+ page_dup_rmap(page);
+ rss[!!PageAnon(page)]++;
+ goto out_set_pte;
+ }
+#endif /* CONFIG_IPIPE */
ptep_set_wrprotect(src_mm, addr, src_pte);
pte = pte_wrprotect(pte);
}
@@ -487,6 +535,7 @@ copy_one_pte(struct mm_struct *dst_mm, s
out_set_pte:
set_pte_at(dst_mm, addr, dst_pte, pte);
+ return 0;
}
static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
@@ -524,7 +573,9 @@ again:
progress++;
continue;
}
- copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss);
+ if (copy_one_pte(dst_mm, src_mm, dst_pte,
+ src_pte, vma, addr, rss))
+ return -ENOMEM;
progress += 8;
} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
@@ -1431,34 +1482,6 @@ static inline pte_t maybe_mkwrite(pte_t
return pte;
}
-static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va)
-{
- /*
- * If the source page was a PFN mapping, we don't have
- * a "struct page" for it. We do a best-effort copy by
- * just copying from the original user address. If that
- * fails, we just zero-fill it. Live with it.
- */
- if (unlikely(!src)) {
- void *kaddr = kmap_atomic(dst, KM_USER0);
- void __user *uaddr = (void __user *)(va & PAGE_MASK);
-
- /*
- * This really shouldn't fail, because the page is there
- * in the page tables. But it might just be unreadable,
- * in which case we just give up and fill the result with
- * zeroes.
- */
- if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
- memset(kaddr, 0, PAGE_SIZE);
- kunmap_atomic(kaddr, KM_USER0);
- flush_dcache_page(dst);
- return;
-
- }
- copy_user_highpage(dst, src, va);
-}
-
/*
* This routine handles present pages, when users try to write
* to a shared page. It is done by copying the page to a new address
@@ -2676,3 +2699,157 @@ int access_process_vm(struct task_struct
return buf - old_buf;
}
+
+#ifdef CONFIG_IPIPE
+static LIST_HEAD(pinned_mms);
+static DEFINE_RWLOCK(pinned_mms_lock);
+
+static inline int ipipe_pin_pte_range(struct mm_struct *mm, pmd_t *pmd,
+ struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end)
+{
+ spinlock_t *ptl;
+ pte_t *pte;
+
+ do {
+ pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ if (!pte)
+ continue;
+
+ if (!pte_present(*pte)) {
+ pte_unmap_unlock(pte, ptl);
+ continue;
+ }
+
+ if (do_wp_page(mm, vma, addr, pte, pmd, ptl, *pte) == VM_FAULT_OOM)
+ return -ENOMEM;
+ } while (addr += PAGE_SIZE, addr != end);
+ return 0;
+}
+
+static inline int ipipe_pin_pmd_range(struct mm_struct *mm, pud_t *pud,
+ struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end)
+{
+ unsigned long next;
+ pmd_t *pmd;
+
+ pmd = pmd_offset(pud, addr);
+ do {
+ next = pmd_addr_end(addr, end);
+ if (ipipe_pin_pte_range(mm, pmd, vma, addr, end))
+ return -ENOMEM;
+ } while (pmd++, addr = next, addr != end);
+ return 0;
+}
+
+static inline int ipipe_pin_pud_range(struct mm_struct *mm, pgd_t *pgd,
+ struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end)
+{
+ unsigned long next;
+ pud_t *pud;
+
+ pud = pud_offset(pgd, addr);
+ do {
+ next = pud_addr_end(addr, end);
+ if (ipipe_pin_pmd_range(mm, pud, vma, addr, end))
+ return -ENOMEM;
+ } while (pud++, addr = next, addr != end);
+ return 0;
+}
+
+int ipipe_disable_ondemand_mappings(struct task_struct *tsk)
+{
+ unsigned long addr, next, end;
+ struct vm_area_struct *vma;
+ struct vm_struct *area;
+ struct mm_struct *mm;
+ int result = 0;
+ pgd_t *pgd;
+
+ mm = get_task_mm(tsk);
+ if (!mm)
+ return -EPERM;
+
+ down_write(&mm->mmap_sem);
+ if (mm->def_flags & VM_PINNED)
+ goto up_mmap_sem_done;
+
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ if (!is_cow_mapping(vma->vm_flags))
+ continue;
+
+ addr = vma->vm_start;
+ end = vma->vm_end;
+
+ pgd = pgd_offset(mm, addr);
+ do {
+ next = pgd_addr_end(addr, end);
+ if (ipipe_pin_pud_range(mm, pgd, vma, addr, next)) {
+ result = -ENOMEM;
+ up_mmap_sem_done:
+ up_write(&mm->mmap_sem);
+ goto done_mm;
+ }
+ } while (pgd++, addr = next, addr != end);
+ }
+ mm->def_flags |= VM_PINNED;
+ up_write(&mm->mmap_sem);
+
+ read_lock(&vmlist_lock);
+ down_write(&mm->mmap_sem);
+ for (area = vmlist; area; area = area->next) {
+ result = __ipipe_pin_range_mapping(mm,
+ (unsigned long) area->addr,
+ (unsigned long) area->addr
+ + area->size);
+ if (result) {
+ mm->def_flags &= ~VM_PINNED;
+ up_write(&mm->mmap_sem);
+ goto done_vmlist;
+ }
+ }
+ up_write(&mm->mmap_sem);
+
+ write_lock(&pinned_mms_lock);
+ list_add(&mm->pinned, &pinned_mms);
+ write_unlock(&pinned_mms_lock);
+
+ done_vmlist:
+ read_unlock(&vmlist_lock);
+ done_mm:
+ mmput(mm);
+ return result;
+}
+
+EXPORT_SYMBOL(ipipe_disable_ondemand_mappings);
+
+int __ipipe_update_all_pinned_mm(unsigned long start, unsigned long end)
+{
+ struct mm_struct *mm;
+ int result = 0;
+
+ read_lock(&pinned_mms_lock);
+ list_for_each_entry(mm, &pinned_mms, pinned) {
+ down_write(&mm->mmap_sem);
+ result = __ipipe_pin_range_mapping(mm, start, end);
+ up_write(&mm->mmap_sem);
+
+ if (result)
+ break;
+ }
+ read_unlock(&pinned_mms_lock);
+
+ return result;
+}
+
+void __ipipe_unlink_pinned_mm(struct mm_struct *mm)
+{
+ if (mm->def_flags & VM_PINNED) {
+ write_lock(&pinned_mms_lock);
+ list_del(&mm->pinned);
+ write_unlock(&pinned_mms_lock);
+ }
+}
+#endif
diff -Naurdp -x '*~' -x '*.orig' -x '*.rej' ipipe-2.6.19-arm/mm/mlock.c ipipe-2.6.19-arm-nocow/mm/mlock.c
--- ipipe-2.6.19-arm/mm/mlock.c 2006-05-07 16:42:15.000000000 +0200
+++ ipipe-2.6.19-arm-nocow/mm/mlock.c 2007-01-30 21:22:26.772349356 +0100
@@ -166,7 +166,7 @@ static int do_mlockall(int flags)
if (flags & MCL_FUTURE)
def_flags = VM_LOCKED;
- current->mm->def_flags = def_flags;
+ current->mm->def_flags |= def_flags;
if (flags == MCL_FUTURE)
goto out;
diff -Naurdp -x '*~' -x '*.orig' -x '*.rej' ipipe-2.6.19-arm/mm/vmalloc.c ipipe-2.6.19-arm-nocow/mm/vmalloc.c
--- ipipe-2.6.19-arm/mm/vmalloc.c 2007-01-15 21:33:01.000000000 +0100
+++ ipipe-2.6.19-arm-nocow/mm/vmalloc.c 2007-01-30 21:22:26.773349232 +0100
@@ -152,15 +152,12 @@ int map_vm_area(struct vm_struct *area,
BUG_ON(addr >= end);
pgd = pgd_offset_k(addr);
do {
- pgd_t oldpgd;
- memcpy(&oldpgd,pgd,sizeof(pgd_t));
next = pgd_addr_end(addr, end);
err = vmap_pud_range(pgd, addr, next, prot, pages);
if (err)
break;
- if (pgd_val(oldpgd) != pgd_val(*pgd))
- set_pgdir(addr, *pgd);
} while (pgd++, addr = next, addr != end);
+ __ipipe_update_all_pinned_mm((unsigned long) area->addr, end);
flush_cache_vmap((unsigned long) area->addr, end);
return err;
}
[-- Attachment #3: vm-nocow-2.6.19-5-i386.patch --]
[-- Type: text/x-patch, Size: 1333 bytes --]
--- ipipe-2.6.19/arch/i386/mm/fault.c 2007-01-10 09:44:52.000000000 +0100
+++ ipipe-2.6.19-nocow/arch/i386/mm/fault.c 2007-01-15 09:57:02.000000000 +0100
@@ -654,3 +654,18 @@ void vmalloc_sync_all(void)
}
}
#endif
+
+#ifdef CONFIG_IPIPE
+int __ipipe_pin_range_mapping(struct mm_struct *mm,
+ unsigned long start, unsigned long end)
+{
+ unsigned long next, addr = start;
+
+ do {
+ next = pgd_addr_end(addr, end);
+ vmalloc_sync_one(mm->pgd, addr);
+ } while (addr = next, addr != end);
+
+ return 0;
+}
+#endif /* CONFIG_IPIPE */
--- ipipe-2.6.19/include/asm-i386/pgalloc.h 2007-01-10 09:44:53.000000000 +0100
+++ ipipe-2.6.19-nocow/include/asm-i386/pgalloc.h 2007-01-11 09:58:49.000000000 +0100
@@ -46,27 +46,4 @@ static inline void pte_free(struct page
#define check_pgt_cache() do { } while (0)
-static inline void set_pgdir(unsigned long address, pgd_t entry)
-{
-#ifdef CONFIG_IPIPE
- struct task_struct * p;
- struct page *page;
- pgd_t *pgd;
-
- read_lock(&tasklist_lock);
-
- for_each_process(p) {
- if(p->mm)
- *pgd_offset(p->mm,address) = entry;
- }
-
- read_unlock(&tasklist_lock);
-
- for (page = pgd_list; page; page = (struct page *)page->index) {
- pgd = (pgd_t *)page_address(page);
- pgd[address >> PGDIR_SHIFT] = entry;
- }
-#endif /* CONFIG_IPIPE */
-}
-
#endif /* _I386_PGALLOC_H */
[-- Attachment #4: vm-nocow-2.6.19-5-arm.patch --]
[-- Type: text/x-patch, Size: 1877 bytes --]
--- ipipe-2.6.19-arm/arch/arm/mm/fault.c 2007-01-30 21:33:47.000000000 +0100
+++ ipipe-2.6.19-arm-nocow/arch/arm/mm/fault.c 2007-01-30 23:23:05.513766878 +0100
@@ -330,6 +330,9 @@ do_translation_fault(unsigned long addr,
if (addr < TASK_SIZE)
return do_page_fault(addr, fsr, regs);
+ if (ipipe_trap_notify(IPIPE_TRAP_ACCESS,regs))
+ return 0;
+
index = pgd_index(addr);
/*
@@ -354,9 +357,6 @@ do_translation_fault(unsigned long addr,
return 0;
bad_area:
- if (ipipe_trap_notify(IPIPE_TRAP_ACCESS,regs))
- return 0;
-
do_bad_area(addr, fsr, regs);
return 0;
}
@@ -479,3 +479,35 @@ do_PrefetchAbort(unsigned long addr, str
do_translation_fault(addr, 0, regs);
}
+#ifdef CONFIG_IPIPE
+static void vmalloc_sync_one(pgd_t *pgd, unsigned long addr)
+{
+ unsigned int index = pgd_index(addr);
+ pgd_t *pgd_k;
+ pmd_t *pmd, *pmd_k;
+
+ pgd += index;
+ pgd_k = init_mm.pgd + index;
+
+ if (!pgd_present(*pgd))
+ set_pgd(pgd, *pgd_k);
+
+ pmd_k = pmd_offset(pgd_k, addr);
+ pmd = pmd_offset(pgd, addr);
+
+ copy_pmd(pmd, pmd_k);
+}
+
+int __ipipe_pin_range_mapping(struct mm_struct *mm,
+ unsigned long start, unsigned long end)
+{
+ unsigned long next, addr = start;
+
+ do {
+ next = pgd_addr_end(addr, end);
+ vmalloc_sync_one(mm->pgd, addr);
+ } while (addr = next, addr != end);
+
+ return 0;
+}
+#endif /* CONFIG_IPIPE */
--- ipipe-2.6.19-arm/include/asm-arm/pgalloc.h 2007-01-30 23:47:15.711345662 +0100
+++ ipipe-2.6.19-arm-nocow/include/asm-arm/pgalloc.h 2007-01-30 23:43:39.759212585 +0100
@@ -23,11 +23,6 @@
#define _PAGE_USER_TABLE (PMD_TYPE_TABLE | PMD_BIT4 | PMD_DOMAIN(DOMAIN_USER))
#define _PAGE_KERNEL_TABLE (PMD_TYPE_TABLE | PMD_BIT4 | PMD_DOMAIN(DOMAIN_KERNEL))
-static inline void set_pgdir(unsigned long address, pgd_t entry)
-{
- /* nop */
-}
-
/*
* Since we have only two-level page tables, these are trivial
*/
^ permalink raw reply [flat|nested] 11+ messages in thread* [Xenomai-core] Nocow patch.
@ 2007-01-10 18:05 Gilles Chanteperdrix
2007-01-11 7:38 ` Niklaus Giger
2007-01-11 15:18 ` Gilles Chanteperdrix
0 siblings, 2 replies; 11+ messages in thread
From: Gilles Chanteperdrix @ 2007-01-10 18:05 UTC (permalink / raw)
To: xenomai-core
[-- Attachment #1: Type: text/plain, Size: 1007 bytes --]
Hi,
I continued working on the idea of nocow patch, here is a beefed up version.
When setting the NOCOW flag for the first time:
- it faults any COW mapping of the target process, this is to handle the
case where a process would call fork (for example, in order to become a
daemon) before shadowing any thread;
- all vmalloc and ioremap areas are added to the target process vm.
A list of the mm structs of the processes which have the NOCOW flag is
maintained and used when calling vmalloc or ioremap to update only the
mapping of the processes in this list. This allows firstly to workaround
vmalloc and ioremap faults on architectures that do not have pgd_list,
and secondly should improve the overhead added by the I-pipe patch to
the ioremap and vmalloc calls.
Since the VM_NOCOW flag now really means "no page fault", the NOCOW flag
was renamed "NOFAULT".
This was run on x86, but need further testing before inclusion.
--
Gilles Chanteperdrix
[-- Attachment #2: vm-nocow-2.6.19.2.patch --]
[-- Type: text/x-patch, Size: 12492 bytes --]
diff -Naurdp -x '*~' ipipe-2.6.19/arch/i386/mm/fault.c ipipe-2.6.19-nocow/arch/i386/mm/fault.c
--- ipipe-2.6.19/arch/i386/mm/fault.c 2007-01-10 09:44:52.000000000 +0100
+++ ipipe-2.6.19-nocow/arch/i386/mm/fault.c 2007-01-10 18:08:18.000000000 +0100
@@ -654,3 +654,19 @@ void vmalloc_sync_all(void)
}
}
#endif
+
+#ifdef CONFIG_IPIPE
+int ipipe_arch_map_vm_area_to_mm(struct mm_struct *mm,
+ unsigned long start,
+ unsigned long end)
+{
+ unsigned long next, addr = start;
+
+ do {
+ next = pgd_addr_end(addr, end);
+ vmalloc_sync_one(mm->pgd, addr);
+ } while (addr = next, addr != end);
+
+ return 0;
+}
+#endif /* CONFIG_IPIPE */
diff -Naurdp -x '*~' ipipe-2.6.19/include/asm-i386/pgalloc.h ipipe-2.6.19-nocow/include/asm-i386/pgalloc.h
--- ipipe-2.6.19/include/asm-i386/pgalloc.h 2007-01-10 09:44:53.000000000 +0100
+++ ipipe-2.6.19-nocow/include/asm-i386/pgalloc.h 2007-01-10 12:09:27.000000000 +0100
@@ -46,27 +46,4 @@ static inline void pte_free(struct page
#define check_pgt_cache() do { } while (0)
-static inline void set_pgdir(unsigned long address, pgd_t entry)
-{
-#ifdef CONFIG_IPIPE
- struct task_struct * p;
- struct page *page;
- pgd_t *pgd;
-
- read_lock(&tasklist_lock);
-
- for_each_process(p) {
- if(p->mm)
- *pgd_offset(p->mm,address) = entry;
- }
-
- read_unlock(&tasklist_lock);
-
- for (page = pgd_list; page; page = (struct page *)page->index) {
- pgd = (pgd_t *)page_address(page);
- pgd[address >> PGDIR_SHIFT] = entry;
- }
-#endif /* CONFIG_IPIPE */
-}
-
#endif /* _I386_PGALLOC_H */
diff -Naurdp -x '*~' ipipe-2.6.19/include/linux/mm.h ipipe-2.6.19-nocow/include/linux/mm.h
--- ipipe-2.6.19/include/linux/mm.h 2007-01-04 10:10:33.000000000 +0100
+++ ipipe-2.6.19-nocow/include/linux/mm.h 2007-01-10 18:49:24.000000000 +0100
@@ -166,6 +166,7 @@ extern unsigned int kobjsize(const void
#define VM_NONLINEAR 0x00800000 /* Is non-linear (remap_file_pages) */
#define VM_MAPPED_COPY 0x01000000 /* T if mapped copy of data (nommu mmap) */
#define VM_INSERTPAGE 0x02000000 /* The vma has had "vm_insert_page()" done on it */
+#define VM_NOFAULT 0x10000000 /* Disable faults for the vma */
#ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */
#define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS
diff -Naurdp -x '*~' ipipe-2.6.19/include/linux/sched.h ipipe-2.6.19-nocow/include/linux/sched.h
--- ipipe-2.6.19/include/linux/sched.h 2007-01-10 09:44:53.000000000 +0100
+++ ipipe-2.6.19-nocow/include/linux/sched.h 2007-01-10 11:08:20.000000000 +0100
@@ -363,6 +363,10 @@ struct mm_struct {
/* aio bits */
rwlock_t ioctx_list_lock;
struct kioctx *ioctx_list;
+
+#if CONFIG_IPIPE
+ struct list_head nofault;
+#endif /* CONFIG_IPIPE */
};
struct sighand_struct {
diff -Naurdp -x '*~' ipipe-2.6.19/kernel/fork.c ipipe-2.6.19-nocow/kernel/fork.c
--- ipipe-2.6.19/kernel/fork.c 2007-01-10 09:44:53.000000000 +0100
+++ ipipe-2.6.19-nocow/kernel/fork.c 2007-01-10 12:24:36.000000000 +0100
@@ -385,6 +385,7 @@ void mmput(struct mm_struct *mm)
if (atomic_dec_and_test(&mm->mm_users)) {
ipipe_cleanup_notify(mm);
+ ipipe_destroy_nofault_mm(mm);
exit_aio(mm);
exit_mmap(mm);
if (!list_empty(&mm->mmlist)) {
diff -Naurdp -x '*~' ipipe-2.6.19/lib/ioremap.c ipipe-2.6.19-nocow/lib/ioremap.c
--- ipipe-2.6.19/lib/ioremap.c 2007-01-10 09:44:53.000000000 +0100
+++ ipipe-2.6.19-nocow/lib/ioremap.c 2007-01-10 17:10:00.000000000 +0100
@@ -85,9 +85,10 @@ int ioremap_page_range(unsigned long add
err = ioremap_pud_range(pgd, addr, next, phys_addr+addr, prot);
if (err)
break;
- set_pgdir(addr, *pgd);
} while (pgd++, addr = next, addr != end);
-
+#ifdef CONFIG_IPIPE
+ ipipe_update_nofault_mms(start, end);
+#endif /* CONFIG_IPIPE */
flush_cache_vmap(start, end);
return err;
diff -Naurdp -x '*~' ipipe-2.6.19/mm/memory.c ipipe-2.6.19-nocow/mm/memory.c
--- ipipe-2.6.19/mm/memory.c 2007-01-04 10:10:35.000000000 +0100
+++ ipipe-2.6.19-nocow/mm/memory.c 2007-01-10 18:41:10.000000000 +0100
@@ -50,6 +50,9 @@
#include <linux/delayacct.h>
#include <linux/init.h>
#include <linux/writeback.h>
+#ifdef CONFIG_IPIPE
+#include <linux/vmalloc.h> /* For vmlist */
+#endif /* CONFIG_IPIPE */
#include <asm/pgalloc.h>
#include <asm/uaccess.h>
@@ -418,13 +421,41 @@ struct page *vm_normal_page(struct vm_ar
return pfn_to_page(pfn);
}
+static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va)
+{
+ /*
+ * If the source page was a PFN mapping, we don't have
+ * a "struct page" for it. We do a best-effort copy by
+ * just copying from the original user address. If that
+ * fails, we just zero-fill it. Live with it.
+ */
+ if (unlikely(!src)) {
+ void *kaddr = kmap_atomic(dst, KM_USER0);
+ void __user *uaddr = (void __user *)(va & PAGE_MASK);
+
+ /*
+ * This really shouldn't fail, because the page is there
+ * in the page tables. But it might just be unreadable,
+ * in which case we just give up and fill the result with
+ * zeroes.
+ */
+ if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
+ memset(kaddr, 0, PAGE_SIZE);
+ kunmap_atomic(kaddr, KM_USER0);
+ flush_dcache_page(dst);
+ return;
+
+ }
+ copy_user_highpage(dst, src, va);
+}
+
/*
* copy one vm_area from one task to the other. Assumes the page tables
* already present in the new task to be cleared in the whole range
* covered by this vma.
*/
-static inline void
+static inline int
copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
unsigned long addr, int *rss)
@@ -466,6 +497,25 @@ copy_one_pte(struct mm_struct *dst_mm, s
* in the parent and the child
*/
if (is_cow_mapping(vm_flags)) {
+#ifdef CONFIG_IPIPE
+ if (((vm_flags|src_mm->def_flags) & (VM_LOCKED|VM_NOFAULT)) == (VM_LOCKED|VM_NOFAULT)) {
+ struct page *old_page = vm_normal_page(vma, addr, pte);
+ page = alloc_page_vma(GFP_HIGHUSER, vma, addr);
+ if (!page)
+ return -ENOMEM;
+
+ cow_user_page(page, old_page, addr);
+ pte = mk_pte(page, vma->vm_page_prot);
+
+ if (vm_flags & VM_SHARED)
+ pte = pte_mkclean(pte);
+ pte = pte_mkold(pte);
+
+ page_dup_rmap(page);
+ rss[!!PageAnon(page)]++;
+ goto out_set_pte;
+ }
+#endif /* CONFIG_IPIPE */
ptep_set_wrprotect(src_mm, addr, src_pte);
pte = pte_wrprotect(pte);
}
@@ -487,6 +537,7 @@ copy_one_pte(struct mm_struct *dst_mm, s
out_set_pte:
set_pte_at(dst_mm, addr, dst_pte, pte);
+ return 0;
}
static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
@@ -524,7 +575,9 @@ again:
progress++;
continue;
}
- copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss);
+ if (copy_one_pte(dst_mm, src_mm, dst_pte,
+ src_pte, vma, addr, rss))
+ return -ENOMEM;
progress += 8;
} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
@@ -1431,34 +1484,6 @@ static inline pte_t maybe_mkwrite(pte_t
return pte;
}
-static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va)
-{
- /*
- * If the source page was a PFN mapping, we don't have
- * a "struct page" for it. We do a best-effort copy by
- * just copying from the original user address. If that
- * fails, we just zero-fill it. Live with it.
- */
- if (unlikely(!src)) {
- void *kaddr = kmap_atomic(dst, KM_USER0);
- void __user *uaddr = (void __user *)(va & PAGE_MASK);
-
- /*
- * This really shouldn't fail, because the page is there
- * in the page tables. But it might just be unreadable,
- * in which case we just give up and fill the result with
- * zeroes.
- */
- if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
- memset(kaddr, 0, PAGE_SIZE);
- kunmap_atomic(kaddr, KM_USER0);
- flush_dcache_page(dst);
- return;
-
- }
- copy_user_highpage(dst, src, va);
-}
-
/*
* This routine handles present pages, when users try to write
* to a shared page. It is done by copying the page to a new address
@@ -2676,3 +2701,163 @@ int access_process_vm(struct task_struct
return buf - old_buf;
}
+
+#ifdef CONFIG_IPIPE
+static LIST_HEAD(nofault_mms);
+static DEFINE_RWLOCK(nofault_mms_lock);
+
+static int ipipe_fault_pte_range(struct mm_struct *mm, pmd_t *pmd,
+ struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end)
+{
+ int progress = 0;
+ spinlock_t *ptl;
+ pte_t *pte;
+
+ again:
+ pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ do {
+ if (progress >= 32) {
+ progress = 0;
+ if (need_resched() ||
+ need_lockbreak(ptl))
+ break;
+ }
+
+ if (do_wp_page(mm, vma, addr, pte, pmd, ptl, *pte) == VM_FAULT_OOM)
+ return -ENOMEM;
+
+ } while (pte++, addr += PAGE_SIZE, addr != end);
+ pte_unmap_unlock(pte - 1, ptl);
+ cond_resched();
+ if (addr != end)
+ goto again;
+ return 0;
+}
+
+static int ipipe_fault_pmd_range(struct mm_struct *mm, pud_t *pud,
+ struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end)
+{
+ unsigned long next;
+ pmd_t *pmd;
+
+ pmd = pmd_offset(pud, addr);
+ do {
+ next = pmd_addr_end(addr, end);
+ if (ipipe_fault_pte_range(mm, pmd, vma, addr, end))
+ return -ENOMEM;
+ } while (pmd++, addr = next, addr != end);
+ return 0;
+}
+
+static int ipipe_fault_pud_range(struct mm_struct *mm, pgd_t *pgd,
+ struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end)
+{
+ unsigned long next;
+ pud_t *pud;
+
+ pud = pud_offset(pgd, addr);
+ do {
+ next = pud_addr_end(addr, end);
+ if (ipipe_fault_pmd_range(mm, pud, vma, addr, end))
+ return -ENOMEM;
+ } while (pud++, addr = next, addr != end);
+ return 0;
+}
+
+int ipipe_disable_task_faults(struct task_struct *tsk)
+{
+ unsigned long addr, next, end;
+ struct vm_area_struct *vma;
+ struct vm_struct *area;
+ struct mm_struct *mm;
+ int result = 0;
+ pgd_t *pgd;
+
+ mm = get_task_mm(tsk);
+ if (!mm)
+ return -EPERM;
+
+ down_write(&mm->mmap_sem);
+ if (mm->def_flags & VM_NOFAULT)
+ goto up_mmap_sem_done;
+
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ if (!is_cow_mapping(vma->vm_flags))
+ continue;
+
+ addr = vma->vm_start;
+ end = vma->vm_end;
+
+ pgd = pgd_offset(mm, addr);
+ do {
+ next = pgd_addr_end(addr, end);
+ if (ipipe_fault_pud_range(mm, pgd, vma, addr, next)) {
+ result = -ENOMEM;
+ up_mmap_sem_done:
+ up_write(&mm->mmap_sem);
+ goto done_mm;
+ }
+ } while (pgd++, addr = next, addr != end);
+ }
+ mm->def_flags |= VM_NOFAULT;
+ up_write(&mm->mmap_sem);
+
+ read_lock(&vmlist_lock);
+ down_write(&mm->mmap_sem);
+ for (area = vmlist; area; area = area->next) {
+ result = ipipe_arch_map_vm_area_to_mm(mm,
+ (unsigned long) area->addr,
+ (unsigned long) area->addr
+ + area->size);
+ if (result) {
+ mm->def_flags &= ~VM_NOFAULT;
+ up_write(&mm->mmap_sem);
+ goto done_vmlist;
+ }
+ }
+ up_write(&mm->mmap_sem);
+
+ write_lock(&nofault_mms_lock);
+ list_add(&mm->nofault, &nofault_mms);
+ write_unlock(&nofault_mms_lock);
+
+ done_vmlist:
+ read_unlock(&vmlist_lock);
+ done_mm:
+ mmput(mm);
+ return result;
+}
+
+EXPORT_SYMBOL(ipipe_disable_task_faults);
+
+int ipipe_update_nofault_mms(unsigned long start, unsigned long end)
+{
+ struct mm_struct *mm;
+ int result = 0;
+
+ read_lock(&nofault_mms_lock);
+ list_for_each_entry(mm, &nofault_mms, nofault) {
+ down_write(&mm->mmap_sem);
+ result = ipipe_arch_map_vm_area_to_mm(mm, start, end);
+ up_write(&mm->mmap_sem);
+
+ if (result)
+ break;
+ }
+ read_unlock(&nofault_mms_lock);
+
+ return result;
+}
+
+void ipipe_destroy_nofault_mm(struct mm_struct *mm)
+{
+ if (mm->def_flags & VM_NOFAULT) {
+ write_lock(&nofault_mms_lock);
+ list_del(&mm->nofault);
+ write_unlock(&nofault_mms_lock);
+ }
+}
+#endif
diff -Naurdp -x '*~' ipipe-2.6.19/mm/vmalloc.c ipipe-2.6.19-nocow/mm/vmalloc.c
--- ipipe-2.6.19/mm/vmalloc.c 2007-01-10 11:22:05.000000000 +0100
+++ ipipe-2.6.19-nocow/mm/vmalloc.c 2007-01-10 17:09:46.000000000 +0100
@@ -152,15 +152,14 @@ int map_vm_area(struct vm_struct *area,
BUG_ON(addr >= end);
pgd = pgd_offset_k(addr);
do {
- pgd_t oldpgd;
- memcpy(&oldpgd,pgd,sizeof(pgd_t));
next = pgd_addr_end(addr, end);
err = vmap_pud_range(pgd, addr, next, prot, pages);
if (err)
break;
- if (pgd_val(oldpgd) != pgd_val(*pgd))
- set_pgdir(addr, *pgd);
} while (pgd++, addr = next, addr != end);
+#ifdef CONFIG_IPIPE
+ ipipe_update_nofault_mms((unsigned long) area->addr, end);
+#endif /* CONFIG_IPIPE */
flush_cache_vmap((unsigned long) area->addr, end);
return err;
}
^ permalink raw reply [flat|nested] 11+ messages in thread* Re: [Xenomai-core] Nocow patch.
2007-01-10 18:05 Gilles Chanteperdrix
@ 2007-01-11 7:38 ` Niklaus Giger
2007-01-11 8:43 ` Gilles Chanteperdrix
2007-01-11 15:18 ` Gilles Chanteperdrix
1 sibling, 1 reply; 11+ messages in thread
From: Niklaus Giger @ 2007-01-11 7:38 UTC (permalink / raw)
To: xenomai
Am Mittwoch, 10. Januar 2007 19:05 schrieb Gilles Chanteperdrix:
> Hi,
<..>
> This was run on x86, but need further testing before inclusion.
I wanted to give it a try on my PPC board. But trying to apply it (on a Linxu
2.6.19.1 kernel patched with
ksrc/arch/powerpc/patches/adeos-ipipe-2.6.19-ppc-1.5-01.patch gave me the
error
> Hunk #1 succeeded at 652 (offset -2 lines).
> patching file include/asm-i386/pgalloc.h
> Reversed (or previously applied) patch detected! Assume -R? [n]
Same problem with ioremap.c.
Is there a problem or did I apply it against the wrong kernel?
Best regards
--
Niklaus Giger
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [Xenomai-core] Nocow patch.
2007-01-11 7:38 ` Niklaus Giger
@ 2007-01-11 8:43 ` Gilles Chanteperdrix
2007-01-11 8:46 ` Gilles Chanteperdrix
0 siblings, 1 reply; 11+ messages in thread
From: Gilles Chanteperdrix @ 2007-01-11 8:43 UTC (permalink / raw)
To: niklaus.giger; +Cc: xenomai
Niklaus Giger wrote:
> Am Mittwoch, 10. Januar 2007 19:05 schrieb Gilles Chanteperdrix:
>
>>Hi,
>
> <..>
>
>>This was run on x86, but need further testing before inclusion.
>
> I wanted to give it a try on my PPC board. But trying to apply it (on a Linxu
> 2.6.19.1 kernel patched with
> ksrc/arch/powerpc/patches/adeos-ipipe-2.6.19-ppc-1.5-01.patch gave me the
> error
>
>>Hunk #1 succeeded at 652 (offset -2 lines).
>>patching file include/asm-i386/pgalloc.h
>>Reversed (or previously applied) patch detected! Assume -R? [n]
>
> Same problem with ioremap.c.
> Is there a problem or did I apply it against the wrong kernel?
It is a patch for x86 only for now, other architectures will be
implemented when it is stable on x86.
--
Gilles Chanteperdrix
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [Xenomai-core] Nocow patch.
2007-01-10 18:05 Gilles Chanteperdrix
2007-01-11 7:38 ` Niklaus Giger
@ 2007-01-11 15:18 ` Gilles Chanteperdrix
2007-01-13 18:57 ` Philippe Gerum
1 sibling, 1 reply; 11+ messages in thread
From: Gilles Chanteperdrix @ 2007-01-11 15:18 UTC (permalink / raw)
To: Gilles Chanteperdrix; +Cc: xenomai-core
[-- Attachment #1: Type: text/plain, Size: 915 bytes --]
Gilles Chanteperdrix wrote:
> This was run on x86, but need further testing before inclusion.
Here is a new version, after testing. It appears to run fine. I tested
forking in real-time applications both before and after calling
rt_task_shadow, and vmallocing areas of 256 Mo, and memseting them both
from a non-realtime or real-time context and it works.
The next step is to clean up the patch, but I have to admit that I need
some help: should I keep the functions in the files where I put them ?
in what headers should I declare them ? Should I define an empty
ipipe_update_nofault_mms when CONFIG_IPIPE is not set in order to avoid
a few #ifdefs ?
Note that in order to use the patch, you have to call
ipipe_disable_task_faults(current) in xnshadow_map instead of simply
setting the VM_NOCOW flag.
I will now test the patch on ARM.
--
Gilles Chanteperdrix
[-- Attachment #2: vm-nocow-2.6.19.3.patch --]
[-- Type: text/x-patch, Size: 12690 bytes --]
diff -Naurdp -x '*~' ipipe-2.6.19/arch/i386/mm/fault.c ipipe-2.6.19-nocow/arch/i386/mm/fault.c
--- ipipe-2.6.19/arch/i386/mm/fault.c 2007-01-10 09:44:52.000000000 +0100
+++ ipipe-2.6.19-nocow/arch/i386/mm/fault.c 2007-01-11 09:58:49.000000000 +0100
@@ -654,3 +654,19 @@ void vmalloc_sync_all(void)
}
}
#endif
+
+#ifdef CONFIG_IPIPE
+int ipipe_arch_map_vm_area_to_mm(struct mm_struct *mm,
+ unsigned long start,
+ unsigned long end)
+{
+ unsigned long next, addr = start;
+
+ do {
+ next = pgd_addr_end(addr, end);
+ vmalloc_sync_one(mm->pgd, addr);
+ } while (addr = next, addr != end);
+
+ return 0;
+}
+#endif /* CONFIG_IPIPE */
diff -Naurdp -x '*~' ipipe-2.6.19/include/asm-i386/pgalloc.h ipipe-2.6.19-nocow/include/asm-i386/pgalloc.h
--- ipipe-2.6.19/include/asm-i386/pgalloc.h 2007-01-10 09:44:53.000000000 +0100
+++ ipipe-2.6.19-nocow/include/asm-i386/pgalloc.h 2007-01-11 09:58:49.000000000 +0100
@@ -46,27 +46,4 @@ static inline void pte_free(struct page
#define check_pgt_cache() do { } while (0)
-static inline void set_pgdir(unsigned long address, pgd_t entry)
-{
-#ifdef CONFIG_IPIPE
- struct task_struct * p;
- struct page *page;
- pgd_t *pgd;
-
- read_lock(&tasklist_lock);
-
- for_each_process(p) {
- if(p->mm)
- *pgd_offset(p->mm,address) = entry;
- }
-
- read_unlock(&tasklist_lock);
-
- for (page = pgd_list; page; page = (struct page *)page->index) {
- pgd = (pgd_t *)page_address(page);
- pgd[address >> PGDIR_SHIFT] = entry;
- }
-#endif /* CONFIG_IPIPE */
-}
-
#endif /* _I386_PGALLOC_H */
diff -Naurdp -x '*~' ipipe-2.6.19/include/linux/mm.h ipipe-2.6.19-nocow/include/linux/mm.h
--- ipipe-2.6.19/include/linux/mm.h 2007-01-04 10:10:33.000000000 +0100
+++ ipipe-2.6.19-nocow/include/linux/mm.h 2007-01-11 09:58:49.000000000 +0100
@@ -166,6 +166,7 @@ extern unsigned int kobjsize(const void
#define VM_NONLINEAR 0x00800000 /* Is non-linear (remap_file_pages) */
#define VM_MAPPED_COPY 0x01000000 /* T if mapped copy of data (nommu mmap) */
#define VM_INSERTPAGE 0x02000000 /* The vma has had "vm_insert_page()" done on it */
+#define VM_NOFAULT 0x10000000 /* Disable faults for the vma */
#ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */
#define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS
diff -Naurdp -x '*~' ipipe-2.6.19/include/linux/sched.h ipipe-2.6.19-nocow/include/linux/sched.h
--- ipipe-2.6.19/include/linux/sched.h 2007-01-10 09:44:53.000000000 +0100
+++ ipipe-2.6.19-nocow/include/linux/sched.h 2007-01-11 09:58:49.000000000 +0100
@@ -363,6 +363,10 @@ struct mm_struct {
/* aio bits */
rwlock_t ioctx_list_lock;
struct kioctx *ioctx_list;
+
+#if CONFIG_IPIPE
+ struct list_head nofault;
+#endif /* CONFIG_IPIPE */
};
struct sighand_struct {
diff -Naurdp -x '*~' ipipe-2.6.19/kernel/fork.c ipipe-2.6.19-nocow/kernel/fork.c
--- ipipe-2.6.19/kernel/fork.c 2007-01-10 09:44:53.000000000 +0100
+++ ipipe-2.6.19-nocow/kernel/fork.c 2007-01-11 15:32:25.000000000 +0100
@@ -385,6 +385,7 @@ void mmput(struct mm_struct *mm)
if (atomic_dec_and_test(&mm->mm_users)) {
ipipe_cleanup_notify(mm);
+ ipipe_destroy_nofault_mm(mm);
exit_aio(mm);
exit_mmap(mm);
if (!list_empty(&mm->mmlist)) {
diff -Naurdp -x '*~' ipipe-2.6.19/lib/ioremap.c ipipe-2.6.19-nocow/lib/ioremap.c
--- ipipe-2.6.19/lib/ioremap.c 2007-01-10 09:44:53.000000000 +0100
+++ ipipe-2.6.19-nocow/lib/ioremap.c 2007-01-11 09:58:49.000000000 +0100
@@ -85,9 +85,10 @@ int ioremap_page_range(unsigned long add
err = ioremap_pud_range(pgd, addr, next, phys_addr+addr, prot);
if (err)
break;
- set_pgdir(addr, *pgd);
} while (pgd++, addr = next, addr != end);
-
+#ifdef CONFIG_IPIPE
+ ipipe_update_nofault_mms(start, end);
+#endif /* CONFIG_IPIPE */
flush_cache_vmap(start, end);
return err;
diff -Naurdp -x '*~' ipipe-2.6.19/mm/memory.c ipipe-2.6.19-nocow/mm/memory.c
--- ipipe-2.6.19/mm/memory.c 2007-01-04 10:10:35.000000000 +0100
+++ ipipe-2.6.19-nocow/mm/memory.c 2007-01-11 15:50:37.000000000 +0100
@@ -50,6 +50,9 @@
#include <linux/delayacct.h>
#include <linux/init.h>
#include <linux/writeback.h>
+#ifdef CONFIG_IPIPE
+#include <linux/vmalloc.h> /* For vmlist */
+#endif /* CONFIG_IPIPE */
#include <asm/pgalloc.h>
#include <asm/uaccess.h>
@@ -418,13 +421,41 @@ struct page *vm_normal_page(struct vm_ar
return pfn_to_page(pfn);
}
+static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va)
+{
+ /*
+ * If the source page was a PFN mapping, we don't have
+ * a "struct page" for it. We do a best-effort copy by
+ * just copying from the original user address. If that
+ * fails, we just zero-fill it. Live with it.
+ */
+ if (unlikely(!src)) {
+ void *kaddr = kmap_atomic(dst, KM_USER0);
+ void __user *uaddr = (void __user *)(va & PAGE_MASK);
+
+ /*
+ * This really shouldn't fail, because the page is there
+ * in the page tables. But it might just be unreadable,
+ * in which case we just give up and fill the result with
+ * zeroes.
+ */
+ if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
+ memset(kaddr, 0, PAGE_SIZE);
+ kunmap_atomic(kaddr, KM_USER0);
+ flush_dcache_page(dst);
+ return;
+
+ }
+ copy_user_highpage(dst, src, va);
+}
+
/*
* copy one vm_area from one task to the other. Assumes the page tables
* already present in the new task to be cleared in the whole range
* covered by this vma.
*/
-static inline void
+static inline int
copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
unsigned long addr, int *rss)
@@ -466,6 +497,25 @@ copy_one_pte(struct mm_struct *dst_mm, s
* in the parent and the child
*/
if (is_cow_mapping(vm_flags)) {
+#ifdef CONFIG_IPIPE
+ if (((vm_flags|src_mm->def_flags) & (VM_LOCKED|VM_NOFAULT)) == (VM_LOCKED|VM_NOFAULT)) {
+ struct page *old_page = vm_normal_page(vma, addr, pte);
+ page = alloc_page_vma(GFP_HIGHUSER, vma, addr);
+ if (!page)
+ return -ENOMEM;
+
+ cow_user_page(page, old_page, addr);
+ pte = mk_pte(page, vma->vm_page_prot);
+
+ if (vm_flags & VM_SHARED)
+ pte = pte_mkclean(pte);
+ pte = pte_mkold(pte);
+
+ page_dup_rmap(page);
+ rss[!!PageAnon(page)]++;
+ goto out_set_pte;
+ }
+#endif /* CONFIG_IPIPE */
ptep_set_wrprotect(src_mm, addr, src_pte);
pte = pte_wrprotect(pte);
}
@@ -487,6 +537,7 @@ copy_one_pte(struct mm_struct *dst_mm, s
out_set_pte:
set_pte_at(dst_mm, addr, dst_pte, pte);
+ return 0;
}
static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
@@ -524,7 +575,9 @@ again:
progress++;
continue;
}
- copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss);
+ if (copy_one_pte(dst_mm, src_mm, dst_pte,
+ src_pte, vma, addr, rss))
+ return -ENOMEM;
progress += 8;
} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
@@ -1431,34 +1484,6 @@ static inline pte_t maybe_mkwrite(pte_t
return pte;
}
-static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va)
-{
- /*
- * If the source page was a PFN mapping, we don't have
- * a "struct page" for it. We do a best-effort copy by
- * just copying from the original user address. If that
- * fails, we just zero-fill it. Live with it.
- */
- if (unlikely(!src)) {
- void *kaddr = kmap_atomic(dst, KM_USER0);
- void __user *uaddr = (void __user *)(va & PAGE_MASK);
-
- /*
- * This really shouldn't fail, because the page is there
- * in the page tables. But it might just be unreadable,
- * in which case we just give up and fill the result with
- * zeroes.
- */
- if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
- memset(kaddr, 0, PAGE_SIZE);
- kunmap_atomic(kaddr, KM_USER0);
- flush_dcache_page(dst);
- return;
-
- }
- copy_user_highpage(dst, src, va);
-}
-
/*
* This routine handles present pages, when users try to write
* to a shared page. It is done by copying the page to a new address
@@ -2676,3 +2701,150 @@ int access_process_vm(struct task_struct
return buf - old_buf;
}
+
+#ifdef CONFIG_IPIPE
+static LIST_HEAD(nofault_mms);
+static DEFINE_RWLOCK(nofault_mms_lock);
+
+static int ipipe_fault_pte_range(struct mm_struct *mm, pmd_t *pmd,
+ struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end)
+{
+ spinlock_t *ptl;
+ pte_t *pte;
+
+ do {
+ pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+
+ if (do_wp_page(mm, vma, addr, pte, pmd, ptl, *pte) == VM_FAULT_OOM)
+ return -ENOMEM;
+ } while (addr += PAGE_SIZE, addr != end);
+ return 0;
+}
+
+static int ipipe_fault_pmd_range(struct mm_struct *mm, pud_t *pud,
+ struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end)
+{
+ unsigned long next;
+ pmd_t *pmd;
+
+ pmd = pmd_offset(pud, addr);
+ do {
+ next = pmd_addr_end(addr, end);
+ if (ipipe_fault_pte_range(mm, pmd, vma, addr, end))
+ return -ENOMEM;
+ } while (pmd++, addr = next, addr != end);
+ return 0;
+}
+
+static int ipipe_fault_pud_range(struct mm_struct *mm, pgd_t *pgd,
+ struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end)
+{
+ unsigned long next;
+ pud_t *pud;
+
+ pud = pud_offset(pgd, addr);
+ do {
+ next = pud_addr_end(addr, end);
+ if (ipipe_fault_pmd_range(mm, pud, vma, addr, end))
+ return -ENOMEM;
+ } while (pud++, addr = next, addr != end);
+ return 0;
+}
+
+int ipipe_disable_task_faults(struct task_struct *tsk)
+{
+ unsigned long addr, next, end;
+ struct vm_area_struct *vma;
+ struct vm_struct *area;
+ struct mm_struct *mm;
+ int result = 0;
+ pgd_t *pgd;
+
+ mm = get_task_mm(tsk);
+ if (!mm)
+ return -EPERM;
+
+ down_write(&mm->mmap_sem);
+ if (mm->def_flags & VM_NOFAULT)
+ goto up_mmap_sem_done;
+
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ if (!is_cow_mapping(vma->vm_flags))
+ continue;
+
+ addr = vma->vm_start;
+ end = vma->vm_end;
+
+ pgd = pgd_offset(mm, addr);
+ do {
+ next = pgd_addr_end(addr, end);
+ if (ipipe_fault_pud_range(mm, pgd, vma, addr, next)) {
+ result = -ENOMEM;
+ up_mmap_sem_done:
+ up_write(&mm->mmap_sem);
+ goto done_mm;
+ }
+ } while (pgd++, addr = next, addr != end);
+ }
+ mm->def_flags |= VM_NOFAULT;
+ up_write(&mm->mmap_sem);
+
+ read_lock(&vmlist_lock);
+ down_write(&mm->mmap_sem);
+ for (area = vmlist; area; area = area->next) {
+ result = ipipe_arch_map_vm_area_to_mm(mm,
+ (unsigned long) area->addr,
+ (unsigned long) area->addr
+ + area->size);
+ if (result) {
+ mm->def_flags &= ~VM_NOFAULT;
+ up_write(&mm->mmap_sem);
+ goto done_vmlist;
+ }
+ }
+ up_write(&mm->mmap_sem);
+
+ write_lock(&nofault_mms_lock);
+ list_add(&mm->nofault, &nofault_mms);
+ write_unlock(&nofault_mms_lock);
+
+ done_vmlist:
+ read_unlock(&vmlist_lock);
+ done_mm:
+ mmput(mm);
+ return result;
+}
+
+EXPORT_SYMBOL(ipipe_disable_task_faults);
+
+int ipipe_update_nofault_mms(unsigned long start, unsigned long end)
+{
+ struct mm_struct *mm;
+ int result = 0;
+
+ read_lock(&nofault_mms_lock);
+ list_for_each_entry(mm, &nofault_mms, nofault) {
+ down_write(&mm->mmap_sem);
+ result = ipipe_arch_map_vm_area_to_mm(mm, start, end);
+ up_write(&mm->mmap_sem);
+
+ if (result)
+ break;
+ }
+ read_unlock(&nofault_mms_lock);
+
+ return result;
+}
+
+void ipipe_destroy_nofault_mm(struct mm_struct *mm)
+{
+ if (mm->def_flags & VM_NOFAULT) {
+ write_lock(&nofault_mms_lock);
+ list_del(&mm->nofault);
+ write_unlock(&nofault_mms_lock);
+ }
+}
+#endif
diff -Naurdp -x '*~' ipipe-2.6.19/mm/mlock.c ipipe-2.6.19-nocow/mm/mlock.c
--- ipipe-2.6.19/mm/mlock.c 2007-01-02 10:59:48.000000000 +0100
+++ ipipe-2.6.19-nocow/mm/mlock.c 2007-01-11 15:32:09.000000000 +0100
@@ -166,7 +166,7 @@ static int do_mlockall(int flags)
if (flags & MCL_FUTURE)
def_flags = VM_LOCKED;
- current->mm->def_flags = def_flags;
+ current->mm->def_flags |= def_flags;
if (flags == MCL_FUTURE)
goto out;
diff -Naurdp -x '*~' ipipe-2.6.19/mm/vmalloc.c ipipe-2.6.19-nocow/mm/vmalloc.c
--- ipipe-2.6.19/mm/vmalloc.c 2007-01-10 11:22:05.000000000 +0100
+++ ipipe-2.6.19-nocow/mm/vmalloc.c 2007-01-11 09:58:49.000000000 +0100
@@ -152,15 +152,14 @@ int map_vm_area(struct vm_struct *area,
BUG_ON(addr >= end);
pgd = pgd_offset_k(addr);
do {
- pgd_t oldpgd;
- memcpy(&oldpgd,pgd,sizeof(pgd_t));
next = pgd_addr_end(addr, end);
err = vmap_pud_range(pgd, addr, next, prot, pages);
if (err)
break;
- if (pgd_val(oldpgd) != pgd_val(*pgd))
- set_pgdir(addr, *pgd);
} while (pgd++, addr = next, addr != end);
+#ifdef CONFIG_IPIPE
+ ipipe_update_nofault_mms((unsigned long) area->addr, end);
+#endif /* CONFIG_IPIPE */
flush_cache_vmap((unsigned long) area->addr, end);
return err;
}
^ permalink raw reply [flat|nested] 11+ messages in thread* Re: [Xenomai-core] Nocow patch.
2007-01-11 15:18 ` Gilles Chanteperdrix
@ 2007-01-13 18:57 ` Philippe Gerum
2007-01-15 10:48 ` Gilles Chanteperdrix
0 siblings, 1 reply; 11+ messages in thread
From: Philippe Gerum @ 2007-01-13 18:57 UTC (permalink / raw)
To: Gilles Chanteperdrix; +Cc: xenomai-core
On Thu, 2007-01-11 at 16:18 +0100, Gilles Chanteperdrix wrote:
> Gilles Chanteperdrix wrote:
> > This was run on x86, but need further testing before inclusion.
>
> Here is a new version, after testing. It appears to run fine. I tested
> forking in real-time applications both before and after calling
> rt_task_shadow, and vmallocing areas of 256 Mo, and memseting them both
> from a non-realtime or real-time context and it works.
>
> The next step is to clean up the patch, but I have to admit that I need
> some help: should I keep the functions in the files where I put them ?
> in what headers should I declare them ? Should I define an empty
> ipipe_update_nofault_mms when CONFIG_IPIPE is not set in order to avoid
> a few #ifdefs ?
> diff -Naurdp -x '*~' ipipe-2.6.19/arch/i386/mm/fault.c ipipe-2.6.19-nocow/arch/i386/mm/fault.c
> --- ipipe-2.6.19/arch/i386/mm/fault.c 2007-01-10 09:44:52.000000000 +0100
> +++ ipipe-2.6.19-nocow/arch/i386/mm/fault.c 2007-01-11 09:58:49.000000000 +0100
> @@ -654,3 +654,19 @@ void vmalloc_sync_all(void)
> }
> }
> #endif
> +
> +#ifdef CONFIG_IPIPE
> +int ipipe_arch_map_vm_area_to_mm(struct mm_struct *mm,
> + unsigned long start,
> + unsigned long end)
> +{
__ipipe_pin_range_mapping() would better identify an internal routine
which somehow wires the mapping of a virtual address range into a
memory context.
[...]
> +
> +#if CONFIG_IPIPE
> + struct list_head nofault;
s,nofault,pinned, ? The point is that the NOFAULT feature does not
really disable all faults, but only faults leading to lazy/ondemand
mappings. E.g. pathological faults would still raise exceptions.
> +#endif /* CONFIG_IPIPE */
> };
>
> struct sighand_struct {
> diff -Naurdp -x '*~' ipipe-2.6.19/kernel/fork.c ipipe-2.6.19-nocow/kernel/fork.c
> --- ipipe-2.6.19/kernel/fork.c 2007-01-10 09:44:53.000000000 +0100
> +++ ipipe-2.6.19-nocow/kernel/fork.c 2007-01-11 15:32:25.000000000 +0100
> @@ -385,6 +385,7 @@ void mmput(struct mm_struct *mm)
>
> if (atomic_dec_and_test(&mm->mm_users)) {
> ipipe_cleanup_notify(mm);
> + ipipe_destroy_nofault_mm(mm);
We may want to merge both into the notification trigger. Those
nitty-gritty I-pipe details ought to be gathered; after all, removing
the mm from the pinned mm queue is also a cleanup operation. This
would also remove the need for adding a placeholder in the
!CONFIG_IPIPE case.
[...]
> -
> +#ifdef CONFIG_IPIPE
> + ipipe_update_nofault_mms(start, end);
I'd suggest something like __ipipe_update_all_pinned_mm().
[...]
> +#ifdef CONFIG_IPIPE
> +#include <linux/vmalloc.h> /* For vmlist */
> +#endif /* CONFIG_IPIPE */
No need for noisy conditional here. Including linux/vmalloc.h has no
undesirable side-effect in the !CONFIG_IPIPE case anyway.
[...]
> +
> +#ifdef CONFIG_IPIPE
> +static LIST_HEAD(nofault_mms);
> +static DEFINE_RWLOCK(nofault_mms_lock);
> +
> +static int ipipe_fault_pte_range(struct mm_struct *mm, pmd_t *pmd,
> + struct vm_area_struct *vma,
> + unsigned long addr, unsigned long
[...]
> +static int ipipe_fault_pmd_range(struct mm_struct *mm, pud_t *pud,
> + struct vm_area_struct *vma,
> + unsigned long addr, unsigned long end)
[...]
> +static int ipipe_fault_pud_range(struct mm_struct *mm, pgd_t *pgd,
> + struct vm_area_struct *vma,
> + unsigned long addr, unsigned long end)
[...]
Those routines are good candidates for inlining.
> +int ipipe_disable_task_faults(struct task_struct *tsk)
> +{
ipipe_disable_ondemand_mappings() would be more accurate.
[...]
> +#ifdef CONFIG_IPIPE
> + ipipe_update_nofault_mms((unsigned long) area->addr, end);
> +#endif /* CONFIG_IPIPE */
Better define a nop placeholder for __ipipe_update_all_pinned_mm()
in the !CONFIG_IPIPE case instead of the conditional.
--
Philippe.
^ permalink raw reply [flat|nested] 11+ messages in thread* Re: [Xenomai-core] Nocow patch.
2007-01-13 18:57 ` Philippe Gerum
@ 2007-01-15 10:48 ` Gilles Chanteperdrix
2007-01-19 9:22 ` Gilles Chanteperdrix
0 siblings, 1 reply; 11+ messages in thread
From: Gilles Chanteperdrix @ 2007-01-15 10:48 UTC (permalink / raw)
To: rpm; +Cc: xenomai-core
[-- Attachment #1: Type: text/plain, Size: 4413 bytes --]
Philippe Gerum wrote:
> On Thu, 2007-01-11 at 16:18 +0100, Gilles Chanteperdrix wrote:
>
>>Gilles Chanteperdrix wrote:
>>
>>>This was run on x86, but need further testing before inclusion.
>>
>>Here is a new version, after testing. It appears to run fine. I tested
>>forking in real-time applications both before and after calling
>>rt_task_shadow, and vmallocing areas of 256 Mo, and memseting them both
>>from a non-realtime or real-time context and it works.
>>
>>The next step is to clean up the patch, but I have to admit that I need
>>some help: should I keep the functions in the files where I put them ?
>>in what headers should I declare them ? Should I define an empty
>>ipipe_update_nofault_mms when CONFIG_IPIPE is not set in order to avoid
>>a few #ifdefs ?
>
>
>>diff -Naurdp -x '*~' ipipe-2.6.19/arch/i386/mm/fault.c ipipe-2.6.19-nocow/arch/i386/mm/fault.c
>>--- ipipe-2.6.19/arch/i386/mm/fault.c 2007-01-10 09:44:52.000000000 +0100
>>+++ ipipe-2.6.19-nocow/arch/i386/mm/fault.c 2007-01-11 09:58:49.000000000 +0100
>>@@ -654,3 +654,19 @@ void vmalloc_sync_all(void)
>> }
>> }
>> #endif
>>+
>>+#ifdef CONFIG_IPIPE
>>+int ipipe_arch_map_vm_area_to_mm(struct mm_struct *mm,
>>+ unsigned long start,
>>+ unsigned long end)
>>+{
>
>
> __ipipe_pin_range_mapping() would better identify an internal routine
> which somehow wires the mapping of a virtual address range into a
> memory context.
>
> [...]
>
>
>>+
>>+#if CONFIG_IPIPE
>>+ struct list_head nofault;
>
>
> s,nofault,pinned, ? The point is that the NOFAULT feature does not
> really disable all faults, but only faults leading to lazy/ondemand
> mappings. E.g. pathological faults would still raise exceptions.
>
>
>>+#endif /* CONFIG_IPIPE */
>> };
>>
>> struct sighand_struct {
>>diff -Naurdp -x '*~' ipipe-2.6.19/kernel/fork.c ipipe-2.6.19-nocow/kernel/fork.c
>>--- ipipe-2.6.19/kernel/fork.c 2007-01-10 09:44:53.000000000 +0100
>>+++ ipipe-2.6.19-nocow/kernel/fork.c 2007-01-11 15:32:25.000000000 +0100
>>@@ -385,6 +385,7 @@ void mmput(struct mm_struct *mm)
>>
>> if (atomic_dec_and_test(&mm->mm_users)) {
>> ipipe_cleanup_notify(mm);
>>+ ipipe_destroy_nofault_mm(mm);
>
>
> We may want to merge both into the notification trigger. Those
> nitty-gritty I-pipe details ought to be gathered; after all, removing
> the mm from the pinned mm queue is also a cleanup operation. This
> would also remove the need for adding a placeholder in the
> !CONFIG_IPIPE case.
>
> [...]
>
>
>>-
>>+#ifdef CONFIG_IPIPE
>>+ ipipe_update_nofault_mms(start, end);
>
>
> I'd suggest something like __ipipe_update_all_pinned_mm().
>
> [...]
>
>
>>+#ifdef CONFIG_IPIPE
>>+#include <linux/vmalloc.h> /* For vmlist */
>>+#endif /* CONFIG_IPIPE */
>
>
> No need for noisy conditional here. Including linux/vmalloc.h has no
> undesirable side-effect in the !CONFIG_IPIPE case anyway.
>
> [...]
>
>
>>+
>>+#ifdef CONFIG_IPIPE
>>+static LIST_HEAD(nofault_mms);
>>+static DEFINE_RWLOCK(nofault_mms_lock);
>>+
>>+static int ipipe_fault_pte_range(struct mm_struct *mm, pmd_t *pmd,
>>+ struct vm_area_struct *vma,
>>+ unsigned long addr, unsigned long
>
>
> [...]
>
>
>>+static int ipipe_fault_pmd_range(struct mm_struct *mm, pud_t *pud,
>>+ struct vm_area_struct *vma,
>>+ unsigned long addr, unsigned long end)
>
>
> [...]
>
>
>>+static int ipipe_fault_pud_range(struct mm_struct *mm, pgd_t *pgd,
>>+ struct vm_area_struct *vma,
>>+ unsigned long addr, unsigned long end)
>
>
> [...]
>
> Those routines are good candidates for inlining.
>
>
>>+int ipipe_disable_task_faults(struct task_struct *tsk)
>>+{
>
>
> ipipe_disable_ondemand_mappings() would be more accurate.
>
> [...]
>
>
>>+#ifdef CONFIG_IPIPE
>>+ ipipe_update_nofault_mms((unsigned long) area->addr, end);
>>+#endif /* CONFIG_IPIPE */
>
>
> Better define a nop placeholder for __ipipe_update_all_pinned_mm()
> in the !CONFIG_IPIPE case instead of the conditional.
>
Here is an updated version, following your advices.
However, after looking at the ARM patch, I am not so sure
__ipipe_update_all_pinned_mm() is the way to go on all architectures.
The ARM I-pipe handles vmalloc and ioremap faults without causing a mode
switch, I wonder if it is not better than having
__ipipe_update_all_pinned_mm() updating page directories all over the
place.
--
Gilles Chanteperdrix
[-- Attachment #2: vm-nocow-2.6.19.4.patch --]
[-- Type: text/x-patch, Size: 13951 bytes --]
diff -Naurdp -x '*~' ipipe-2.6.19/arch/i386/mm/fault.c ipipe-2.6.19-nocow/arch/i386/mm/fault.c
--- ipipe-2.6.19/arch/i386/mm/fault.c 2007-01-10 09:44:52.000000000 +0100
+++ ipipe-2.6.19-nocow/arch/i386/mm/fault.c 2007-01-15 09:57:02.000000000 +0100
@@ -654,3 +654,18 @@ void vmalloc_sync_all(void)
}
}
#endif
+
+#ifdef CONFIG_IPIPE
+int __ipipe_pin_range_mapping(struct mm_struct *mm,
+ unsigned long start, unsigned long end)
+{
+ unsigned long next, addr = start;
+
+ do {
+ next = pgd_addr_end(addr, end);
+ vmalloc_sync_one(mm->pgd, addr);
+ } while (addr = next, addr != end);
+
+ return 0;
+}
+#endif /* CONFIG_IPIPE */
diff -Naurdp -x '*~' ipipe-2.6.19/include/asm-i386/pgalloc.h ipipe-2.6.19-nocow/include/asm-i386/pgalloc.h
--- ipipe-2.6.19/include/asm-i386/pgalloc.h 2007-01-10 09:44:53.000000000 +0100
+++ ipipe-2.6.19-nocow/include/asm-i386/pgalloc.h 2007-01-11 09:58:49.000000000 +0100
@@ -46,27 +46,4 @@ static inline void pte_free(struct page
#define check_pgt_cache() do { } while (0)
-static inline void set_pgdir(unsigned long address, pgd_t entry)
-{
-#ifdef CONFIG_IPIPE
- struct task_struct * p;
- struct page *page;
- pgd_t *pgd;
-
- read_lock(&tasklist_lock);
-
- for_each_process(p) {
- if(p->mm)
- *pgd_offset(p->mm,address) = entry;
- }
-
- read_unlock(&tasklist_lock);
-
- for (page = pgd_list; page; page = (struct page *)page->index) {
- pgd = (pgd_t *)page_address(page);
- pgd[address >> PGDIR_SHIFT] = entry;
- }
-#endif /* CONFIG_IPIPE */
-}
-
#endif /* _I386_PGALLOC_H */
diff -Naurdp -x '*~' ipipe-2.6.19/include/linux/ipipe.h ipipe-2.6.19-nocow/include/linux/ipipe.h
--- ipipe-2.6.19/include/linux/ipipe.h 2007-01-10 09:44:53.000000000 +0100
+++ ipipe-2.6.19-nocow/include/linux/ipipe.h 2007-01-15 10:52:06.000000000 +0100
@@ -337,6 +337,15 @@ int fastcall __ipipe_dispatch_wired(stru
void fastcall __ipipe_sync_stage(unsigned long syncmask);
+int __ipipe_update_all_pinned_mm(unsigned long start, unsigned long end);
+
+struct mm_struct;
+
+void __ipipe_unlink_pinned_mm(struct mm_struct *mm);
+
+int __ipipe_pin_range_mapping(struct mm_struct *mm,
+ unsigned long start, unsigned long end);
+
#ifndef __ipipe_sync_pipeline
#define __ipipe_sync_pipeline(syncmask) __ipipe_sync_stage(syncmask)
#endif
@@ -434,12 +443,11 @@ static inline void ipipe_init_notify(str
__ipipe_dispatch_event(IPIPE_EVENT_INIT,p);
}
-struct mm_struct;
-
static inline void ipipe_cleanup_notify(struct mm_struct *mm)
{
if (__ipipe_event_monitored_p(IPIPE_EVENT_CLEANUP))
__ipipe_dispatch_event(IPIPE_EVENT_CLEANUP,mm);
+ __ipipe_unlink_pinned_mm(mm);
}
/* Public interface */
@@ -643,6 +651,8 @@ int fastcall ipipe_set_ptd(int key,
void fastcall *ipipe_get_ptd(int key);
+int ipipe_disable_ondemand_mappings(struct task_struct *tsk);
+
#define local_irq_enable_hw_cond() local_irq_enable_hw()
#define local_irq_disable_hw_cond() local_irq_disable_hw()
#define local_irq_save_hw_cond(flags) local_irq_save_hw(flags)
@@ -690,6 +700,7 @@ void fastcall *ipipe_get_ptd(int key);
#define ipipe_cleanup_notify(mm) do { } while(0)
#define ipipe_trap_notify(t,r) 0
#define ipipe_init_proc() do { } while(0)
+#define __ipipe_update_all_pinned_mm(start, end) 0
#define local_irq_enable_hw_cond() do { } while(0)
#define local_irq_disable_hw_cond() do { } while(0)
diff -Naurdp -x '*~' ipipe-2.6.19/include/linux/mm.h ipipe-2.6.19-nocow/include/linux/mm.h
--- ipipe-2.6.19/include/linux/mm.h 2007-01-04 10:10:33.000000000 +0100
+++ ipipe-2.6.19-nocow/include/linux/mm.h 2007-01-15 10:06:44.000000000 +0100
@@ -166,6 +166,7 @@ extern unsigned int kobjsize(const void
#define VM_NONLINEAR 0x00800000 /* Is non-linear (remap_file_pages) */
#define VM_MAPPED_COPY 0x01000000 /* T if mapped copy of data (nommu mmap) */
#define VM_INSERTPAGE 0x02000000 /* The vma has had "vm_insert_page()" done on it */
+#define VM_PINNED 0x10000000 /* Disable faults for the vma */
#ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */
#define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS
diff -Naurdp -x '*~' ipipe-2.6.19/include/linux/sched.h ipipe-2.6.19-nocow/include/linux/sched.h
--- ipipe-2.6.19/include/linux/sched.h 2007-01-10 09:44:53.000000000 +0100
+++ ipipe-2.6.19-nocow/include/linux/sched.h 2007-01-15 10:07:45.000000000 +0100
@@ -363,6 +363,10 @@ struct mm_struct {
/* aio bits */
rwlock_t ioctx_list_lock;
struct kioctx *ioctx_list;
+
+#if CONFIG_IPIPE
+ struct list_head pinned;
+#endif /* CONFIG_IPIPE */
};
struct sighand_struct {
diff -Naurdp -x '*~' ipipe-2.6.19/lib/ioremap.c ipipe-2.6.19-nocow/lib/ioremap.c
--- ipipe-2.6.19/lib/ioremap.c 2007-01-10 09:44:53.000000000 +0100
+++ ipipe-2.6.19-nocow/lib/ioremap.c 2007-01-15 11:23:54.000000000 +0100
@@ -85,10 +85,9 @@ int ioremap_page_range(unsigned long add
err = ioremap_pud_range(pgd, addr, next, phys_addr+addr, prot);
if (err)
break;
- set_pgdir(addr, *pgd);
} while (pgd++, addr = next, addr != end);
-
- flush_cache_vmap(start, end);
+ __ipipe_update_all_pinned_mm(start, end);
+ flush_cache_vmap(start, end);
return err;
}
diff -Naurdp -x '*~' ipipe-2.6.19/mm/memory.c ipipe-2.6.19-nocow/mm/memory.c
--- ipipe-2.6.19/mm/memory.c 2007-01-04 10:10:35.000000000 +0100
+++ ipipe-2.6.19-nocow/mm/memory.c 2007-01-15 10:58:36.000000000 +0100
@@ -50,6 +50,7 @@
#include <linux/delayacct.h>
#include <linux/init.h>
#include <linux/writeback.h>
+#include <linux/vmalloc.h>
#include <asm/pgalloc.h>
#include <asm/uaccess.h>
@@ -418,13 +419,41 @@ struct page *vm_normal_page(struct vm_ar
return pfn_to_page(pfn);
}
+static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va)
+{
+ /*
+ * If the source page was a PFN mapping, we don't have
+ * a "struct page" for it. We do a best-effort copy by
+ * just copying from the original user address. If that
+ * fails, we just zero-fill it. Live with it.
+ */
+ if (unlikely(!src)) {
+ void *kaddr = kmap_atomic(dst, KM_USER0);
+ void __user *uaddr = (void __user *)(va & PAGE_MASK);
+
+ /*
+ * This really shouldn't fail, because the page is there
+ * in the page tables. But it might just be unreadable,
+ * in which case we just give up and fill the result with
+ * zeroes.
+ */
+ if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
+ memset(kaddr, 0, PAGE_SIZE);
+ kunmap_atomic(kaddr, KM_USER0);
+ flush_dcache_page(dst);
+ return;
+
+ }
+ copy_user_highpage(dst, src, va);
+}
+
/*
* copy one vm_area from one task to the other. Assumes the page tables
* already present in the new task to be cleared in the whole range
* covered by this vma.
*/
-static inline void
+static inline int
copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
unsigned long addr, int *rss)
@@ -466,6 +495,25 @@ copy_one_pte(struct mm_struct *dst_mm, s
* in the parent and the child
*/
if (is_cow_mapping(vm_flags)) {
+#ifdef CONFIG_IPIPE
+ if (((vm_flags|src_mm->def_flags) & (VM_LOCKED|VM_PINNED)) == (VM_LOCKED|VM_PINNED)) {
+ struct page *old_page = vm_normal_page(vma, addr, pte);
+ page = alloc_page_vma(GFP_HIGHUSER, vma, addr);
+ if (!page)
+ return -ENOMEM;
+
+ cow_user_page(page, old_page, addr);
+ pte = mk_pte(page, vma->vm_page_prot);
+
+ if (vm_flags & VM_SHARED)
+ pte = pte_mkclean(pte);
+ pte = pte_mkold(pte);
+
+ page_dup_rmap(page);
+ rss[!!PageAnon(page)]++;
+ goto out_set_pte;
+ }
+#endif /* CONFIG_IPIPE */
ptep_set_wrprotect(src_mm, addr, src_pte);
pte = pte_wrprotect(pte);
}
@@ -487,6 +535,7 @@ copy_one_pte(struct mm_struct *dst_mm, s
out_set_pte:
set_pte_at(dst_mm, addr, dst_pte, pte);
+ return 0;
}
static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
@@ -524,7 +573,9 @@ again:
progress++;
continue;
}
- copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss);
+ if (copy_one_pte(dst_mm, src_mm, dst_pte,
+ src_pte, vma, addr, rss))
+ return -ENOMEM;
progress += 8;
} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
@@ -1431,34 +1482,6 @@ static inline pte_t maybe_mkwrite(pte_t
return pte;
}
-static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va)
-{
- /*
- * If the source page was a PFN mapping, we don't have
- * a "struct page" for it. We do a best-effort copy by
- * just copying from the original user address. If that
- * fails, we just zero-fill it. Live with it.
- */
- if (unlikely(!src)) {
- void *kaddr = kmap_atomic(dst, KM_USER0);
- void __user *uaddr = (void __user *)(va & PAGE_MASK);
-
- /*
- * This really shouldn't fail, because the page is there
- * in the page tables. But it might just be unreadable,
- * in which case we just give up and fill the result with
- * zeroes.
- */
- if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
- memset(kaddr, 0, PAGE_SIZE);
- kunmap_atomic(kaddr, KM_USER0);
- flush_dcache_page(dst);
- return;
-
- }
- copy_user_highpage(dst, src, va);
-}
-
/*
* This routine handles present pages, when users try to write
* to a shared page. It is done by copying the page to a new address
@@ -2676,3 +2699,150 @@ int access_process_vm(struct task_struct
return buf - old_buf;
}
+
+#ifdef CONFIG_IPIPE
+static LIST_HEAD(pinned_mms);
+static DEFINE_RWLOCK(pinned_mms_lock);
+
+static inline int ipipe_pin_pte_range(struct mm_struct *mm, pmd_t *pmd,
+ struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end)
+{
+ spinlock_t *ptl;
+ pte_t *pte;
+
+ do {
+ pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+
+ if (do_wp_page(mm, vma, addr, pte, pmd, ptl, *pte) == VM_FAULT_OOM)
+ return -ENOMEM;
+ } while (addr += PAGE_SIZE, addr != end);
+ return 0;
+}
+
+static inline int ipipe_pin_pmd_range(struct mm_struct *mm, pud_t *pud,
+ struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end)
+{
+ unsigned long next;
+ pmd_t *pmd;
+
+ pmd = pmd_offset(pud, addr);
+ do {
+ next = pmd_addr_end(addr, end);
+ if (ipipe_pin_pte_range(mm, pmd, vma, addr, end))
+ return -ENOMEM;
+ } while (pmd++, addr = next, addr != end);
+ return 0;
+}
+
+static inline int ipipe_pin_pud_range(struct mm_struct *mm, pgd_t *pgd,
+ struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end)
+{
+ unsigned long next;
+ pud_t *pud;
+
+ pud = pud_offset(pgd, addr);
+ do {
+ next = pud_addr_end(addr, end);
+ if (ipipe_pin_pmd_range(mm, pud, vma, addr, end))
+ return -ENOMEM;
+ } while (pud++, addr = next, addr != end);
+ return 0;
+}
+
+int ipipe_disable_ondemand_mappings(struct task_struct *tsk)
+{
+ unsigned long addr, next, end;
+ struct vm_area_struct *vma;
+ struct vm_struct *area;
+ struct mm_struct *mm;
+ int result = 0;
+ pgd_t *pgd;
+
+ mm = get_task_mm(tsk);
+ if (!mm)
+ return -EPERM;
+
+ down_write(&mm->mmap_sem);
+ if (mm->def_flags & VM_PINNED)
+ goto up_mmap_sem_done;
+
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ if (!is_cow_mapping(vma->vm_flags))
+ continue;
+
+ addr = vma->vm_start;
+ end = vma->vm_end;
+
+ pgd = pgd_offset(mm, addr);
+ do {
+ next = pgd_addr_end(addr, end);
+ if (ipipe_pin_pud_range(mm, pgd, vma, addr, next)) {
+ result = -ENOMEM;
+ up_mmap_sem_done:
+ up_write(&mm->mmap_sem);
+ goto done_mm;
+ }
+ } while (pgd++, addr = next, addr != end);
+ }
+ mm->def_flags |= VM_PINNED;
+ up_write(&mm->mmap_sem);
+
+ read_lock(&vmlist_lock);
+ down_write(&mm->mmap_sem);
+ for (area = vmlist; area; area = area->next) {
+ result = __ipipe_pin_range_mapping(mm,
+ (unsigned long) area->addr,
+ (unsigned long) area->addr
+ + area->size);
+ if (result) {
+ mm->def_flags &= ~VM_PINNED;
+ up_write(&mm->mmap_sem);
+ goto done_vmlist;
+ }
+ }
+ up_write(&mm->mmap_sem);
+
+ write_lock(&pinned_mms_lock);
+ list_add(&mm->pinned, &pinned_mms);
+ write_unlock(&pinned_mms_lock);
+
+ done_vmlist:
+ read_unlock(&vmlist_lock);
+ done_mm:
+ mmput(mm);
+ return result;
+}
+
+EXPORT_SYMBOL(ipipe_disable_ondemand_mappings);
+
+int __ipipe_update_all_pinned_mm(unsigned long start, unsigned long end)
+{
+ struct mm_struct *mm;
+ int result = 0;
+
+ read_lock(&pinned_mms_lock);
+ list_for_each_entry(mm, &pinned_mms, pinned) {
+ down_write(&mm->mmap_sem);
+ result = __ipipe_pin_range_mapping(mm, start, end);
+ up_write(&mm->mmap_sem);
+
+ if (result)
+ break;
+ }
+ read_unlock(&pinned_mms_lock);
+
+ return result;
+}
+
+void __ipipe_unlink_pinned_mm(struct mm_struct *mm)
+{
+ if (mm->def_flags & VM_PINNED) {
+ write_lock(&pinned_mms_lock);
+ list_del(&mm->pinned);
+ write_unlock(&pinned_mms_lock);
+ }
+}
+#endif
diff -Naurdp -x '*~' ipipe-2.6.19/mm/mlock.c ipipe-2.6.19-nocow/mm/mlock.c
--- ipipe-2.6.19/mm/mlock.c 2007-01-02 10:59:48.000000000 +0100
+++ ipipe-2.6.19-nocow/mm/mlock.c 2007-01-11 15:32:09.000000000 +0100
@@ -166,7 +166,7 @@ static int do_mlockall(int flags)
if (flags & MCL_FUTURE)
def_flags = VM_LOCKED;
- current->mm->def_flags = def_flags;
+ current->mm->def_flags |= def_flags;
if (flags == MCL_FUTURE)
goto out;
diff -Naurdp -x '*~' ipipe-2.6.19/mm/vmalloc.c ipipe-2.6.19-nocow/mm/vmalloc.c
--- ipipe-2.6.19/mm/vmalloc.c 2007-01-10 11:22:05.000000000 +0100
+++ ipipe-2.6.19-nocow/mm/vmalloc.c 2007-01-15 10:05:57.000000000 +0100
@@ -152,15 +152,12 @@ int map_vm_area(struct vm_struct *area,
BUG_ON(addr >= end);
pgd = pgd_offset_k(addr);
do {
- pgd_t oldpgd;
- memcpy(&oldpgd,pgd,sizeof(pgd_t));
next = pgd_addr_end(addr, end);
err = vmap_pud_range(pgd, addr, next, prot, pages);
if (err)
break;
- if (pgd_val(oldpgd) != pgd_val(*pgd))
- set_pgdir(addr, *pgd);
} while (pgd++, addr = next, addr != end);
+ __ipipe_update_all_pinned_mm((unsigned long) area->addr, end);
flush_cache_vmap((unsigned long) area->addr, end);
return err;
}
^ permalink raw reply [flat|nested] 11+ messages in thread* Re: [Xenomai-core] Nocow patch.
2007-01-15 10:48 ` Gilles Chanteperdrix
@ 2007-01-19 9:22 ` Gilles Chanteperdrix
2007-01-19 9:58 ` Philippe Gerum
0 siblings, 1 reply; 11+ messages in thread
From: Gilles Chanteperdrix @ 2007-01-19 9:22 UTC (permalink / raw)
To: rpm; +Cc: xenomai-core
Gilles Chanteperdrix wrote:
> However, after looking at the ARM patch, I am not so sure
> __ipipe_update_all_pinned_mm() is the way to go on all architectures.
> The ARM I-pipe handles vmalloc and ioremap faults without causing a mode
> switch, I wonder if it is not better than having
> __ipipe_update_all_pinned_mm() updating page directories all over the
> place.
I checked with the I-pipe tracer the overhead on an ARM of a fault on a
vmalloced area: it costs around 5 us.
--
Gilles Chanteperdrix
^ permalink raw reply [flat|nested] 11+ messages in thread* Re: [Xenomai-core] Nocow patch.
2007-01-19 9:22 ` Gilles Chanteperdrix
@ 2007-01-19 9:58 ` Philippe Gerum
2007-01-19 10:10 ` Gilles Chanteperdrix
0 siblings, 1 reply; 11+ messages in thread
From: Philippe Gerum @ 2007-01-19 9:58 UTC (permalink / raw)
To: Gilles Chanteperdrix; +Cc: xenomai-core
On Fri, 2007-01-19 at 10:22 +0100, Gilles Chanteperdrix wrote:
> Gilles Chanteperdrix wrote:
> > However, after looking at the ARM patch, I am not so sure
> > __ipipe_update_all_pinned_mm() is the way to go on all architectures.
> > The ARM I-pipe handles vmalloc and ioremap faults without causing a mode
> > switch, I wonder if it is not better than having
> > __ipipe_update_all_pinned_mm() updating page directories all over the
> > place.
>
> I checked with the I-pipe tracer the overhead on an ARM of a fault on a
> vmalloced area: it costs around 5 us.
>
What is the average latency in user-space on this board for 1Khz and
10Khz periods?
Beyond that, we may want to keep both approaches at hand in the core
infrastructure; running the same benchmark on, say an integrator, may
give much higher latencies due to lousy cache issues. This still needs
to be verified though.
--
Philippe.
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [Xenomai-core] Nocow patch.
2007-01-19 9:58 ` Philippe Gerum
@ 2007-01-19 10:10 ` Gilles Chanteperdrix
0 siblings, 0 replies; 11+ messages in thread
From: Gilles Chanteperdrix @ 2007-01-19 10:10 UTC (permalink / raw)
To: rpm; +Cc: xenomai-core
Philippe Gerum wrote:
> On Fri, 2007-01-19 at 10:22 +0100, Gilles Chanteperdrix wrote:
>
>>Gilles Chanteperdrix wrote:
>>
>>>However, after looking at the ARM patch, I am not so sure
>>>__ipipe_update_all_pinned_mm() is the way to go on all architectures.
>>>The ARM I-pipe handles vmalloc and ioremap faults without causing a mode
>>>switch, I wonder if it is not better than having
>>>__ipipe_update_all_pinned_mm() updating page directories all over the
>>>place.
>>
>>I checked with the I-pipe tracer the overhead on an ARM of a fault on a
>>vmalloced area: it costs around 5 us.
>>
>
>
> What is the average latency in user-space on this board for 1Khz and
> 10Khz periods?
At 10 kHz we get a lockup. At 1 kHz, we get a worst case latency around
200 us, with an average latency around 150 us if running the cache
calibrator in the background.
>
> Beyond that, we may want to keep both approaches at hand in the core
> infrastructure; running the same benchmark on, say an integrator, may
> give much higher latencies due to lousy cache issues. This still needs
> to be verified though.
My original point was that the use of ipipe_disable_ondemand_mappings
may be traumatic on cache. I am wondering if we should not add a nucleus
syscall a bit like mlockall, to which we would pass some flags like
COW_CURRENT, VMALLOC_CURRENT, VMALLOC_FUTURE, and depending on these
flags, ipipe_disable_ondemand_mappings would only do parts of what it is
currently doing.
--
Gilles Chanteperdrix
^ permalink raw reply [flat|nested] 11+ messages in thread
end of thread, other threads:[~2007-01-31 8:37 UTC | newest]
Thread overview: 11+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-01-31 8:37 [Xenomai-core] Nocow patch Gilles Chanteperdrix
-- strict thread matches above, loose matches on Subject: below --
2007-01-10 18:05 Gilles Chanteperdrix
2007-01-11 7:38 ` Niklaus Giger
2007-01-11 8:43 ` Gilles Chanteperdrix
2007-01-11 8:46 ` Gilles Chanteperdrix
2007-01-11 15:18 ` Gilles Chanteperdrix
2007-01-13 18:57 ` Philippe Gerum
2007-01-15 10:48 ` Gilles Chanteperdrix
2007-01-19 9:22 ` Gilles Chanteperdrix
2007-01-19 9:58 ` Philippe Gerum
2007-01-19 10:10 ` Gilles Chanteperdrix
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.