* [RFP-V2 1/3] Have mmu_notifiers use SRCU so they may safely schedule. [not found] <20100202040145.555474000@alcatraz.americas.sgi.com> @ 2010-02-02 4:01 ` Robin Holt 2010-02-02 4:01 ` [RFP-V2 2/3] Fix unmap_vma() bug related to mmu_notifiers Robin Holt ` (2 subsequent siblings) 3 siblings, 0 replies; 26+ messages in thread From: Robin Holt @ 2010-02-02 4:01 UTC (permalink / raw) To: Andrew Morton, Andrea Arcangeli; +Cc: Jack Steiner, linux-mm [-- Attachment #1: mmu_notifier_srcu_v1 --] [-- Type: text/plain, Size: 9047 bytes --] From: Andrea Arcangeli <aarcange@redhat.com> With an RCU based mmu_notifier implementation, any callout to mmu_notifier_invalidate_range_start, mmu_notifier_invalidate_range_end, or mmu_notifier_invalidate_page would not be allowed to call schedule as that could potentially allow a modification to the mmu_notifier structure while it is currently being used. Signed-off-by: Andrea Arcangeli <aarcange@redhat.com> Signed-off-by: Robin Holt <holt@sgi.com> To: Andrew Morton <akpm@linux-foundation.org> To: Andrea Arcangeli <aarcange@redhat.com> Cc: Jack Steiner <steiner@sgi.com> Cc: linux-mm@kvack.org --- include/linux/mmu_notifier.h | 3 ++ mm/mmu_notifier.c | 59 ++++++++++++++++++++++++++----------------- 2 files changed, 40 insertions(+), 22 deletions(-) Index: mmu_notifiers_sleepable_v2/include/linux/mmu_notifier.h =================================================================== --- mmu_notifiers_sleepable_v2.orig/include/linux/mmu_notifier.h 2010-02-01 11:24:27.000000000 -0600 +++ mmu_notifiers_sleepable_v2/include/linux/mmu_notifier.h 2010-02-01 11:34:38.000000000 -0600 @@ -4,6 +4,7 @@ #include <linux/list.h> #include <linux/spinlock.h> #include <linux/mm_types.h> +#include <linux/srcu.h> struct mmu_notifier; struct mmu_notifier_ops; @@ -19,6 +20,8 @@ struct mmu_notifier_ops; struct mmu_notifier_mm { /* all mmu notifiers registerd in this mm are queued in this list */ struct hlist_head list; + /* srcu structure for this mm */ + struct srcu_struct srcu; /* to serialize the list modifications and hlist_unhashed */ spinlock_t lock; }; Index: mmu_notifiers_sleepable_v2/mm/mmu_notifier.c =================================================================== --- mmu_notifiers_sleepable_v2.orig/mm/mmu_notifier.c 2010-02-01 11:24:31.000000000 -0600 +++ mmu_notifiers_sleepable_v2/mm/mmu_notifier.c 2010-02-01 11:34:38.000000000 -0600 @@ -14,6 +14,7 @@ #include <linux/module.h> #include <linux/mm.h> #include <linux/err.h> +#include <linux/srcu.h> #include <linux/rcupdate.h> #include <linux/sched.h> @@ -24,14 +25,15 @@ * in parallel despite there being no task using this mm any more, * through the vmas outside of the exit_mmap context, such as with * vmtruncate. This serializes against mmu_notifier_unregister with - * the mmu_notifier_mm->lock in addition to RCU and it serializes - * against the other mmu notifiers with RCU. struct mmu_notifier_mm + * the mmu_notifier_mm->lock in addition to SRCU and it serializes + * against the other mmu notifiers with SRCU. struct mmu_notifier_mm * can't go away from under us as exit_mmap holds an mm_count pin * itself. */ void __mmu_notifier_release(struct mm_struct *mm) { struct mmu_notifier *mn; + int srcu; spin_lock(&mm->mmu_notifier_mm->lock); while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) { @@ -46,10 +48,10 @@ void __mmu_notifier_release(struct mm_st */ hlist_del_init_rcu(&mn->hlist); /* - * RCU here will block mmu_notifier_unregister until + * SRCU here will block mmu_notifier_unregister until * ->release returns. */ - rcu_read_lock(); + srcu = srcu_read_lock(&mm->mmu_notifier_mm->srcu); spin_unlock(&mm->mmu_notifier_mm->lock); /* * if ->release runs before mmu_notifier_unregister it @@ -60,13 +62,13 @@ void __mmu_notifier_release(struct mm_st */ if (mn->ops->release) mn->ops->release(mn, mm); - rcu_read_unlock(); + srcu_read_unlock(&mm->mmu_notifier_mm->srcu, srcu); spin_lock(&mm->mmu_notifier_mm->lock); } spin_unlock(&mm->mmu_notifier_mm->lock); /* - * synchronize_rcu here prevents mmu_notifier_release to + * synchronize_srcu here prevents mmu_notifier_release to * return to exit_mmap (which would proceed freeing all pages * in the mm) until the ->release method returns, if it was * invoked by mmu_notifier_unregister. @@ -74,7 +76,7 @@ void __mmu_notifier_release(struct mm_st * The mmu_notifier_mm can't go away from under us because one * mm_count is hold by exit_mmap. */ - synchronize_rcu(); + synchronize_srcu(&mm->mmu_notifier_mm->srcu); } /* @@ -87,14 +89,14 @@ int __mmu_notifier_clear_flush_young(str { struct mmu_notifier *mn; struct hlist_node *n; - int young = 0; + int young = 0, srcu; - rcu_read_lock(); + srcu = srcu_read_lock(&mm->mmu_notifier_mm->srcu); hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { if (mn->ops->clear_flush_young) young |= mn->ops->clear_flush_young(mn, mm, address); } - rcu_read_unlock(); + srcu_read_unlock(&mm->mmu_notifier_mm->srcu, srcu); return young; } @@ -124,13 +126,14 @@ void __mmu_notifier_invalidate_page(stru { struct mmu_notifier *mn; struct hlist_node *n; + int srcu; - rcu_read_lock(); + srcu = srcu_read_lock(&mm->mmu_notifier_mm->srcu); hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { if (mn->ops->invalidate_page) mn->ops->invalidate_page(mn, mm, address); } - rcu_read_unlock(); + srcu_read_unlock(&mm->mmu_notifier_mm->srcu, srcu); } void __mmu_notifier_invalidate_range_start(struct mm_struct *mm, @@ -138,13 +141,14 @@ void __mmu_notifier_invalidate_range_sta { struct mmu_notifier *mn; struct hlist_node *n; + int srcu; - rcu_read_lock(); + srcu = srcu_read_lock(&mm->mmu_notifier_mm->srcu); hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { if (mn->ops->invalidate_range_start) mn->ops->invalidate_range_start(mn, mm, start, end); } - rcu_read_unlock(); + srcu_read_unlock(&mm->mmu_notifier_mm->srcu, srcu); } void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, @@ -152,13 +156,14 @@ void __mmu_notifier_invalidate_range_end { struct mmu_notifier *mn; struct hlist_node *n; + int srcu; - rcu_read_lock(); + srcu = srcu_read_lock(&mm->mmu_notifier_mm->srcu); hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { if (mn->ops->invalidate_range_end) mn->ops->invalidate_range_end(mn, mm, start, end); } - rcu_read_unlock(); + srcu_read_unlock(&mm->mmu_notifier_mm->srcu, srcu); } static int do_mmu_notifier_register(struct mmu_notifier *mn, @@ -175,6 +180,10 @@ static int do_mmu_notifier_register(stru if (unlikely(!mmu_notifier_mm)) goto out; + ret = init_srcu_struct(&mmu_notifier_mm->srcu); + if (unlikely(ret)) + goto out_kfree; + if (take_mmap_sem) down_write(&mm->mmap_sem); ret = mm_take_all_locks(mm); @@ -205,6 +214,9 @@ static int do_mmu_notifier_register(stru out_cleanup: if (take_mmap_sem) up_write(&mm->mmap_sem); + if (mmu_notifier_mm) + cleanup_srcu_struct(&mmu_notifier_mm->srcu); +out_kfree: /* kfree() does nothing if mmu_notifier_mm is NULL */ kfree(mmu_notifier_mm); out: @@ -245,6 +257,7 @@ EXPORT_SYMBOL_GPL(__mmu_notifier_registe void __mmu_notifier_mm_destroy(struct mm_struct *mm) { BUG_ON(!hlist_empty(&mm->mmu_notifier_mm->list)); + cleanup_srcu_struct(&mm->mmu_notifier_mm->srcu); kfree(mm->mmu_notifier_mm); mm->mmu_notifier_mm = LIST_POISON1; /* debug */ } @@ -252,8 +265,8 @@ void __mmu_notifier_mm_destroy(struct mm /* * This releases the mm_count pin automatically and frees the mm * structure if it was the last user of it. It serializes against - * running mmu notifiers with RCU and against mmu_notifier_unregister - * with the unregister lock + RCU. All sptes must be dropped before + * running mmu notifiers with SRCU and against mmu_notifier_unregister + * with the unregister lock + SRCU. All sptes must be dropped before * calling mmu_notifier_unregister. ->release or any other notifier * method may be invoked concurrently with mmu_notifier_unregister, * and only after mmu_notifier_unregister returned we're guaranteed @@ -265,13 +278,15 @@ void mmu_notifier_unregister(struct mmu_ spin_lock(&mm->mmu_notifier_mm->lock); if (!hlist_unhashed(&mn->hlist)) { + int srcu; + hlist_del_rcu(&mn->hlist); /* - * RCU here will force exit_mmap to wait ->release to finish + * SRCU here will force exit_mmap to wait ->release to finish * before freeing the pages. */ - rcu_read_lock(); + srcu = srcu_read_lock(&mm->mmu_notifier_mm->srcu); spin_unlock(&mm->mmu_notifier_mm->lock); /* * exit_mmap will block in mmu_notifier_release to @@ -280,7 +295,7 @@ void mmu_notifier_unregister(struct mmu_ */ if (mn->ops->release) mn->ops->release(mn, mm); - rcu_read_unlock(); + srcu_read_unlock(&mm->mmu_notifier_mm->srcu, srcu); } else spin_unlock(&mm->mmu_notifier_mm->lock); @@ -288,7 +303,7 @@ void mmu_notifier_unregister(struct mmu_ * Wait any running method to finish, of course including * ->release if it was run by mmu_notifier_relase instead of us. */ - synchronize_rcu(); + synchronize_srcu(&mm->mmu_notifier_mm->srcu); BUG_ON(atomic_read(&mm->mm_count) <= 0); -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 26+ messages in thread
* [RFP-V2 2/3] Fix unmap_vma() bug related to mmu_notifiers [not found] <20100202040145.555474000@alcatraz.americas.sgi.com> 2010-02-02 4:01 ` [RFP-V2 1/3] Have mmu_notifiers use SRCU so they may safely schedule Robin Holt @ 2010-02-02 4:01 ` Robin Holt 2010-02-02 4:01 ` [RFP-V2 3/3] Make mmu_notifier_invalidate_range_start able to sleep Robin Holt 2010-02-02 8:09 ` [RFP-V2 0/3] " Christoph Hellwig 3 siblings, 0 replies; 26+ messages in thread From: Robin Holt @ 2010-02-02 4:01 UTC (permalink / raw) To: Andrew Morton, Andrea Arcangeli; +Cc: Jack Steiner, linux-mm [-- Attachment #1: mmu_notifier_tlb_v1 --] [-- Type: text/plain, Size: 4409 bytes --] unmap_vmas() can fail to correctly flush the TLB if a callout to mmu_notifier_invalidate_range_start() sleeps. The mmu_gather list is initialized prior to the callout. If it is reused while the thread is sleeping, the mm field may be invalid. If the task migrates to a different cpu, the task may use the wrong mmu_gather. The patch changes unmap_vmas() to initialize the mmu_gather AFTER the mmu_notifier completes. Signed-off-by: Jack Steiner <steiner@sgi.com> To: Andrew Morton <akpm@linux-foundation.org> To: Andrea Arcangeli <aarcange@redhat.com> Cc: Jack Steiner <steiner@sgi.com> Cc: linux-mm@kvack.org --- include/linux/mm.h | 2 +- mm/memory.c | 11 +++++++---- mm/mmap.c | 6 ++---- 3 files changed, 10 insertions(+), 9 deletions(-) Index: linux/include/linux/mm.h =================================================================== --- linux.orig/include/linux/mm.h 2010-01-25 01:45:37.000000000 -0600 +++ linux/include/linux/mm.h 2010-01-25 11:32:21.000000000 -0600 @@ -761,7 +761,7 @@ unsigned long zap_page_range(struct vm_a unsigned long unmap_vmas(struct mmu_gather **tlb, struct vm_area_struct *start_vma, unsigned long start_addr, unsigned long end_addr, unsigned long *nr_accounted, - struct zap_details *); + struct zap_details *, int fullmm); /** * mm_walk - callbacks for walk_page_range Index: linux/mm/memory.c =================================================================== --- linux.orig/mm/memory.c 2010-01-25 01:45:37.000000000 -0600 +++ linux/mm/memory.c 2010-01-25 11:32:21.000000000 -0600 @@ -1010,17 +1010,21 @@ static unsigned long unmap_page_range(st unsigned long unmap_vmas(struct mmu_gather **tlbp, struct vm_area_struct *vma, unsigned long start_addr, unsigned long end_addr, unsigned long *nr_accounted, - struct zap_details *details) + struct zap_details *details, int fullmm) { long zap_work = ZAP_BLOCK_SIZE; unsigned long tlb_start = 0; /* For tlb_finish_mmu */ int tlb_start_valid = 0; unsigned long start = start_addr; spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL; - int fullmm = (*tlbp)->fullmm; struct mm_struct *mm = vma->vm_mm; + /* + * mmu_notifier_invalidate_range_start can sleep. Don't initialize + * mmu_gather until it completes + */ mmu_notifier_invalidate_range_start(mm, start_addr, end_addr); + *tlbp = tlb_gather_mmu(mm, fullmm); for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) { unsigned long end; @@ -1108,9 +1112,8 @@ unsigned long zap_page_range(struct vm_a unsigned long nr_accounted = 0; lru_add_drain(); - tlb = tlb_gather_mmu(mm, 0); update_hiwater_rss(mm); - end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details); + end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details, 0); if (tlb) tlb_finish_mmu(tlb, address, end); return end; Index: linux/mm/mmap.c =================================================================== --- linux.orig/mm/mmap.c 2010-01-25 01:45:37.000000000 -0600 +++ linux/mm/mmap.c 2010-01-25 11:35:55.000000000 -0600 @@ -1824,9 +1824,8 @@ static void unmap_region(struct mm_struc unsigned long nr_accounted = 0; lru_add_drain(); - tlb = tlb_gather_mmu(mm, 0); update_hiwater_rss(mm); - unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL); + unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL, 0); vm_unacct_memory(nr_accounted); free_pgtables(tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS, next? next->vm_start: 0); @@ -2168,10 +2167,9 @@ void exit_mmap(struct mm_struct *mm) lru_add_drain(); flush_cache_mm(mm); - tlb = tlb_gather_mmu(mm, 1); /* update_hiwater_rss(mm) here? but nobody should be looking */ /* Use -1 here to ensure all VMAs in the mm are unmapped */ - end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL); + end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL, 1); vm_unacct_memory(nr_accounted); free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0); -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 26+ messages in thread
* [RFP-V2 3/3] Make mmu_notifier_invalidate_range_start able to sleep. [not found] <20100202040145.555474000@alcatraz.americas.sgi.com> 2010-02-02 4:01 ` [RFP-V2 1/3] Have mmu_notifiers use SRCU so they may safely schedule Robin Holt 2010-02-02 4:01 ` [RFP-V2 2/3] Fix unmap_vma() bug related to mmu_notifiers Robin Holt @ 2010-02-02 4:01 ` Robin Holt 2010-02-02 8:09 ` [RFP-V2 0/3] " Christoph Hellwig 3 siblings, 0 replies; 26+ messages in thread From: Robin Holt @ 2010-02-02 4:01 UTC (permalink / raw) To: Andrew Morton, Andrea Arcangeli; +Cc: Jack Steiner, linux-mm [-- Attachment #1: mmu_notifier_truncate_sleepable_v1 --] [-- Type: text/plain, Size: 13226 bytes --] Make the truncate case handle the need to sleep. We accomplish this by failing the mmu_notifier_invalidate_range_start(... atomic==1) case which in turn falls back to unmap_mapping_range_vma() with the restart_address == start_address. In that case, we make an additional callout to mmu_notifier_invalidate_range_start(... atomic==0) after the i_mmap_lock has been released. Signed-off-by: Robin Holt <holt@sgi.com> To: Andrew Morton <akpm@linux-foundation.org> To: Andrea Arcangeli <aarcange@redhat.com> Cc: Jack Steiner <steiner@sgi.com> Cc: linux-mm@kvack.org --- drivers/misc/sgi-gru/grutlbpurge.c | 8 +++++--- include/linux/mmu_notifier.h | 22 ++++++++++++---------- mm/fremap.c | 2 +- mm/hugetlb.c | 2 +- mm/memory.c | 29 +++++++++++++++++++++++------ mm/mmu_notifier.c | 10 ++++++---- mm/mprotect.c | 2 +- mm/mremap.c | 2 +- virt/kvm/kvm_main.c | 11 +++++++---- 9 files changed, 57 insertions(+), 31 deletions(-) Index: mmu_notifiers_sleepable_v2/drivers/misc/sgi-gru/grutlbpurge.c =================================================================== --- mmu_notifiers_sleepable_v2.orig/drivers/misc/sgi-gru/grutlbpurge.c 2010-02-01 21:10:06.000000000 -0600 +++ mmu_notifiers_sleepable_v2/drivers/misc/sgi-gru/grutlbpurge.c 2010-02-01 21:41:19.000000000 -0600 @@ -219,9 +219,10 @@ void gru_flush_all_tlb(struct gru_state /* * MMUOPS notifier callout functions */ -static void gru_invalidate_range_start(struct mmu_notifier *mn, - struct mm_struct *mm, - unsigned long start, unsigned long end) +static int gru_invalidate_range_start(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long start, unsigned long end, + bool atomic) { struct gru_mm_struct *gms = container_of(mn, struct gru_mm_struct, ms_notifier); @@ -231,6 +232,7 @@ static void gru_invalidate_range_start(s gru_dbg(grudev, "gms %p, start 0x%lx, end 0x%lx, act %d\n", gms, start, end, atomic_read(&gms->ms_range_active)); gru_flush_tlb_range(gms, start, end - start); + return 0; } static void gru_invalidate_range_end(struct mmu_notifier *mn, Index: mmu_notifiers_sleepable_v2/include/linux/mmu_notifier.h =================================================================== --- mmu_notifiers_sleepable_v2.orig/include/linux/mmu_notifier.h 2010-02-01 21:10:20.000000000 -0600 +++ mmu_notifiers_sleepable_v2/include/linux/mmu_notifier.h 2010-02-01 21:10:21.000000000 -0600 @@ -127,9 +127,10 @@ struct mmu_notifier_ops { * address space but may still be referenced by sptes until * the last refcount is dropped. */ - void (*invalidate_range_start)(struct mmu_notifier *mn, + int (*invalidate_range_start)(struct mmu_notifier *mn, struct mm_struct *mm, - unsigned long start, unsigned long end); + unsigned long start, unsigned long end, + bool atomic); void (*invalidate_range_end)(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long start, unsigned long end); @@ -170,8 +171,8 @@ extern void __mmu_notifier_change_pte(st unsigned long address, pte_t pte); extern void __mmu_notifier_invalidate_page(struct mm_struct *mm, unsigned long address); -extern void __mmu_notifier_invalidate_range_start(struct mm_struct *mm, - unsigned long start, unsigned long end); +extern int __mmu_notifier_invalidate_range_start(struct mm_struct *mm, + unsigned long start, unsigned long end, bool atomic); extern void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, unsigned long start, unsigned long end); @@ -203,11 +204,12 @@ static inline void mmu_notifier_invalida __mmu_notifier_invalidate_page(mm, address); } -static inline void mmu_notifier_invalidate_range_start(struct mm_struct *mm, - unsigned long start, unsigned long end) +static inline int mmu_notifier_invalidate_range_start(struct mm_struct *mm, + unsigned long start, unsigned long end, bool atomic) { if (mm_has_notifiers(mm)) - __mmu_notifier_invalidate_range_start(mm, start, end); + return __mmu_notifier_invalidate_range_start(mm, start, end, atomic); + return 0; } static inline void mmu_notifier_invalidate_range_end(struct mm_struct *mm, @@ -288,10 +290,10 @@ static inline void mmu_notifier_invalida unsigned long address) { } - -static inline void mmu_notifier_invalidate_range_start(struct mm_struct *mm, - unsigned long start, unsigned long end) +static inline int mmu_notifier_invalidate_range_start(struct mm_struct *mm, + unsigned long start, unsigned long end, bool atomic) { + return 0; } static inline void mmu_notifier_invalidate_range_end(struct mm_struct *mm, Index: mmu_notifiers_sleepable_v2/mm/fremap.c =================================================================== --- mmu_notifiers_sleepable_v2.orig/mm/fremap.c 2010-02-01 21:10:06.000000000 -0600 +++ mmu_notifiers_sleepable_v2/mm/fremap.c 2010-02-01 21:41:19.000000000 -0600 @@ -226,7 +226,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsign vma->vm_flags = saved_flags; } - mmu_notifier_invalidate_range_start(mm, start, start + size); + mmu_notifier_invalidate_range_start(mm, start, start + size, 0); err = populate_range(mm, vma, start, size, pgoff); mmu_notifier_invalidate_range_end(mm, start, start + size); if (!err && !(flags & MAP_NONBLOCK)) { Index: mmu_notifiers_sleepable_v2/mm/hugetlb.c =================================================================== --- mmu_notifiers_sleepable_v2.orig/mm/hugetlb.c 2010-02-01 21:10:06.000000000 -0600 +++ mmu_notifiers_sleepable_v2/mm/hugetlb.c 2010-02-01 21:41:19.000000000 -0600 @@ -2159,7 +2159,7 @@ void __unmap_hugepage_range(struct vm_ar BUG_ON(start & ~huge_page_mask(h)); BUG_ON(end & ~huge_page_mask(h)); - mmu_notifier_invalidate_range_start(mm, start, end); + mmu_notifier_invalidate_range_start(mm, start, end, 0); spin_lock(&mm->page_table_lock); for (address = start; address < end; address += sz) { ptep = huge_pte_offset(mm, address); Index: mmu_notifiers_sleepable_v2/mm/memory.c =================================================================== --- mmu_notifiers_sleepable_v2.orig/mm/memory.c 2010-02-01 21:10:21.000000000 -0600 +++ mmu_notifiers_sleepable_v2/mm/memory.c 2010-02-01 21:41:19.000000000 -0600 @@ -786,7 +786,7 @@ int copy_page_range(struct mm_struct *ds * is_cow_mapping() returns true. */ if (is_cow_mapping(vma->vm_flags)) - mmu_notifier_invalidate_range_start(src_mm, addr, end); + mmu_notifier_invalidate_range_start(src_mm, addr, end, 0); ret = 0; dst_pgd = pgd_offset(dst_mm, addr); @@ -990,7 +990,8 @@ static unsigned long unmap_page_range(st * @nr_accounted: Place number of unmapped pages in vm-accountable vma's here * @details: details of nonlinear truncation or shared cache invalidation * - * Returns the end address of the unmapping (restart addr if interrupted). + * Returns the end address of the unmapping (restart addr if interrupted, start + * if the i_mmap_lock is held and mmu_notifier_range_start() needs to sleep). * * Unmap all pages in the vma list. * @@ -1023,7 +1024,10 @@ unsigned long unmap_vmas(struct mmu_gath * mmu_notifier_invalidate_range_start can sleep. Don't initialize * mmu_gather until it completes */ - mmu_notifier_invalidate_range_start(mm, start_addr, end_addr); + if (mmu_notifier_invalidate_range_start(mm, start_addr, + end_addr, (i_mmap_lock == NULL))); + goto out; + *tlbp = tlb_gather_mmu(mm, fullmm); for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) { unsigned long end; @@ -1107,7 +1111,7 @@ unsigned long zap_page_range(struct vm_a unsigned long size, struct zap_details *details) { struct mm_struct *mm = vma->vm_mm; - struct mmu_gather *tlb; + struct mmu_gather *tlb = NULL; unsigned long end = address + size; unsigned long nr_accounted = 0; @@ -1908,7 +1912,7 @@ int apply_to_page_range(struct mm_struct int err; BUG_ON(addr >= end); - mmu_notifier_invalidate_range_start(mm, start, end); + mmu_notifier_invalidate_range_start(mm, start, end, 0); pgd = pgd_offset(mm, addr); do { next = pgd_addr_end(addr, end); @@ -2329,6 +2333,7 @@ static int unmap_mapping_range_vma(struc { unsigned long restart_addr; int need_break; + int need_unlocked_invalidate; /* * files that support invalidating or truncating portions of the @@ -2350,7 +2355,9 @@ again: restart_addr = zap_page_range(vma, start_addr, end_addr - start_addr, details); - need_break = need_resched() || spin_needbreak(details->i_mmap_lock); + need_unlocked_invalidate = (restart_addr == start_addr); + need_break = need_resched() || spin_needbreak(details->i_mmap_lock) || + need_unlocked_invalidate; if (restart_addr >= end_addr) { /* We have now completed this vma: mark it so */ @@ -2365,6 +2372,16 @@ again: } spin_unlock(details->i_mmap_lock); + if (need_unlocked_invalidate) { + /* + * If zap_page_range failed to make any progress because the + * mmu_notifier_invalidate_range_start was called atomically + * while the callee needed to sleep. In that event, we + * make the callout while the i_mmap_lock is released. + */ + mmu_notifier_invalidate_range_start(vma->vm_mm, start_addr, end_addr, 0); + mmu_notifier_invalidate_range_end(vma->vm_mm, start_addr, end_addr); + } cond_resched(); spin_lock(details->i_mmap_lock); return -EINTR; Index: mmu_notifiers_sleepable_v2/mm/mmu_notifier.c =================================================================== --- mmu_notifiers_sleepable_v2.orig/mm/mmu_notifier.c 2010-02-01 21:10:20.000000000 -0600 +++ mmu_notifiers_sleepable_v2/mm/mmu_notifier.c 2010-02-01 21:41:19.000000000 -0600 @@ -135,20 +135,22 @@ void __mmu_notifier_invalidate_page(stru } srcu_read_unlock(&mm->mmu_notifier_mm->srcu, srcu); } - -void __mmu_notifier_invalidate_range_start(struct mm_struct *mm, - unsigned long start, unsigned long end) +int __mmu_notifier_invalidate_range_start(struct mm_struct *mm, + unsigned long start, unsigned long end, bool atomic) { struct mmu_notifier *mn; struct hlist_node *n; int srcu; + int ret = 0; srcu = srcu_read_lock(&mm->mmu_notifier_mm->srcu); hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { if (mn->ops->invalidate_range_start) - mn->ops->invalidate_range_start(mn, mm, start, end); + ret |= mn->ops->invalidate_range_start(mn, mm, start, + end, atomic); } srcu_read_unlock(&mm->mmu_notifier_mm->srcu, srcu); + return ret; } void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, Index: mmu_notifiers_sleepable_v2/mm/mprotect.c =================================================================== --- mmu_notifiers_sleepable_v2.orig/mm/mprotect.c 2010-02-01 21:10:06.000000000 -0600 +++ mmu_notifiers_sleepable_v2/mm/mprotect.c 2010-02-01 21:41:19.000000000 -0600 @@ -204,7 +204,7 @@ success: dirty_accountable = 1; } - mmu_notifier_invalidate_range_start(mm, start, end); + mmu_notifier_invalidate_range_start(mm, start, end, 0); if (is_vm_hugetlb_page(vma)) hugetlb_change_protection(vma, start, end, vma->vm_page_prot); else Index: mmu_notifiers_sleepable_v2/mm/mremap.c =================================================================== --- mmu_notifiers_sleepable_v2.orig/mm/mremap.c 2010-02-01 21:10:06.000000000 -0600 +++ mmu_notifiers_sleepable_v2/mm/mremap.c 2010-02-01 21:41:19.000000000 -0600 @@ -82,7 +82,7 @@ static void move_ptes(struct vm_area_str old_start = old_addr; mmu_notifier_invalidate_range_start(vma->vm_mm, - old_start, old_end); + old_start, old_end, 0); if (vma->vm_file) { /* * Subtle point from Rajesh Venkatasubramanian: before Index: mmu_notifiers_sleepable_v2/virt/kvm/kvm_main.c =================================================================== --- mmu_notifiers_sleepable_v2.orig/virt/kvm/kvm_main.c 2010-02-01 21:10:06.000000000 -0600 +++ mmu_notifiers_sleepable_v2/virt/kvm/kvm_main.c 2010-02-01 21:41:19.000000000 -0600 @@ -259,10 +259,11 @@ static void kvm_mmu_notifier_change_pte( spin_unlock(&kvm->mmu_lock); } -static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, - struct mm_struct *mm, - unsigned long start, - unsigned long end) +static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long start, + unsigned long end, + bool atomic) { struct kvm *kvm = mmu_notifier_to_kvm(mn); int need_tlb_flush = 0; @@ -281,6 +282,8 @@ static void kvm_mmu_notifier_invalidate_ /* we've to flush the tlb before the pages can be freed */ if (need_tlb_flush) kvm_flush_remote_tlbs(kvm); + + return 0; } static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFP-V2 0/3] Make mmu_notifier_invalidate_range_start able to sleep. [not found] <20100202040145.555474000@alcatraz.americas.sgi.com> ` (2 preceding siblings ...) 2010-02-02 4:01 ` [RFP-V2 3/3] Make mmu_notifier_invalidate_range_start able to sleep Robin Holt @ 2010-02-02 8:09 ` Christoph Hellwig 2010-02-02 12:59 ` Andrea Arcangeli 2010-02-02 13:35 ` Robin Holt 3 siblings, 2 replies; 26+ messages in thread From: Christoph Hellwig @ 2010-02-02 8:09 UTC (permalink / raw) To: Robin Holt; +Cc: Andrew Morton, Andrea Arcangeli, Jack Steiner, linux-mm On Mon, Feb 01, 2010 at 10:01:45PM -0600, Robin Holt wrote: > XPMEM would like to utilize mmu_notifiers to track page table entry > changes of the segment and keep the attachment page table/tlb information > consistent. Given that SGI just pushes XPMEM direclty into the distributions instead of adding it upstream I don't really see the relevance of these patches. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFP-V2 0/3] Make mmu_notifier_invalidate_range_start able to sleep. 2010-02-02 8:09 ` [RFP-V2 0/3] " Christoph Hellwig @ 2010-02-02 12:59 ` Andrea Arcangeli 2010-02-02 13:13 ` Andrea Arcangeli 2010-02-02 13:23 ` Robin Holt 2010-02-02 13:35 ` Robin Holt 1 sibling, 2 replies; 26+ messages in thread From: Andrea Arcangeli @ 2010-02-02 12:59 UTC (permalink / raw) To: Christoph Hellwig; +Cc: Robin Holt, Andrew Morton, Jack Steiner, linux-mm On Tue, Feb 02, 2010 at 03:09:47AM -0500, Christoph Hellwig wrote: > On Mon, Feb 01, 2010 at 10:01:45PM -0600, Robin Holt wrote: > > XPMEM would like to utilize mmu_notifiers to track page table entry > > changes of the segment and keep the attachment page table/tlb information > > consistent. > > Given that SGI just pushes XPMEM direclty into the distributions instead > of adding it upstream I don't really see the relevance of these patches. That will then prevent upstream modules to build against those kernels. Not an huge issue, for a distro that's an ok compromise. My real issue with mainline is that while XPMEM is ok to break and corrupt memory if people uses XPMEM on top of shared mappings (instead of anonymous ones) by making a two liner change to the userland app opening xpmem device, but when next mmu notifier user comes and ask for full scheduling across shared mapping too as it needs security and not-trusted user can open /dev/xpmem (or whatever that device is located), we'll have to undo this work and fix it the real way (with config option MMU_NOTIFIER_SLEEPABLE=y). But if distro have to support XPMEM in default kernels, this hack is better because it won't slowdown the locking even if it leaves holes and corrupts memory when XPMEM can be opened by luser. It really depends if the user having access to XPMEM device is malicious, if we know it's not (assume avatar distributed rendering in closed environment or whatever) this again is an ok hack. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFP-V2 0/3] Make mmu_notifier_invalidate_range_start able to sleep. 2010-02-02 12:59 ` Andrea Arcangeli @ 2010-02-02 13:13 ` Andrea Arcangeli 2010-02-02 13:29 ` Robin Holt 2010-02-02 13:23 ` Robin Holt 1 sibling, 1 reply; 26+ messages in thread From: Andrea Arcangeli @ 2010-02-02 13:13 UTC (permalink / raw) To: Christoph Hellwig; +Cc: Robin Holt, Andrew Morton, Jack Steiner, linux-mm On Tue, Feb 02, 2010 at 01:59:43PM +0100, Andrea Arcangeli wrote: > slowdown the locking even if it leaves holes and corrupts memory when > XPMEM can be opened by luser. It really depends if the user having > access to XPMEM device is malicious, if we know it's not (assume > avatar distributed rendering in closed environment or whatever) this > again is an ok hack. >From another point of view: if the userland has to be as trusted as the kernel for this hack to be ok, I don't get it why it's not ok to just schedule unconditionally in the invalidate_range_start without altering the API and gracefully deadlock in the i_mmap_lock. If the secondary mappings cannot be teardown without scheduling, it means the page will be swapped out but the physical pages can be still written to despite the page being swapped out and reused by something else leading to trivial memory corruption if the user having access to xpmem device is malicious. Like Andrew already said, we've no clue what the "bool atomic" parameter will be used for and so it's next to impossible to judge the validity of this hack (because an hack that is). We don't know how xpmem will react to that event, all we know is that it won't be able to invalidate secondary mappings by the time this call returns leading to memory corruption. If it panics or if it ignores the invalidate when atomic=1, it's equivalent or worse than just schedule in atomic. If it schedule in atomic there's zero risk of memory corruption at least and no need of altering the API. So even for distro this hack to the API isn't necessary but only srcu and tlb flush deferral is needed. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFP-V2 0/3] Make mmu_notifier_invalidate_range_start able to sleep. 2010-02-02 13:13 ` Andrea Arcangeli @ 2010-02-02 13:29 ` Robin Holt 2010-02-02 13:40 ` Andrea Arcangeli 0 siblings, 1 reply; 26+ messages in thread From: Robin Holt @ 2010-02-02 13:29 UTC (permalink / raw) To: Andrea Arcangeli Cc: Christoph Hellwig, Robin Holt, Andrew Morton, Jack Steiner, linux-mm On Tue, Feb 02, 2010 at 02:13:41PM +0100, Andrea Arcangeli wrote: > On Tue, Feb 02, 2010 at 01:59:43PM +0100, Andrea Arcangeli wrote: > > slowdown the locking even if it leaves holes and corrupts memory when > > XPMEM can be opened by luser. It really depends if the user having > > access to XPMEM device is malicious, if we know it's not (assume > > avatar distributed rendering in closed environment or whatever) this > > again is an ok hack. > > >From another point of view: if the userland has to be as trusted as > the kernel for this hack to be ok, I don't get it why it's not ok to > just schedule unconditionally in the invalidate_range_start without > altering the API and gracefully deadlock in the i_mmap_lock. If the > secondary mappings cannot be teardown without scheduling, it means the > page will be swapped out but the physical pages can be still written > to despite the page being swapped out and reused by something else > leading to trivial memory corruption if the user having access to > xpmem device is malicious. > > Like Andrew already said, we've no clue what the "bool atomic" > parameter will be used for and so it's next to impossible to judge the > validity of this hack (because an hack that is). We don't know how > xpmem will react to that event, all we know is that it won't be able > to invalidate secondary mappings by the time this call returns leading > to memory corruption. If it panics or if it ignores the invalidate > when atomic=1, it's equivalent or worse than just schedule in The atomic==1 case is only for the truncate case, correct? XPMEM is holding reference counts on the pages it exports (get_user_pages) so they are not freed even when the zap_page_range has completed. What I think we are dealing with is an inconsistent appearance to userland. The one task would SIG_BUS if it touches the memory. The other would be able to read/write it just fine until the ascynchronous zap of the attachment completed. Robin -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFP-V2 0/3] Make mmu_notifier_invalidate_range_start able to sleep. 2010-02-02 13:29 ` Robin Holt @ 2010-02-02 13:40 ` Andrea Arcangeli 2010-02-02 13:51 ` Robin Holt 0 siblings, 1 reply; 26+ messages in thread From: Andrea Arcangeli @ 2010-02-02 13:40 UTC (permalink / raw) To: Robin Holt; +Cc: Christoph Hellwig, Andrew Morton, Jack Steiner, linux-mm On Tue, Feb 02, 2010 at 07:29:20AM -0600, Robin Holt wrote: > The atomic==1 case is only for the truncate case, correct? XPMEM is Correct. > holding reference counts on the pages it exports (get_user_pages) so > they are not freed even when the zap_page_range has completed. What I > think we are dealing with is an inconsistent appearance to userland. > The one task would SIG_BUS if it touches the memory. The other would > be able to read/write it just fine until the ascynchronous zap of the > attachment completed. Ok, thanks to the page pin it won't randomly corrupt memory, but it can still screw the runtime of an unmodified unaware program. I think you've to figure out how important it is that you won't deadlock if luser modifies userland because this isn't a complete approach and as much as I care about your workload that is ok with this, I cannot exclude it might materialize an usage in the future where sigbus while other thread still access the remote pages is not ok and may screw userland in a more subtle way than a visible kernel deadlock. Now we can do this now and undo it later, nothing very problematic, but considering this isn't a full transparent solution, I don't see the big deal in just scheduling in atomic if user does what it can't do (there will be unexpected behavior to his app anyway if he does that). I don't see a problem in applying srcu and the tlb gather patch in distro kernels, those won't even prevent the upstream modules to build against those kernels and there will be no change of API. In general making the methods sleepable doesn't need to alter the API at all... reason of this change of API is because we're not actually making them sleepable but only a few. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFP-V2 0/3] Make mmu_notifier_invalidate_range_start able to sleep. 2010-02-02 13:40 ` Andrea Arcangeli @ 2010-02-02 13:51 ` Robin Holt 2010-02-02 14:10 ` Andrea Arcangeli 0 siblings, 1 reply; 26+ messages in thread From: Robin Holt @ 2010-02-02 13:51 UTC (permalink / raw) To: Andrea Arcangeli Cc: Robin Holt, Christoph Hellwig, Andrew Morton, Jack Steiner, linux-mm On Tue, Feb 02, 2010 at 02:40:47PM +0100, Andrea Arcangeli wrote: > On Tue, Feb 02, 2010 at 07:29:20AM -0600, Robin Holt wrote: > > The atomic==1 case is only for the truncate case, correct? XPMEM is > > Correct. > > > holding reference counts on the pages it exports (get_user_pages) so > > they are not freed even when the zap_page_range has completed. What I > > think we are dealing with is an inconsistent appearance to userland. > > The one task would SIG_BUS if it touches the memory. The other would > > be able to read/write it just fine until the ascynchronous zap of the > > attachment completed. > > Ok, thanks to the page pin it won't randomly corrupt memory, but it > can still screw the runtime of an unmodified unaware program. I think > you've to figure out how important it is that you won't deadlock if > luser modifies userland because this isn't a complete approach and as > much as I care about your workload that is ok with this, I cannot > exclude it might materialize an usage in the future where sigbus while > other thread still access the remote pages is not ok and may screw > userland in a more subtle way than a visible kernel deadlock. Now we > can do this now and undo it later, nothing very problematic, but > considering this isn't a full transparent solution, I don't see the > big deal in just scheduling in atomic if user does what it can't do > (there will be unexpected behavior to his app anyway if he does that). I am sorry. I slipped implementations. I had worked late into the evening on an alternative to this patch which did not require the unlock of the i_mmap_lock, _inv_range_start, lock, return -EINTR. That approach would have had the caveats as above. The approach we are discussing here does not have any difference in userland behavior. When called with atomic==1, XPMEM will detect that we have exported pages in that address range and return !0 indicating we need to sleep to satisfy this call. The i_mmap_lock would be released and _inv_range_start would be called again with atomic==0. This time, the pages would be cleared from the attachments, the unmap_mapping_range_vma would then retry. Hopefully the call this time with atomic==1 would return 0 indicating it did not need to sleep. > I don't see a problem in applying srcu and the tlb gather patch in > distro kernels, those won't even prevent the upstream modules to build > against those kernels and there will be no change of API. In general > making the methods sleepable doesn't need to alter the API at > all... reason of this change of API is because we're not actually > making them sleepable but only a few. I don't see the change in API with this method either. Robin -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFP-V2 0/3] Make mmu_notifier_invalidate_range_start able to sleep. 2010-02-02 13:51 ` Robin Holt @ 2010-02-02 14:10 ` Andrea Arcangeli 2010-02-02 14:21 ` Robin Holt 0 siblings, 1 reply; 26+ messages in thread From: Andrea Arcangeli @ 2010-02-02 14:10 UTC (permalink / raw) To: Robin Holt; +Cc: Christoph Hellwig, Andrew Morton, Jack Steiner, linux-mm On Tue, Feb 02, 2010 at 07:51:41AM -0600, Robin Holt wrote: > I don't see the change in API with this method either. The API change I'm referring to, is the reason that you had to patch virt/kvm/kvm_main.c and drivers/misc/sgi-gru/grutlbpurge.c to prevent compile failure. That isn't needed if we really make mmu notifier sleepable like my old patched did just fine. Except they slowed down the locking to achieve it... (the slowdown should be confined to config option) and you don't want that I guess. But if you didn't need to return -EINVAL I think your userland would also be safer. Only problem I can see is that you would then have trouble to convince distro to build with the slower locking and you basically are ok to break userland in truncate to be sure your module will work with default binary distro kernel. It's a tradeoff and I'm not against it but it has to be well documented that this is an hack to be practical on binary shipped kernels. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFP-V2 0/3] Make mmu_notifier_invalidate_range_start able to sleep. 2010-02-02 14:10 ` Andrea Arcangeli @ 2010-02-02 14:21 ` Robin Holt 2010-02-02 14:59 ` Andrea Arcangeli 0 siblings, 1 reply; 26+ messages in thread From: Robin Holt @ 2010-02-02 14:21 UTC (permalink / raw) To: Andrea Arcangeli Cc: Robin Holt, Christoph Hellwig, Andrew Morton, Jack Steiner, linux-mm On Tue, Feb 02, 2010 at 03:10:36PM +0100, Andrea Arcangeli wrote: > On Tue, Feb 02, 2010 at 07:51:41AM -0600, Robin Holt wrote: > > I don't see the change in API with this method either. > > The API change I'm referring to, is the reason that you had to patch > virt/kvm/kvm_main.c and drivers/misc/sgi-gru/grutlbpurge.c to prevent So the API is an mmu_notifier thing and not external. I think adding reference counting to the VMA and converting the i_mmap_lock to i_mmap_sem might have a slightly larger impact on users of kernel headers than this proposal. > compile failure. That isn't needed if we really make mmu notifier > sleepable like my old patched did just fine. Except they slowed down > the locking to achieve it... (the slowdown should be confined to > config option) and you don't want that I guess. But if you didn't need Your argument seems ridiculous. Take this larger series of patches which touches many parts of the kernel and has a runtime downside for 99% of the user community but only when configured on and then try and argue with the distros that they should slow all users down for our 1%. > to return -EINVAL I think your userland would also be safer. Only I think you missed my correction to an earlier statement. This patcheset does not have any data corruption or userland inconsistency. I had mistakenly spoken of a patchset I am working up as a lesser alternative to this one. > problem I can see is that you would then have trouble to convince > distro to build with the slower locking and you basically are ok to > break userland in truncate to be sure your module will work with > default binary distro kernel. It's a tradeoff and I'm not against it > but it has to be well documented that this is an hack to be practical > on binary shipped kernels. This is no more a hack than the other long list of compromises that have been made in the past. Very similar to your huge page patchset which invalidates a page by using the range callout. NIHS is not the same as a hack. Robin -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFP-V2 0/3] Make mmu_notifier_invalidate_range_start able to sleep. 2010-02-02 14:21 ` Robin Holt @ 2010-02-02 14:59 ` Andrea Arcangeli 2010-02-02 15:21 ` Robin Holt 0 siblings, 1 reply; 26+ messages in thread From: Andrea Arcangeli @ 2010-02-02 14:59 UTC (permalink / raw) To: Robin Holt; +Cc: Christoph Hellwig, Andrew Morton, Jack Steiner, linux-mm On Tue, Feb 02, 2010 at 08:21:30AM -0600, Robin Holt wrote: > Your argument seems ridiculous. Take this larger series of patches which > touches many parts of the kernel and has a runtime downside for 99% of > the user community but only when configured on and then try and argue > with the distros that they should slow all users down for our 1%. I wasn't suggesting to ask to apply such a patch to a kernel distro! If that's you understood... That would have been the only ridiculous thing about it. If you want to send patches to distro your hack is probably the max as it alters kABI but not so much. > > to return -EINVAL I think your userland would also be safer. Only > > I think you missed my correction to an earlier statement. This patcheset > does not have any data corruption or userland inconsistency. I had mistakenly > spoken of a patchset I am working up as a lesser alternative to this one. If there is never data corruption or userland inconsistency when I do mmap(MAP_SHARED) truncate(0) then I've to wonder why at all you need any modification if you already can handle remote spte invalidation through atomic sections. That is ridiculous that you can handle it through atomic-section truncate without sleepability, and you still ask sleepability for mmu notifier in the first place... > This is no more a hack than the other long list of compromises that have > been made in the past. Very similar to your huge page patchset which > invalidates a page by using the range callout. NIHS is not the same as > a hack. My hack is just to avoid having to modify mmu notifier API to reduce the amount of mangling I have to ask people to digest at once, I simply can't do too many things at once, not right example of compromise as it's going to get fixed without any downside... no tradeoff at all. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFP-V2 0/3] Make mmu_notifier_invalidate_range_start able to sleep. 2010-02-02 14:59 ` Andrea Arcangeli @ 2010-02-02 15:21 ` Robin Holt 2010-02-02 16:01 ` Andrea Arcangeli 0 siblings, 1 reply; 26+ messages in thread From: Robin Holt @ 2010-02-02 15:21 UTC (permalink / raw) To: Andrea Arcangeli Cc: Robin Holt, Christoph Hellwig, Andrew Morton, Jack Steiner, linux-mm On Tue, Feb 02, 2010 at 03:59:11PM +0100, Andrea Arcangeli wrote: > > I think you missed my correction to an earlier statement. This patcheset > > does not have any data corruption or userland inconsistency. I had mistakenly > > spoken of a patchset I am working up as a lesser alternative to this one. > > If there is never data corruption or userland inconsistency when I do > mmap(MAP_SHARED) truncate(0) then I've to wonder why at all you need > any modification if you already can handle remote spte invalidation > through atomic sections. That is ridiculous that you can handle it > through atomic-section truncate without sleepability, and you still > ask sleepability for mmu notifier in the first place... In the truncate(0) example you provide, the sequence would be as follows: On the first call from unmap_vmas into _inv_range_start(atomic==1), XPMEM would scan the segment's PFN table. If there were pages in that range which have been exported, we would return !0 without doing any invalidation. The unmap_vmas code would see the non-zero return and return start_addr back to zap_page_range and further to unmap_mapping_range_vma where need_unlocked_invalidate would be set. unmap_mapping_range_vma would then unlock the i_mmap_lock, call _inv_range_start(atomic==0) which would clear all the remote page tables and TLBs. It would then reaquire the i_mmap_lock and retry. This time unmap_vmas would call _inv_range_start(atomic==1). XPMEM would scan the segment's PFN table and find there were no pages exported and return 0. Things would proceed as normal from there. No corruption. No intrusive locking additions that negatively affect the vast majority of users. A compromise. The only downside I can see at all in the CONFIG_MMU_NOTIFIER=n case is unmap_mapping_range_vma is slightly larger. Robin -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFP-V2 0/3] Make mmu_notifier_invalidate_range_start able to sleep. 2010-02-02 15:21 ` Robin Holt @ 2010-02-02 16:01 ` Andrea Arcangeli 2010-02-02 16:39 ` Robin Holt 0 siblings, 1 reply; 26+ messages in thread From: Andrea Arcangeli @ 2010-02-02 16:01 UTC (permalink / raw) To: Robin Holt; +Cc: Christoph Hellwig, Andrew Morton, Jack Steiner, linux-mm On Tue, Feb 02, 2010 at 09:21:42AM -0600, Robin Holt wrote: > unmap_mapping_range_vma would then unlock the i_mmap_lock, call > _inv_range_start(atomic==0) which would clear all the remote page tables > and TLBs. It would then reaquire the i_mmap_lock and retry. I guess you missed why we hold the i_mmap_lock there... it's not like gratuitous locking complication there's an actual reason why it's taken, and it's to avoid the vma to be freed from under you: + if (need_unlocked_invalidate) { + /* + * If zap_page_range failed to make any progress because the + * mmu_notifier_invalidate_range_start was called atomically + * while the callee needed to sleep. In that event, we + * make the callout while the i_mmap_lock is released. + */ + mmu_notifier_invalidate_range_start(vma->vm_mm, start_addr, end_addr, 0); + mmu_notifier_invalidate_range_end(vma->vm_mm, start_addr, end_addr); The above runs with vma being a dangling pointer, it'll corrupt memory randomly and crash. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFP-V2 0/3] Make mmu_notifier_invalidate_range_start able to sleep. 2010-02-02 16:01 ` Andrea Arcangeli @ 2010-02-02 16:39 ` Robin Holt 2010-02-02 16:52 ` Andrea Arcangeli 0 siblings, 1 reply; 26+ messages in thread From: Robin Holt @ 2010-02-02 16:39 UTC (permalink / raw) To: Andrea Arcangeli Cc: Robin Holt, Christoph Hellwig, Andrew Morton, Jack Steiner, linux-mm On Tue, Feb 02, 2010 at 05:01:46PM +0100, Andrea Arcangeli wrote: > On Tue, Feb 02, 2010 at 09:21:42AM -0600, Robin Holt wrote: > > unmap_mapping_range_vma would then unlock the i_mmap_lock, call > > _inv_range_start(atomic==0) which would clear all the remote page tables > > and TLBs. It would then reaquire the i_mmap_lock and retry. > > I guess you missed why we hold the i_mmap_lock there... it's not like > gratuitous locking complication there's an actual reason why it's > taken, and it's to avoid the vma to be freed from under you: Oversight on my part. Sorry. Will this work? static int unmap_mapping_range_vma(struct vm_area_struct *vma, ... if (need_unlocked_invalidate) { mm = vma->vm_mm; atomic_inc(&mm->mm_users); } spin_unlock(details->i_mmap_lock); if (need_unlocked_invalidate) { /* * zap_page_range failed to make any progress because the * mmu_notifier_invalidate_range_start was called atomically * while the callee needed to sleep. In that event, we * make the callout while the i_mmap_lock is released. */ mmu_notifier_invalidate_range_start(mm, start_addr, end_addr, 0); mmu_notifier_invalidate_range_end(mm, start_addr, end_addr); mmput(mm); } cond_resched(); spin_lock(details->i_mmap_lock); return -EINTR; Robin -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFP-V2 0/3] Make mmu_notifier_invalidate_range_start able to sleep. 2010-02-02 16:39 ` Robin Holt @ 2010-02-02 16:52 ` Andrea Arcangeli 2010-02-02 16:59 ` Robin Holt 0 siblings, 1 reply; 26+ messages in thread From: Andrea Arcangeli @ 2010-02-02 16:52 UTC (permalink / raw) To: Robin Holt; +Cc: Christoph Hellwig, Andrew Morton, Jack Steiner, linux-mm On Tue, Feb 02, 2010 at 10:39:30AM -0600, Robin Holt wrote: > On Tue, Feb 02, 2010 at 05:01:46PM +0100, Andrea Arcangeli wrote: > > On Tue, Feb 02, 2010 at 09:21:42AM -0600, Robin Holt wrote: > > > unmap_mapping_range_vma would then unlock the i_mmap_lock, call > > > _inv_range_start(atomic==0) which would clear all the remote page tables > > > and TLBs. It would then reaquire the i_mmap_lock and retry. > > > > I guess you missed why we hold the i_mmap_lock there... it's not like > > gratuitous locking complication there's an actual reason why it's > > taken, and it's to avoid the vma to be freed from under you: > > Oversight on my part. Sorry. > > Will this work? No, it still corrupts memory as before. You need to re-run find_vma under mmap_sem. Then it could work... Also it's wrong to pin mm_users, you only need mm_count here, as you only need to run find_vma, you don't need to prevent exit to free the pages indefinitely while you're blocked. > static int unmap_mapping_range_vma(struct vm_area_struct *vma, > ... > if (need_unlocked_invalidate) { > mm = vma->vm_mm; > atomic_inc(&mm->mm_users); > } > spin_unlock(details->i_mmap_lock); > if (need_unlocked_invalidate) { > /* > * zap_page_range failed to make any progress because the > * mmu_notifier_invalidate_range_start was called atomically > * while the callee needed to sleep. In that event, we > * make the callout while the i_mmap_lock is released. > */ > mmu_notifier_invalidate_range_start(mm, start_addr, end_addr, 0); > mmu_notifier_invalidate_range_end(mm, start_addr, end_addr); > mmput(mm); > } > cond_resched(); > spin_lock(details->i_mmap_lock); > return -EINTR; > > Robin > -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFP-V2 0/3] Make mmu_notifier_invalidate_range_start able to sleep. 2010-02-02 16:52 ` Andrea Arcangeli @ 2010-02-02 16:59 ` Robin Holt 2010-02-02 17:31 ` Robin Holt 2010-02-02 20:17 ` Andrea Arcangeli 0 siblings, 2 replies; 26+ messages in thread From: Robin Holt @ 2010-02-02 16:59 UTC (permalink / raw) To: Andrea Arcangeli Cc: Robin Holt, Christoph Hellwig, Andrew Morton, Jack Steiner, linux-mm On Tue, Feb 02, 2010 at 05:52:24PM +0100, Andrea Arcangeli wrote: > On Tue, Feb 02, 2010 at 10:39:30AM -0600, Robin Holt wrote: > > On Tue, Feb 02, 2010 at 05:01:46PM +0100, Andrea Arcangeli wrote: > > > On Tue, Feb 02, 2010 at 09:21:42AM -0600, Robin Holt wrote: > > > > unmap_mapping_range_vma would then unlock the i_mmap_lock, call > > > > _inv_range_start(atomic==0) which would clear all the remote page tables > > > > and TLBs. It would then reaquire the i_mmap_lock and retry. > > > > > > I guess you missed why we hold the i_mmap_lock there... it's not like > > > gratuitous locking complication there's an actual reason why it's > > > taken, and it's to avoid the vma to be freed from under you: > > > > Oversight on my part. Sorry. > > > > Will this work? > > No, it still corrupts memory as before. You need to re-run find_vma > under mmap_sem. Then it could work... But we don't use the vma for anything. The _invalidate_range_start/end is using the mm. XPMEM and GRU don't use the vma. Does KVM? Since it isn't passed in, I would expect that anybody trying to use the vma is going to have to do a find_vma themselves. Did I miss something? > Also it's wrong to pin mm_users, you only need mm_count here, as you > only need to run find_vma, you don't need to prevent exit to free the > pages indefinitely while you're blocked. Is this better? static int unmap_mapping_range_vma(struct vm_area_struct *vma, ... if (need_unlocked_invalidate) { mm = vma->vm_mm; atomic_inc(&mm->mm_count); } spin_unlock(details->i_mmap_lock); if (need_unlocked_invalidate) { /* * zap_page_range failed to make any progress because the * mmu_notifier_invalidate_range_start was called atomically * while the callee needed to sleep. In that event, we * make the callout while the i_mmap_lock is released. */ mmu_notifier_invalidate_range_start(mm, start_addr, end_addr, 0); mmu_notifier_invalidate_range_end(mm, start_addr, end_addr); mmdrop(mm); } Robin -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFP-V2 0/3] Make mmu_notifier_invalidate_range_start able to sleep. 2010-02-02 16:59 ` Robin Holt @ 2010-02-02 17:31 ` Robin Holt 2010-02-02 20:27 ` Andrea Arcangeli 2010-02-02 20:17 ` Andrea Arcangeli 1 sibling, 1 reply; 26+ messages in thread From: Robin Holt @ 2010-02-02 17:31 UTC (permalink / raw) To: Andrea Arcangeli Cc: Robin Holt, Christoph Hellwig, Andrew Morton, Jack Steiner, linux-mm On Tue, Feb 02, 2010 at 10:59:03AM -0600, Robin Holt wrote: > On Tue, Feb 02, 2010 at 05:52:24PM +0100, Andrea Arcangeli wrote: > > On Tue, Feb 02, 2010 at 10:39:30AM -0600, Robin Holt wrote: > > > On Tue, Feb 02, 2010 at 05:01:46PM +0100, Andrea Arcangeli wrote: > > > > On Tue, Feb 02, 2010 at 09:21:42AM -0600, Robin Holt wrote: > > > > > unmap_mapping_range_vma would then unlock the i_mmap_lock, call > > > > > _inv_range_start(atomic==0) which would clear all the remote page tables > > > > > and TLBs. It would then reaquire the i_mmap_lock and retry. > > > > > > > > I guess you missed why we hold the i_mmap_lock there... it's not like > > > > gratuitous locking complication there's an actual reason why it's > > > > taken, and it's to avoid the vma to be freed from under you: > > > > > > Oversight on my part. Sorry. > > > > > > Will this work? > > > > No, it still corrupts memory as before. You need to re-run find_vma > > under mmap_sem. Then it could work... > > But we don't use the vma for anything. The _invalidate_range_start/end > is using the mm. XPMEM and GRU don't use the vma. Does KVM? Since it > isn't passed in, I would expect that anybody trying to use the vma is > going to have to do a find_vma themselves. Did I miss something? > > > Also it's wrong to pin mm_users, you only need mm_count here, as you > > only need to run find_vma, you don't need to prevent exit to free the > > pages indefinitely while you're blocked. > > Is this better? Not better. Still need to grab the mmap_sem. How about this? static int unmap_mapping_range_vma(struct vm_area_struct *vma, ... if (need_unlocked_invalidate) { mm = vma->vm_mm; atomic_inc(&mm->mm_count); } spin_unlock(details->i_mmap_lock); if (need_unlocked_invalidate) { /* * zap_page_range failed to make any progress because the * mmu_notifier_invalidate_range_start was called atomically * while the callee needed to sleep. In that event, we * make the callout while the i_mmap_lock is released. */ down_read(&mm->mmap_sem); mmu_notifier_invalidate_range_start(mm, start_addr, end_addr, 0); mmu_notifier_invalidate_range_end(mm, start_addr, end_addr); up_read(&mm->mmap_sem); mmdrop(mm); } Robin -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFP-V2 0/3] Make mmu_notifier_invalidate_range_start able to sleep. 2010-02-02 17:31 ` Robin Holt @ 2010-02-02 20:27 ` Andrea Arcangeli 0 siblings, 0 replies; 26+ messages in thread From: Andrea Arcangeli @ 2010-02-02 20:27 UTC (permalink / raw) To: Robin Holt; +Cc: Christoph Hellwig, Andrew Morton, Jack Steiner, linux-mm On Tue, Feb 02, 2010 at 11:31:58AM -0600, Robin Holt wrote: > Not better. Still need to grab the mmap_sem. How about this? I don't think you need to grab mmap_sem, previous patch was ok if we agree with the tradeoff that you still can't schedule in ->invalidate_page and it's unclear why that is ok. After you apply transparent hugepage support, it'll break and I'll have to deal with it somehow. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFP-V2 0/3] Make mmu_notifier_invalidate_range_start able to sleep. 2010-02-02 16:59 ` Robin Holt 2010-02-02 17:31 ` Robin Holt @ 2010-02-02 20:17 ` Andrea Arcangeli 2010-02-03 0:48 ` Robin Holt 1 sibling, 1 reply; 26+ messages in thread From: Andrea Arcangeli @ 2010-02-02 20:17 UTC (permalink / raw) To: Robin Holt; +Cc: Christoph Hellwig, Andrew Morton, Jack Steiner, linux-mm On Tue, Feb 02, 2010 at 10:59:03AM -0600, Robin Holt wrote: > But we don't use the vma for anything. The _invalidate_range_start/end > is using the mm. XPMEM and GRU don't use the vma. Does KVM? Since it > isn't passed in, I would expect that anybody trying to use the vma is > going to have to do a find_vma themselves. Did I miss something? No sorry, we are passing down the mm not the vma so it should be ok already. > Is this better? > > static int unmap_mapping_range_vma(struct vm_area_struct *vma, > ... > if (need_unlocked_invalidate) { > mm = vma->vm_mm; > atomic_inc(&mm->mm_count); > } > spin_unlock(details->i_mmap_lock); > if (need_unlocked_invalidate) { > /* > * zap_page_range failed to make any progress because the > * mmu_notifier_invalidate_range_start was called atomically > * while the callee needed to sleep. In that event, we > * make the callout while the i_mmap_lock is released. > */ > mmu_notifier_invalidate_range_start(mm, start_addr, end_addr, 0); > mmu_notifier_invalidate_range_end(mm, start_addr, end_addr); > mmdrop(mm); > } Yes with mm_count it's better and this way it should be safe. I think it's an ok tradeoff, hopefully then nobody will ask to schedule in ->invalidate_page. Still it'd be interesting (back to Andrew's argument) to understand what is fundamentally different that you are ok not to schedule in ->invalidate_page but you absolutely need it here. And yes this will break also my transparent hugepage patch that can't schedule inside the anon_vma->lock and uses the range calls to be safer (then maybe we can require the mmu notifier users to check PageTransHuge against the pages and handle the invalidate through ->invalidate_page or we can add ->invalidate_transhuge_page. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFP-V2 0/3] Make mmu_notifier_invalidate_range_start able to sleep. 2010-02-02 20:17 ` Andrea Arcangeli @ 2010-02-03 0:48 ` Robin Holt 2010-02-03 17:14 ` Andrea Arcangeli 0 siblings, 1 reply; 26+ messages in thread From: Robin Holt @ 2010-02-03 0:48 UTC (permalink / raw) To: Andrea Arcangeli Cc: Robin Holt, Christoph Hellwig, Andrew Morton, Jack Steiner, linux-mm > Yes with mm_count it's better and this way it should be safe. I think > it's an ok tradeoff, hopefully then nobody will ask to schedule in > ->invalidate_page. Still it'd be interesting (back to Andrew's > argument) to understand what is fundamentally different that you are > ok not to schedule in ->invalidate_page but you absolutely need it > here. And yes this will break also my transparent hugepage patch that In the _invalidate_page case, it is called by the kernel from sites where the kernel is relying upon the reference count to eliminate the page from use while maintaining the page's data as clean and ready to be released. If the page is marked as dirty, etc. then the kernel will "do the right thing" with the page to maintain data consistency. The _invalidate_range_start/end pairs are used in places where the caller's address space is being modified. If we allow the attachers to continue to use the old pages from the old mapping even for a short time after the process has started to use the new pages, there would be silent data corruption. A difference is the kernel's expectations. The truncate case is the one place where the kernel's expectation for _invalidate_range_start/end more closely matches those of _invalidate_page. When I was babbling about a new version of the patch, it basically adds that concept to the _invalidate_range_start callout as a parameter. Essentially changing the bool atomic into a flag indicating the kernel does not expect this step to be complete prior finishing this callout. I don't like that second patch which is why I have not posted it. It relies upon the fuzzy quantity of an "adequate" period of time between when the file is truncated down before it may be extended again to ensure data consistency. Shrink and extend too quickly and problems will ensue. > can't schedule inside the anon_vma->lock and uses the range calls to > be safer (then maybe we can require the mmu notifier users to check > PageTransHuge against the pages and handle the invalidate through > ->invalidate_page or we can add ->invalidate_transhuge_page. I don't think that is a problem. I don't think the GRU has any issues at all. I believe that the invalidate even of a standard page size will eliminate the entire TLB. Jack was going to verify that the last time I talked with him. If it behaved any differently, I would be surprised as it would be inconsistent with nearly every other TLB out there. XPMEM will currently not work, but I believe I can get it to work quite easily as I can walk the segment's PFN table without acquiring any sleeping locks and decide to expand the page size for any invalidation within that range. With that, at the time of the callout, I can schedule an invalidation of the appropriate size. As for the transparent huge page patches, I have just skimmed them lightly as I really don't have much time to understand your intention. I do think I am agreeing with Christoph Lameter that using the migration style mechanism is probably better in that it handles the invalidate_page callouts, follows the page reference counts, and allows my asynchronous invalidation expectation to persist. If I read it correctly, your patch would now require invalidate_page to be complete and have reference counts pushed back down upon return. I probably missed a key portion of that patch and would welcome the chance to be informed. Thanks, Robin -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFP-V2 0/3] Make mmu_notifier_invalidate_range_start able to sleep. 2010-02-03 0:48 ` Robin Holt @ 2010-02-03 17:14 ` Andrea Arcangeli 2010-02-03 17:18 ` Andrea Arcangeli 2010-02-03 19:54 ` Robin Holt 0 siblings, 2 replies; 26+ messages in thread From: Andrea Arcangeli @ 2010-02-03 17:14 UTC (permalink / raw) To: Robin Holt; +Cc: Christoph Hellwig, Andrew Morton, Jack Steiner, linux-mm On Tue, Feb 02, 2010 at 06:48:33PM -0600, Robin Holt wrote: > In the _invalidate_page case, it is called by the kernel from sites where > the kernel is relying upon the reference count to eliminate the page from > use while maintaining the page's data as clean and ready to be released. > If the page is marked as dirty, etc. then the kernel will "do the right > thing" with the page to maintain data consistency. > > The _invalidate_range_start/end pairs are used in places where the > caller's address space is being modified. If we allow the attachers > to continue to use the old pages from the old mapping even for a short > time after the process has started to use the new pages, there would be > silent data corruption. Just to show how fragile your assumption is, your code is already generating mm corruption in fork and in ksm... the set_pte_at_notify invalidate_page has to run immediately and be effective immediately despite being called with the PT lock hold. > A difference is the kernel's expectations. The truncate case is the one do_wp_page and ksm ->invalidate_page or ->change_pte have expectations different than what the ones you expect it to have. It's the first time I hear about the semantics of invalidate_page and range_start/end being different. The only reason why there are two different calls is that start/end wants to run a single tlb flush for many ptes teardown. While this isn't doable with the ptep_clear_flush or set_pte_at versions beacuse there is a single pte to invalidate and not multiple. So for the latter we simply have a single call and there's no need of a start/stop range. > place where the kernel's expectation for _invalidate_range_start/end > more closely matches those of _invalidate_page. When I was babbling > about a new version of the patch, it basically adds that concept to the > _invalidate_range_start callout as a parameter. Essentially changing > the bool atomic into a flag indicating the kernel does not expect this > step to be complete prior finishing this callout. I don't get it sorry. > I don't like that second patch which is why I have not posted it. > It relies upon the fuzzy quantity of an "adequate" period of time between > when the file is truncated down before it may be extended again to ensure > data consistency. Shrink and extend too quickly and problems will ensue. Not sure I get it, it's all i_mutex serialized so i_size can't change under the code under discussion, I don't see the shrink and extend too quick. > > can't schedule inside the anon_vma->lock and uses the range calls to > > be safer (then maybe we can require the mmu notifier users to check > > PageTransHuge against the pages and handle the invalidate through > > ->invalidate_page or we can add ->invalidate_transhuge_page. > > I don't think that is a problem. I don't think the GRU has any issues > at all. I believe that the invalidate even of a standard page size will > eliminate the entire TLB. Jack was going to verify that the last time > I talked with him. If it behaved any differently, I would be surprised > as it would be inconsistent with nearly every other TLB out there. GRU sure is fine as it has no sleepability requirement. My pmdp_clear_flush_notify calls invalidate_range_start/end to flush the entire mmu range in case the secondary mmu couldn't map the hugepage in a single hugetlb (like kvm does). > XPMEM will currently not work, but I believe I can get it to work quite > easily as I can walk the segment's PFN table without acquiring any > sleeping locks and decide to expand the page size for any invalidation > within that range. With that, at the time of the callout, I can schedule > an invalidation of the appropriate size. Schedule or/and do it later isn't safe in khugepaged, ksm, do_wp_page etc.. all running under PT lock. It also isn't ok for pmdp_clear_flush_notify calling the mmu_notifier_invalidate_range_start/end(atomic=0), it's unacceptable to fail and not flush sptes as I have to migrate a writable page in khugepaged and data writes will be lost if you don't really stop the remote dma before pmdp_clear_flush_notify returns! > As for the transparent huge page patches, I have just skimmed them > lightly as I really don't have much time to understand your intention. > I do think I am agreeing with Christoph Lameter that using the migration > style mechanism is probably better in that it handles the invalidate_page > callouts, follows the page reference counts, and allows my asynchronous > invalidation expectation to persist. I think your asynchronous invalidation expectation are invalid regardless of khugepaged and pmdp_clear_flush_notify. > If I read it correctly, your patch would now require invalidate_page > to be complete and have reference counts pushed back down upon return. > I probably missed a key portion of that patch and would welcome the > chance to be informed. The problem aren't the reference counts. The problem is that the writes coming from remote must not go to a page regardless of its page count, this isn't about khugepaged only. The reads coming from remote also must not read from local memory after invalidate or do_wp_page will break as the writes through the pte will go to the copied page while the remote mapping still reads from the old page. It's a pure accident it works ok in swap (because swap won't alter the pte->pnf mapping after the invalidate and it won't actually lose the swap entry until the refcount is only the one for the lru). Every other place where invalidate_page is called and the pte->pfn mapping is altered (and it's not only marked nonpresent and remapped on the same pfn in case of swapcache fault) requires synchronous updating of secondary sptes in invalidate_page too. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFP-V2 0/3] Make mmu_notifier_invalidate_range_start able to sleep. 2010-02-03 17:14 ` Andrea Arcangeli @ 2010-02-03 17:18 ` Andrea Arcangeli 2010-02-03 19:54 ` Robin Holt 1 sibling, 0 replies; 26+ messages in thread From: Andrea Arcangeli @ 2010-02-03 17:18 UTC (permalink / raw) To: Robin Holt; +Cc: Christoph Hellwig, Andrew Morton, Jack Steiner, linux-mm On Wed, Feb 03, 2010 at 06:14:13PM +0100, Andrea Arcangeli wrote: > generating mm corruption in fork and in ksm... the set_pte_at_notify with fork I meant later during the do_wp_page... to avoid confusion. (very fork is ok as it can schedule so you won't have to defer the invalidate) -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFP-V2 0/3] Make mmu_notifier_invalidate_range_start able to sleep. 2010-02-03 17:14 ` Andrea Arcangeli 2010-02-03 17:18 ` Andrea Arcangeli @ 2010-02-03 19:54 ` Robin Holt 1 sibling, 0 replies; 26+ messages in thread From: Robin Holt @ 2010-02-03 19:54 UTC (permalink / raw) To: Andrea Arcangeli Cc: Robin Holt, Christoph Hellwig, Andrew Morton, Jack Steiner, linux-mm On Wed, Feb 03, 2010 at 06:14:13PM +0100, Andrea Arcangeli wrote: > On Tue, Feb 02, 2010 at 06:48:33PM -0600, Robin Holt wrote: > > In the _invalidate_page case, it is called by the kernel from sites where > > the kernel is relying upon the reference count to eliminate the page from > > use while maintaining the page's data as clean and ready to be released. > > If the page is marked as dirty, etc. then the kernel will "do the right > > thing" with the page to maintain data consistency. > > > > The _invalidate_range_start/end pairs are used in places where the > > caller's address space is being modified. If we allow the attachers > > to continue to use the old pages from the old mapping even for a short > > time after the process has started to use the new pages, there would be > > silent data corruption. > > Just to show how fragile your assumption is, your code is already > generating mm corruption in fork and in ksm... the set_pte_at_notify > invalidate_page has to run immediately and be effective immediately > despite being called with the PT lock hold. Actually, we don't generate corruption, but that is a little more complex. At fork time, the invalidate range happens to clear all of the segment's page table. When XPMEM goes to refill the entry, we always call get_user_pages(,write=1,). That will result in a page callout, but we have no entry in the segment's page table so the callout is safely ignored. When the processes pte gets established, it has already broken COW so we are back to a safe state. Robin -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFP-V2 0/3] Make mmu_notifier_invalidate_range_start able to sleep. 2010-02-02 12:59 ` Andrea Arcangeli 2010-02-02 13:13 ` Andrea Arcangeli @ 2010-02-02 13:23 ` Robin Holt 1 sibling, 0 replies; 26+ messages in thread From: Robin Holt @ 2010-02-02 13:23 UTC (permalink / raw) To: Andrea Arcangeli Cc: Christoph Hellwig, Robin Holt, Andrew Morton, Jack Steiner, linux-mm On Tue, Feb 02, 2010 at 01:59:43PM +0100, Andrea Arcangeli wrote: > On Tue, Feb 02, 2010 at 03:09:47AM -0500, Christoph Hellwig wrote: > > On Mon, Feb 01, 2010 at 10:01:45PM -0600, Robin Holt wrote: > > > XPMEM would like to utilize mmu_notifiers to track page table entry > > > changes of the segment and keep the attachment page table/tlb information > > > consistent. > > > > Given that SGI just pushes XPMEM direclty into the distributions instead > > of adding it upstream I don't really see the relevance of these patches. > > That will then prevent upstream modules to build against those > kernels. Not an huge issue, for a distro that's an ok compromise. My > real issue with mainline is that while XPMEM is ok to break and > corrupt memory if people uses XPMEM on top of shared mappings (instead > of anonymous ones) by making a two liner change to the userland app > opening xpmem device, but when next mmu notifier user comes and ask > for full scheduling across shared mapping too as it needs security and > not-trusted user can open /dev/xpmem (or whatever that device is > located), we'll have to undo this work and fix it the real way (with > config option MMU_NOTIFIER_SLEEPABLE=y). But if distro have to support > XPMEM in default kernels, this hack is better because it won't > slowdown the locking even if it leaves holes and corrupts memory when > XPMEM can be opened by luser. It really depends if the user having Where is it leaving holes and corrupting memory? Robin -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFP-V2 0/3] Make mmu_notifier_invalidate_range_start able to sleep. 2010-02-02 8:09 ` [RFP-V2 0/3] " Christoph Hellwig 2010-02-02 12:59 ` Andrea Arcangeli @ 2010-02-02 13:35 ` Robin Holt 1 sibling, 0 replies; 26+ messages in thread From: Robin Holt @ 2010-02-02 13:35 UTC (permalink / raw) To: Christoph Hellwig Cc: Robin Holt, Andrew Morton, Andrea Arcangeli, Jack Steiner, linux-mm On Tue, Feb 02, 2010 at 03:09:47AM -0500, Christoph Hellwig wrote: > On Mon, Feb 01, 2010 at 10:01:45PM -0600, Robin Holt wrote: > > XPMEM would like to utilize mmu_notifiers to track page table entry > > changes of the segment and keep the attachment page table/tlb information > > consistent. > > Given that SGI just pushes XPMEM direclty into the distributions instead > of adding it upstream I don't really see the relevance of these patches. XPMEM has in the past and will again be pushed to the community. We are not pushing it to the distros. We have asked them to take very minor patches which have all, with the exception of one, been accepted upstream. The one which has not been accepted upstream has not even been pushed and that is only turning on MMU_NOTIFIER when CONFIG_IA64 && CONFIG_SGI_XP are set. We build xpmem as a GPL out of tree kernel module and library. The sources are shipped with the SGI ProPack product CD. Any customer could rebuild the kernel module with a simple rpmbuild --rebuild xpmem*.src.rpm if they wanted. Thanks, Robin -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 26+ messages in thread
end of thread, other threads:[~2010-02-03 19:54 UTC | newest] Thread overview: 26+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- [not found] <20100202040145.555474000@alcatraz.americas.sgi.com> 2010-02-02 4:01 ` [RFP-V2 1/3] Have mmu_notifiers use SRCU so they may safely schedule Robin Holt 2010-02-02 4:01 ` [RFP-V2 2/3] Fix unmap_vma() bug related to mmu_notifiers Robin Holt 2010-02-02 4:01 ` [RFP-V2 3/3] Make mmu_notifier_invalidate_range_start able to sleep Robin Holt 2010-02-02 8:09 ` [RFP-V2 0/3] " Christoph Hellwig 2010-02-02 12:59 ` Andrea Arcangeli 2010-02-02 13:13 ` Andrea Arcangeli 2010-02-02 13:29 ` Robin Holt 2010-02-02 13:40 ` Andrea Arcangeli 2010-02-02 13:51 ` Robin Holt 2010-02-02 14:10 ` Andrea Arcangeli 2010-02-02 14:21 ` Robin Holt 2010-02-02 14:59 ` Andrea Arcangeli 2010-02-02 15:21 ` Robin Holt 2010-02-02 16:01 ` Andrea Arcangeli 2010-02-02 16:39 ` Robin Holt 2010-02-02 16:52 ` Andrea Arcangeli 2010-02-02 16:59 ` Robin Holt 2010-02-02 17:31 ` Robin Holt 2010-02-02 20:27 ` Andrea Arcangeli 2010-02-02 20:17 ` Andrea Arcangeli 2010-02-03 0:48 ` Robin Holt 2010-02-03 17:14 ` Andrea Arcangeli 2010-02-03 17:18 ` Andrea Arcangeli 2010-02-03 19:54 ` Robin Holt 2010-02-02 13:23 ` Robin Holt 2010-02-02 13:35 ` Robin Holt
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).