[RFP-V2 1/3] Have mmu_notifiers use SRCU so they may safely schedule.

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [RFP-V2 1/3] Have mmu_notifiers use SRCU so they may safely schedule.
       [not found] <20100202040145.555474000@alcatraz.americas.sgi.com>
@ 2010-02-02  4:01 ` Robin Holt
  2010-02-02  4:01 ` [RFP-V2 2/3] Fix unmap_vma() bug related to mmu_notifiers Robin Holt
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 26+ messages in thread
From: Robin Holt @ 2010-02-02  4:01 UTC (permalink / raw)
  To: Andrew Morton, Andrea Arcangeli; +Cc: Jack Steiner, linux-mm

[-- Attachment #1: mmu_notifier_srcu_v1 --]
[-- Type: text/plain, Size: 9047 bytes --]

From: Andrea Arcangeli <aarcange@redhat.com>


With an RCU based mmu_notifier implementation, any callout to
mmu_notifier_invalidate_range_start, mmu_notifier_invalidate_range_end,
or mmu_notifier_invalidate_page would not be allowed to call schedule as
that could potentially allow a modification to the mmu_notifier structure
while it is currently being used.


Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Robin Holt <holt@sgi.com>
To: Andrew Morton <akpm@linux-foundation.org>
To: Andrea Arcangeli <aarcange@redhat.com>
Cc: Jack Steiner <steiner@sgi.com>
Cc: linux-mm@kvack.org
---

 include/linux/mmu_notifier.h |    3 ++
 mm/mmu_notifier.c            |   59 ++++++++++++++++++++++++++-----------------
 2 files changed, 40 insertions(+), 22 deletions(-)

Index: mmu_notifiers_sleepable_v2/include/linux/mmu_notifier.h
===================================================================
--- mmu_notifiers_sleepable_v2.orig/include/linux/mmu_notifier.h	2010-02-01 11:24:27.000000000 -0600
+++ mmu_notifiers_sleepable_v2/include/linux/mmu_notifier.h	2010-02-01 11:34:38.000000000 -0600
@@ -4,6 +4,7 @@
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/mm_types.h>
+#include <linux/srcu.h>
 
 struct mmu_notifier;
 struct mmu_notifier_ops;
@@ -19,6 +20,8 @@ struct mmu_notifier_ops;
 struct mmu_notifier_mm {
 	/* all mmu notifiers registerd in this mm are queued in this list */
 	struct hlist_head list;
+	/* srcu structure for this mm */
+	struct srcu_struct srcu;
 	/* to serialize the list modifications and hlist_unhashed */
 	spinlock_t lock;
 };
Index: mmu_notifiers_sleepable_v2/mm/mmu_notifier.c
===================================================================
--- mmu_notifiers_sleepable_v2.orig/mm/mmu_notifier.c	2010-02-01 11:24:31.000000000 -0600
+++ mmu_notifiers_sleepable_v2/mm/mmu_notifier.c	2010-02-01 11:34:38.000000000 -0600
@@ -14,6 +14,7 @@
 #include <linux/module.h>
 #include <linux/mm.h>
 #include <linux/err.h>
+#include <linux/srcu.h>
 #include <linux/rcupdate.h>
 #include <linux/sched.h>
 
@@ -24,14 +25,15 @@
  * in parallel despite there being no task using this mm any more,
  * through the vmas outside of the exit_mmap context, such as with
  * vmtruncate. This serializes against mmu_notifier_unregister with
- * the mmu_notifier_mm->lock in addition to RCU and it serializes
- * against the other mmu notifiers with RCU. struct mmu_notifier_mm
+ * the mmu_notifier_mm->lock in addition to SRCU and it serializes
+ * against the other mmu notifiers with SRCU. struct mmu_notifier_mm
  * can't go away from under us as exit_mmap holds an mm_count pin
  * itself.
  */
 void __mmu_notifier_release(struct mm_struct *mm)
 {
 	struct mmu_notifier *mn;
+	int srcu;
 
 	spin_lock(&mm->mmu_notifier_mm->lock);
 	while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) {
@@ -46,10 +48,10 @@ void __mmu_notifier_release(struct mm_st
 		 */
 		hlist_del_init_rcu(&mn->hlist);
 		/*
-		 * RCU here will block mmu_notifier_unregister until
+		 * SRCU here will block mmu_notifier_unregister until
 		 * ->release returns.
 		 */
-		rcu_read_lock();
+		srcu = srcu_read_lock(&mm->mmu_notifier_mm->srcu);
 		spin_unlock(&mm->mmu_notifier_mm->lock);
 		/*
 		 * if ->release runs before mmu_notifier_unregister it
@@ -60,13 +62,13 @@ void __mmu_notifier_release(struct mm_st
 		 */
 		if (mn->ops->release)
 			mn->ops->release(mn, mm);
-		rcu_read_unlock();
+		srcu_read_unlock(&mm->mmu_notifier_mm->srcu, srcu);
 		spin_lock(&mm->mmu_notifier_mm->lock);
 	}
 	spin_unlock(&mm->mmu_notifier_mm->lock);
 
 	/*
-	 * synchronize_rcu here prevents mmu_notifier_release to
+	 * synchronize_srcu here prevents mmu_notifier_release to
 	 * return to exit_mmap (which would proceed freeing all pages
 	 * in the mm) until the ->release method returns, if it was
 	 * invoked by mmu_notifier_unregister.
@@ -74,7 +76,7 @@ void __mmu_notifier_release(struct mm_st
 	 * The mmu_notifier_mm can't go away from under us because one
 	 * mm_count is hold by exit_mmap.
 	 */
-	synchronize_rcu();
+	synchronize_srcu(&mm->mmu_notifier_mm->srcu);
 }
 
 /*
@@ -87,14 +89,14 @@ int __mmu_notifier_clear_flush_young(str
 {
 	struct mmu_notifier *mn;
 	struct hlist_node *n;
-	int young = 0;
+	int young = 0, srcu;
 
-	rcu_read_lock();
+	srcu = srcu_read_lock(&mm->mmu_notifier_mm->srcu);
 	hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
 		if (mn->ops->clear_flush_young)
 			young |= mn->ops->clear_flush_young(mn, mm, address);
 	}
-	rcu_read_unlock();
+	srcu_read_unlock(&mm->mmu_notifier_mm->srcu, srcu);
 
 	return young;
 }
@@ -124,13 +126,14 @@ void __mmu_notifier_invalidate_page(stru
 {
 	struct mmu_notifier *mn;
 	struct hlist_node *n;
+	int srcu;
 
-	rcu_read_lock();
+	srcu = srcu_read_lock(&mm->mmu_notifier_mm->srcu);
 	hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
 		if (mn->ops->invalidate_page)
 			mn->ops->invalidate_page(mn, mm, address);
 	}
-	rcu_read_unlock();
+	srcu_read_unlock(&mm->mmu_notifier_mm->srcu, srcu);
 }
 
 void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
@@ -138,13 +141,14 @@ void __mmu_notifier_invalidate_range_sta
 {
 	struct mmu_notifier *mn;
 	struct hlist_node *n;
+	int srcu;
 
-	rcu_read_lock();
+	srcu = srcu_read_lock(&mm->mmu_notifier_mm->srcu);
 	hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
 		if (mn->ops->invalidate_range_start)
 			mn->ops->invalidate_range_start(mn, mm, start, end);
 	}
-	rcu_read_unlock();
+	srcu_read_unlock(&mm->mmu_notifier_mm->srcu, srcu);
 }
 
 void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
@@ -152,13 +156,14 @@ void __mmu_notifier_invalidate_range_end
 {
 	struct mmu_notifier *mn;
 	struct hlist_node *n;
+	int srcu;
 
-	rcu_read_lock();
+	srcu = srcu_read_lock(&mm->mmu_notifier_mm->srcu);
 	hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
 		if (mn->ops->invalidate_range_end)
 			mn->ops->invalidate_range_end(mn, mm, start, end);
 	}
-	rcu_read_unlock();
+	srcu_read_unlock(&mm->mmu_notifier_mm->srcu, srcu);
 }
 
 static int do_mmu_notifier_register(struct mmu_notifier *mn,
@@ -175,6 +180,10 @@ static int do_mmu_notifier_register(stru
 	if (unlikely(!mmu_notifier_mm))
 		goto out;
 
+	ret = init_srcu_struct(&mmu_notifier_mm->srcu);
+	if (unlikely(ret))
+		goto out_kfree;
+
 	if (take_mmap_sem)
 		down_write(&mm->mmap_sem);
 	ret = mm_take_all_locks(mm);
@@ -205,6 +214,9 @@ static int do_mmu_notifier_register(stru
 out_cleanup:
 	if (take_mmap_sem)
 		up_write(&mm->mmap_sem);
+	if (mmu_notifier_mm)
+		cleanup_srcu_struct(&mmu_notifier_mm->srcu);
+out_kfree:
 	/* kfree() does nothing if mmu_notifier_mm is NULL */
 	kfree(mmu_notifier_mm);
 out:
@@ -245,6 +257,7 @@ EXPORT_SYMBOL_GPL(__mmu_notifier_registe
 void __mmu_notifier_mm_destroy(struct mm_struct *mm)
 {
 	BUG_ON(!hlist_empty(&mm->mmu_notifier_mm->list));
+	cleanup_srcu_struct(&mm->mmu_notifier_mm->srcu);
 	kfree(mm->mmu_notifier_mm);
 	mm->mmu_notifier_mm = LIST_POISON1; /* debug */
 }
@@ -252,8 +265,8 @@ void __mmu_notifier_mm_destroy(struct mm
 /*
  * This releases the mm_count pin automatically and frees the mm
  * structure if it was the last user of it. It serializes against
- * running mmu notifiers with RCU and against mmu_notifier_unregister
- * with the unregister lock + RCU. All sptes must be dropped before
+ * running mmu notifiers with SRCU and against mmu_notifier_unregister
+ * with the unregister lock + SRCU. All sptes must be dropped before
  * calling mmu_notifier_unregister. ->release or any other notifier
  * method may be invoked concurrently with mmu_notifier_unregister,
  * and only after mmu_notifier_unregister returned we're guaranteed
@@ -265,13 +278,15 @@ void mmu_notifier_unregister(struct mmu_
 
 	spin_lock(&mm->mmu_notifier_mm->lock);
 	if (!hlist_unhashed(&mn->hlist)) {
+		int srcu;
+
 		hlist_del_rcu(&mn->hlist);
 
 		/*
-		 * RCU here will force exit_mmap to wait ->release to finish
+		 * SRCU here will force exit_mmap to wait ->release to finish
 		 * before freeing the pages.
 		 */
-		rcu_read_lock();
+		srcu = srcu_read_lock(&mm->mmu_notifier_mm->srcu);
 		spin_unlock(&mm->mmu_notifier_mm->lock);
 		/*
 		 * exit_mmap will block in mmu_notifier_release to
@@ -280,7 +295,7 @@ void mmu_notifier_unregister(struct mmu_
 		 */
 		if (mn->ops->release)
 			mn->ops->release(mn, mm);
-		rcu_read_unlock();
+		srcu_read_unlock(&mm->mmu_notifier_mm->srcu, srcu);
 	} else
 		spin_unlock(&mm->mmu_notifier_mm->lock);
 
@@ -288,7 +303,7 @@ void mmu_notifier_unregister(struct mmu_
 	 * Wait any running method to finish, of course including
 	 * ->release if it was run by mmu_notifier_relase instead of us.
 	 */
-	synchronize_rcu();
+	synchronize_srcu(&mm->mmu_notifier_mm->srcu);
 
 	BUG_ON(atomic_read(&mm->mm_count) <= 0);
 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* [RFP-V2 2/3] Fix unmap_vma() bug related to mmu_notifiers
       [not found] <20100202040145.555474000@alcatraz.americas.sgi.com>
  2010-02-02  4:01 ` [RFP-V2 1/3] Have mmu_notifiers use SRCU so they may safely schedule Robin Holt
@ 2010-02-02  4:01 ` Robin Holt
  2010-02-02  4:01 ` [RFP-V2 3/3] Make mmu_notifier_invalidate_range_start able to sleep Robin Holt
  2010-02-02  8:09 ` [RFP-V2 0/3] " Christoph Hellwig
  3 siblings, 0 replies; 26+ messages in thread
From: Robin Holt @ 2010-02-02  4:01 UTC (permalink / raw)
  To: Andrew Morton, Andrea Arcangeli; +Cc: Jack Steiner, linux-mm

[-- Attachment #1: mmu_notifier_tlb_v1 --]
[-- Type: text/plain, Size: 4409 bytes --]


unmap_vmas() can fail to correctly flush the TLB if a
callout to mmu_notifier_invalidate_range_start() sleeps.
The mmu_gather list is initialized prior to the callout. If it is reused
while the thread is sleeping, the mm field may be invalid.

If the task migrates to a different cpu, the task may use the wrong
mmu_gather.

The patch changes unmap_vmas() to initialize the mmu_gather
AFTER the mmu_notifier completes.

Signed-off-by: Jack Steiner <steiner@sgi.com>
To: Andrew Morton <akpm@linux-foundation.org>
To: Andrea Arcangeli <aarcange@redhat.com>
Cc: Jack Steiner <steiner@sgi.com>
Cc: linux-mm@kvack.org

---

 include/linux/mm.h |    2 +-
 mm/memory.c        |   11 +++++++----
 mm/mmap.c          |    6 ++----
 3 files changed, 10 insertions(+), 9 deletions(-)

Index: linux/include/linux/mm.h
===================================================================
--- linux.orig/include/linux/mm.h	2010-01-25 01:45:37.000000000 -0600
+++ linux/include/linux/mm.h	2010-01-25 11:32:21.000000000 -0600
@@ -761,7 +761,7 @@ unsigned long zap_page_range(struct vm_a
 unsigned long unmap_vmas(struct mmu_gather **tlb,
 		struct vm_area_struct *start_vma, unsigned long start_addr,
 		unsigned long end_addr, unsigned long *nr_accounted,
-		struct zap_details *);
+		struct zap_details *, int fullmm);
 
 /**
  * mm_walk - callbacks for walk_page_range
Index: linux/mm/memory.c
===================================================================
--- linux.orig/mm/memory.c	2010-01-25 01:45:37.000000000 -0600
+++ linux/mm/memory.c	2010-01-25 11:32:21.000000000 -0600
@@ -1010,17 +1010,21 @@ static unsigned long unmap_page_range(st
 unsigned long unmap_vmas(struct mmu_gather **tlbp,
 		struct vm_area_struct *vma, unsigned long start_addr,
 		unsigned long end_addr, unsigned long *nr_accounted,
-		struct zap_details *details)
+		struct zap_details *details, int fullmm)
 {
 	long zap_work = ZAP_BLOCK_SIZE;
 	unsigned long tlb_start = 0;	/* For tlb_finish_mmu */
 	int tlb_start_valid = 0;
 	unsigned long start = start_addr;
 	spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
-	int fullmm = (*tlbp)->fullmm;
 	struct mm_struct *mm = vma->vm_mm;
 
+	/*
+	 * mmu_notifier_invalidate_range_start can sleep. Don't initialize
+	 * mmu_gather until it completes
+	 */
 	mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
+	*tlbp = tlb_gather_mmu(mm, fullmm);
 	for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {
 		unsigned long end;
 
@@ -1108,9 +1112,8 @@ unsigned long zap_page_range(struct vm_a
 	unsigned long nr_accounted = 0;
 
 	lru_add_drain();
-	tlb = tlb_gather_mmu(mm, 0);
 	update_hiwater_rss(mm);
-	end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
+	end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details, 0);
 	if (tlb)
 		tlb_finish_mmu(tlb, address, end);
 	return end;
Index: linux/mm/mmap.c
===================================================================
--- linux.orig/mm/mmap.c	2010-01-25 01:45:37.000000000 -0600
+++ linux/mm/mmap.c	2010-01-25 11:35:55.000000000 -0600
@@ -1824,9 +1824,8 @@ static void unmap_region(struct mm_struc
 	unsigned long nr_accounted = 0;
 
 	lru_add_drain();
-	tlb = tlb_gather_mmu(mm, 0);
 	update_hiwater_rss(mm);
-	unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL);
+	unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL, 0);
 	vm_unacct_memory(nr_accounted);
 	free_pgtables(tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
 				 next? next->vm_start: 0);
@@ -2168,10 +2167,9 @@ void exit_mmap(struct mm_struct *mm)
 
 	lru_add_drain();
 	flush_cache_mm(mm);
-	tlb = tlb_gather_mmu(mm, 1);
 	/* update_hiwater_rss(mm) here? but nobody should be looking */
 	/* Use -1 here to ensure all VMAs in the mm are unmapped */
-	end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
+	end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL, 1);
 	vm_unacct_memory(nr_accounted);
 
 	free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0);

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* [RFP-V2 3/3] Make mmu_notifier_invalidate_range_start able to sleep.
       [not found] <20100202040145.555474000@alcatraz.americas.sgi.com>
  2010-02-02  4:01 ` [RFP-V2 1/3] Have mmu_notifiers use SRCU so they may safely schedule Robin Holt
  2010-02-02  4:01 ` [RFP-V2 2/3] Fix unmap_vma() bug related to mmu_notifiers Robin Holt
@ 2010-02-02  4:01 ` Robin Holt
  2010-02-02  8:09 ` [RFP-V2 0/3] " Christoph Hellwig
  3 siblings, 0 replies; 26+ messages in thread
From: Robin Holt @ 2010-02-02  4:01 UTC (permalink / raw)
  To: Andrew Morton, Andrea Arcangeli; +Cc: Jack Steiner, linux-mm

[-- Attachment #1: mmu_notifier_truncate_sleepable_v1 --]
[-- Type: text/plain, Size: 13226 bytes --]


Make the truncate case handle the need to sleep.  We accomplish this
by failing the mmu_notifier_invalidate_range_start(... atomic==1)
case which in turn falls back to unmap_mapping_range_vma() with the
restart_address == start_address.  In that case, we make an additional
callout to mmu_notifier_invalidate_range_start(... atomic==0) after the
i_mmap_lock has been released.

Signed-off-by: Robin Holt <holt@sgi.com>
To: Andrew Morton <akpm@linux-foundation.org>
To: Andrea Arcangeli <aarcange@redhat.com>
Cc: Jack Steiner <steiner@sgi.com>
Cc: linux-mm@kvack.org

---

 drivers/misc/sgi-gru/grutlbpurge.c |    8 +++++---
 include/linux/mmu_notifier.h       |   22 ++++++++++++----------
 mm/fremap.c                        |    2 +-
 mm/hugetlb.c                       |    2 +-
 mm/memory.c                        |   29 +++++++++++++++++++++++------
 mm/mmu_notifier.c                  |   10 ++++++----
 mm/mprotect.c                      |    2 +-
 mm/mremap.c                        |    2 +-
 virt/kvm/kvm_main.c                |   11 +++++++----
 9 files changed, 57 insertions(+), 31 deletions(-)

Index: mmu_notifiers_sleepable_v2/drivers/misc/sgi-gru/grutlbpurge.c
===================================================================
--- mmu_notifiers_sleepable_v2.orig/drivers/misc/sgi-gru/grutlbpurge.c	2010-02-01 21:10:06.000000000 -0600
+++ mmu_notifiers_sleepable_v2/drivers/misc/sgi-gru/grutlbpurge.c	2010-02-01 21:41:19.000000000 -0600
@@ -219,9 +219,10 @@ void gru_flush_all_tlb(struct gru_state 
 /*
  * MMUOPS notifier callout functions
  */
-static void gru_invalidate_range_start(struct mmu_notifier *mn,
-				       struct mm_struct *mm,
-				       unsigned long start, unsigned long end)
+static int gru_invalidate_range_start(struct mmu_notifier *mn,
+				      struct mm_struct *mm,
+				      unsigned long start, unsigned long end,
+				      bool atomic)
 {
 	struct gru_mm_struct *gms = container_of(mn, struct gru_mm_struct,
 						 ms_notifier);
@@ -231,6 +232,7 @@ static void gru_invalidate_range_start(s
 	gru_dbg(grudev, "gms %p, start 0x%lx, end 0x%lx, act %d\n", gms,
 		start, end, atomic_read(&gms->ms_range_active));
 	gru_flush_tlb_range(gms, start, end - start);
+	return 0;
 }
 
 static void gru_invalidate_range_end(struct mmu_notifier *mn,
Index: mmu_notifiers_sleepable_v2/include/linux/mmu_notifier.h
===================================================================
--- mmu_notifiers_sleepable_v2.orig/include/linux/mmu_notifier.h	2010-02-01 21:10:20.000000000 -0600
+++ mmu_notifiers_sleepable_v2/include/linux/mmu_notifier.h	2010-02-01 21:10:21.000000000 -0600
@@ -127,9 +127,10 @@ struct mmu_notifier_ops {
 	 * address space but may still be referenced by sptes until
 	 * the last refcount is dropped.
 	 */
-	void (*invalidate_range_start)(struct mmu_notifier *mn,
+	int (*invalidate_range_start)(struct mmu_notifier *mn,
 				       struct mm_struct *mm,
-				       unsigned long start, unsigned long end);
+				       unsigned long start, unsigned long end,
+				       bool atomic);
 	void (*invalidate_range_end)(struct mmu_notifier *mn,
 				     struct mm_struct *mm,
 				     unsigned long start, unsigned long end);
@@ -170,8 +171,8 @@ extern void __mmu_notifier_change_pte(st
 				      unsigned long address, pte_t pte);
 extern void __mmu_notifier_invalidate_page(struct mm_struct *mm,
 					  unsigned long address);
-extern void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
-				  unsigned long start, unsigned long end);
+extern int __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
+			    unsigned long start, unsigned long end, bool atomic);
 extern void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
 				  unsigned long start, unsigned long end);
 
@@ -203,11 +204,12 @@ static inline void mmu_notifier_invalida
 		__mmu_notifier_invalidate_page(mm, address);
 }
 
-static inline void mmu_notifier_invalidate_range_start(struct mm_struct *mm,
-				  unsigned long start, unsigned long end)
+static inline int mmu_notifier_invalidate_range_start(struct mm_struct *mm,
+			     unsigned long start, unsigned long end, bool atomic)
 {
 	if (mm_has_notifiers(mm))
-		__mmu_notifier_invalidate_range_start(mm, start, end);
+		return __mmu_notifier_invalidate_range_start(mm, start, end, atomic);
+	return 0;
 }
 
 static inline void mmu_notifier_invalidate_range_end(struct mm_struct *mm,
@@ -288,10 +290,10 @@ static inline void mmu_notifier_invalida
 					  unsigned long address)
 {
 }
-
-static inline void mmu_notifier_invalidate_range_start(struct mm_struct *mm,
-				  unsigned long start, unsigned long end)
+static inline int mmu_notifier_invalidate_range_start(struct mm_struct *mm,
+			     unsigned long start, unsigned long end, bool atomic)
 {
+	return 0;
 }
 
 static inline void mmu_notifier_invalidate_range_end(struct mm_struct *mm,
Index: mmu_notifiers_sleepable_v2/mm/fremap.c
===================================================================
--- mmu_notifiers_sleepable_v2.orig/mm/fremap.c	2010-02-01 21:10:06.000000000 -0600
+++ mmu_notifiers_sleepable_v2/mm/fremap.c	2010-02-01 21:41:19.000000000 -0600
@@ -226,7 +226,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsign
 		vma->vm_flags = saved_flags;
 	}
 
-	mmu_notifier_invalidate_range_start(mm, start, start + size);
+	mmu_notifier_invalidate_range_start(mm, start, start + size, 0);
 	err = populate_range(mm, vma, start, size, pgoff);
 	mmu_notifier_invalidate_range_end(mm, start, start + size);
 	if (!err && !(flags & MAP_NONBLOCK)) {
Index: mmu_notifiers_sleepable_v2/mm/hugetlb.c
===================================================================
--- mmu_notifiers_sleepable_v2.orig/mm/hugetlb.c	2010-02-01 21:10:06.000000000 -0600
+++ mmu_notifiers_sleepable_v2/mm/hugetlb.c	2010-02-01 21:41:19.000000000 -0600
@@ -2159,7 +2159,7 @@ void __unmap_hugepage_range(struct vm_ar
 	BUG_ON(start & ~huge_page_mask(h));
 	BUG_ON(end & ~huge_page_mask(h));
 
-	mmu_notifier_invalidate_range_start(mm, start, end);
+	mmu_notifier_invalidate_range_start(mm, start, end, 0);
 	spin_lock(&mm->page_table_lock);
 	for (address = start; address < end; address += sz) {
 		ptep = huge_pte_offset(mm, address);
Index: mmu_notifiers_sleepable_v2/mm/memory.c
===================================================================
--- mmu_notifiers_sleepable_v2.orig/mm/memory.c	2010-02-01 21:10:21.000000000 -0600
+++ mmu_notifiers_sleepable_v2/mm/memory.c	2010-02-01 21:41:19.000000000 -0600
@@ -786,7 +786,7 @@ int copy_page_range(struct mm_struct *ds
 	 * is_cow_mapping() returns true.
 	 */
 	if (is_cow_mapping(vma->vm_flags))
-		mmu_notifier_invalidate_range_start(src_mm, addr, end);
+		mmu_notifier_invalidate_range_start(src_mm, addr, end, 0);
 
 	ret = 0;
 	dst_pgd = pgd_offset(dst_mm, addr);
@@ -990,7 +990,8 @@ static unsigned long unmap_page_range(st
  * @nr_accounted: Place number of unmapped pages in vm-accountable vma's here
  * @details: details of nonlinear truncation or shared cache invalidation
  *
- * Returns the end address of the unmapping (restart addr if interrupted).
+ * Returns the end address of the unmapping (restart addr if interrupted, start
+ * if the i_mmap_lock is held and mmu_notifier_range_start() needs to sleep).
  *
  * Unmap all pages in the vma list.
  *
@@ -1023,7 +1024,10 @@ unsigned long unmap_vmas(struct mmu_gath
 	 * mmu_notifier_invalidate_range_start can sleep. Don't initialize
 	 * mmu_gather until it completes
 	 */
-	mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
+	if (mmu_notifier_invalidate_range_start(mm, start_addr,
+					end_addr, (i_mmap_lock == NULL)));
+		goto out;
+
 	*tlbp = tlb_gather_mmu(mm, fullmm);
 	for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {
 		unsigned long end;
@@ -1107,7 +1111,7 @@ unsigned long zap_page_range(struct vm_a
 		unsigned long size, struct zap_details *details)
 {
 	struct mm_struct *mm = vma->vm_mm;
-	struct mmu_gather *tlb;
+	struct mmu_gather *tlb = NULL;
 	unsigned long end = address + size;
 	unsigned long nr_accounted = 0;
 
@@ -1908,7 +1912,7 @@ int apply_to_page_range(struct mm_struct
 	int err;
 
 	BUG_ON(addr >= end);
-	mmu_notifier_invalidate_range_start(mm, start, end);
+	mmu_notifier_invalidate_range_start(mm, start, end, 0);
 	pgd = pgd_offset(mm, addr);
 	do {
 		next = pgd_addr_end(addr, end);
@@ -2329,6 +2333,7 @@ static int unmap_mapping_range_vma(struc
 {
 	unsigned long restart_addr;
 	int need_break;
+	int need_unlocked_invalidate;
 
 	/*
 	 * files that support invalidating or truncating portions of the
@@ -2350,7 +2355,9 @@ again:
 
 	restart_addr = zap_page_range(vma, start_addr,
 					end_addr - start_addr, details);
-	need_break = need_resched() || spin_needbreak(details->i_mmap_lock);
+	need_unlocked_invalidate = (restart_addr == start_addr);
+	need_break = need_resched() || spin_needbreak(details->i_mmap_lock) ||
+					need_unlocked_invalidate;
 
 	if (restart_addr >= end_addr) {
 		/* We have now completed this vma: mark it so */
@@ -2365,6 +2372,16 @@ again:
 	}
 
 	spin_unlock(details->i_mmap_lock);
+	if (need_unlocked_invalidate) {
+		/*
+		 * If zap_page_range failed to make any progress because the
+		 * mmu_notifier_invalidate_range_start was called atomically
+		 * while the callee needed to sleep.  In that event, we
+		 * make the callout while the i_mmap_lock is released.
+		 */
+		mmu_notifier_invalidate_range_start(vma->vm_mm, start_addr, end_addr, 0);
+		mmu_notifier_invalidate_range_end(vma->vm_mm, start_addr, end_addr);
+	}
 	cond_resched();
 	spin_lock(details->i_mmap_lock);
 	return -EINTR;
Index: mmu_notifiers_sleepable_v2/mm/mmu_notifier.c
===================================================================
--- mmu_notifiers_sleepable_v2.orig/mm/mmu_notifier.c	2010-02-01 21:10:20.000000000 -0600
+++ mmu_notifiers_sleepable_v2/mm/mmu_notifier.c	2010-02-01 21:41:19.000000000 -0600
@@ -135,20 +135,22 @@ void __mmu_notifier_invalidate_page(stru
 	}
 	srcu_read_unlock(&mm->mmu_notifier_mm->srcu, srcu);
 }
-
-void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
-				  unsigned long start, unsigned long end)
+int __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
+			     unsigned long start, unsigned long end, bool atomic)
 {
 	struct mmu_notifier *mn;
 	struct hlist_node *n;
 	int srcu;
+	int ret = 0;
 
 	srcu = srcu_read_lock(&mm->mmu_notifier_mm->srcu);
 	hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
 		if (mn->ops->invalidate_range_start)
-			mn->ops->invalidate_range_start(mn, mm, start, end);
+			ret |= mn->ops->invalidate_range_start(mn, mm, start,
+								end, atomic);
 	}
 	srcu_read_unlock(&mm->mmu_notifier_mm->srcu, srcu);
+	return ret;
 }
 
 void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
Index: mmu_notifiers_sleepable_v2/mm/mprotect.c
===================================================================
--- mmu_notifiers_sleepable_v2.orig/mm/mprotect.c	2010-02-01 21:10:06.000000000 -0600
+++ mmu_notifiers_sleepable_v2/mm/mprotect.c	2010-02-01 21:41:19.000000000 -0600
@@ -204,7 +204,7 @@ success:
 		dirty_accountable = 1;
 	}
 
-	mmu_notifier_invalidate_range_start(mm, start, end);
+	mmu_notifier_invalidate_range_start(mm, start, end, 0);
 	if (is_vm_hugetlb_page(vma))
 		hugetlb_change_protection(vma, start, end, vma->vm_page_prot);
 	else
Index: mmu_notifiers_sleepable_v2/mm/mremap.c
===================================================================
--- mmu_notifiers_sleepable_v2.orig/mm/mremap.c	2010-02-01 21:10:06.000000000 -0600
+++ mmu_notifiers_sleepable_v2/mm/mremap.c	2010-02-01 21:41:19.000000000 -0600
@@ -82,7 +82,7 @@ static void move_ptes(struct vm_area_str
 
 	old_start = old_addr;
 	mmu_notifier_invalidate_range_start(vma->vm_mm,
-					    old_start, old_end);
+					    old_start, old_end, 0);
 	if (vma->vm_file) {
 		/*
 		 * Subtle point from Rajesh Venkatasubramanian: before
Index: mmu_notifiers_sleepable_v2/virt/kvm/kvm_main.c
===================================================================
--- mmu_notifiers_sleepable_v2.orig/virt/kvm/kvm_main.c	2010-02-01 21:10:06.000000000 -0600
+++ mmu_notifiers_sleepable_v2/virt/kvm/kvm_main.c	2010-02-01 21:41:19.000000000 -0600
@@ -259,10 +259,11 @@ static void kvm_mmu_notifier_change_pte(
 	spin_unlock(&kvm->mmu_lock);
 }
 
-static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
-						    struct mm_struct *mm,
-						    unsigned long start,
-						    unsigned long end)
+static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
+						   struct mm_struct *mm,
+						   unsigned long start,
+						   unsigned long end,
+						   bool atomic)
 {
 	struct kvm *kvm = mmu_notifier_to_kvm(mn);
 	int need_tlb_flush = 0;
@@ -281,6 +282,8 @@ static void kvm_mmu_notifier_invalidate_
 	/* we've to flush the tlb before the pages can be freed */
 	if (need_tlb_flush)
 		kvm_flush_remote_tlbs(kvm);
+
+	return 0;
 }
 
 static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFP-V2 0/3] Make mmu_notifier_invalidate_range_start able to sleep.
       [not found] <20100202040145.555474000@alcatraz.americas.sgi.com>
                   ` (2 preceding siblings ...)
  2010-02-02  4:01 ` [RFP-V2 3/3] Make mmu_notifier_invalidate_range_start able to sleep Robin Holt
@ 2010-02-02  8:09 ` Christoph Hellwig
  2010-02-02 12:59   ` Andrea Arcangeli
  2010-02-02 13:35   ` Robin Holt
  3 siblings, 2 replies; 26+ messages in thread
From: Christoph Hellwig @ 2010-02-02  8:09 UTC (permalink / raw)
  To: Robin Holt; +Cc: Andrew Morton, Andrea Arcangeli, Jack Steiner, linux-mm

On Mon, Feb 01, 2010 at 10:01:45PM -0600, Robin Holt wrote:
> XPMEM would like to utilize mmu_notifiers to track page table entry
> changes of the segment and keep the attachment page table/tlb information
> consistent.

Given that SGI just pushes XPMEM direclty into the distributions instead
of adding it upstream I don't really see the relevance of these patches.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFP-V2 0/3] Make mmu_notifier_invalidate_range_start able to sleep.
  2010-02-02  8:09 ` [RFP-V2 0/3] " Christoph Hellwig
@ 2010-02-02 12:59   ` Andrea Arcangeli
  2010-02-02 13:13     ` Andrea Arcangeli
  2010-02-02 13:23     ` Robin Holt
  2010-02-02 13:35   ` Robin Holt
  1 sibling, 2 replies; 26+ messages in thread
From: Andrea Arcangeli @ 2010-02-02 12:59 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Robin Holt, Andrew Morton, Jack Steiner, linux-mm

On Tue, Feb 02, 2010 at 03:09:47AM -0500, Christoph Hellwig wrote:
> On Mon, Feb 01, 2010 at 10:01:45PM -0600, Robin Holt wrote:
> > XPMEM would like to utilize mmu_notifiers to track page table entry
> > changes of the segment and keep the attachment page table/tlb information
> > consistent.
> 
> Given that SGI just pushes XPMEM direclty into the distributions instead
> of adding it upstream I don't really see the relevance of these patches.

That will then prevent upstream modules to build against those
kernels. Not an huge issue, for a distro that's an ok compromise. My
real issue with mainline is that while XPMEM is ok to break and
corrupt memory if people uses XPMEM on top of shared mappings (instead
of anonymous ones) by making a two liner change to the userland app
opening xpmem device, but when next mmu notifier user comes and ask
for full scheduling across shared mapping too as it needs security and
not-trusted user can open /dev/xpmem (or whatever that device is
located), we'll have to undo this work and fix it the real way (with
config option MMU_NOTIFIER_SLEEPABLE=y). But if distro have to support
XPMEM in default kernels, this hack is better because it won't
slowdown the locking even if it leaves holes and corrupts memory when
XPMEM can be opened by luser. It really depends if the user having
access to XPMEM device is malicious, if we know it's not (assume
avatar distributed rendering in closed environment or whatever) this
again is an ok hack.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFP-V2 0/3] Make mmu_notifier_invalidate_range_start able to sleep.
  2010-02-02 12:59   ` Andrea Arcangeli
@ 2010-02-02 13:13     ` Andrea Arcangeli
  2010-02-02 13:29       ` Robin Holt
  2010-02-02 13:23     ` Robin Holt
  1 sibling, 1 reply; 26+ messages in thread
From: Andrea Arcangeli @ 2010-02-02 13:13 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Robin Holt, Andrew Morton, Jack Steiner, linux-mm

On Tue, Feb 02, 2010 at 01:59:43PM +0100, Andrea Arcangeli wrote:
> slowdown the locking even if it leaves holes and corrupts memory when
> XPMEM can be opened by luser. It really depends if the user having
> access to XPMEM device is malicious, if we know it's not (assume
> avatar distributed rendering in closed environment or whatever) this
> again is an ok hack.

>From another point of view: if the userland has to be as trusted as
the kernel for this hack to be ok, I don't get it why it's not ok to
just schedule unconditionally in the invalidate_range_start without
altering the API and gracefully deadlock in the i_mmap_lock. If the
secondary mappings cannot be teardown without scheduling, it means the
page will be swapped out but the physical pages can be still written
to despite the page being swapped out and reused by something else
leading to trivial memory corruption if the user having access to
xpmem device is malicious.

Like Andrew already said, we've no clue what the "bool atomic"
parameter will be used for and so it's next to impossible to judge the
validity of this hack (because an hack that is). We don't know how
xpmem will react to that event, all we know is that it won't be able
to invalidate secondary mappings by the time this call returns leading
to memory corruption. If it panics or if it ignores the invalidate
when atomic=1, it's equivalent or worse than just schedule in
atomic. If it schedule in atomic there's zero risk of memory
corruption at least and no need of altering the API. So even for
distro this hack to the API isn't necessary but only srcu and tlb
flush deferral is needed.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFP-V2 0/3] Make mmu_notifier_invalidate_range_start able to sleep.
  2010-02-02 13:13     ` Andrea Arcangeli
@ 2010-02-02 13:29       ` Robin Holt
  2010-02-02 13:40         ` Andrea Arcangeli
  0 siblings, 1 reply; 26+ messages in thread
From: Robin Holt @ 2010-02-02 13:29 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Christoph Hellwig, Robin Holt, Andrew Morton, Jack Steiner,
	linux-mm

On Tue, Feb 02, 2010 at 02:13:41PM +0100, Andrea Arcangeli wrote:
> On Tue, Feb 02, 2010 at 01:59:43PM +0100, Andrea Arcangeli wrote:
> > slowdown the locking even if it leaves holes and corrupts memory when
> > XPMEM can be opened by luser. It really depends if the user having
> > access to XPMEM device is malicious, if we know it's not (assume
> > avatar distributed rendering in closed environment or whatever) this
> > again is an ok hack.
> 
> >From another point of view: if the userland has to be as trusted as
> the kernel for this hack to be ok, I don't get it why it's not ok to
> just schedule unconditionally in the invalidate_range_start without
> altering the API and gracefully deadlock in the i_mmap_lock. If the
> secondary mappings cannot be teardown without scheduling, it means the
> page will be swapped out but the physical pages can be still written
> to despite the page being swapped out and reused by something else
> leading to trivial memory corruption if the user having access to
> xpmem device is malicious.
> 
> Like Andrew already said, we've no clue what the "bool atomic"
> parameter will be used for and so it's next to impossible to judge the
> validity of this hack (because an hack that is). We don't know how
> xpmem will react to that event, all we know is that it won't be able
> to invalidate secondary mappings by the time this call returns leading
> to memory corruption. If it panics or if it ignores the invalidate
> when atomic=1, it's equivalent or worse than just schedule in

The atomic==1 case is only for the truncate case, correct?  XPMEM is
holding reference counts on the pages it exports (get_user_pages) so
they are not freed even when the zap_page_range has completed.  What I
think we are dealing with is an inconsistent appearance to userland.
The one task would SIG_BUS if it touches the memory.  The other would
be able to read/write it just fine until the ascynchronous zap of the
attachment completed.

Robin

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFP-V2 0/3] Make mmu_notifier_invalidate_range_start able to sleep.
  2010-02-02 13:29       ` Robin Holt
@ 2010-02-02 13:40         ` Andrea Arcangeli
  2010-02-02 13:51           ` Robin Holt
  0 siblings, 1 reply; 26+ messages in thread
From: Andrea Arcangeli @ 2010-02-02 13:40 UTC (permalink / raw)
  To: Robin Holt; +Cc: Christoph Hellwig, Andrew Morton, Jack Steiner, linux-mm

On Tue, Feb 02, 2010 at 07:29:20AM -0600, Robin Holt wrote:
> The atomic==1 case is only for the truncate case, correct?  XPMEM is

Correct.

> holding reference counts on the pages it exports (get_user_pages) so
> they are not freed even when the zap_page_range has completed.  What I
> think we are dealing with is an inconsistent appearance to userland.
> The one task would SIG_BUS if it touches the memory.  The other would
> be able to read/write it just fine until the ascynchronous zap of the
> attachment completed.

Ok, thanks to the page pin it won't randomly corrupt memory, but it
can still screw the runtime of an unmodified unaware program. I think
you've to figure out how important it is that you won't deadlock if
luser modifies userland because this isn't a complete approach and as
much as I care about your workload that is ok with this, I cannot
exclude it might materialize an usage in the future where sigbus while
other thread still access the remote pages is not ok and may screw
userland in a more subtle way than a visible kernel deadlock. Now we
can do this now and undo it later, nothing very problematic, but
considering this isn't a full transparent solution, I don't see the
big deal in just scheduling in atomic if user does what it can't do
(there will be unexpected behavior to his app anyway if he does that).

I don't see a problem in applying srcu and the tlb gather patch in
distro kernels, those won't even prevent the upstream modules to build
against those kernels and there will be no change of API. In general
making the methods sleepable doesn't need to alter the API at
all... reason of this change of API is because we're not actually
making them sleepable but only a few.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFP-V2 0/3] Make mmu_notifier_invalidate_range_start able to sleep.
  2010-02-02 13:40         ` Andrea Arcangeli
@ 2010-02-02 13:51           ` Robin Holt
  2010-02-02 14:10             ` Andrea Arcangeli
  0 siblings, 1 reply; 26+ messages in thread
From: Robin Holt @ 2010-02-02 13:51 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Robin Holt, Christoph Hellwig, Andrew Morton, Jack Steiner,
	linux-mm

On Tue, Feb 02, 2010 at 02:40:47PM +0100, Andrea Arcangeli wrote:
> On Tue, Feb 02, 2010 at 07:29:20AM -0600, Robin Holt wrote:
> > The atomic==1 case is only for the truncate case, correct?  XPMEM is
> 
> Correct.
> 
> > holding reference counts on the pages it exports (get_user_pages) so
> > they are not freed even when the zap_page_range has completed.  What I
> > think we are dealing with is an inconsistent appearance to userland.
> > The one task would SIG_BUS if it touches the memory.  The other would
> > be able to read/write it just fine until the ascynchronous zap of the
> > attachment completed.
> 
> Ok, thanks to the page pin it won't randomly corrupt memory, but it
> can still screw the runtime of an unmodified unaware program. I think
> you've to figure out how important it is that you won't deadlock if
> luser modifies userland because this isn't a complete approach and as
> much as I care about your workload that is ok with this, I cannot
> exclude it might materialize an usage in the future where sigbus while
> other thread still access the remote pages is not ok and may screw
> userland in a more subtle way than a visible kernel deadlock. Now we
> can do this now and undo it later, nothing very problematic, but
> considering this isn't a full transparent solution, I don't see the
> big deal in just scheduling in atomic if user does what it can't do
> (there will be unexpected behavior to his app anyway if he does that).

I am sorry.  I slipped implementations.  I had worked late into the
evening on an alternative to this patch which did not require the unlock
of the i_mmap_lock, _inv_range_start, lock, return -EINTR.  That approach
would have had the caveats as above.

The approach we are discussing here does not have any difference in
userland behavior.  When called with atomic==1, XPMEM will detect that
we have exported pages in that address range and return !0 indicating we
need to sleep to satisfy this call.  The i_mmap_lock would be released
and _inv_range_start would be called again with atomic==0.  This time, the
pages would be cleared from the attachments, the unmap_mapping_range_vma
would then retry.  Hopefully the call this time with atomic==1 would
return 0 indicating it did not need to sleep.

> I don't see a problem in applying srcu and the tlb gather patch in
> distro kernels, those won't even prevent the upstream modules to build
> against those kernels and there will be no change of API. In general
> making the methods sleepable doesn't need to alter the API at
> all... reason of this change of API is because we're not actually
> making them sleepable but only a few.

I don't see the change in API with this method either.

Robin

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFP-V2 0/3] Make mmu_notifier_invalidate_range_start able to sleep.
  2010-02-02 13:51           ` Robin Holt
@ 2010-02-02 14:10             ` Andrea Arcangeli
  2010-02-02 14:21               ` Robin Holt
  0 siblings, 1 reply; 26+ messages in thread
From: Andrea Arcangeli @ 2010-02-02 14:10 UTC (permalink / raw)
  To: Robin Holt; +Cc: Christoph Hellwig, Andrew Morton, Jack Steiner, linux-mm

On Tue, Feb 02, 2010 at 07:51:41AM -0600, Robin Holt wrote:
> I don't see the change in API with this method either.

The API change I'm referring to, is the reason that you had to patch
virt/kvm/kvm_main.c and drivers/misc/sgi-gru/grutlbpurge.c to prevent
compile failure. That isn't needed if we really make mmu notifier
sleepable like my old patched did just fine. Except they slowed down
the locking to achieve it... (the slowdown should be confined to
config option) and you don't want that I guess. But if you didn't need
to return -EINVAL I think your userland would also be safer. Only
problem I can see is that you would then have trouble to convince
distro to build with the slower locking and you basically are ok to
break userland in truncate to be sure your module will work with
default binary distro kernel. It's a tradeoff and I'm not against it
but it has to be well documented that this is an hack to be practical
on binary shipped kernels.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFP-V2 0/3] Make mmu_notifier_invalidate_range_start able to sleep.
  2010-02-02 14:10             ` Andrea Arcangeli
@ 2010-02-02 14:21               ` Robin Holt
  2010-02-02 14:59                 ` Andrea Arcangeli
  0 siblings, 1 reply; 26+ messages in thread
From: Robin Holt @ 2010-02-02 14:21 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Robin Holt, Christoph Hellwig, Andrew Morton, Jack Steiner,
	linux-mm

On Tue, Feb 02, 2010 at 03:10:36PM +0100, Andrea Arcangeli wrote:
> On Tue, Feb 02, 2010 at 07:51:41AM -0600, Robin Holt wrote:
> > I don't see the change in API with this method either.
> 
> The API change I'm referring to, is the reason that you had to patch
> virt/kvm/kvm_main.c and drivers/misc/sgi-gru/grutlbpurge.c to prevent

So the API is an mmu_notifier thing and not external.  I think
adding reference counting to the VMA  and converting the i_mmap_lock
to i_mmap_sem might have a slightly larger impact on users of kernel
headers than this proposal.

> compile failure. That isn't needed if we really make mmu notifier
> sleepable like my old patched did just fine. Except they slowed down
> the locking to achieve it... (the slowdown should be confined to
> config option) and you don't want that I guess. But if you didn't need

Your argument seems ridiculous.  Take this larger series of patches which
touches many parts of the kernel and has a runtime downside for 99% of
the user community but only when configured on and then try and argue
with the distros that they should slow all users down for our 1%.

> to return -EINVAL I think your userland would also be safer. Only

I think you missed my correction to an earlier statement.  This patcheset
does not have any data corruption or userland inconsistency.  I had mistakenly
spoken of a patchset I am working up as a lesser alternative to this one.

> problem I can see is that you would then have trouble to convince
> distro to build with the slower locking and you basically are ok to
> break userland in truncate to be sure your module will work with
> default binary distro kernel. It's a tradeoff and I'm not against it
> but it has to be well documented that this is an hack to be practical
> on binary shipped kernels.

This is no more a hack than the other long list of compromises that have
been made in the past.  Very similar to your huge page patchset which
invalidates a page by using the range callout.  NIHS is not the same as
a hack.

Robin

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFP-V2 0/3] Make mmu_notifier_invalidate_range_start able to sleep.
  2010-02-02 14:21               ` Robin Holt
@ 2010-02-02 14:59                 ` Andrea Arcangeli
  2010-02-02 15:21                   ` Robin Holt
  0 siblings, 1 reply; 26+ messages in thread
From: Andrea Arcangeli @ 2010-02-02 14:59 UTC (permalink / raw)
  To: Robin Holt; +Cc: Christoph Hellwig, Andrew Morton, Jack Steiner, linux-mm

On Tue, Feb 02, 2010 at 08:21:30AM -0600, Robin Holt wrote:
> Your argument seems ridiculous.  Take this larger series of patches which
> touches many parts of the kernel and has a runtime downside for 99% of
> the user community but only when configured on and then try and argue
> with the distros that they should slow all users down for our 1%.

I wasn't suggesting to ask to apply such a patch to a kernel distro!
If that's you understood... That would have been the only ridiculous
thing about it.

If you want to send patches to distro your hack is probably the max as
it alters kABI but not so much.

> > to return -EINVAL I think your userland would also be safer. Only
> 
> I think you missed my correction to an earlier statement.  This patcheset
> does not have any data corruption or userland inconsistency.  I had mistakenly
> spoken of a patchset I am working up as a lesser alternative to this one.

If there is never data corruption or userland inconsistency when I do
mmap(MAP_SHARED) truncate(0) then I've to wonder why at all you need
any modification if you already can handle remote spte invalidation
through atomic sections. That is ridiculous that you can handle it
through atomic-section truncate without sleepability, and you still
ask sleepability for mmu notifier in the first place...

> This is no more a hack than the other long list of compromises that have
> been made in the past.  Very similar to your huge page patchset which
> invalidates a page by using the range callout.  NIHS is not the same as
> a hack.

My hack is just to avoid having to modify mmu notifier API to reduce
the amount of mangling I have to ask people to digest at once, I
simply can't do too many things at once, not right example of
compromise as it's going to get fixed without any downside... no
tradeoff at all.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFP-V2 0/3] Make mmu_notifier_invalidate_range_start able to sleep.
  2010-02-02 14:59                 ` Andrea Arcangeli
@ 2010-02-02 15:21                   ` Robin Holt
  2010-02-02 16:01                     ` Andrea Arcangeli
  0 siblings, 1 reply; 26+ messages in thread
From: Robin Holt @ 2010-02-02 15:21 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Robin Holt, Christoph Hellwig, Andrew Morton, Jack Steiner,
	linux-mm

On Tue, Feb 02, 2010 at 03:59:11PM +0100, Andrea Arcangeli wrote:
> > I think you missed my correction to an earlier statement.  This patcheset
> > does not have any data corruption or userland inconsistency.  I had mistakenly
> > spoken of a patchset I am working up as a lesser alternative to this one.
> 
> If there is never data corruption or userland inconsistency when I do
> mmap(MAP_SHARED) truncate(0) then I've to wonder why at all you need
> any modification if you already can handle remote spte invalidation
> through atomic sections. That is ridiculous that you can handle it
> through atomic-section truncate without sleepability, and you still
> ask sleepability for mmu notifier in the first place...

In the truncate(0) example you provide, the sequence would be as follows:

On the first call from unmap_vmas into _inv_range_start(atomic==1),
XPMEM would scan the segment's PFN table.  If there were pages in that
range which have been exported, we would return !0 without doing any
invalidation.

The unmap_vmas code would see the non-zero return and return start_addr
back to zap_page_range and further to unmap_mapping_range_vma where
need_unlocked_invalidate would be set.

unmap_mapping_range_vma would then unlock the i_mmap_lock, call
_inv_range_start(atomic==0) which would clear all the remote page tables
and TLBs.  It would then reaquire the i_mmap_lock and retry.

This time unmap_vmas would call _inv_range_start(atomic==1).  XPMEM would
scan the segment's PFN table and find there were no pages exported and
return 0.

Things would proceed as normal from there.

No corruption.  No intrusive locking additions that negatively affect
the vast majority of users.  A compromise.  The only downside I can see
at all in the CONFIG_MMU_NOTIFIER=n case is unmap_mapping_range_vma is
slightly larger.

Robin

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFP-V2 0/3] Make mmu_notifier_invalidate_range_start able to sleep.
  2010-02-02 15:21                   ` Robin Holt
@ 2010-02-02 16:01                     ` Andrea Arcangeli
  2010-02-02 16:39                       ` Robin Holt
  0 siblings, 1 reply; 26+ messages in thread
From: Andrea Arcangeli @ 2010-02-02 16:01 UTC (permalink / raw)
  To: Robin Holt; +Cc: Christoph Hellwig, Andrew Morton, Jack Steiner, linux-mm

On Tue, Feb 02, 2010 at 09:21:42AM -0600, Robin Holt wrote:
> unmap_mapping_range_vma would then unlock the i_mmap_lock, call
> _inv_range_start(atomic==0) which would clear all the remote page tables
> and TLBs.  It would then reaquire the i_mmap_lock and retry.

I guess you missed why we hold the i_mmap_lock there... it's not like
gratuitous locking complication there's an actual reason why it's
taken, and it's to avoid the vma to be freed from under you:

+       if (need_unlocked_invalidate) {
+               /*
+                * If zap_page_range failed to make any progress
because the
+                * mmu_notifier_invalidate_range_start was called
atomically
+                * while the callee needed to sleep.  In that event,
we
+                * make the callout while the i_mmap_lock is released.
+                */
+               mmu_notifier_invalidate_range_start(vma->vm_mm, start_addr, end_addr, 0);
+               mmu_notifier_invalidate_range_end(vma->vm_mm, start_addr, end_addr);

The above runs with vma being a dangling pointer, it'll corrupt memory
randomly and crash.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFP-V2 0/3] Make mmu_notifier_invalidate_range_start able to sleep.
  2010-02-02 16:01                     ` Andrea Arcangeli
@ 2010-02-02 16:39                       ` Robin Holt
  2010-02-02 16:52                         ` Andrea Arcangeli
  0 siblings, 1 reply; 26+ messages in thread
From: Robin Holt @ 2010-02-02 16:39 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Robin Holt, Christoph Hellwig, Andrew Morton, Jack Steiner,
	linux-mm

On Tue, Feb 02, 2010 at 05:01:46PM +0100, Andrea Arcangeli wrote:
> On Tue, Feb 02, 2010 at 09:21:42AM -0600, Robin Holt wrote:
> > unmap_mapping_range_vma would then unlock the i_mmap_lock, call
> > _inv_range_start(atomic==0) which would clear all the remote page tables
> > and TLBs.  It would then reaquire the i_mmap_lock and retry.
> 
> I guess you missed why we hold the i_mmap_lock there... it's not like
> gratuitous locking complication there's an actual reason why it's
> taken, and it's to avoid the vma to be freed from under you:

Oversight on my part.  Sorry.

Will this work?
static int unmap_mapping_range_vma(struct vm_area_struct *vma,
...
	if (need_unlocked_invalidate) {
		mm = vma->vm_mm;
		atomic_inc(&mm->mm_users);
	}
	spin_unlock(details->i_mmap_lock);
	if (need_unlocked_invalidate) {
		/*
		 * zap_page_range failed to make any progress because the
		 * mmu_notifier_invalidate_range_start was called atomically
		 * while the callee needed to sleep.  In that event, we
		 * make the callout while the i_mmap_lock is released.
		 */
		mmu_notifier_invalidate_range_start(mm, start_addr, end_addr, 0);
		mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
		mmput(mm);
	}
	cond_resched();
	spin_lock(details->i_mmap_lock);
	return -EINTR;

Robin

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFP-V2 0/3] Make mmu_notifier_invalidate_range_start able to sleep.
  2010-02-02 16:39                       ` Robin Holt
@ 2010-02-02 16:52                         ` Andrea Arcangeli
  2010-02-02 16:59                           ` Robin Holt
  0 siblings, 1 reply; 26+ messages in thread
From: Andrea Arcangeli @ 2010-02-02 16:52 UTC (permalink / raw)
  To: Robin Holt; +Cc: Christoph Hellwig, Andrew Morton, Jack Steiner, linux-mm

On Tue, Feb 02, 2010 at 10:39:30AM -0600, Robin Holt wrote:
> On Tue, Feb 02, 2010 at 05:01:46PM +0100, Andrea Arcangeli wrote:
> > On Tue, Feb 02, 2010 at 09:21:42AM -0600, Robin Holt wrote:
> > > unmap_mapping_range_vma would then unlock the i_mmap_lock, call
> > > _inv_range_start(atomic==0) which would clear all the remote page tables
> > > and TLBs.  It would then reaquire the i_mmap_lock and retry.
> > 
> > I guess you missed why we hold the i_mmap_lock there... it's not like
> > gratuitous locking complication there's an actual reason why it's
> > taken, and it's to avoid the vma to be freed from under you:
> 
> Oversight on my part.  Sorry.
> 
> Will this work?

No, it still corrupts memory as before. You need to re-run find_vma
under mmap_sem. Then it could work...

Also it's wrong to pin mm_users, you only need mm_count here, as you
only need to run find_vma, you don't need to prevent exit to free the
pages indefinitely while you're blocked.

> static int unmap_mapping_range_vma(struct vm_area_struct *vma,
> ...
> 	if (need_unlocked_invalidate) {
> 		mm = vma->vm_mm;
> 		atomic_inc(&mm->mm_users);
> 	}
> 	spin_unlock(details->i_mmap_lock);
> 	if (need_unlocked_invalidate) {
> 		/*
> 		 * zap_page_range failed to make any progress because the
> 		 * mmu_notifier_invalidate_range_start was called atomically
> 		 * while the callee needed to sleep.  In that event, we
> 		 * make the callout while the i_mmap_lock is released.
> 		 */
> 		mmu_notifier_invalidate_range_start(mm, start_addr, end_addr, 0);
> 		mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
> 		mmput(mm);
> 	}
> 	cond_resched();
> 	spin_lock(details->i_mmap_lock);
> 	return -EINTR;
> 
> Robin
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFP-V2 0/3] Make mmu_notifier_invalidate_range_start able to sleep.
  2010-02-02 16:52                         ` Andrea Arcangeli
@ 2010-02-02 16:59                           ` Robin Holt
  2010-02-02 17:31                             ` Robin Holt
  2010-02-02 20:17                             ` Andrea Arcangeli
  0 siblings, 2 replies; 26+ messages in thread
From: Robin Holt @ 2010-02-02 16:59 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Robin Holt, Christoph Hellwig, Andrew Morton, Jack Steiner,
	linux-mm

On Tue, Feb 02, 2010 at 05:52:24PM +0100, Andrea Arcangeli wrote:
> On Tue, Feb 02, 2010 at 10:39:30AM -0600, Robin Holt wrote:
> > On Tue, Feb 02, 2010 at 05:01:46PM +0100, Andrea Arcangeli wrote:
> > > On Tue, Feb 02, 2010 at 09:21:42AM -0600, Robin Holt wrote:
> > > > unmap_mapping_range_vma would then unlock the i_mmap_lock, call
> > > > _inv_range_start(atomic==0) which would clear all the remote page tables
> > > > and TLBs.  It would then reaquire the i_mmap_lock and retry.
> > > 
> > > I guess you missed why we hold the i_mmap_lock there... it's not like
> > > gratuitous locking complication there's an actual reason why it's
> > > taken, and it's to avoid the vma to be freed from under you:
> > 
> > Oversight on my part.  Sorry.
> > 
> > Will this work?
> 
> No, it still corrupts memory as before. You need to re-run find_vma
> under mmap_sem. Then it could work...

But we don't use the vma for anything.  The _invalidate_range_start/end
is using the mm.  XPMEM and GRU don't use the vma.  Does KVM?  Since it
isn't passed in, I would expect that anybody trying to use the vma is
going to have to do a find_vma themselves.  Did I miss something?

> Also it's wrong to pin mm_users, you only need mm_count here, as you
> only need to run find_vma, you don't need to prevent exit to free the
> pages indefinitely while you're blocked.

Is this better?

static int unmap_mapping_range_vma(struct vm_area_struct *vma,
...
	if (need_unlocked_invalidate) {
		mm = vma->vm_mm;
		atomic_inc(&mm->mm_count);
	}
	spin_unlock(details->i_mmap_lock);
	if (need_unlocked_invalidate) {
		/*
		 * zap_page_range failed to make any progress because the
		 * mmu_notifier_invalidate_range_start was called atomically
		 * while the callee needed to sleep.  In that event, we
		 * make the callout while the i_mmap_lock is released.
		 */
		mmu_notifier_invalidate_range_start(mm, start_addr, end_addr, 0);
		mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
		mmdrop(mm);
	}

Robin

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFP-V2 0/3] Make mmu_notifier_invalidate_range_start able to sleep.
  2010-02-02 16:59                           ` Robin Holt
@ 2010-02-02 17:31                             ` Robin Holt
  2010-02-02 20:27                               ` Andrea Arcangeli
  2010-02-02 20:17                             ` Andrea Arcangeli
  1 sibling, 1 reply; 26+ messages in thread
From: Robin Holt @ 2010-02-02 17:31 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Robin Holt, Christoph Hellwig, Andrew Morton, Jack Steiner,
	linux-mm

On Tue, Feb 02, 2010 at 10:59:03AM -0600, Robin Holt wrote:
> On Tue, Feb 02, 2010 at 05:52:24PM +0100, Andrea Arcangeli wrote:
> > On Tue, Feb 02, 2010 at 10:39:30AM -0600, Robin Holt wrote:
> > > On Tue, Feb 02, 2010 at 05:01:46PM +0100, Andrea Arcangeli wrote:
> > > > On Tue, Feb 02, 2010 at 09:21:42AM -0600, Robin Holt wrote:
> > > > > unmap_mapping_range_vma would then unlock the i_mmap_lock, call
> > > > > _inv_range_start(atomic==0) which would clear all the remote page tables
> > > > > and TLBs.  It would then reaquire the i_mmap_lock and retry.
> > > > 
> > > > I guess you missed why we hold the i_mmap_lock there... it's not like
> > > > gratuitous locking complication there's an actual reason why it's
> > > > taken, and it's to avoid the vma to be freed from under you:
> > > 
> > > Oversight on my part.  Sorry.
> > > 
> > > Will this work?
> > 
> > No, it still corrupts memory as before. You need to re-run find_vma
> > under mmap_sem. Then it could work...
> 
> But we don't use the vma for anything.  The _invalidate_range_start/end
> is using the mm.  XPMEM and GRU don't use the vma.  Does KVM?  Since it
> isn't passed in, I would expect that anybody trying to use the vma is
> going to have to do a find_vma themselves.  Did I miss something?
> 
> > Also it's wrong to pin mm_users, you only need mm_count here, as you
> > only need to run find_vma, you don't need to prevent exit to free the
> > pages indefinitely while you're blocked.
> 
> Is this better?

Not better.  Still need to grab the mmap_sem.  How about this?

static int unmap_mapping_range_vma(struct vm_area_struct *vma,
...
	if (need_unlocked_invalidate) {
		mm = vma->vm_mm;
		atomic_inc(&mm->mm_count);
	}
	spin_unlock(details->i_mmap_lock);
	if (need_unlocked_invalidate) {
		/*
		 * zap_page_range failed to make any progress because the
		 * mmu_notifier_invalidate_range_start was called atomically
		 * while the callee needed to sleep.  In that event, we
		 * make the callout while the i_mmap_lock is released.
		 */
		down_read(&mm->mmap_sem);
		mmu_notifier_invalidate_range_start(mm, start_addr, end_addr, 0);
		mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
		up_read(&mm->mmap_sem);
		mmdrop(mm);
	}


Robin

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFP-V2 0/3] Make mmu_notifier_invalidate_range_start able to sleep.
  2010-02-02 17:31                             ` Robin Holt
@ 2010-02-02 20:27                               ` Andrea Arcangeli
  0 siblings, 0 replies; 26+ messages in thread
From: Andrea Arcangeli @ 2010-02-02 20:27 UTC (permalink / raw)
  To: Robin Holt; +Cc: Christoph Hellwig, Andrew Morton, Jack Steiner, linux-mm

On Tue, Feb 02, 2010 at 11:31:58AM -0600, Robin Holt wrote:
> Not better.  Still need to grab the mmap_sem.  How about this?

I don't think you need to grab mmap_sem, previous patch was ok if we
agree with the tradeoff that you still can't schedule in
->invalidate_page and it's unclear why that is ok. After you apply
transparent hugepage support, it'll break and I'll have to deal with
it somehow.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFP-V2 0/3] Make mmu_notifier_invalidate_range_start able to sleep.
  2010-02-02 16:59                           ` Robin Holt
  2010-02-02 17:31                             ` Robin Holt
@ 2010-02-02 20:17                             ` Andrea Arcangeli
  2010-02-03  0:48                               ` Robin Holt
  1 sibling, 1 reply; 26+ messages in thread
From: Andrea Arcangeli @ 2010-02-02 20:17 UTC (permalink / raw)
  To: Robin Holt; +Cc: Christoph Hellwig, Andrew Morton, Jack Steiner, linux-mm

On Tue, Feb 02, 2010 at 10:59:03AM -0600, Robin Holt wrote:
> But we don't use the vma for anything.  The _invalidate_range_start/end
> is using the mm.  XPMEM and GRU don't use the vma.  Does KVM?  Since it
> isn't passed in, I would expect that anybody trying to use the vma is
> going to have to do a find_vma themselves.  Did I miss something?

No sorry, we are passing down the mm not the vma so it should be ok already.

> Is this better?
> 
> static int unmap_mapping_range_vma(struct vm_area_struct *vma,
> ...
> 	if (need_unlocked_invalidate) {
> 		mm = vma->vm_mm;
> 		atomic_inc(&mm->mm_count);
> 	}
> 	spin_unlock(details->i_mmap_lock);
> 	if (need_unlocked_invalidate) {
> 		/*
> 		 * zap_page_range failed to make any progress because the
> 		 * mmu_notifier_invalidate_range_start was called atomically
> 		 * while the callee needed to sleep.  In that event, we
> 		 * make the callout while the i_mmap_lock is released.
> 		 */
> 		mmu_notifier_invalidate_range_start(mm, start_addr, end_addr, 0);
> 		mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
> 		mmdrop(mm);
> 	}

Yes with mm_count it's better and this way it should be safe. I think
it's an ok tradeoff, hopefully then nobody will ask to schedule in
->invalidate_page. Still it'd be interesting (back to Andrew's
argument) to understand what is fundamentally different that you are
ok not to schedule in ->invalidate_page but you absolutely need it
here. And yes this will break also my transparent hugepage patch that
can't schedule inside the anon_vma->lock and uses the range calls to
be safer (then maybe we can require the mmu notifier users to check
PageTransHuge against the pages and handle the invalidate through
->invalidate_page or we can add ->invalidate_transhuge_page.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFP-V2 0/3] Make mmu_notifier_invalidate_range_start able to sleep.
  2010-02-02 20:17                             ` Andrea Arcangeli
@ 2010-02-03  0:48                               ` Robin Holt
  2010-02-03 17:14                                 ` Andrea Arcangeli
  0 siblings, 1 reply; 26+ messages in thread
From: Robin Holt @ 2010-02-03  0:48 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Robin Holt, Christoph Hellwig, Andrew Morton, Jack Steiner,
	linux-mm

> Yes with mm_count it's better and this way it should be safe. I think
> it's an ok tradeoff, hopefully then nobody will ask to schedule in
> ->invalidate_page. Still it'd be interesting (back to Andrew's
> argument) to understand what is fundamentally different that you are
> ok not to schedule in ->invalidate_page but you absolutely need it
> here. And yes this will break also my transparent hugepage patch that

In the _invalidate_page case, it is called by the kernel from sites where
the kernel is relying upon the reference count to eliminate the page from
use while maintaining the page's data as clean and ready to be released.
If the page is marked as dirty, etc. then the kernel will "do the right
thing" with the page to maintain data consistency.

The _invalidate_range_start/end pairs are used in places where the
caller's address space is being modified.  If we allow the attachers
to continue to use the old pages from the old mapping even for a short
time after the process has started to use the new pages, there would be
silent data corruption.

A difference is the kernel's expectations.  The truncate case is the one
place where the kernel's expectation for _invalidate_range_start/end
more closely matches those of _invalidate_page.  When I was babbling
about a new version of the patch, it basically adds that concept to the
_invalidate_range_start callout as a parameter.  Essentially changing
the bool atomic into a flag indicating the kernel does not expect this
step to be complete prior finishing this callout.

I don't like that second patch which is why I have not posted it.
It relies upon the fuzzy quantity of an "adequate" period of time between
when the file is truncated down before it may be extended again to ensure
data consistency.  Shrink and extend too quickly and problems will ensue.

> can't schedule inside the anon_vma->lock and uses the range calls to
> be safer (then maybe we can require the mmu notifier users to check
> PageTransHuge against the pages and handle the invalidate through
> ->invalidate_page or we can add ->invalidate_transhuge_page.

I don't think that is a problem.  I don't think the GRU has any issues
at all.  I believe that the invalidate even of a standard page size will
eliminate the entire TLB.  Jack was going to verify that the last time
I talked with him.  If it behaved any differently, I would be surprised
as it would be inconsistent with nearly every other TLB out there.

XPMEM will currently not work, but I believe I can get it to work quite
easily as I can walk the segment's PFN table without acquiring any
sleeping locks and decide to expand the page size for any invalidation
within that range.  With that, at the time of the callout, I can schedule
an invalidation of the appropriate size.

As for the transparent huge page patches, I have just skimmed them
lightly as I really don't have much time to understand your intention.
I do think I am agreeing with Christoph Lameter that using the migration
style mechanism is probably better in that it handles the invalidate_page
callouts, follows the page reference counts, and allows my asynchronous
invalidation expectation to persist.

If I read it correctly, your patch would now require invalidate_page
to be complete and have reference counts pushed back down upon return.
I probably missed a key portion of that patch and would welcome the
chance to be informed.

Thanks,
Robin

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFP-V2 0/3] Make mmu_notifier_invalidate_range_start able to sleep.
  2010-02-03  0:48                               ` Robin Holt
@ 2010-02-03 17:14                                 ` Andrea Arcangeli
  2010-02-03 17:18                                   ` Andrea Arcangeli
  2010-02-03 19:54                                   ` Robin Holt
  0 siblings, 2 replies; 26+ messages in thread
From: Andrea Arcangeli @ 2010-02-03 17:14 UTC (permalink / raw)
  To: Robin Holt; +Cc: Christoph Hellwig, Andrew Morton, Jack Steiner, linux-mm

On Tue, Feb 02, 2010 at 06:48:33PM -0600, Robin Holt wrote:
> In the _invalidate_page case, it is called by the kernel from sites where
> the kernel is relying upon the reference count to eliminate the page from
> use while maintaining the page's data as clean and ready to be released.
> If the page is marked as dirty, etc. then the kernel will "do the right
> thing" with the page to maintain data consistency.
>
> The _invalidate_range_start/end pairs are used in places where the
> caller's address space is being modified.  If we allow the attachers
> to continue to use the old pages from the old mapping even for a short
> time after the process has started to use the new pages, there would be
> silent data corruption.

Just to show how fragile your assumption is, your code is already
generating mm corruption in fork and in ksm... the set_pte_at_notify
invalidate_page has to run immediately and be effective immediately
despite being called with the PT lock hold.

> A difference is the kernel's expectations.  The truncate case is the one

do_wp_page and ksm ->invalidate_page or ->change_pte have expectations
different than what the ones you expect it to have.

It's the first time I hear about the semantics of invalidate_page and
range_start/end being different. The only reason why there are two
different calls is that start/end wants to run a single tlb flush for
many ptes teardown. While this isn't doable with the ptep_clear_flush
or set_pte_at versions beacuse there is a single pte to invalidate and
not multiple. So for the latter we simply have a single call and
there's no need of a start/stop range.

> place where the kernel's expectation for _invalidate_range_start/end
> more closely matches those of _invalidate_page.  When I was babbling
> about a new version of the patch, it basically adds that concept to the
> _invalidate_range_start callout as a parameter.  Essentially changing
> the bool atomic into a flag indicating the kernel does not expect this
> step to be complete prior finishing this callout.

I don't get it sorry.

> I don't like that second patch which is why I have not posted it.
> It relies upon the fuzzy quantity of an "adequate" period of time between
> when the file is truncated down before it may be extended again to ensure
> data consistency.  Shrink and extend too quickly and problems will ensue.

Not sure I get it, it's all i_mutex serialized so i_size can't change
under the code under discussion, I don't see the shrink and extend too
quick.

> > can't schedule inside the anon_vma->lock and uses the range calls to
> > be safer (then maybe we can require the mmu notifier users to check
> > PageTransHuge against the pages and handle the invalidate through
> > ->invalidate_page or we can add ->invalidate_transhuge_page.
> 
> I don't think that is a problem.  I don't think the GRU has any issues
> at all.  I believe that the invalidate even of a standard page size will
> eliminate the entire TLB.  Jack was going to verify that the last time
> I talked with him.  If it behaved any differently, I would be surprised
> as it would be inconsistent with nearly every other TLB out there.

GRU sure is fine as it has no sleepability requirement. My
pmdp_clear_flush_notify calls invalidate_range_start/end to flush the
entire mmu range in case the secondary mmu couldn't map the hugepage
in a single hugetlb (like kvm does).

> XPMEM will currently not work, but I believe I can get it to work quite
> easily as I can walk the segment's PFN table without acquiring any
> sleeping locks and decide to expand the page size for any invalidation
> within that range.  With that, at the time of the callout, I can schedule
> an invalidation of the appropriate size.

Schedule or/and do it later isn't safe in khugepaged, ksm, do_wp_page
etc.. all running under PT lock. It also isn't ok for
pmdp_clear_flush_notify calling the
mmu_notifier_invalidate_range_start/end(atomic=0), it's unacceptable
to fail and not flush sptes as I have to migrate a writable page in
khugepaged and data writes will be lost if you don't really stop the
remote dma before pmdp_clear_flush_notify returns!

> As for the transparent huge page patches, I have just skimmed them
> lightly as I really don't have much time to understand your intention.
> I do think I am agreeing with Christoph Lameter that using the migration
> style mechanism is probably better in that it handles the invalidate_page
> callouts, follows the page reference counts, and allows my asynchronous
> invalidation expectation to persist.

I think your asynchronous invalidation expectation are invalid
regardless of khugepaged and pmdp_clear_flush_notify.

> If I read it correctly, your patch would now require invalidate_page
> to be complete and have reference counts pushed back down upon return.
> I probably missed a key portion of that patch and would welcome the
> chance to be informed.

The problem aren't the reference counts. The problem is that the
writes coming from remote must not go to a page regardless of its page
count, this isn't about khugepaged only. The reads coming from remote
also must not read from local memory after invalidate or do_wp_page
will break as the writes through the pte will go to the copied page
while the remote mapping still reads from the old page.

It's a pure accident it works ok in swap (because swap won't alter the
pte->pnf mapping after the invalidate and it won't actually lose the
swap entry until the refcount is only the one for the lru). Every
other place where invalidate_page is called and the pte->pfn mapping
is altered (and it's not only marked nonpresent and remapped on the
same pfn in case of swapcache fault) requires synchronous updating of
secondary sptes in invalidate_page too.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFP-V2 0/3] Make mmu_notifier_invalidate_range_start able to sleep.
  2010-02-03 17:14                                 ` Andrea Arcangeli
@ 2010-02-03 17:18                                   ` Andrea Arcangeli
  2010-02-03 19:54                                   ` Robin Holt
  1 sibling, 0 replies; 26+ messages in thread
From: Andrea Arcangeli @ 2010-02-03 17:18 UTC (permalink / raw)
  To: Robin Holt; +Cc: Christoph Hellwig, Andrew Morton, Jack Steiner, linux-mm

On Wed, Feb 03, 2010 at 06:14:13PM +0100, Andrea Arcangeli wrote:
> generating mm corruption in fork and in ksm... the set_pte_at_notify

with fork I meant later during the do_wp_page... to avoid
confusion. (very fork is ok as it can schedule so you won't have to
defer the invalidate)

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFP-V2 0/3] Make mmu_notifier_invalidate_range_start able to sleep.
  2010-02-03 17:14                                 ` Andrea Arcangeli
  2010-02-03 17:18                                   ` Andrea Arcangeli
@ 2010-02-03 19:54                                   ` Robin Holt
  1 sibling, 0 replies; 26+ messages in thread
From: Robin Holt @ 2010-02-03 19:54 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Robin Holt, Christoph Hellwig, Andrew Morton, Jack Steiner,
	linux-mm

On Wed, Feb 03, 2010 at 06:14:13PM +0100, Andrea Arcangeli wrote:
> On Tue, Feb 02, 2010 at 06:48:33PM -0600, Robin Holt wrote:
> > In the _invalidate_page case, it is called by the kernel from sites where
> > the kernel is relying upon the reference count to eliminate the page from
> > use while maintaining the page's data as clean and ready to be released.
> > If the page is marked as dirty, etc. then the kernel will "do the right
> > thing" with the page to maintain data consistency.
> >
> > The _invalidate_range_start/end pairs are used in places where the
> > caller's address space is being modified.  If we allow the attachers
> > to continue to use the old pages from the old mapping even for a short
> > time after the process has started to use the new pages, there would be
> > silent data corruption.
> 
> Just to show how fragile your assumption is, your code is already
> generating mm corruption in fork and in ksm... the set_pte_at_notify
> invalidate_page has to run immediately and be effective immediately
> despite being called with the PT lock hold.

Actually, we don't generate corruption, but that is a little more complex.

At fork time, the invalidate range happens to clear all of the segment's
page table.

When XPMEM goes to refill the entry, we always call
get_user_pages(,write=1,).  That will result in a page callout, but
we have no entry in the segment's page table so the callout is safely
ignored.  When the processes pte gets established, it has already broken
COW so we are back to a safe state.

Robin

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFP-V2 0/3] Make mmu_notifier_invalidate_range_start able to sleep.
  2010-02-02 12:59   ` Andrea Arcangeli
  2010-02-02 13:13     ` Andrea Arcangeli
@ 2010-02-02 13:23     ` Robin Holt
  1 sibling, 0 replies; 26+ messages in thread
From: Robin Holt @ 2010-02-02 13:23 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Christoph Hellwig, Robin Holt, Andrew Morton, Jack Steiner,
	linux-mm

On Tue, Feb 02, 2010 at 01:59:43PM +0100, Andrea Arcangeli wrote:
> On Tue, Feb 02, 2010 at 03:09:47AM -0500, Christoph Hellwig wrote:
> > On Mon, Feb 01, 2010 at 10:01:45PM -0600, Robin Holt wrote:
> > > XPMEM would like to utilize mmu_notifiers to track page table entry
> > > changes of the segment and keep the attachment page table/tlb information
> > > consistent.
> > 
> > Given that SGI just pushes XPMEM direclty into the distributions instead
> > of adding it upstream I don't really see the relevance of these patches.
> 
> That will then prevent upstream modules to build against those
> kernels. Not an huge issue, for a distro that's an ok compromise. My
> real issue with mainline is that while XPMEM is ok to break and
> corrupt memory if people uses XPMEM on top of shared mappings (instead
> of anonymous ones) by making a two liner change to the userland app
> opening xpmem device, but when next mmu notifier user comes and ask
> for full scheduling across shared mapping too as it needs security and
> not-trusted user can open /dev/xpmem (or whatever that device is
> located), we'll have to undo this work and fix it the real way (with
> config option MMU_NOTIFIER_SLEEPABLE=y). But if distro have to support
> XPMEM in default kernels, this hack is better because it won't
> slowdown the locking even if it leaves holes and corrupts memory when
> XPMEM can be opened by luser. It really depends if the user having

Where is it leaving holes and corrupting memory?

Robin

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFP-V2 0/3] Make mmu_notifier_invalidate_range_start able to sleep.
  2010-02-02  8:09 ` [RFP-V2 0/3] " Christoph Hellwig
  2010-02-02 12:59   ` Andrea Arcangeli
@ 2010-02-02 13:35   ` Robin Holt
  1 sibling, 0 replies; 26+ messages in thread
From: Robin Holt @ 2010-02-02 13:35 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Robin Holt, Andrew Morton, Andrea Arcangeli, Jack Steiner,
	linux-mm

On Tue, Feb 02, 2010 at 03:09:47AM -0500, Christoph Hellwig wrote:
> On Mon, Feb 01, 2010 at 10:01:45PM -0600, Robin Holt wrote:
> > XPMEM would like to utilize mmu_notifiers to track page table entry
> > changes of the segment and keep the attachment page table/tlb information
> > consistent.
> 
> Given that SGI just pushes XPMEM direclty into the distributions instead
> of adding it upstream I don't really see the relevance of these patches.

XPMEM has in the past and will again be pushed to the community.  We are
not pushing it to the distros.  We have asked them to take very minor
patches which have all, with the exception of one, been accepted upstream.
The one which has not been accepted upstream has not even been pushed and
that is only turning on MMU_NOTIFIER when CONFIG_IA64 && CONFIG_SGI_XP
are set.

We build xpmem as a GPL out of tree kernel module and library.  The
sources are shipped with the SGI ProPack product CD.  Any customer could
rebuild the kernel module with a simple rpmbuild --rebuild xpmem*.src.rpm
if they wanted.

Thanks,
Robin

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

end of thread, other threads:[~2010-02-03 19:54 UTC | newest]

Thread overview: 26+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <20100202040145.555474000@alcatraz.americas.sgi.com>
2010-02-02  4:01 ` [RFP-V2 1/3] Have mmu_notifiers use SRCU so they may safely schedule Robin Holt
2010-02-02  4:01 ` [RFP-V2 2/3] Fix unmap_vma() bug related to mmu_notifiers Robin Holt
2010-02-02  4:01 ` [RFP-V2 3/3] Make mmu_notifier_invalidate_range_start able to sleep Robin Holt
2010-02-02  8:09 ` [RFP-V2 0/3] " Christoph Hellwig
2010-02-02 12:59   ` Andrea Arcangeli
2010-02-02 13:13     ` Andrea Arcangeli
2010-02-02 13:29       ` Robin Holt
2010-02-02 13:40         ` Andrea Arcangeli
2010-02-02 13:51           ` Robin Holt
2010-02-02 14:10             ` Andrea Arcangeli
2010-02-02 14:21               ` Robin Holt
2010-02-02 14:59                 ` Andrea Arcangeli
2010-02-02 15:21                   ` Robin Holt
2010-02-02 16:01                     ` Andrea Arcangeli
2010-02-02 16:39                       ` Robin Holt
2010-02-02 16:52                         ` Andrea Arcangeli
2010-02-02 16:59                           ` Robin Holt
2010-02-02 17:31                             ` Robin Holt
2010-02-02 20:27                               ` Andrea Arcangeli
2010-02-02 20:17                             ` Andrea Arcangeli
2010-02-03  0:48                               ` Robin Holt
2010-02-03 17:14                                 ` Andrea Arcangeli
2010-02-03 17:18                                   ` Andrea Arcangeli
2010-02-03 19:54                                   ` Robin Holt
2010-02-02 13:23     ` Robin Holt
2010-02-02 13:35   ` Robin Holt

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).