All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] rmaplock 1/6 PageAnon in mapping
@ 2004-07-12 21:49 Hugh Dickins
  2004-07-12 21:50 ` [PATCH] rmaplock 2/6 kill page_map_lock Hugh Dickins
                   ` (5 more replies)
  0 siblings, 6 replies; 13+ messages in thread
From: Hugh Dickins @ 2004-07-12 21:49 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel

First of a batch of six patches, based on 2.6.7-mm7 but applying easily
to 2.6.8-rc1: to eliminate rmap's page_map_lock, replace its trylocking
by spinlocking, and settle a couple of issues noticed on the way.

rmaplock 1/6 PageAnon in mapping

Replace the PG_anon page->flags bit by setting the lower bit of the
pointer in page->mapping when it's anon_vma: PAGE_MAPPING_ANON bit.

We're about to eliminate the locking which kept the flags and mapping
in synch: it's much easier to work on a local copy of page->mapping,
than worry about whether flags and mapping are in synch (though I
imagine it could be done, at greater cost, with some barriers).

Signed-off-by: Hugh Dickins <hugh@veritas.com>

 include/linux/mm.h         |   23 ++++++++++++++++-------
 include/linux/page-flags.h |    6 ------
 mm/page_alloc.c            |    3 ---
 mm/rmap.c                  |   20 +++-----------------
 4 files changed, 19 insertions(+), 33 deletions(-)

--- 2.6.7-mm7/include/linux/mm.h	2004-07-09 10:53:43.000000000 +0100
+++ rmaplock1/include/linux/mm.h	2004-07-12 18:20:07.949982720 +0100
@@ -205,11 +205,12 @@ struct page {
 					 * if PagePrivate set; used for
 					 * swp_entry_t if PageSwapCache
 					 */
-	struct address_space *mapping;	/* If PG_anon clear, points to
+	struct address_space *mapping;	/* If low bit clear, points to
 					 * inode address_space, or NULL.
 					 * If page mapped as anonymous
-					 * memory, PG_anon is set, and
-					 * it points to anon_vma object.
+					 * memory, low bit is set, and
+					 * it points to anon_vma object:
+					 * see PAGE_MAPPING_ANON below.
 					 */
 	pgoff_t index;			/* Our offset within mapping. */
 	struct list_head lru;		/* Pageout list, eg. active_list
@@ -433,24 +434,32 @@ void page_address_init(void);
 
 /*
  * On an anonymous page mapped into a user virtual memory area,
- * page->mapping points to its anon_vma, not to a struct address_space.
+ * page->mapping points to its anon_vma, not to a struct address_space;
+ * with the PAGE_MAPPING_ANON bit set to distinguish it.
  *
  * Please note that, confusingly, "page_mapping" refers to the inode
  * address_space which maps the page from disk; whereas "page_mapped"
  * refers to user virtual address space into which the page is mapped.
  */
+#define PAGE_MAPPING_ANON	1
+
 extern struct address_space swapper_space;
 static inline struct address_space *page_mapping(struct page *page)
 {
-	struct address_space *mapping = NULL;
+	struct address_space *mapping = page->mapping;
 
 	if (unlikely(PageSwapCache(page)))
 		mapping = &swapper_space;
-	else if (likely(!PageAnon(page)))
-		mapping = page->mapping;
+	else if (unlikely((unsigned long)mapping & PAGE_MAPPING_ANON))
+		mapping = NULL;
 	return mapping;
 }
 
+static inline int PageAnon(struct page *page)
+{
+	return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0;
+}
+
 /*
  * Return the pagecache index of the passed page.  Regular pagecache pages
  * use ->index whereas swapcache pages use ->private
--- 2.6.7-mm7/include/linux/page-flags.h	2004-07-09 10:53:44.000000000 +0100
+++ rmaplock1/include/linux/page-flags.h	2004-07-12 18:20:07.950982568 +0100
@@ -76,8 +76,6 @@
 #define PG_reclaim		18	/* To be reclaimed asap */
 #define PG_compound		19	/* Part of a compound page */
 
-#define PG_anon			20	/* Anonymous: anon_vma in mapping */
-
 
 /*
  * Global page accounting.  One instance per CPU.  Only unsigned longs are
@@ -292,10 +290,6 @@ extern unsigned long __read_page_state(u
 #define SetPageCompound(page)	set_bit(PG_compound, &(page)->flags)
 #define ClearPageCompound(page)	clear_bit(PG_compound, &(page)->flags)
 
-#define PageAnon(page)		test_bit(PG_anon, &(page)->flags)
-#define SetPageAnon(page)	set_bit(PG_anon, &(page)->flags)
-#define ClearPageAnon(page)	clear_bit(PG_anon, &(page)->flags)
-
 #ifdef CONFIG_SWAP
 #define PageSwapCache(page)	test_bit(PG_swapcache, &(page)->flags)
 #define SetPageSwapCache(page)	set_bit(PG_swapcache, &(page)->flags)
--- 2.6.7-mm7/mm/page_alloc.c	2004-07-09 10:53:46.000000000 +0100
+++ rmaplock1/mm/page_alloc.c	2004-07-12 18:20:07.952982264 +0100
@@ -88,7 +88,6 @@ static void bad_page(const char *functio
 			1 << PG_active	|
 			1 << PG_dirty	|
 			1 << PG_maplock |
-			1 << PG_anon    |
 			1 << PG_swapcache |
 			1 << PG_writeback);
 	set_page_count(page, 0);
@@ -230,7 +229,6 @@ static inline void free_pages_check(cons
 			1 << PG_reclaim	|
 			1 << PG_slab	|
 			1 << PG_maplock |
-			1 << PG_anon    |
 			1 << PG_swapcache |
 			1 << PG_writeback )))
 		bad_page(function, page);
@@ -353,7 +351,6 @@ static void prep_new_page(struct page *p
 			1 << PG_dirty	|
 			1 << PG_reclaim	|
 			1 << PG_maplock |
-			1 << PG_anon    |
 			1 << PG_swapcache |
 			1 << PG_writeback )))
 		bad_page(__FUNCTION__, page);
--- 2.6.7-mm7/mm/rmap.c	2004-07-09 10:53:46.000000000 +0100
+++ rmaplock1/mm/rmap.c	2004-07-12 18:20:07.954981960 +0100
@@ -168,7 +168,6 @@ static inline void clear_page_anon(struc
 {
 	BUG_ON(!page->mapping);
 	page->mapping = NULL;
-	ClearPageAnon(page);
 }
 
 /*
@@ -243,7 +242,7 @@ out:
 static inline int page_referenced_anon(struct page *page)
 {
 	unsigned int mapcount = page->mapcount;
-	struct anon_vma *anon_vma = (struct anon_vma *) page->mapping;
+	struct anon_vma *anon_vma = (void *) page->mapping - PAGE_MAPPING_ANON;
 	struct vm_area_struct *vma;
 	int referenced = 0;
 
@@ -344,31 +343,18 @@ void page_add_anon_rmap(struct page *pag
 	BUG_ON(PageReserved(page));
 	BUG_ON(!anon_vma);
 
+	anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
 	index = (address - vma->vm_start) >> PAGE_SHIFT;
 	index += vma->vm_pgoff;
 	index >>= PAGE_CACHE_SHIFT - PAGE_SHIFT;
 
-	/*
-	 * Setting and clearing PG_anon must always happen inside
-	 * page_map_lock to avoid races between mapping and
-	 * unmapping on different processes of the same
-	 * shared cow swapcache page. And while we take the
-	 * page_map_lock PG_anon cannot change from under us.
-	 * Actually PG_anon cannot change under fork either
-	 * since fork holds a reference on the page so it cannot
-	 * be unmapped under fork and in turn copy_page_range is
-	 * allowed to read PG_anon outside the page_map_lock.
-	 */
 	page_map_lock(page);
 	if (!page->mapcount) {
-		BUG_ON(PageAnon(page));
 		BUG_ON(page->mapping);
-		SetPageAnon(page);
 		page->index = index;
 		page->mapping = (struct address_space *) anon_vma;
 		inc_page_state(nr_mapped);
 	} else {
-		BUG_ON(!PageAnon(page));
 		BUG_ON(page->index != index);
 		BUG_ON(page->mapping != (struct address_space *) anon_vma);
 	}
@@ -623,7 +609,7 @@ out_unlock:
 
 static inline int try_to_unmap_anon(struct page *page)
 {
-	struct anon_vma *anon_vma = (struct anon_vma *) page->mapping;
+	struct anon_vma *anon_vma = (void *) page->mapping - PAGE_MAPPING_ANON;
 	struct vm_area_struct *vma;
 	int ret = SWAP_AGAIN;
 


^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH] rmaplock 2/6 kill page_map_lock
  2004-07-12 21:49 [PATCH] rmaplock 1/6 PageAnon in mapping Hugh Dickins
@ 2004-07-12 21:50 ` Hugh Dickins
  2004-07-12 21:52 ` [PATCH] rmaplock 2/6 SLAB_DESTROY_BY_RCU Hugh Dickins
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 13+ messages in thread
From: Hugh Dickins @ 2004-07-12 21:50 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel

The pte_chains rmap used pte_chain_lock (bit_spin_lock on PG_chainlock)
to lock its pte_chains.  We kept this (as page_map_lock: bit_spin_lock
on PG_maplock) when we moved to objrmap.  But the file objrmap locks its
vma tree with mapping->i_mmap_lock, and the anon objrmap locks its vma
list with anon_vma->lock: so isn't the page_map_lock superfluous?

Pretty much, yes.  The mapcount was protected by it, and needs to become
an atomic: starting at -1 like page _count, so nr_mapped can be tracked
precisely up and down.  The last page_remove_rmap can't clear anon page
mapping any more, because of races with page_add_rmap; from which some
BUG_ONs must go for the same reason, but they've served their purpose.

vmscan decisions are naturally racy, little change there beyond removing
page_map_lock/unlock.  But to stabilize the file-backed page->mapping
against truncation while acquiring i_mmap_lock, page_referenced_file now
needs page lock to be held even for refill_inactive_zone.  There's a
similar issue in acquiring anon_vma->lock, where page lock doesn't help:
which this patch pretends to handle, but actually it needs the next.

Roughly 10% cut off lmbench fork numbers on my 2*HT*P4.  Must confess
my testing failed to show the races even while they were knowingly
exposed: would benefit from testing on racier equipment.

Signed-off-by: Hugh Dickins <hugh@veritas.com>

 include/linux/mm.h         |   22 ++++++-
 include/linux/page-flags.h |    3 -
 include/linux/rmap.h       |   13 +---
 mm/page_alloc.c            |   10 +--
 mm/rmap.c                  |  135 +++++++++++++++++++++++++--------------------
 mm/vmscan.c                |   37 ++----------
 6 files changed, 110 insertions(+), 110 deletions(-)

--- rmaplock1/include/linux/mm.h	2004-07-12 18:20:07.949982720 +0100
+++ rmaplock2/include/linux/mm.h	2004-07-12 18:20:22.331796352 +0100
@@ -195,10 +195,9 @@ struct page {
 	page_flags_t flags;		/* Atomic flags, some possibly
 					 * updated asynchronously */
 	atomic_t _count;		/* Usage count, see below. */
-	unsigned int mapcount;		/* Count of ptes mapped in mms,
+	atomic_t _mapcount;		/* Count of ptes mapped in mms,
 					 * to show when page is mapped
-					 * & limit reverse map searches,
-					 * protected by PG_maplock.
+					 * & limit reverse map searches.
 					 */
 	unsigned long private;		/* Mapping-private opaque data:
 					 * usually used for buffer_heads
@@ -472,11 +471,26 @@ static inline pgoff_t page_index(struct 
 }
 
 /*
+ * The atomic page->_mapcount, like _count, starts from -1:
+ * so that transitions both from it and to it can be tracked,
+ * using atomic_inc_and_test and atomic_add_negative(-1).
+ */
+static inline void reset_page_mapcount(struct page *page)
+{
+	atomic_set(&(page)->_mapcount, -1);
+}
+
+static inline int page_mapcount(struct page *page)
+{
+	return atomic_read(&(page)->_mapcount) + 1;
+}
+
+/*
  * Return true if this page is mapped into pagetables.
  */
 static inline int page_mapped(struct page *page)
 {
-	return page->mapcount != 0;
+	return atomic_read(&(page)->_mapcount) >= 0;
 }
 
 /*
--- rmaplock1/include/linux/page-flags.h	2004-07-12 18:20:07.950982568 +0100
+++ rmaplock2/include/linux/page-flags.h	2004-07-12 18:20:22.332796200 +0100
@@ -69,12 +69,11 @@
 #define PG_private		12	/* Has something at ->private */
 #define PG_writeback		13	/* Page is under writeback */
 #define PG_nosave		14	/* Used for system suspend/resume */
-#define PG_maplock		15	/* Lock bit for rmap to ptes */
+#define PG_compound		15	/* Part of a compound page */
 
 #define PG_swapcache		16	/* Swap page: swp_entry_t in private */
 #define PG_mappedtodisk		17	/* Has blocks allocated on-disk */
 #define PG_reclaim		18	/* To be reclaimed asap */
-#define PG_compound		19	/* Part of a compound page */
 
 
 /*
--- rmaplock1/include/linux/rmap.h	2004-06-16 06:20:36.000000000 +0100
+++ rmaplock2/include/linux/rmap.h	2004-07-12 18:20:22.333796048 +0100
@@ -9,11 +9,6 @@
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 
-#define page_map_lock(page) \
-	bit_spin_lock(PG_maplock, (unsigned long *)&(page)->flags)
-#define page_map_unlock(page) \
-	bit_spin_unlock(PG_maplock, (unsigned long *)&(page)->flags)
-
 /*
  * The anon_vma heads a list of private "related" vmas, to scan if
  * an anonymous page pointing to this anon_vma needs to be unmapped:
@@ -87,15 +82,13 @@ void page_remove_rmap(struct page *);
  */
 static inline void page_dup_rmap(struct page *page)
 {
-	page_map_lock(page);
-	page->mapcount++;
-	page_map_unlock(page);
+	atomic_inc(&page->_mapcount);
 }
 
 /*
  * Called from mm/vmscan.c to handle paging out
  */
-int page_referenced(struct page *);
+int page_referenced(struct page *, int is_locked);
 int try_to_unmap(struct page *);
 
 #else	/* !CONFIG_MMU */
@@ -104,7 +97,7 @@ int try_to_unmap(struct page *);
 #define anon_vma_prepare(vma)	(0)
 #define anon_vma_link(vma)	do {} while (0)
 
-#define page_referenced(page)	TestClearPageReferenced(page)
+#define page_referenced(page,l)	TestClearPageReferenced(page)
 #define try_to_unmap(page)	SWAP_FAIL
 
 #endif	/* CONFIG_MMU */
--- rmaplock1/mm/page_alloc.c	2004-07-12 18:20:07.952982264 +0100
+++ rmaplock2/mm/page_alloc.c	2004-07-12 18:20:22.335795744 +0100
@@ -78,7 +78,7 @@ static void bad_page(const char *functio
 		function, current->comm, page);
 	printk(KERN_EMERG "flags:0x%08lx mapping:%p mapcount:%d count:%d\n",
 		(unsigned long)page->flags, page->mapping,
-		(int)page->mapcount, page_count(page));
+		page_mapcount(page), page_count(page));
 	printk(KERN_EMERG "Backtrace:\n");
 	dump_stack();
 	printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n");
@@ -87,12 +87,11 @@ static void bad_page(const char *functio
 			1 << PG_lru	|
 			1 << PG_active	|
 			1 << PG_dirty	|
-			1 << PG_maplock |
 			1 << PG_swapcache |
 			1 << PG_writeback);
 	set_page_count(page, 0);
+	reset_page_mapcount(page);
 	page->mapping = NULL;
-	page->mapcount = 0;
 }
 
 #ifndef CONFIG_HUGETLB_PAGE
@@ -228,7 +227,6 @@ static inline void free_pages_check(cons
 			1 << PG_active	|
 			1 << PG_reclaim	|
 			1 << PG_slab	|
-			1 << PG_maplock |
 			1 << PG_swapcache |
 			1 << PG_writeback )))
 		bad_page(function, page);
@@ -350,7 +348,6 @@ static void prep_new_page(struct page *p
 			1 << PG_active	|
 			1 << PG_dirty	|
 			1 << PG_reclaim	|
-			1 << PG_maplock |
 			1 << PG_swapcache |
 			1 << PG_writeback )))
 		bad_page(__FUNCTION__, page);
@@ -512,6 +509,8 @@ static void fastcall free_hot_cold_page(
 
 	kernel_map_pages(page, 1, 0);
 	inc_page_state(pgfree);
+	if (PageAnon(page))
+		page->mapping = NULL;
 	free_pages_check(__FUNCTION__, page);
 	pcp = &zone->pageset[get_cpu()].pcp[cold];
 	local_irq_save(flags);
@@ -1399,6 +1398,7 @@ void __init memmap_init_zone(struct page
 	for (page = start; page < (start + size); page++) {
 		set_page_zone(page, NODEZONE(nid, zone));
 		set_page_count(page, 0);
+		reset_page_mapcount(page);
 		SetPageReserved(page);
 		INIT_LIST_HEAD(&page->lru);
 #ifdef WANT_PAGE_VIRTUAL
--- rmaplock1/mm/rmap.c	2004-07-12 18:20:07.954981960 +0100
+++ rmaplock2/mm/rmap.c	2004-07-12 18:20:22.338795288 +0100
@@ -163,13 +163,6 @@ void __init anon_vma_init(void)
 		sizeof(struct anon_vma), 0, SLAB_PANIC, anon_vma_ctor, NULL);
 }
 
-/* this needs the page->flags PG_maplock held */
-static inline void clear_page_anon(struct page *page)
-{
-	BUG_ON(!page->mapping);
-	page->mapping = NULL;
-}
-
 /*
  * At what user virtual address is page expected in vma?
  */
@@ -239,15 +232,22 @@ out:
 	return referenced;
 }
 
-static inline int page_referenced_anon(struct page *page)
+static int page_referenced_anon(struct page *page)
 {
-	unsigned int mapcount = page->mapcount;
+	unsigned int mapcount;
 	struct anon_vma *anon_vma = (void *) page->mapping - PAGE_MAPPING_ANON;
 	struct vm_area_struct *vma;
 	int referenced = 0;
 
+	/*
+	 * Recheck mapcount: it is not safe to take anon_vma->lock after
+	 * last page_remove_rmap, since struct anon_vma might be reused.
+	 */
+	mapcount = page_mapcount(page);
+	if (!mapcount)
+		return referenced;
+
 	spin_lock(&anon_vma->lock);
-	BUG_ON(list_empty(&anon_vma->head));
 	list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
 		referenced += page_referenced_one(page, vma, &mapcount);
 		if (!mapcount)
@@ -271,18 +271,39 @@ static inline int page_referenced_anon(s
  * The spinlock address_space->i_mmap_lock is tried.  If it can't be gotten,
  * assume a reference count of 0, so try_to_unmap will then have a go.
  */
-static inline int page_referenced_file(struct page *page)
+static int page_referenced_file(struct page *page)
 {
-	unsigned int mapcount = page->mapcount;
+	unsigned int mapcount;
 	struct address_space *mapping = page->mapping;
 	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
 	struct vm_area_struct *vma = NULL;
 	struct prio_tree_iter iter;
 	int referenced = 0;
 
+	/*
+	 * The caller's checks on page->mapping and !PageAnon have made
+	 * sure that this is a file page: the check for page->mapping
+	 * excludes the case just before it gets set on an anon page.
+	 */
+	BUG_ON(PageAnon(page));
+
+	/*
+	 * The page lock not only makes sure that page->mapping cannot
+	 * suddenly be NULLified by truncation, it makes sure that the
+	 * structure at mapping cannot be freed and reused yet,
+	 * so we can safely take mapping->i_mmap_lock.
+	 */
+	BUG_ON(!PageLocked(page));
+
 	if (!spin_trylock(&mapping->i_mmap_lock))
 		return 0;
 
+	/*
+	 * i_mmap_lock does not stabilize mapcount at all, but mapcount
+	 * is more likely to be accurate if we note it after spinning.
+	 */
+	mapcount = page_mapcount(page);
+
 	while ((vma = vma_prio_tree_next(vma, &mapping->i_mmap,
 					&iter, pgoff, pgoff)) != NULL) {
 		if ((vma->vm_flags & (VM_LOCKED|VM_MAYSHARE))
@@ -302,12 +323,12 @@ static inline int page_referenced_file(s
 /**
  * page_referenced - test if the page was referenced
  * @page: the page to test
+ * @is_locked: caller holds lock on the page
  *
  * Quick test_and_clear_referenced for all mappings to a page,
  * returns the number of ptes which referenced the page.
- * Caller needs to hold the rmap lock.
  */
-int page_referenced(struct page *page)
+int page_referenced(struct page *page, int is_locked)
 {
 	int referenced = 0;
 
@@ -317,11 +338,17 @@ int page_referenced(struct page *page)
 	if (TestClearPageReferenced(page))
 		referenced++;
 
-	if (page->mapcount && page->mapping) {
+	if (page_mapped(page) && page->mapping) {
 		if (PageAnon(page))
 			referenced += page_referenced_anon(page);
-		else
+		else if (is_locked)
+			referenced += page_referenced_file(page);
+		else if (TestSetPageLocked(page))
+			referenced++;
+		else if (page->mapping) {
 			referenced += page_referenced_file(page);
+			unlock_page(page);
+		}
 	}
 	return referenced;
 }
@@ -348,18 +375,12 @@ void page_add_anon_rmap(struct page *pag
 	index += vma->vm_pgoff;
 	index >>= PAGE_CACHE_SHIFT - PAGE_SHIFT;
 
-	page_map_lock(page);
-	if (!page->mapcount) {
-		BUG_ON(page->mapping);
+	if (atomic_inc_and_test(&page->_mapcount)) {
 		page->index = index;
 		page->mapping = (struct address_space *) anon_vma;
 		inc_page_state(nr_mapped);
-	} else {
-		BUG_ON(page->index != index);
-		BUG_ON(page->mapping != (struct address_space *) anon_vma);
 	}
-	page->mapcount++;
-	page_map_unlock(page);
+	/* else checking page index and mapping is racy */
 }
 
 /**
@@ -374,11 +395,8 @@ void page_add_file_rmap(struct page *pag
 	if (!pfn_valid(page_to_pfn(page)) || PageReserved(page))
 		return;
 
-	page_map_lock(page);
-	if (!page->mapcount)
+	if (atomic_inc_and_test(&page->_mapcount))
 		inc_page_state(nr_mapped);
-	page->mapcount++;
-	page_map_unlock(page);
 }
 
 /**
@@ -390,18 +408,20 @@ void page_add_file_rmap(struct page *pag
 void page_remove_rmap(struct page *page)
 {
 	BUG_ON(PageReserved(page));
-	BUG_ON(!page->mapcount);
 
-	page_map_lock(page);
-	page->mapcount--;
-	if (!page->mapcount) {
+	if (atomic_add_negative(-1, &page->_mapcount)) {
+		BUG_ON(page_mapcount(page) < 0);
+		/*
+		 * It would be tidy to reset the PageAnon mapping here,
+		 * but that might overwrite a racing page_add_anon_rmap
+		 * which increments mapcount after us but sets mapping
+		 * before us: so leave the reset to free_hot_cold_page,
+		 * and remember that it's only reliable while mapped.
+		 */
 		if (page_test_and_clear_dirty(page))
 			set_page_dirty(page);
-		if (PageAnon(page))
-			clear_page_anon(page);
 		dec_page_state(nr_mapped);
 	}
-	page_map_unlock(page);
 }
 
 /*
@@ -469,7 +489,7 @@ static int try_to_unmap_one(struct page 
 	 * the original page for.
 	 */
 	if (PageSwapCache(page) &&
-	    page_count(page) != page->mapcount + 2) {
+	    page_count(page) != page_mapcount(page) + 2) {
 		ret = SWAP_FAIL;
 		goto out_unmap;
 	}
@@ -495,8 +515,7 @@ static int try_to_unmap_one(struct page 
 	}
 
 	mm->rss--;
-	BUG_ON(!page->mapcount);
-	page->mapcount--;
+	page_remove_rmap(page);
 	page_cache_release(page);
 
 out_unmap:
@@ -607,17 +626,23 @@ out_unlock:
 	return SWAP_AGAIN;
 }
 
-static inline int try_to_unmap_anon(struct page *page)
+static int try_to_unmap_anon(struct page *page)
 {
 	struct anon_vma *anon_vma = (void *) page->mapping - PAGE_MAPPING_ANON;
 	struct vm_area_struct *vma;
 	int ret = SWAP_AGAIN;
 
+	/*
+	 * Recheck mapped: it is not safe to take anon_vma->lock after
+	 * last page_remove_rmap, since struct anon_vma might be reused.
+	 */
+	if (!page_mapped(page))
+		return ret;
+
 	spin_lock(&anon_vma->lock);
-	BUG_ON(list_empty(&anon_vma->head));
 	list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
 		ret = try_to_unmap_one(page, vma);
-		if (ret == SWAP_FAIL || !page->mapcount)
+		if (ret == SWAP_FAIL || !page_mapped(page))
 			break;
 	}
 	spin_unlock(&anon_vma->lock);
@@ -636,7 +661,7 @@ static inline int try_to_unmap_anon(stru
  * The spinlock address_space->i_mmap_lock is tried.  If it can't be gotten,
  * return a temporary error.
  */
-static inline int try_to_unmap_file(struct page *page)
+static int try_to_unmap_file(struct page *page)
 {
 	struct address_space *mapping = page->mapping;
 	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
@@ -654,7 +679,7 @@ static inline int try_to_unmap_file(stru
 	while ((vma = vma_prio_tree_next(vma, &mapping->i_mmap,
 					&iter, pgoff, pgoff)) != NULL) {
 		ret = try_to_unmap_one(page, vma);
-		if (ret == SWAP_FAIL || !page->mapcount)
+		if (ret == SWAP_FAIL || !page_mapped(page))
 			goto out;
 	}
 
@@ -683,8 +708,9 @@ static inline int try_to_unmap_file(stru
 	 * The mapcount of the page we came in with is irrelevant,
 	 * but even so use it as a guide to how hard we should try?
 	 */
-	mapcount = page->mapcount;
-	page_map_unlock(page);
+	mapcount = page_mapcount(page);
+	if (!mapcount)
+		goto out;
 	cond_resched_lock(&mapping->i_mmap_lock);
 
 	max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK;
@@ -707,7 +733,7 @@ static inline int try_to_unmap_file(stru
 				cursor += CLUSTER_SIZE;
 				vma->vm_private_data = (void *) cursor;
 				if ((int)mapcount <= 0)
-					goto relock;
+					goto out;
 			}
 			if (ret != SWAP_FAIL)
 				vma->vm_private_data =
@@ -728,8 +754,6 @@ static inline int try_to_unmap_file(stru
 		if (!(vma->vm_flags & VM_RESERVED))
 			vma->vm_private_data = NULL;
 	}
-relock:
-	page_map_lock(page);
 out:
 	spin_unlock(&mapping->i_mmap_lock);
 	return ret;
@@ -740,11 +764,11 @@ out:
  * @page: the page to get unmapped
  *
  * Tries to remove all the page table entries which are mapping this
- * page, used in the pageout path.  Caller must hold the page lock
- * and its rmap lock.  Return values are:
+ * page, used in the pageout path.  Caller must hold the page lock.
+ * Return values are:
  *
  * SWAP_SUCCESS	- we succeeded in removing all mappings
- * SWAP_AGAIN	- we missed a trylock, try again later
+ * SWAP_AGAIN	- we missed a mapping, try again later
  * SWAP_FAIL	- the page is unswappable
  */
 int try_to_unmap(struct page *page)
@@ -753,20 +777,13 @@ int try_to_unmap(struct page *page)
 
 	BUG_ON(PageReserved(page));
 	BUG_ON(!PageLocked(page));
-	BUG_ON(!page->mapcount);
 
 	if (PageAnon(page))
 		ret = try_to_unmap_anon(page);
 	else
 		ret = try_to_unmap_file(page);
 
-	if (!page->mapcount) {
-		if (page_test_and_clear_dirty(page))
-			set_page_dirty(page);
-		if (PageAnon(page))
-			clear_page_anon(page);
-		dec_page_state(nr_mapped);
+	if (!page_mapped(page))
 		ret = SWAP_SUCCESS;
-	}
 	return ret;
 }
--- rmaplock1/mm/vmscan.c	2004-07-09 10:53:47.000000000 +0100
+++ rmaplock2/mm/vmscan.c	2004-07-12 18:20:22.340794984 +0100
@@ -209,7 +209,7 @@ static int shrink_slab(unsigned long sca
 	return 0;
 }
 
-/* Must be called with page's rmap lock held. */
+/* Called without lock on whether page is mapped, so answer is unstable */
 static inline int page_mapping_inuse(struct page *page)
 {
 	struct address_space *mapping;
@@ -367,26 +367,19 @@ static int shrink_list(struct list_head 
 		if (page_mapped(page) || PageSwapCache(page))
 			sc->nr_scanned++;
 
-		page_map_lock(page);
-		referenced = page_referenced(page);
-		if (referenced && page_mapping_inuse(page)) {
-			/* In active use or really unfreeable.  Activate it. */
-			page_map_unlock(page);
+		referenced = page_referenced(page, 1);
+		/* In active use or really unfreeable?  Activate it. */
+		if (referenced && page_mapping_inuse(page))
 			goto activate_locked;
-		}
 
 #ifdef CONFIG_SWAP
 		/*
 		 * Anonymous process memory has backing store?
 		 * Try to allocate it some swap space here.
-		 *
-		 * XXX: implement swap clustering ?
 		 */
 		if (PageAnon(page) && !PageSwapCache(page)) {
-			page_map_unlock(page);
 			if (!add_to_swap(page))
 				goto activate_locked;
-			page_map_lock(page);
 		}
 #endif /* CONFIG_SWAP */
 
@@ -401,16 +394,13 @@ static int shrink_list(struct list_head 
 		if (page_mapped(page) && mapping) {
 			switch (try_to_unmap(page)) {
 			case SWAP_FAIL:
-				page_map_unlock(page);
 				goto activate_locked;
 			case SWAP_AGAIN:
-				page_map_unlock(page);
 				goto keep_locked;
 			case SWAP_SUCCESS:
 				; /* try to free the page below */
 			}
 		}
-		page_map_unlock(page);
 
 		if (PageDirty(page)) {
 			if (referenced)
@@ -713,25 +703,12 @@ refill_inactive_zone(struct zone *zone, 
 		page = lru_to_page(&l_hold);
 		list_del(&page->lru);
 		if (page_mapped(page)) {
-			if (!reclaim_mapped) {
-				list_add(&page->lru, &l_active);
-				continue;
-			}
-			page_map_lock(page);
-			if (page_referenced(page)) {
-				page_map_unlock(page);
+			if (!reclaim_mapped ||
+			    (total_swap_pages == 0 && PageAnon(page)) ||
+			    page_referenced(page, 0)) {
 				list_add(&page->lru, &l_active);
 				continue;
 			}
-			page_map_unlock(page);
-		}
-		/*
-		 * FIXME: need to consider page_count(page) here if/when we
-		 * reap orphaned pages via the LRU (Daniel's locking stuff)
-		 */
-		if (total_swap_pages == 0 && PageAnon(page)) {
-			list_add(&page->lru, &l_active);
-			continue;
 		}
 		list_add(&page->lru, &l_inactive);
 	}


^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH] rmaplock 2/6 SLAB_DESTROY_BY_RCU
  2004-07-12 21:49 [PATCH] rmaplock 1/6 PageAnon in mapping Hugh Dickins
  2004-07-12 21:50 ` [PATCH] rmaplock 2/6 kill page_map_lock Hugh Dickins
@ 2004-07-12 21:52 ` Hugh Dickins
  2004-07-13 16:36   ` Manfred Spraul
  2004-07-12 21:53 ` [PATCH] rmaplock 4/6 mm lock ordering Hugh Dickins
                   ` (3 subsequent siblings)
  5 siblings, 1 reply; 13+ messages in thread
From: Hugh Dickins @ 2004-07-12 21:52 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Manfred Spraul, linux-kernel

With page_map_lock gone, how to stabilize page->mapping's anon_vma while
acquiring anon_vma->lock in page_referenced_anon and try_to_unmap_anon?

The page cannot actually be freed (vmscan holds reference), but however
much we check page_mapped (which guarantees that anon_vma is in use - or
would guarantee that if we added suitable barriers), there's no locking
against page becoming unmapped the instant after, then anon_vma freed.

It's okay to take anon_vma->lock after it's freed, so long as it remains
a struct anon_vma (its list would become empty, or perhaps reused for an
unrelated anon_vma: but no problem since we always check that the page
located is the right one); but corruption if that memory gets reused for
some other purpose.

This is not unique: it's liable to be problem whenever the kernel tries
to approach a structure obliquely.  It's generally solved with an atomic
reference count; but one advantage of anon_vma over anonmm is that it
does not have such a count, and it would be a backward step to add one.

Therefore... implement SLAB_DESTROY_BY_RCU flag, to guarantee that such
a kmem_cache_alloc'ed structure cannot get freed to other use while the
rcu_read_lock is held i.e. preempt disabled; and use that for anon_vma.

I hope SLAB_DESTROY_BY_RCU can be useful elsewhere; but though it's safe
for little anon_vma, I'd hesitate before using it generally, on shrinker
caches or kmem_cache_destroyable caches.

Signed-off-by: Hugh Dickins <hugh@veritas.com>

 include/linux/slab.h |    1 
 mm/rmap.c            |   50 ++++++++++++++++++++++++++++---------------
 mm/slab.c            |   59 +++++++++++++++++++++++++++++++++++++++++++++------
 3 files changed, 87 insertions(+), 23 deletions(-)

--- rmaplock2/include/linux/slab.h	2004-07-09 10:53:44.000000000 +0100
+++ rmaplock3/include/linux/slab.h	2004-07-12 18:20:35.250832360 +0100
@@ -45,6 +45,7 @@ typedef struct kmem_cache_s kmem_cache_t
 #define SLAB_RECLAIM_ACCOUNT	0x00020000UL	/* track pages allocated to indicate
 						   what is reclaimable later*/
 #define SLAB_PANIC		0x00040000UL	/* panic if kmem_cache_create() fails */
+#define SLAB_DESTROY_BY_RCU	0x00080000UL	/* defer freeing pages to RCU */
 
 /* flags passed to a constructor func */
 #define	SLAB_CTOR_CONSTRUCTOR	0x001UL		/* if not set, then deconstructor */
--- rmaplock2/mm/rmap.c	2004-07-12 18:20:22.338795288 +0100
+++ rmaplock3/mm/rmap.c	2004-07-12 18:20:35.274828712 +0100
@@ -30,6 +30,7 @@
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/rmap.h>
+#include <linux/rcupdate.h>
 
 #include <asm/tlbflush.h>
 
@@ -159,8 +160,31 @@ static void anon_vma_ctor(void *data, km
 
 void __init anon_vma_init(void)
 {
-	anon_vma_cachep = kmem_cache_create("anon_vma",
-		sizeof(struct anon_vma), 0, SLAB_PANIC, anon_vma_ctor, NULL);
+	anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma),
+			0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor, NULL);
+}
+
+/*
+ * Getting a lock on a stable anon_vma from a page off the LRU is
+ * tricky: page_lock_anon_vma rely on RCU to guard against the races.
+ */
+static struct anon_vma *page_lock_anon_vma(struct page *page)
+{
+	struct anon_vma *anon_vma = NULL;
+	unsigned long anon_mapping;
+
+	rcu_read_lock();
+	anon_mapping = (unsigned long) page->mapping;
+	if (!(anon_mapping & PAGE_MAPPING_ANON))
+		goto out;
+	if (!page_mapped(page))
+		goto out;
+
+	anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
+	spin_lock(&anon_vma->lock);
+out:
+	rcu_read_unlock();
+	return anon_vma;
 }
 
 /*
@@ -235,19 +259,15 @@ out:
 static int page_referenced_anon(struct page *page)
 {
 	unsigned int mapcount;
-	struct anon_vma *anon_vma = (void *) page->mapping - PAGE_MAPPING_ANON;
+	struct anon_vma *anon_vma;
 	struct vm_area_struct *vma;
 	int referenced = 0;
 
-	/*
-	 * Recheck mapcount: it is not safe to take anon_vma->lock after
-	 * last page_remove_rmap, since struct anon_vma might be reused.
-	 */
-	mapcount = page_mapcount(page);
-	if (!mapcount)
+	anon_vma = page_lock_anon_vma(page);
+	if (!anon_vma)
 		return referenced;
 
-	spin_lock(&anon_vma->lock);
+	mapcount = page_mapcount(page);
 	list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
 		referenced += page_referenced_one(page, vma, &mapcount);
 		if (!mapcount)
@@ -628,18 +648,14 @@ out_unlock:
 
 static int try_to_unmap_anon(struct page *page)
 {
-	struct anon_vma *anon_vma = (void *) page->mapping - PAGE_MAPPING_ANON;
+	struct anon_vma *anon_vma;
 	struct vm_area_struct *vma;
 	int ret = SWAP_AGAIN;
 
-	/*
-	 * Recheck mapped: it is not safe to take anon_vma->lock after
-	 * last page_remove_rmap, since struct anon_vma might be reused.
-	 */
-	if (!page_mapped(page))
+	anon_vma = page_lock_anon_vma(page);
+	if (!anon_vma)
 		return ret;
 
-	spin_lock(&anon_vma->lock);
 	list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
 		ret = try_to_unmap_one(page, vma);
 		if (ret == SWAP_FAIL || !page_mapped(page))
--- rmaplock2/mm/slab.c	2004-07-09 10:53:46.000000000 +0100
+++ rmaplock3/mm/slab.c	2004-07-12 18:20:35.277828256 +0100
@@ -91,6 +91,7 @@
 #include	<linux/cpu.h>
 #include	<linux/sysctl.h>
 #include	<linux/module.h>
+#include	<linux/rcupdate.h>
 
 #include	<asm/uaccess.h>
 #include	<asm/cacheflush.h>
@@ -139,11 +140,13 @@
 			 SLAB_POISON | SLAB_HWCACHE_ALIGN | \
 			 SLAB_NO_REAP | SLAB_CACHE_DMA | \
 			 SLAB_MUST_HWCACHE_ALIGN | SLAB_STORE_USER | \
-			 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC)
+			 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
+			 SLAB_DESTROY_BY_RCU)
 #else
 # define CREATE_MASK	(SLAB_HWCACHE_ALIGN | SLAB_NO_REAP | \
 			 SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN | \
-			 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC)
+			 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
+			 SLAB_DESTROY_BY_RCU)
 #endif
 
 /*
@@ -190,6 +193,29 @@ struct slab {
 };
 
 /*
+ * struct slab_rcu
+ *
+ * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to
+ * arrange for kmem_freepages to be called via RCU.  This is useful if
+ * we need to approach a kernel structure obliquely, from its address
+ * obtained without the usual locking.  We can lock the structure to
+ * stabilize it and check it's still at the given address, only if we
+ * can be sure that the memory has not been meanwhile reused for some
+ * other kind of object (which our subsystem's lock might corrupt).
+ *
+ * rcu_read_lock before reading the address, then rcu_read_unlock after
+ * taking the spinlock within the structure expected at that address.
+ *
+ * We assume struct slab_rcu can overlay struct slab when destroying.
+ * Care needed to use kmem_cache_destroy on a SLAB_DESTROY_BY_RCU cache.
+ */
+struct slab_rcu {
+	struct rcu_head		head;
+	kmem_cache_t		*cachep;
+	void			*addr;
+};
+
+/*
  * struct array_cache
  *
  * Per cpu structures
@@ -883,6 +909,16 @@ static void kmem_freepages(kmem_cache_t 
 		atomic_sub(1<<cachep->gfporder, &slab_reclaim_pages);
 }
 
+static void kmem_rcu_free(struct rcu_head *head)
+{
+	struct slab_rcu *slab_rcu = (struct slab_rcu *) head;
+	kmem_cache_t *cachep = slab_rcu->cachep;
+
+	kmem_freepages(cachep, slab_rcu->addr);
+	if (OFF_SLAB(cachep))
+		kmem_cache_free(cachep->slabp_cache, slab_rcu);
+}
+
 #if DEBUG
 
 #ifdef CONFIG_DEBUG_PAGEALLOC
@@ -1036,6 +1072,8 @@ static void check_poison_obj(kmem_cache_
  */
 static void slab_destroy (kmem_cache_t *cachep, struct slab *slabp)
 {
+	void *addr = slabp->s_mem - slabp->colouroff;
+
 #if DEBUG
 	int i;
 	for (i = 0; i < cachep->num; i++) {
@@ -1071,10 +1109,19 @@ static void slab_destroy (kmem_cache_t *
 		}
 	}
 #endif
-	
-	kmem_freepages(cachep, slabp->s_mem-slabp->colouroff);
-	if (OFF_SLAB(cachep))
-		kmem_cache_free(cachep->slabp_cache, slabp);
+
+	if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) {
+		struct slab_rcu *slab_rcu;
+
+		slab_rcu = (struct slab_rcu *) slabp;
+		slab_rcu->cachep = cachep;
+		slab_rcu->addr = addr;
+		call_rcu(&slab_rcu->head, kmem_rcu_free);
+	} else {
+		kmem_freepages(cachep, addr);
+		if (OFF_SLAB(cachep))
+			kmem_cache_free(cachep->slabp_cache, slabp);
+	}
 }
 
 /**


^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH] rmaplock 4/6 mm lock ordering
  2004-07-12 21:49 [PATCH] rmaplock 1/6 PageAnon in mapping Hugh Dickins
  2004-07-12 21:50 ` [PATCH] rmaplock 2/6 kill page_map_lock Hugh Dickins
  2004-07-12 21:52 ` [PATCH] rmaplock 2/6 SLAB_DESTROY_BY_RCU Hugh Dickins
@ 2004-07-12 21:53 ` Hugh Dickins
  2004-07-12 21:54 ` [PATCH] rmaplock 5/6 unuse_process mmap_sem Hugh Dickins
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 13+ messages in thread
From: Hugh Dickins @ 2004-07-12 21:53 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel

With page_map_lock out of the way, there's no need for page_referenced
and try_to_unmap to use trylocks - provided we switch anon_vma->lock and
mm->page_table_lock around in anon_vma_prepare.  Though I suppose it's
possible that we'll find that vmscan makes better progress with trylocks
than spinning - we're free to choose trylocks again if so.

Try to update the mm lock ordering documentation in filemap.c.
But I still find it confusing, and I've no idea of where to stop.
So add the mm lock ordering list I can understand to rmap.c.

Signed-off-by: Hugh Dickins <hugh@veritas.com>

 mm/filemap.c |   15 ++++++----
 mm/rmap.c    |   82 ++++++++++++++++++++++++++++++++---------------------------
 2 files changed, 54 insertions(+), 43 deletions(-)

--- rmaplock3/mm/filemap.c	2004-07-09 10:53:46.000000000 +0100
+++ rmaplock4/mm/filemap.c	2004-07-12 18:20:48.023890560 +0100
@@ -60,7 +60,6 @@
  *      ->swap_list_lock
  *        ->swap_device_lock	(exclusive_swap_page, others)
  *          ->mapping->tree_lock
- *    ->page_map_lock()		(try_to_unmap_file)
  *
  *  ->i_sem
  *    ->i_mmap_lock		(truncate->unmap_mapping_range)
@@ -83,16 +82,20 @@
  *    ->sb_lock			(fs/fs-writeback.c)
  *    ->mapping->tree_lock	(__sync_single_inode)
  *
+ *  ->i_mmap_lock
+ *    ->anon_vma.lock		(vma_adjust)
+ *
+ *  ->anon_vma.lock
+ *    ->page_table_lock		(anon_vma_prepare and various)
+ *
  *  ->page_table_lock
  *    ->swap_device_lock	(try_to_unmap_one)
  *    ->private_lock		(try_to_unmap_one)
  *    ->tree_lock		(try_to_unmap_one)
  *    ->zone.lru_lock		(follow_page->mark_page_accessed)
- *    ->page_map_lock()		(page_add_anon_rmap)
- *      ->tree_lock		(page_remove_rmap->set_page_dirty)
- *      ->private_lock		(page_remove_rmap->set_page_dirty)
- *      ->inode_lock		(page_remove_rmap->set_page_dirty)
- *    ->anon_vma.lock		(anon_vma_prepare)
+ *    ->private_lock		(page_remove_rmap->set_page_dirty)
+ *    ->tree_lock		(page_remove_rmap->set_page_dirty)
+ *    ->inode_lock		(page_remove_rmap->set_page_dirty)
  *    ->inode_lock		(zap_pte_range->set_page_dirty)
  *    ->private_lock		(zap_pte_range->__set_page_dirty_buffers)
  *
--- rmaplock3/mm/rmap.c	2004-07-12 18:20:35.274828712 +0100
+++ rmaplock4/mm/rmap.c	2004-07-12 18:20:48.025890256 +0100
@@ -18,9 +18,30 @@
  */
 
 /*
- * Locking: see "Lock ordering" summary in filemap.c.
- * In swapout, page_map_lock is held on entry to page_referenced and
- * try_to_unmap, so they trylock for i_mmap_lock and page_table_lock.
+ * Lock ordering in mm:
+ *
+ * inode->i_sem	(while writing or truncating, not reading or faulting)
+ *   inode->i_alloc_sem
+ *
+ * When a page fault occurs in writing from user to file, down_read
+ * of mmap_sem nests within i_sem; in sys_msync, i_sem nests within
+ * down_read of mmap_sem; i_sem and down_write of mmap_sem are never
+ * taken together; in truncation, i_sem is taken outermost.
+ *
+ * mm->mmap_sem
+ *   page->flags PG_locked (lock_page)
+ *     mapping->i_mmap_lock
+ *       anon_vma->lock
+ *         mm->page_table_lock
+ *           zone->lru_lock (in mark_page_accessed)
+ *           swap_list_lock (in swap_free etc's swap_info_get)
+ *             swap_device_lock (in swap_duplicate, swap_info_get)
+ *             mapping->private_lock (in __set_page_dirty_buffers)
+ *             inode_lock (in set_page_dirty's __mark_inode_dirty)
+ *               sb_lock (within inode_lock in fs/fs-writeback.c)
+ *               mapping->tree_lock (widely used, in set_page_dirty,
+ *                         in arch-dependent flush_dcache_mmap_lock,
+ *                         within inode_lock in __sync_single_inode)
  */
 
 #include <linux/mm.h>
@@ -64,28 +85,32 @@ int anon_vma_prepare(struct vm_area_stru
 	might_sleep();
 	if (unlikely(!anon_vma)) {
 		struct mm_struct *mm = vma->vm_mm;
-		struct anon_vma *allocated = NULL;
+		struct anon_vma *allocated, *locked;
 
 		anon_vma = find_mergeable_anon_vma(vma);
-		if (!anon_vma) {
+		if (anon_vma) {
+			allocated = NULL;
+			locked = anon_vma;
+			spin_lock(&locked->lock);
+		} else {
 			anon_vma = anon_vma_alloc();
 			if (unlikely(!anon_vma))
 				return -ENOMEM;
 			allocated = anon_vma;
+			locked = NULL;
 		}
 
 		/* page_table_lock to protect against threads */
 		spin_lock(&mm->page_table_lock);
 		if (likely(!vma->anon_vma)) {
-			if (!allocated)
-				spin_lock(&anon_vma->lock);
 			vma->anon_vma = anon_vma;
 			list_add(&vma->anon_vma_node, &anon_vma->head);
-			if (!allocated)
-				spin_unlock(&anon_vma->lock);
 			allocated = NULL;
 		}
 		spin_unlock(&mm->page_table_lock);
+
+		if (locked)
+			spin_unlock(&locked->lock);
 		if (unlikely(allocated))
 			anon_vma_free(allocated);
 	}
@@ -225,8 +250,7 @@ static int page_referenced_one(struct pa
 	if (address == -EFAULT)
 		goto out;
 
-	if (!spin_trylock(&mm->page_table_lock))
-		goto out;
+	spin_lock(&mm->page_table_lock);
 
 	pgd = pgd_offset(mm, address);
 	if (!pgd_present(*pgd))
@@ -287,9 +311,6 @@ static int page_referenced_anon(struct p
  * of references it found.
  *
  * This function is only called from page_referenced for object-based pages.
- *
- * The spinlock address_space->i_mmap_lock is tried.  If it can't be gotten,
- * assume a reference count of 0, so try_to_unmap will then have a go.
  */
 static int page_referenced_file(struct page *page)
 {
@@ -315,8 +336,7 @@ static int page_referenced_file(struct p
 	 */
 	BUG_ON(!PageLocked(page));
 
-	if (!spin_trylock(&mapping->i_mmap_lock))
-		return 0;
+	spin_lock(&mapping->i_mmap_lock);
 
 	/*
 	 * i_mmap_lock does not stabilize mapcount at all, but mapcount
@@ -468,8 +488,7 @@ static int try_to_unmap_one(struct page 
 	 * We need the page_table_lock to protect us from page faults,
 	 * munmap, fork, etc...
 	 */
-	if (!spin_trylock(&mm->page_table_lock))
-		goto out;
+	spin_lock(&mm->page_table_lock);
 
 	pgd = pgd_offset(mm, address);
 	if (!pgd_present(*pgd))
@@ -568,7 +587,7 @@ out:
 #define CLUSTER_SIZE	min(32*PAGE_SIZE, PMD_SIZE)
 #define CLUSTER_MASK	(~(CLUSTER_SIZE - 1))
 
-static int try_to_unmap_cluster(unsigned long cursor,
+static void try_to_unmap_cluster(unsigned long cursor,
 	unsigned int *mapcount, struct vm_area_struct *vma)
 {
 	struct mm_struct *mm = vma->vm_mm;
@@ -585,8 +604,7 @@ static int try_to_unmap_cluster(unsigned
 	 * We need the page_table_lock to protect us from page faults,
 	 * munmap, fork, etc...
 	 */
-	if (!spin_trylock(&mm->page_table_lock))
-		return SWAP_FAIL;
+	spin_lock(&mm->page_table_lock);
 
 	address = (vma->vm_start + cursor) & CLUSTER_MASK;
 	end = address + CLUSTER_SIZE;
@@ -643,7 +661,6 @@ static int try_to_unmap_cluster(unsigned
 
 out_unlock:
 	spin_unlock(&mm->page_table_lock);
-	return SWAP_AGAIN;
 }
 
 static int try_to_unmap_anon(struct page *page)
@@ -673,9 +690,6 @@ static int try_to_unmap_anon(struct page
  * contained in the address_space struct it points to.
  *
  * This function is only called from try_to_unmap for object-based pages.
- *
- * The spinlock address_space->i_mmap_lock is tried.  If it can't be gotten,
- * return a temporary error.
  */
 static int try_to_unmap_file(struct page *page)
 {
@@ -689,9 +703,7 @@ static int try_to_unmap_file(struct page
 	unsigned long max_nl_size = 0;
 	unsigned int mapcount;
 
-	if (!spin_trylock(&mapping->i_mmap_lock))
-		return ret;
-
+	spin_lock(&mapping->i_mmap_lock);
 	while ((vma = vma_prio_tree_next(vma, &mapping->i_mmap,
 					&iter, pgoff, pgoff)) != NULL) {
 		ret = try_to_unmap_one(page, vma);
@@ -714,8 +726,10 @@ static int try_to_unmap_file(struct page
 			max_nl_size = cursor;
 	}
 
-	if (max_nl_size == 0)	/* any nonlinears locked or reserved */
+	if (max_nl_size == 0) {	/* any nonlinears locked or reserved */
+		ret = SWAP_FAIL;
 		goto out;
+	}
 
 	/*
 	 * We don't try to search for this page in the nonlinear vmas,
@@ -742,19 +756,13 @@ static int try_to_unmap_file(struct page
 			while (vma->vm_mm->rss &&
 				cursor < max_nl_cursor &&
 				cursor < vma->vm_end - vma->vm_start) {
-				ret = try_to_unmap_cluster(
-						cursor, &mapcount, vma);
-				if (ret == SWAP_FAIL)
-					break;
+				try_to_unmap_cluster(cursor, &mapcount, vma);
 				cursor += CLUSTER_SIZE;
 				vma->vm_private_data = (void *) cursor;
 				if ((int)mapcount <= 0)
 					goto out;
 			}
-			if (ret != SWAP_FAIL)
-				vma->vm_private_data =
-					(void *) max_nl_cursor;
-			ret = SWAP_AGAIN;
+			vma->vm_private_data = (void *) max_nl_cursor;
 		}
 		cond_resched_lock(&mapping->i_mmap_lock);
 		max_nl_cursor += CLUSTER_SIZE;


^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH] rmaplock 5/6 unuse_process mmap_sem
  2004-07-12 21:49 [PATCH] rmaplock 1/6 PageAnon in mapping Hugh Dickins
                   ` (2 preceding siblings ...)
  2004-07-12 21:53 ` [PATCH] rmaplock 4/6 mm lock ordering Hugh Dickins
@ 2004-07-12 21:54 ` Hugh Dickins
  2004-07-12 21:55 ` [PATCH] rmaplock 6/6 swapoff use anon_vma Hugh Dickins
  2004-07-13  4:48 ` [PATCH] rmaplock 1/6 PageAnon in mapping Andrew Morton
  5 siblings, 0 replies; 13+ messages in thread
From: Hugh Dickins @ 2004-07-12 21:54 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel

Updating the mm lock ordering documentation drew attention to the fact
that we were wrong to blithely add down_read(&mm->mmap_sem) to swapoff's
unuse_process, while it holds swapcache page lock: not very likely, but
it could deadlock against, say, mlock faulting a page back in from swap.

But it looks like these days it's safe to drop and reacquire page lock
if down_read_trylock fails: the page lock is held to stop try_to_unmap
unmapping the page's ptes as fast as try_to_unuse is mapping them back
in; but the recent fix for get_user_pages works to prevent that too.

Signed-off-by: Hugh Dickins <hugh@veritas.com>

 mm/rmap.c     |    4 ++++
 mm/swapfile.c |   10 +++++++++-
 2 files changed, 13 insertions(+), 1 deletion(-)

--- rmaplock4/mm/rmap.c	2004-07-12 18:20:48.025890256 +0100
+++ rmaplock5/mm/rmap.c	2004-07-12 18:21:02.456696440 +0100
@@ -526,6 +526,10 @@ static int try_to_unmap_one(struct page 
 	 * an exclusive swap page, do_wp_page will replace it by a copy
 	 * page, and the user never get to see the data GUP was holding
 	 * the original page for.
+	 *
+	 * This test is also useful for when swapoff (unuse_process) has
+	 * to drop page lock: its reference to the page stops existing
+	 * ptes from being unmapped, so swapoff can make progress.
 	 */
 	if (PageSwapCache(page) &&
 	    page_count(page) != page_mapcount(page) + 2) {
--- rmaplock4/mm/swapfile.c	2004-07-09 10:53:46.000000000 +0100
+++ rmaplock5/mm/swapfile.c	2004-07-12 18:21:02.458696136 +0100
@@ -548,7 +548,15 @@ static int unuse_process(struct mm_struc
 	/*
 	 * Go through process' page directory.
 	 */
-	down_read(&mm->mmap_sem);
+	if (!down_read_trylock(&mm->mmap_sem)) {
+		/*
+		 * Our reference to the page stops try_to_unmap_one from
+		 * unmapping its ptes, so swapoff can make progress.
+		 */
+		unlock_page(page);
+		down_read(&mm->mmap_sem);
+		lock_page(page);
+	}
 	spin_lock(&mm->page_table_lock);
 	for (vma = mm->mmap; vma; vma = vma->vm_next) {
 		if (!is_vm_hugetlb_page(vma)) {


^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH] rmaplock 6/6 swapoff use anon_vma
  2004-07-12 21:49 [PATCH] rmaplock 1/6 PageAnon in mapping Hugh Dickins
                   ` (3 preceding siblings ...)
  2004-07-12 21:54 ` [PATCH] rmaplock 5/6 unuse_process mmap_sem Hugh Dickins
@ 2004-07-12 21:55 ` Hugh Dickins
  2004-07-13  4:48 ` [PATCH] rmaplock 1/6 PageAnon in mapping Andrew Morton
  5 siblings, 0 replies; 13+ messages in thread
From: Hugh Dickins @ 2004-07-12 21:55 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel

Swapoff can make good use of a page's anon_vma and index, while it's
still left in swapcache, or once it's brought back in and the first pte
mapped back: unuse_vma go directly to just one page of only those vmas
with the same anon_vma.  And unuse_process can skip any vmas without an
anon_vma (extending the hugetlb check: hugetlb vmas have no anon_vma).

This just hacks in on top of the existing procedure, still going through
all the vmas of all the mms in mmlist.  A more elegant procedure might
replace mmlist by a list of anon_vmas: but that would be more work to
implement, with apparently more overhead in the common paths.

Signed-off-by: Hugh Dickins <hugh@veritas.com>

 include/linux/rmap.h |    5 +++++
 mm/rmap.c            |   20 ++++++++++++++++++++
 mm/swapfile.c        |   23 ++++++++++++++++-------
 3 files changed, 41 insertions(+), 7 deletions(-)

--- rmaplock5/include/linux/rmap.h	2004-07-12 18:20:22.333796048 +0100
+++ rmaplock6/include/linux/rmap.h	2004-07-12 18:21:16.752523144 +0100
@@ -91,6 +91,11 @@ static inline void page_dup_rmap(struct 
 int page_referenced(struct page *, int is_locked);
 int try_to_unmap(struct page *);
 
+/*
+ * Used by swapoff to help locate where page is expected in vma.
+ */
+unsigned long page_address_in_vma(struct page *, struct vm_area_struct *);
+
 #else	/* !CONFIG_MMU */
 
 #define anon_vma_init()		do {} while (0)
--- rmaplock5/mm/rmap.c	2004-07-12 18:21:02.456696440 +0100
+++ rmaplock6/mm/rmap.c	2004-07-12 18:21:16.767520864 +0100
@@ -231,6 +231,24 @@ vma_address(struct page *page, struct vm
 }
 
 /*
+ * At what user virtual address is page expected in vma? checking that the
+ * page matches the vma: currently only used by unuse_process, on anon pages.
+ */
+unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
+{
+	if (PageAnon(page)) {
+		if ((void *)vma->anon_vma !=
+		    (void *)page->mapping - PAGE_MAPPING_ANON)
+			return -EFAULT;
+	} else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) {
+		if (vma->vm_file->f_mapping != page->mapping)
+			return -EFAULT;
+	} else
+		return -EFAULT;
+	return vma_address(page, vma);
+}
+
+/*
  * Subfunctions of page_referenced: page_referenced_one called
  * repeatedly from either page_referenced_anon or page_referenced_file.
  */
@@ -457,6 +475,8 @@ void page_remove_rmap(struct page *page)
 		 * which increments mapcount after us but sets mapping
 		 * before us: so leave the reset to free_hot_cold_page,
 		 * and remember that it's only reliable while mapped.
+		 * Leaving it set also helps swapoff to reinstate ptes
+		 * faster for those pages still in swapcache.
 		 */
 		if (page_test_and_clear_dirty(page))
 			set_page_dirty(page);
--- rmaplock5/mm/swapfile.c	2004-07-12 18:21:02.458696136 +0100
+++ rmaplock6/mm/swapfile.c	2004-07-12 18:21:16.769520560 +0100
@@ -520,14 +520,24 @@ static unsigned long unuse_pgd(struct vm
 }
 
 /* vma->vm_mm->page_table_lock is held */
-static unsigned long unuse_vma(struct vm_area_struct * vma, pgd_t *pgdir,
+static unsigned long unuse_vma(struct vm_area_struct * vma,
 	swp_entry_t entry, struct page *page)
 {
-	unsigned long start = vma->vm_start, end = vma->vm_end;
+	pgd_t *pgdir;
+	unsigned long start, end;
 	unsigned long foundaddr;
 
-	if (start >= end)
-		BUG();
+	if (page->mapping) {
+		start = page_address_in_vma(page, vma);
+		if (start == -EFAULT)
+			return 0;
+		else
+			end = start + PAGE_SIZE;
+	} else {
+		start = vma->vm_start;
+		end = vma->vm_end;
+	}
+	pgdir = pgd_offset(vma->vm_mm, start);
 	do {
 		foundaddr = unuse_pgd(vma, pgdir, start, end - start,
 						entry, page);
@@ -559,9 +569,8 @@ static int unuse_process(struct mm_struc
 	}
 	spin_lock(&mm->page_table_lock);
 	for (vma = mm->mmap; vma; vma = vma->vm_next) {
-		if (!is_vm_hugetlb_page(vma)) {
-			pgd_t * pgd = pgd_offset(mm, vma->vm_start);
-			foundaddr = unuse_vma(vma, pgd, entry, page);
+		if (vma->anon_vma) {
+			foundaddr = unuse_vma(vma, entry, page);
 			if (foundaddr)
 				break;
 		}


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] rmaplock 1/6 PageAnon in mapping
  2004-07-12 21:49 [PATCH] rmaplock 1/6 PageAnon in mapping Hugh Dickins
                   ` (4 preceding siblings ...)
  2004-07-12 21:55 ` [PATCH] rmaplock 6/6 swapoff use anon_vma Hugh Dickins
@ 2004-07-13  4:48 ` Andrew Morton
  2004-07-13  7:08   ` Hugh Dickins
  5 siblings, 1 reply; 13+ messages in thread
From: Andrew Morton @ 2004-07-13  4:48 UTC (permalink / raw)
  To: Hugh Dickins; +Cc: linux-kernel

Hugh Dickins <hugh@veritas.com> wrote:
>
> Replace the PG_anon page->flags bit by setting the lower bit of the
>  pointer in page->mapping when it's anon_vma: PAGE_MAPPING_ANON bit.

This is likely to cause conniptions in various quarters.  Is there no
alternative?

It might make things more palatable to hide the setting, clearing testing
amd masking of this bit behind some set of wrapper functions.  Maybe.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] rmaplock 1/6 PageAnon in mapping
  2004-07-13  4:48 ` [PATCH] rmaplock 1/6 PageAnon in mapping Andrew Morton
@ 2004-07-13  7:08   ` Hugh Dickins
  0 siblings, 0 replies; 13+ messages in thread
From: Hugh Dickins @ 2004-07-13  7:08 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel

On Mon, 12 Jul 2004, Andrew Morton wrote:
> Hugh Dickins <hugh@veritas.com> wrote:
> >
> > Replace the PG_anon page->flags bit by setting the lower bit of the
> >  pointer in page->mapping when it's anon_vma: PAGE_MAPPING_ANON bit.
> 
> This is likely to cause conniptions in various quarters.  Is there no
> alternative?

I've little doubt barriers would provide an alternative;
but I'm hopeless with barriers, so couldn't manage it myself;
and not alone in finding them difficult; and surely more expensive.

> It might make things more palatable to hide the setting, clearing testing
> amd masking of this bit behind some set of wrapper functions.  Maybe.

Conniptious in some quarters that the name shouts too.  But where does it
appear?  In mm.h, the inlines to hide it from wider gaze.  And just four
times in rmap.c.  I'd have been happier to say three times, but even so,
I don't think burying it deeper helps anyone.

Hugh


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] rmaplock 2/6 SLAB_DESTROY_BY_RCU
  2004-07-12 21:52 ` [PATCH] rmaplock 2/6 SLAB_DESTROY_BY_RCU Hugh Dickins
@ 2004-07-13 16:36   ` Manfred Spraul
  2004-07-13 19:53     ` Hugh Dickins
  0 siblings, 1 reply; 13+ messages in thread
From: Manfred Spraul @ 2004-07-13 16:36 UTC (permalink / raw)
  To: Hugh Dickins; +Cc: Andrew Morton, linux-kernel

Hugh Dickins wrote:

>It's okay to take anon_vma->lock after it's freed, so long as it remains
>a struct anon_vma (its list would become empty, or perhaps reused for an
>unrelated anon_vma: but no problem since we always check that the page
>located is the right one); but corruption if that memory gets reused for
>some other purpose.
>
>  
>
An interesting idea:
The slab caches are object caches. If a rcu user only needs a valid 
object but doesn't care which one then there is no need to wait for a 
quiescent cycle after free - the quiescent cycle can be delayed until 
the destructor is called.

But there are two flaws in your patch:
- you must disable poisoning and unmapping if SLAB_DESTROY_BY_RCU is set.
- either delay the dtor calls a well or fail if an object has a non-NULL 
dtor and SLAB_DESTROY_BY_RCU is set.

--
    Manfred

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] rmaplock 2/6 SLAB_DESTROY_BY_RCU
  2004-07-13 16:36   ` Manfred Spraul
@ 2004-07-13 19:53     ` Hugh Dickins
  2004-07-13 20:36       ` Manfred Spraul
  0 siblings, 1 reply; 13+ messages in thread
From: Hugh Dickins @ 2004-07-13 19:53 UTC (permalink / raw)
  To: Manfred Spraul; +Cc: Andrew Morton, linux-kernel

On Tue, 13 Jul 2004, Manfred Spraul wrote:
> An interesting idea:
> The slab caches are object caches. If a rcu user only needs a valid 
> object but doesn't care which one then there is no need to wait for a 
> quiescent cycle after free - the quiescent cycle can be delayed until 
> the destructor is called.

Sorry, to be honest, I've not understood you there at all.
I wonder if you're seeing some other use than I intended.

> But there are two flaws in your patch:
> - you must disable poisoning and unmapping if SLAB_DESTROY_BY_RCU is set.

How right you are!  Thank you.  Does the further patch below suit?

> - either delay the dtor calls a well or fail if an object has a non-NULL 
> dtor and SLAB_DESTROY_BY_RCU is set.

Doesn't that rather depend on what the dtor does?  I'm not used to how
destructors are commonly used, but I can easily imagine a destructor
which, say, frees up some attached objects, but still leaves this cache
object recognizably itself, good enough for the reference-after-free
which SLAB_DESTROY_BY_RCU is allowing - all we need to avoid is the
page being freed (or poisoned or unmapped) and reused.  I don't see a
need to prohibit or delay dtor necessarily - but there's no question,
those who make use of this SLAB_DESTROY_BY_RCU technique do still
need to take great care when referencing after free.

Hugh

--- rmaplock6/mm/slab.c	2004-07-12 18:20:35.277828256 +0100
+++ rmaplock7/mm/slab.c	2004-07-13 20:24:27.193884264 +0100
@@ -1196,8 +1196,11 @@ kmem_cache_create (const char *name, siz
 	 */
 	if ((size < 4096 || fls(size-1) == fls(size-1+3*BYTES_PER_WORD)))
 		flags |= SLAB_RED_ZONE|SLAB_STORE_USER;
-	flags |= SLAB_POISON;
+	if (!(flags & SLAB_DESTROY_BY_RCU))
+		flags |= SLAB_POISON;
 #endif
+	if (flags & SLAB_DESTROY_BY_RCU)
+		BUG_ON(flags & SLAB_POISON);
 #endif
 	/*
 	 * Always checks flags, a caller might be expecting debug


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] rmaplock 2/6 SLAB_DESTROY_BY_RCU
  2004-07-13 19:53     ` Hugh Dickins
@ 2004-07-13 20:36       ` Manfred Spraul
  2004-07-13 22:17         ` William Lee Irwin III
  2004-07-14 20:25         ` Hugh Dickins
  0 siblings, 2 replies; 13+ messages in thread
From: Manfred Spraul @ 2004-07-13 20:36 UTC (permalink / raw)
  To: Hugh Dickins; +Cc: Andrew Morton, linux-kernel

Hugh Dickins wrote:

>On Tue, 13 Jul 2004, Manfred Spraul wrote:
>  
>
>>An interesting idea:
>>The slab caches are object caches. If a rcu user only needs a valid 
>>object but doesn't care which one then there is no need to wait for a 
>>quiescent cycle after free - the quiescent cycle can be delayed until 
>>the destructor is called.
>>    
>>
>
>Sorry, to be honest, I've not understood you there at all.
>I wonder if you're seeing some other use than I intended.
>
>  
>
No, I just tried to reworden your idea and I obviously failed.
Previously, I always assumed that the free call had to be delayed, but 
you have shown that this is wrong: If a rcu user can detect that an 
object changed its identity (or if he doesn't care), then it's not 
necessary to delay the free call, it's sufficient to delay the object 
destruction. And since the slab cache is an object cache delaying 
destruction causes virtually no overhead at all.
A second restriction is that it must be possible to recover from 
identity changes. I think this rules out the dentry cache - the hash 
chains point to wrong positions after a free/alloc/reuse cycle, it's not 
possible to recover from that.

>>But there are two flaws in your patch:
>>- you must disable poisoning and unmapping if SLAB_DESTROY_BY_RCU is set.
>>    
>>
>
>How right you are!  Thank you.  Does the further patch below suit?
>
>  
>
Yes, that should work.

>>- either delay the dtor calls a well or fail if an object has a non-NULL 
>>dtor and SLAB_DESTROY_BY_RCU is set.
>>    
>>
>
>Doesn't that rather depend on what the dtor does?  I'm not used to how
>destructors are commonly used, but I can easily imagine a destructor
>which, say, frees up some attached objects, but still leaves this cache
>object recognizably itself, good enough for the reference-after-free
>which SLAB_DESTROY_BY_RCU is allowing - all we need to avoid is the
>page being freed (or poisoned or unmapped) and reused.
>
This would introduce a concept of a half destroyed object. I think it's 
a bad idea, too easy to introduce bugs.
The correct fix would be to move the whole slab_destroy call into the 
rcu callback instead of just the kmem_cache_free call, but locking would 
be tricky - kmem_cache_destroy would have to wait for completion of 
outstanding rcu calls, etc.
Thus I'd propose a quick fix (fail if there is a dtor - are there any 
slab caches with dtors at all?) and in the long run slab_destroy should 
be moved into the rcu callback.

--
    Manfred

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] rmaplock 2/6 SLAB_DESTROY_BY_RCU
  2004-07-13 20:36       ` Manfred Spraul
@ 2004-07-13 22:17         ` William Lee Irwin III
  2004-07-14 20:25         ` Hugh Dickins
  1 sibling, 0 replies; 13+ messages in thread
From: William Lee Irwin III @ 2004-07-13 22:17 UTC (permalink / raw)
  To: Manfred Spraul; +Cc: Hugh Dickins, Andrew Morton, linux-kernel

On Tue, Jul 13, 2004 at 10:36:08PM +0200, Manfred Spraul wrote:
> Thus I'd propose a quick fix (fail if there is a dtor - are there any 
> slab caches with dtors at all?) and in the long run slab_destroy should 
> be moved into the rcu callback.

Yes, ia32 pgd and pmd slabs have dtors.


-- wli

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] rmaplock 2/6 SLAB_DESTROY_BY_RCU
  2004-07-13 20:36       ` Manfred Spraul
  2004-07-13 22:17         ` William Lee Irwin III
@ 2004-07-14 20:25         ` Hugh Dickins
  1 sibling, 0 replies; 13+ messages in thread
From: Hugh Dickins @ 2004-07-14 20:25 UTC (permalink / raw)
  To: Manfred Spraul; +Cc: Andrew Morton, linux-kernel

On Tue, 13 Jul 2004, Manfred Spraul wrote:
> Hugh Dickins wrote:
> >I wonder if you're seeing some other use than I intended.
> >
> No, I just tried to reworden your idea and I obviously failed.
> Previously, I always assumed that the free call had to be delayed, but 
> you have shown that this is wrong: If a rcu user can detect that an 
> object changed its identity (or if he doesn't care), then it's not 
> necessary to delay the free call, it's sufficient to delay the object 
> destruction. And since the slab cache is an object cache delaying 
> destruction causes virtually no overhead at all.

Actually, I now see it is a different use that you're seeing.  You're
thinking of how a subsystem which uses RCU might manage its kmem_cache,
that SLAB_DESTROY_BY_RCU might allow it to free directly where before
it had to leave freeing to RCU callback.  Yes, it could be used that
way, though I don't know if that's particularly useful - if the
subsystem uses RCU anyway, isn't it easy to let RCU do the freeing?

My reason for adding the flag is different: for a subsystem which is
almost entirely ignorant of RCU, which has good and simple locking for
its structures from one direction, but sometimes needs to approach
them from another direction, where it's hard to get the lock safely:
once it holds the spinlock within the structure, it's stabilized; but
at any instant prior to getting that spinlock, the structure containing
it might be freed.  SLAB_DESTROY_BY_RCU on the cache, with rcu_read_lock
while trying to get to that spinlock, can solve this.

> A second restriction is that it must be possible to recover from 
> identity changes. I think this rules out the dentry cache - the hash 
> chains point to wrong positions after a free/alloc/reuse cycle, it's not 
> possible to recover from that.

Yes, I don't think SLAB_DESTROY_BY_RCU should be splashed around all over.
Another reason, it slightly delays the freeing of pages from its caches,
so is probably not appropriate for those major caches which vmscan needs
to be able to shrink (and know how much it has shrunk).

> This would introduce a concept of a half destroyed object. I think it's 
> a bad idea, too easy to introduce bugs.

Fair enough.

> The correct fix would be to move the whole slab_destroy call into the 
> rcu callback instead of just the kmem_cache_free call, but locking would 
> be tricky

Yes, I didn't want or need to get into that.  Keep it simple for now.

> - kmem_cache_destroy would have to wait for completion of 
> outstanding rcu calls, etc.

I think it already needs that wait if SLAB_DESTROY_BY_RCU (unless we
change kmem_freepages to take several args, such as order, instead of
peeking at cachep->gfporder).  I didn't worry about that before, since
I didn't need to use kmem_cache_destroy: but I can see you prefer to
guard your territory properly, so I've now added a synchronize_kernel.

> Thus I'd propose a quick fix (fail if there is a dtor - are there any 
> slab caches with dtors at all?) and in the long run slab_destroy should 
> be moved into the rcu callback.

Okay.  My current patch to 2.6.8-rc1-mm1 slab.h and slab.c below.

Thanks a lot for considering this, Manfred.

Hugh

--- 2.6.8-rc1-mm1/include/linux/slab.h	2004-07-14 06:11:52.566031480 +0100
+++ linux/include/linux/slab.h	2004-07-14 19:02:37.217301704 +0100
@@ -45,6 +45,7 @@ typedef struct kmem_cache_s kmem_cache_t
 #define SLAB_RECLAIM_ACCOUNT	0x00020000UL	/* track pages allocated to indicate
 						   what is reclaimable later*/
 #define SLAB_PANIC		0x00040000UL	/* panic if kmem_cache_create() fails */
+#define SLAB_DESTROY_BY_RCU	0x00080000UL	/* defer freeing pages to RCU */
 
 /* flags passed to a constructor func */
 #define	SLAB_CTOR_CONSTRUCTOR	0x001UL		/* if not set, then deconstructor */
--- 2.6.8-rc1-mm1/mm/slab.c	2004-07-14 06:11:52.864986032 +0100
+++ linux/mm/slab.c	2004-07-14 19:02:37.233299272 +0100
@@ -91,6 +91,7 @@
 #include	<linux/cpu.h>
 #include	<linux/sysctl.h>
 #include	<linux/module.h>
+#include	<linux/rcupdate.h>
 
 #include	<asm/uaccess.h>
 #include	<asm/cacheflush.h>
@@ -139,11 +140,13 @@
 			 SLAB_POISON | SLAB_HWCACHE_ALIGN | \
 			 SLAB_NO_REAP | SLAB_CACHE_DMA | \
 			 SLAB_MUST_HWCACHE_ALIGN | SLAB_STORE_USER | \
-			 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC)
+			 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
+			 SLAB_DESTROY_BY_RCU)
 #else
 # define CREATE_MASK	(SLAB_HWCACHE_ALIGN | SLAB_NO_REAP | \
 			 SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN | \
-			 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC)
+			 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
+			 SLAB_DESTROY_BY_RCU)
 #endif
 
 /*
@@ -190,6 +193,28 @@ struct slab {
 };
 
 /*
+ * struct slab_rcu
+ *
+ * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to
+ * arrange for kmem_freepages to be called via RCU.  This is useful if
+ * we need to approach a kernel structure obliquely, from its address
+ * obtained without the usual locking.  We can lock the structure to
+ * stabilize it and check it's still at the given address, only if we
+ * can be sure that the memory has not been meanwhile reused for some
+ * other kind of object (which our subsystem's lock might corrupt).
+ *
+ * rcu_read_lock before reading the address, then rcu_read_unlock after
+ * taking the spinlock within the structure expected at that address.
+ *
+ * We assume struct slab_rcu can overlay struct slab when destroying.
+ */
+struct slab_rcu {
+	struct rcu_head		head;
+	kmem_cache_t		*cachep;
+	void			*addr;
+};
+
+/*
  * struct array_cache
  *
  * Per cpu structures
@@ -883,6 +908,16 @@ static void kmem_freepages(kmem_cache_t 
 		atomic_sub(1<<cachep->gfporder, &slab_reclaim_pages);
 }
 
+static void kmem_rcu_free(struct rcu_head *head)
+{
+	struct slab_rcu *slab_rcu = (struct slab_rcu *) head;
+	kmem_cache_t *cachep = slab_rcu->cachep;
+
+	kmem_freepages(cachep, slab_rcu->addr);
+	if (OFF_SLAB(cachep))
+		kmem_cache_free(cachep->slabp_cache, slab_rcu);
+}
+
 #if DEBUG
 
 #ifdef CONFIG_DEBUG_PAGEALLOC
@@ -1036,6 +1071,8 @@ static void check_poison_obj(kmem_cache_
  */
 static void slab_destroy (kmem_cache_t *cachep, struct slab *slabp)
 {
+	void *addr = slabp->s_mem - slabp->colouroff;
+
 #if DEBUG
 	int i;
 	for (i = 0; i < cachep->num; i++) {
@@ -1071,10 +1108,19 @@ static void slab_destroy (kmem_cache_t *
 		}
 	}
 #endif
-	
-	kmem_freepages(cachep, slabp->s_mem-slabp->colouroff);
-	if (OFF_SLAB(cachep))
-		kmem_cache_free(cachep->slabp_cache, slabp);
+
+	if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) {
+		struct slab_rcu *slab_rcu;
+
+		slab_rcu = (struct slab_rcu *) slabp;
+		slab_rcu->cachep = cachep;
+		slab_rcu->addr = addr;
+		call_rcu(&slab_rcu->head, kmem_rcu_free);
+	} else {
+		kmem_freepages(cachep, addr);
+		if (OFF_SLAB(cachep))
+			kmem_cache_free(cachep->slabp_cache, slabp);
+	}
 }
 
 /**
@@ -1149,9 +1195,15 @@ kmem_cache_create (const char *name, siz
 	 */
 	if ((size < 4096 || fls(size-1) == fls(size-1+3*BYTES_PER_WORD)))
 		flags |= SLAB_RED_ZONE|SLAB_STORE_USER;
-	flags |= SLAB_POISON;
+	if (!(flags & SLAB_DESTROY_BY_RCU))
+		flags |= SLAB_POISON;
 #endif
+	if (flags & SLAB_DESTROY_BY_RCU)
+		BUG_ON(flags & SLAB_POISON);
 #endif
+	if (flags & SLAB_DESTROY_BY_RCU)
+		BUG_ON(dtor);
+
 	/*
 	 * Always checks flags, a caller might be expecting debug
 	 * support which isn't available.
@@ -1563,6 +1615,9 @@ int kmem_cache_destroy (kmem_cache_t * c
 		return 1;
 	}
 
+	if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU))
+		synchronize_kernel();
+
 	/* no cpu_online check required here since we clear the percpu
 	 * array on cpu offline and set this to NULL.
 	 */


^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2004-07-14 20:25 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2004-07-12 21:49 [PATCH] rmaplock 1/6 PageAnon in mapping Hugh Dickins
2004-07-12 21:50 ` [PATCH] rmaplock 2/6 kill page_map_lock Hugh Dickins
2004-07-12 21:52 ` [PATCH] rmaplock 2/6 SLAB_DESTROY_BY_RCU Hugh Dickins
2004-07-13 16:36   ` Manfred Spraul
2004-07-13 19:53     ` Hugh Dickins
2004-07-13 20:36       ` Manfred Spraul
2004-07-13 22:17         ` William Lee Irwin III
2004-07-14 20:25         ` Hugh Dickins
2004-07-12 21:53 ` [PATCH] rmaplock 4/6 mm lock ordering Hugh Dickins
2004-07-12 21:54 ` [PATCH] rmaplock 5/6 unuse_process mmap_sem Hugh Dickins
2004-07-12 21:55 ` [PATCH] rmaplock 6/6 swapoff use anon_vma Hugh Dickins
2004-07-13  4:48 ` [PATCH] rmaplock 1/6 PageAnon in mapping Andrew Morton
2004-07-13  7:08   ` Hugh Dickins

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.