linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* [RFC]vmscan: doing page_referenced() in batch way
@ 2010-09-29  2:57 Shaohua Li
  2010-10-06 20:10 ` Andrew Morton
  0 siblings, 1 reply; 5+ messages in thread
From: Shaohua Li @ 2010-09-29  2:57 UTC (permalink / raw)
  To: linux-mm; +Cc: riel, Andrew Morton, Andi Kleen, hughd

when memory pressure is high, page_referenced() causes a lot of lock contention
for anon_vma->lock or mapping->i_mmap_lock. Considering pages from one file
usually live side by side in LRU list, we can lock several pages in
shrink_page_list() and do batch page_referenced() to avoid some lock/unlock,
which should reduce lock contention a lot. The locking rule documented in
rmap.c is:
page_lock
	mapping->i_mmap_lock
		anon_vma->lock
For a batch of pages, we do page lock for all of them first and check their
reference, and then release their i_mmap_lock or anon_vma lock. This seems not
break the rule to me.
Before I further polish the patch, I'd like to know if there is anything
preventing us to do such batch here.

Thanks,
Shaohua

---
 include/linux/rmap.h |    7 ++--
 mm/internal.h        |   11 ++++++
 mm/memory-failure.c  |    2 -
 mm/rmap.c            |   60 ++++++++++++++++++++++++++++------
 mm/vmscan.c          |   88 ++++++++++++++++++++++++++++++++++++++++++++++-----
 5 files changed, 147 insertions(+), 21 deletions(-)

Index: linux/include/linux/rmap.h
===================================================================
--- linux.orig/include/linux/rmap.h	2010-09-29 09:13:04.000000000 +0800
+++ linux/include/linux/rmap.h	2010-09-29 10:29:54.000000000 +0800
@@ -181,8 +181,10 @@ static inline void page_dup_rmap(struct 
 /*
  * Called from mm/vmscan.c to handle paging out
  */
+struct page_reference_control;
 int page_referenced(struct page *, int is_locked,
-			struct mem_cgroup *cnt, unsigned long *vm_flags);
+			struct mem_cgroup *cnt, unsigned long *vm_flags,
+			struct page_reference_control *prc);
 int page_referenced_one(struct page *, struct vm_area_struct *,
 	unsigned long address, unsigned int *mapcount, unsigned long *vm_flags);
 
@@ -230,7 +232,8 @@ int try_to_munlock(struct page *);
 /*
  * Called by memory-failure.c to kill processes.
  */
-struct anon_vma *page_lock_anon_vma(struct page *page);
+struct anon_vma *page_lock_anon_vma(struct page *page,
+	struct page_reference_control *prc);
 void page_unlock_anon_vma(struct anon_vma *anon_vma);
 int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma);
 
Index: linux/mm/internal.h
===================================================================
--- linux.orig/mm/internal.h	2010-09-29 09:13:05.000000000 +0800
+++ linux/mm/internal.h	2010-09-29 10:29:54.000000000 +0800
@@ -249,6 +249,17 @@ int __get_user_pages(struct task_struct 
 #define ZONE_RECLAIM_FULL	-1
 #define ZONE_RECLAIM_SOME	0
 #define ZONE_RECLAIM_SUCCESS	1
+
+#define PRC_PAGE_NUM 8
+struct page_reference_control {
+	int num;
+	struct page *pages[PRC_PAGE_NUM];
+	int references[PRC_PAGE_NUM];
+	struct anon_vma *anon_vma;
+	struct address_space *mapping;
+	/* no ksm */
+};
+
 #endif
 
 extern int hwpoison_filter(struct page *p);
Index: linux/mm/rmap.c
===================================================================
--- linux.orig/mm/rmap.c	2010-09-29 09:13:05.000000000 +0800
+++ linux/mm/rmap.c	2010-09-29 10:30:09.000000000 +0800
@@ -314,7 +314,8 @@ void __init anon_vma_init(void)
  * Getting a lock on a stable anon_vma from a page off the LRU is
  * tricky: page_lock_anon_vma rely on RCU to guard against the races.
  */
-struct anon_vma *page_lock_anon_vma(struct page *page)
+struct anon_vma *page_lock_anon_vma(struct page *page,
+	struct page_reference_control *prc)
 {
 	struct anon_vma *anon_vma, *root_anon_vma;
 	unsigned long anon_mapping;
@@ -328,6 +329,22 @@ struct anon_vma *page_lock_anon_vma(stru
 
 	anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
 	root_anon_vma = ACCESS_ONCE(anon_vma->root);
+
+	if (prc) {
+		if (root_anon_vma == prc->anon_vma) {
+			rcu_read_unlock();
+			return root_anon_vma;
+		}
+		if (prc->anon_vma) {
+			page_unlock_anon_vma(prc->anon_vma);
+			prc->anon_vma = NULL;
+		}
+		if (prc->mapping) {
+			spin_unlock(&prc->mapping->i_mmap_lock);
+			prc->mapping = NULL;
+		}
+		prc->anon_vma = root_anon_vma;
+	}
 	spin_lock(&root_anon_vma->lock);
 
 	/*
@@ -530,14 +547,15 @@ out:
 
 static int page_referenced_anon(struct page *page,
 				struct mem_cgroup *mem_cont,
-				unsigned long *vm_flags)
+				unsigned long *vm_flags,
+				struct page_reference_control *prc)
 {
 	unsigned int mapcount;
 	struct anon_vma *anon_vma;
 	struct anon_vma_chain *avc;
 	int referenced = 0;
 
-	anon_vma = page_lock_anon_vma(page);
+	anon_vma = page_lock_anon_vma(page, prc);
 	if (!anon_vma)
 		return referenced;
 
@@ -560,7 +578,8 @@ static int page_referenced_anon(struct p
 			break;
 	}
 
-	page_unlock_anon_vma(anon_vma);
+	if (!prc)
+		page_unlock_anon_vma(anon_vma);
 	return referenced;
 }
 
@@ -579,7 +598,8 @@ static int page_referenced_anon(struct p
  */
 static int page_referenced_file(struct page *page,
 				struct mem_cgroup *mem_cont,
-				unsigned long *vm_flags)
+				unsigned long *vm_flags,
+				struct page_reference_control *prc)
 {
 	unsigned int mapcount;
 	struct address_space *mapping = page->mapping;
@@ -603,8 +623,25 @@ static int page_referenced_file(struct p
 	 */
 	BUG_ON(!PageLocked(page));
 
-	spin_lock(&mapping->i_mmap_lock);
+	if (prc) {
+		if (mapping == prc->mapping) {
+			goto skip_lock;
+		}
+		if (prc->anon_vma) {
+			page_unlock_anon_vma(prc->anon_vma);
+			prc->anon_vma = NULL;
+		}
+		if (prc->mapping) {
+			spin_unlock(&prc->mapping->i_mmap_lock);
+			prc->mapping = NULL;
+		}
+		prc->mapping = mapping;
+
+		spin_lock(&mapping->i_mmap_lock);
+	} else
+		spin_lock(&mapping->i_mmap_lock);
 
+skip_lock:
 	/*
 	 * i_mmap_lock does not stabilize mapcount at all, but mapcount
 	 * is more likely to be accurate if we note it after spinning.
@@ -628,7 +665,8 @@ static int page_referenced_file(struct p
 			break;
 	}
 
-	spin_unlock(&mapping->i_mmap_lock);
+	if (!prc)
+		spin_unlock(&mapping->i_mmap_lock);
 	return referenced;
 }
 
@@ -645,7 +683,7 @@ static int page_referenced_file(struct p
 int page_referenced(struct page *page,
 		    int is_locked,
 		    struct mem_cgroup *mem_cont,
-		    unsigned long *vm_flags)
+		    unsigned long *vm_flags, struct page_reference_control *prc)
 {
 	int referenced = 0;
 	int we_locked = 0;
@@ -664,10 +702,10 @@ int page_referenced(struct page *page,
 								vm_flags);
 		else if (PageAnon(page))
 			referenced += page_referenced_anon(page, mem_cont,
-								vm_flags);
+								vm_flags, prc);
 		else if (page->mapping)
 			referenced += page_referenced_file(page, mem_cont,
-								vm_flags);
+								vm_flags, prc);
 		if (we_locked)
 			unlock_page(page);
 	}
@@ -1239,7 +1277,7 @@ static int try_to_unmap_anon(struct page
 	struct anon_vma_chain *avc;
 	int ret = SWAP_AGAIN;
 
-	anon_vma = page_lock_anon_vma(page);
+	anon_vma = page_lock_anon_vma(page, NULL);
 	if (!anon_vma)
 		return ret;
 
Index: linux/mm/vmscan.c
===================================================================
--- linux.orig/mm/vmscan.c	2010-09-29 10:29:48.000000000 +0800
+++ linux/mm/vmscan.c	2010-09-29 10:39:26.000000000 +0800
@@ -40,6 +40,7 @@
 #include <linux/memcontrol.h>
 #include <linux/delayacct.h>
 #include <linux/sysctl.h>
+#include <linux/ksm.h>
 
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -571,12 +572,12 @@ enum page_references {
 };
 
 static enum page_references page_check_references(struct page *page,
-						  struct scan_control *sc)
+	struct scan_control *sc, struct page_reference_control *prc)
 {
 	int referenced_ptes, referenced_page;
 	unsigned long vm_flags;
 
-	referenced_ptes = page_referenced(page, 1, sc->mem_cgroup, &vm_flags);
+	referenced_ptes = page_referenced(page, 1, sc->mem_cgroup, &vm_flags, prc);
 	referenced_page = TestClearPageReferenced(page);
 
 	/* Lumpy reclaim - ignore references */
@@ -640,6 +641,44 @@ static noinline_for_stack void free_page
 	pagevec_free(&freed_pvec);
 }
 
+static void do_prc_batch(struct scan_control *sc,
+	struct page_reference_control *prc)
+{
+	int i;
+	for (i = 0; i < prc->num; i++)
+		prc->references[i] = page_check_references(prc->pages[i], sc,
+			prc);
+	/*
+	 * we must release all locks here, the lock ordering requries
+	 * pagelock->
+	 *   mapping->i_mmap_lock->
+	 *     anon_vma->lock
+	 * release lock guarantee we don't break the rule in next run
+	 */
+	if (prc->anon_vma) {
+		page_unlock_anon_vma(prc->anon_vma);
+		prc->anon_vma = NULL;
+	}
+	if (prc->mapping) {
+		spin_unlock(&prc->mapping->i_mmap_lock);
+		prc->mapping = NULL;
+	}
+}
+
+static int page_check_references_batch(struct page *page, struct scan_control *sc,
+	struct page_reference_control *prc)
+{
+	/* bypass ksm pages */
+	if (PageKsm(page))
+		return 1;
+	if (prc->num < PRC_PAGE_NUM)
+		prc->pages[prc->num] = page;
+	prc->num++;
+	if (prc->num == PRC_PAGE_NUM)
+		return 1;
+	return 0;
+}
+
 /*
  * shrink_page_list() returns the number of reclaimed pages
  */
@@ -651,18 +690,36 @@ static unsigned long shrink_page_list(st
 	LIST_HEAD(free_pages);
 	int pgactivate = 0;
 	unsigned long nr_reclaimed = 0;
+	struct page_reference_control prc;
+	int do_batch_count = 0;
+
+	prc.num = 0;
+	prc.anon_vma = NULL;
+	prc.mapping = NULL;
 
 	cond_resched();
 
-	while (!list_empty(page_list)) {
+	while (!list_empty(page_list) || prc.num > 0) {
 		enum page_references references;
 		struct address_space *mapping;
-		struct page *page;
+		struct page *uninitialized_var(page);
 		int may_enter_fs;
 
 		cond_resched();
 
-		page = lru_to_page(page_list);
+		if (do_batch_count)
+			goto do_one_batch_page;
+
+		/* bypass ksm pages */
+		if (!list_empty(page_list)) {
+			page = lru_to_page(page_list);
+			if (PageKsm(page) && prc.num > 0)
+				goto do_batch_pages;
+		}
+
+		if (list_empty(page_list) && prc.num > 0)
+			goto do_batch_pages;
+
 		list_del(&page->lru);
 
 		if (!trylock_page(page))
@@ -700,7 +757,22 @@ static unsigned long shrink_page_list(st
 				goto keep_locked;
 		}
 
-		references = page_check_references(page, sc);
+		if (!page_check_references_batch(page, sc, &prc))
+			continue;
+do_batch_pages:
+		do_prc_batch(sc, &prc);
+		do_batch_count = prc.num;
+do_one_batch_page:
+		/* be careful to not reorder the pages */
+		page = prc.pages[do_batch_count - prc.num];
+		references = prc.references[do_batch_count - prc.num];
+		prc.num--;
+		if (prc.num <= 0)
+			do_batch_count = 0;
+		/* The page might be changed, recheck */
+		may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
+			(PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
+
 		switch (references) {
 		case PAGEREF_ACTIVATE:
 			goto activate_locked;
@@ -857,6 +929,8 @@ keep:
 		VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
 	}
 
+	BUG_ON(prc.num > 0);
+
 	free_page_list(&free_pages);
 
 	list_splice(&ret_pages, page_list);
@@ -1465,7 +1539,7 @@ static void shrink_active_list(unsigned 
 			continue;
 		}
 
-		if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) {
+		if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags, NULL)) {
 			nr_rotated++;
 			/*
 			 * Identify referenced, file-backed active pages and
Index: linux/mm/memory-failure.c
===================================================================
--- linux.orig/mm/memory-failure.c	2010-09-29 09:13:05.000000000 +0800
+++ linux/mm/memory-failure.c	2010-09-29 10:29:54.000000000 +0800
@@ -382,7 +382,7 @@ static void collect_procs_anon(struct pa
 	struct anon_vma *av;
 
 	read_lock(&tasklist_lock);
-	av = page_lock_anon_vma(page);
+	av = page_lock_anon_vma(page, NULL);
 	if (av == NULL)	/* Not actually mapped anymore */
 		goto out;
 	for_each_process (tsk) {


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [RFC]vmscan: doing page_referenced() in batch way
  2010-09-29  2:57 [RFC]vmscan: doing page_referenced() in batch way Shaohua Li
@ 2010-10-06 20:10 ` Andrew Morton
  2010-10-10 17:10   ` Rik van Riel
  2010-10-13  4:53   ` Shaohua Li
  0 siblings, 2 replies; 5+ messages in thread
From: Andrew Morton @ 2010-10-06 20:10 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-mm, riel, Andi Kleen, hughd

On Wed, 29 Sep 2010 10:57:33 +0800
Shaohua Li <shaohua.li@intel.com> wrote:

> when memory pressure is high, page_referenced() causes a lot of lock contention
> for anon_vma->lock or mapping->i_mmap_lock. Considering pages from one file
> usually live side by side in LRU list, we can lock several pages in
> shrink_page_list() and do batch page_referenced() to avoid some lock/unlock,
> which should reduce lock contention a lot. The locking rule documented in
> rmap.c is:
> page_lock
> 	mapping->i_mmap_lock
> 		anon_vma->lock
> For a batch of pages, we do page lock for all of them first and check their
> reference, and then release their i_mmap_lock or anon_vma lock. This seems not
> break the rule to me.
> Before I further polish the patch, I'd like to know if there is anything
> preventing us to do such batch here.

The patch adds quite a bit of complexity, so we'd need to see benchmark
testing results which justify it, please.

Also, the entire patch is irrelevant for uniprocessor machines, so the
runtime overhead and code-size increases for CONFIG_SMP=n builds should
be as low as possible - ideally zero.  Please quantify this as well
within the changelog if you pursue this work.

>
> ...
>
> +#define PRC_PAGE_NUM 8
> +struct page_reference_control {
> +	int num;
> +	struct page *pages[PRC_PAGE_NUM];
> +	int references[PRC_PAGE_NUM];
> +	struct anon_vma *anon_vma;
> +	struct address_space *mapping;
> +	/* no ksm */
> +};

hm, 120 bytes of stack consumed, deep in page reclaim.

>  #endif
>  
>  extern int hwpoison_filter(struct page *p);
>
> ...
>
>  static int page_referenced_file(struct page *page,
>  				struct mem_cgroup *mem_cont,
> -				unsigned long *vm_flags)
> +				unsigned long *vm_flags,
> +				struct page_reference_control *prc)
>  {
>  	unsigned int mapcount;
>  	struct address_space *mapping = page->mapping;
> @@ -603,8 +623,25 @@ static int page_referenced_file(struct p
>  	 */
>  	BUG_ON(!PageLocked(page));
>  
> -	spin_lock(&mapping->i_mmap_lock);
> +	if (prc) {
> +		if (mapping == prc->mapping) {
> +			goto skip_lock;
> +		}
> +		if (prc->anon_vma) {
> +			page_unlock_anon_vma(prc->anon_vma);
> +			prc->anon_vma = NULL;
> +		}
> +		if (prc->mapping) {
> +			spin_unlock(&prc->mapping->i_mmap_lock);
> +			prc->mapping = NULL;
> +		}
> +		prc->mapping = mapping;
> +
> +		spin_lock(&mapping->i_mmap_lock);
> +	} else
> +		spin_lock(&mapping->i_mmap_lock);

Move the spin_lock() outside, remove the `else' part.

> +skip_lock:
>  	/*
>  	 * i_mmap_lock does not stabilize mapcount at all, but mapcount
>  	 * is more likely to be accurate if we note it after spinning.
> @@ -628,7 +665,8 @@ static int page_referenced_file(struct p
>  			break;
>  	}
>  
> -	spin_unlock(&mapping->i_mmap_lock);
> +	if (!prc)
> +		spin_unlock(&mapping->i_mmap_lock);
>  	return referenced;
>  }
>  
>
> ...
>
> +static void do_prc_batch(struct scan_control *sc,
> +	struct page_reference_control *prc)
> +{
> +	int i;
> +	for (i = 0; i < prc->num; i++)
> +		prc->references[i] = page_check_references(prc->pages[i], sc,
> +			prc);
> +	/*
> +	 * we must release all locks here, the lock ordering requries
> +	 * pagelock->
> +	 *   mapping->i_mmap_lock->
> +	 *     anon_vma->lock
> +	 * release lock guarantee we don't break the rule in next run
> +	 */
> +	if (prc->anon_vma) {
> +		page_unlock_anon_vma(prc->anon_vma);
> +		prc->anon_vma = NULL;
> +	}
> +	if (prc->mapping) {
> +		spin_unlock(&prc->mapping->i_mmap_lock);
> +		prc->mapping = NULL;
> +	}
> +}

I didn't check the locking alterations.

> +static int page_check_references_batch(struct page *page, struct scan_control *sc,
> +	struct page_reference_control *prc)
> +{
> +	/* bypass ksm pages */
> +	if (PageKsm(page))
> +		return 1;

A general point about code comments: the comment shouldn't describe
"what" the code is doing unless that is unobvious.  The comment should
explain "why" the code is doing something.  Because that is something
which the code itself cannot explain.

> +	if (prc->num < PRC_PAGE_NUM)
> +		prc->pages[prc->num] = page;
> +	prc->num++;
> +	if (prc->num == PRC_PAGE_NUM)
> +		return 1;
> +	return 0;
> +}
>
> ...
>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [RFC]vmscan: doing page_referenced() in batch way
  2010-10-06 20:10 ` Andrew Morton
@ 2010-10-10 17:10   ` Rik van Riel
  2010-10-11  0:49     ` Shaohua Li
  2010-10-13  4:53   ` Shaohua Li
  1 sibling, 1 reply; 5+ messages in thread
From: Rik van Riel @ 2010-10-10 17:10 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Shaohua Li, linux-mm, Andi Kleen, hughd

On 10/06/2010 04:10 PM, Andrew Morton wrote:
> On Wed, 29 Sep 2010 10:57:33 +0800
> Shaohua Li<shaohua.li@intel.com>  wrote:
>
>> when memory pressure is high, page_referenced() causes a lot of lock contention
>> for anon_vma->lock or mapping->i_mmap_lock. Considering pages from one file
>> usually live side by side in LRU list, we can lock several pages in
>> shrink_page_list() and do batch page_referenced() to avoid some lock/unlock,
>> which should reduce lock contention a lot. The locking rule documented in
>> rmap.c is:
>> page_lock
>> 	mapping->i_mmap_lock
>> 		anon_vma->lock
>> For a batch of pages, we do page lock for all of them first and check their
>> reference, and then release their i_mmap_lock or anon_vma lock. This seems not
>> break the rule to me.
>> Before I further polish the patch, I'd like to know if there is anything
>> preventing us to do such batch here.
>
> The patch adds quite a bit of complexity, so we'd need to see benchmark
> testing results which justify it, please.
>
> Also, the entire patch is irrelevant for uniprocessor machines, so the
> runtime overhead and code-size increases for CONFIG_SMP=n builds should
> be as low as possible - ideally zero.  Please quantify this as well
> within the changelog if you pursue this work.
>
>>
>> ...
>>
>> +#define PRC_PAGE_NUM 8
>> +struct page_reference_control {
>> +	int num;
>> +	struct page *pages[PRC_PAGE_NUM];
>> +	int references[PRC_PAGE_NUM];
>> +	struct anon_vma *anon_vma;
>> +	struct address_space *mapping;
>> +	/* no ksm */
>> +};
>
> hm, 120 bytes of stack consumed, deep in page reclaim.
>
>>   #endif
>>
>>   extern int hwpoison_filter(struct page *p);
>>
>> ...
>>
>>   static int page_referenced_file(struct page *page,
>>   				struct mem_cgroup *mem_cont,
>> -				unsigned long *vm_flags)
>> +				unsigned long *vm_flags,
>> +				struct page_reference_control *prc)
>>   {
>>   	unsigned int mapcount;
>>   	struct address_space *mapping = page->mapping;
>> @@ -603,8 +623,25 @@ static int page_referenced_file(struct p
>>   	 */
>>   	BUG_ON(!PageLocked(page));
>>
>> -	spin_lock(&mapping->i_mmap_lock);
>> +	if (prc) {
>> +		if (mapping == prc->mapping) {
>> +			goto skip_lock;
>> +		}
>> +		if (prc->anon_vma) {
>> +			page_unlock_anon_vma(prc->anon_vma);
>> +			prc->anon_vma = NULL;
>> +		}
>> +		if (prc->mapping) {
>> +			spin_unlock(&prc->mapping->i_mmap_lock);
>> +			prc->mapping = NULL;
>> +		}
>> +		prc->mapping = mapping;
>> +
>> +		spin_lock(&mapping->i_mmap_lock);
>> +	} else
>> +		spin_lock(&mapping->i_mmap_lock);
>
> Move the spin_lock() outside, remove the `else' part.
>
>> +skip_lock:
>>   	/*
>>   	 * i_mmap_lock does not stabilize mapcount at all, but mapcount
>>   	 * is more likely to be accurate if we note it after spinning.
>> @@ -628,7 +665,8 @@ static int page_referenced_file(struct p
>>   			break;
>>   	}
>>
>> -	spin_unlock(&mapping->i_mmap_lock);
>> +	if (!prc)
>> +		spin_unlock(&mapping->i_mmap_lock);
>>   	return referenced;
>>   }
>>
>>
>> ...
>>
>> +static void do_prc_batch(struct scan_control *sc,
>> +	struct page_reference_control *prc)
>> +{
>> +	int i;
>> +	for (i = 0; i<  prc->num; i++)
>> +		prc->references[i] = page_check_references(prc->pages[i], sc,
>> +			prc);
>> +	/*
>> +	 * we must release all locks here, the lock ordering requries
>> +	 * pagelock->
>> +	 *   mapping->i_mmap_lock->
>> +	 *     anon_vma->lock
>> +	 * release lock guarantee we don't break the rule in next run
>> +	 */
>> +	if (prc->anon_vma) {
>> +		page_unlock_anon_vma(prc->anon_vma);
>> +		prc->anon_vma = NULL;
>> +	}
>> +	if (prc->mapping) {
>> +		spin_unlock(&prc->mapping->i_mmap_lock);
>> +		prc->mapping = NULL;
>> +	}
>> +}
>
> I didn't check the locking alterations.

I've tried to wrap my head around them, but haven't
yet been able to convince myself that it is safe.

What if we have multiple pages sharing the same
mapping or root anon_vma, which is not the same
as the prc->anon_vma ?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [RFC]vmscan: doing page_referenced() in batch way
  2010-10-10 17:10   ` Rik van Riel
@ 2010-10-11  0:49     ` Shaohua Li
  0 siblings, 0 replies; 5+ messages in thread
From: Shaohua Li @ 2010-10-11  0:49 UTC (permalink / raw)
  To: Rik van Riel; +Cc: Andrew Morton, linux-mm, Andi Kleen, hughd@google.com

On Mon, Oct 11, 2010 at 01:10:01AM +0800, Rik van Riel wrote:
> On 10/06/2010 04:10 PM, Andrew Morton wrote:
> > On Wed, 29 Sep 2010 10:57:33 +0800
> > Shaohua Li<shaohua.li@intel.com>  wrote:
> >
> >> when memory pressure is high, page_referenced() causes a lot of lock contention
> >> for anon_vma->lock or mapping->i_mmap_lock. Considering pages from one file
> >> usually live side by side in LRU list, we can lock several pages in
> >> shrink_page_list() and do batch page_referenced() to avoid some lock/unlock,
> >> which should reduce lock contention a lot. The locking rule documented in
> >> rmap.c is:
> >> page_lock
> >> 	mapping->i_mmap_lock
> >> 		anon_vma->lock
> >> For a batch of pages, we do page lock for all of them first and check their
> >> reference, and then release their i_mmap_lock or anon_vma lock. This seems not
> >> break the rule to me.
> >> Before I further polish the patch, I'd like to know if there is anything
> >> preventing us to do such batch here.
> >
> > The patch adds quite a bit of complexity, so we'd need to see benchmark
> > testing results which justify it, please.
> >
> > Also, the entire patch is irrelevant for uniprocessor machines, so the
> > runtime overhead and code-size increases for CONFIG_SMP=n builds should
> > be as low as possible - ideally zero.  Please quantify this as well
> > within the changelog if you pursue this work.
> >
> >>
> >> ...
> >>
> >> +#define PRC_PAGE_NUM 8
> >> +struct page_reference_control {
> >> +	int num;
> >> +	struct page *pages[PRC_PAGE_NUM];
> >> +	int references[PRC_PAGE_NUM];
> >> +	struct anon_vma *anon_vma;
> >> +	struct address_space *mapping;
> >> +	/* no ksm */
> >> +};
> >
> > hm, 120 bytes of stack consumed, deep in page reclaim.
> >
> >>   #endif
> >>
> >>   extern int hwpoison_filter(struct page *p);
> >>
> >> ...
> >>
> >>   static int page_referenced_file(struct page *page,
> >>   				struct mem_cgroup *mem_cont,
> >> -				unsigned long *vm_flags)
> >> +				unsigned long *vm_flags,
> >> +				struct page_reference_control *prc)
> >>   {
> >>   	unsigned int mapcount;
> >>   	struct address_space *mapping = page->mapping;
> >> @@ -603,8 +623,25 @@ static int page_referenced_file(struct p
> >>   	 */
> >>   	BUG_ON(!PageLocked(page));
> >>
> >> -	spin_lock(&mapping->i_mmap_lock);
> >> +	if (prc) {
> >> +		if (mapping == prc->mapping) {
> >> +			goto skip_lock;
> >> +		}
> >> +		if (prc->anon_vma) {
> >> +			page_unlock_anon_vma(prc->anon_vma);
> >> +			prc->anon_vma = NULL;
> >> +		}
> >> +		if (prc->mapping) {
> >> +			spin_unlock(&prc->mapping->i_mmap_lock);
> >> +			prc->mapping = NULL;
> >> +		}
> >> +		prc->mapping = mapping;
> >> +
> >> +		spin_lock(&mapping->i_mmap_lock);
> >> +	} else
> >> +		spin_lock(&mapping->i_mmap_lock);
> >
> > Move the spin_lock() outside, remove the `else' part.
> >
> >> +skip_lock:
> >>   	/*
> >>   	 * i_mmap_lock does not stabilize mapcount at all, but mapcount
> >>   	 * is more likely to be accurate if we note it after spinning.
> >> @@ -628,7 +665,8 @@ static int page_referenced_file(struct p
> >>   			break;
> >>   	}
> >>
> >> -	spin_unlock(&mapping->i_mmap_lock);
> >> +	if (!prc)
> >> +		spin_unlock(&mapping->i_mmap_lock);
> >>   	return referenced;
> >>   }
> >>
> >>
> >> ...
> >>
> >> +static void do_prc_batch(struct scan_control *sc,
> >> +	struct page_reference_control *prc)
> >> +{
> >> +	int i;
> >> +	for (i = 0; i<  prc->num; i++)
> >> +		prc->references[i] = page_check_references(prc->pages[i], sc,
> >> +			prc);
> >> +	/*
> >> +	 * we must release all locks here, the lock ordering requries
> >> +	 * pagelock->
> >> +	 *   mapping->i_mmap_lock->
> >> +	 *     anon_vma->lock
> >> +	 * release lock guarantee we don't break the rule in next run
> >> +	 */
> >> +	if (prc->anon_vma) {
> >> +		page_unlock_anon_vma(prc->anon_vma);
> >> +		prc->anon_vma = NULL;
> >> +	}
> >> +	if (prc->mapping) {
> >> +		spin_unlock(&prc->mapping->i_mmap_lock);
> >> +		prc->mapping = NULL;
> >> +	}
> >> +}
> >
> > I didn't check the locking alterations.
> 
> I've tried to wrap my head around them, but haven't
> yet been able to convince myself that it is safe.
> 
> What if we have multiple pages sharing the same
> mapping or root anon_vma, which is not the same
> as the prc->anon_vma ?
if their mapping or anon_vma is different, page_lock_anon_vma()
or page_referenced_file() will release the lock and retake a new
lock in the patch. Since adjacent pages usually have the same
mapping or anon_vma, this release/retake lock already can avoid
quite a lot of lock. is this safe?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [RFC]vmscan: doing page_referenced() in batch way
  2010-10-06 20:10 ` Andrew Morton
  2010-10-10 17:10   ` Rik van Riel
@ 2010-10-13  4:53   ` Shaohua Li
  1 sibling, 0 replies; 5+ messages in thread
From: Shaohua Li @ 2010-10-13  4:53 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-mm, riel@redhat.com, Andi Kleen, hughd@google.com

On Thu, Oct 07, 2010 at 04:10:52AM +0800, Andrew Morton wrote:
> On Wed, 29 Sep 2010 10:57:33 +0800
> Shaohua Li <shaohua.li@intel.com> wrote:
> 
> > when memory pressure is high, page_referenced() causes a lot of lock contention
> > for anon_vma->lock or mapping->i_mmap_lock. Considering pages from one file
> > usually live side by side in LRU list, we can lock several pages in
> > shrink_page_list() and do batch page_referenced() to avoid some lock/unlock,
> > which should reduce lock contention a lot. The locking rule documented in
> > rmap.c is:
> > page_lock
> > 	mapping->i_mmap_lock
> > 		anon_vma->lock
> > For a batch of pages, we do page lock for all of them first and check their
> > reference, and then release their i_mmap_lock or anon_vma lock. This seems not
> > break the rule to me.
> > Before I further polish the patch, I'd like to know if there is anything
> > preventing us to do such batch here.
Thanks for your time.

> The patch adds quite a bit of complexity, so we'd need to see benchmark
> testing results which justify it, please.
My test only shows around 10% improvements, which is below my expections.
try_to_unmap() causes quite a lot of lock contention for such locks, and makes
the page_referenced() batch not quite helpful. Looks we can do batch try_to_unmap()
too. I'll report back later when I have data with try_to_unmap() batched.

Thanks,
Shaohua

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2010-10-13  4:53 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-09-29  2:57 [RFC]vmscan: doing page_referenced() in batch way Shaohua Li
2010-10-06 20:10 ` Andrew Morton
2010-10-10 17:10   ` Rik van Riel
2010-10-11  0:49     ` Shaohua Li
2010-10-13  4:53   ` Shaohua Li

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).