[PATCH 1/3] mm, vmscan: count accidental reclaimed pages failed to put into lru

linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 1/3] mm, vmscan: count accidental reclaimed pages failed to put into lru
@ 2013-04-09  1:21 Joonsoo Kim
  2013-04-09  1:21 ` [PATCH 2/3] mm, slub: count freed pages via rcu as this task's reclaimed_slab Joonsoo Kim
                   ` (2 more replies)
  0 siblings, 3 replies; 16+ messages in thread
From: Joonsoo Kim @ 2013-04-09  1:21 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-kernel, linux-mm, Mel Gorman, Hugh Dickins, Rik van Riel,
	Minchan Kim, Joonsoo Kim

In shrink_(in)active_list(), we can fail to put into lru, and these pages
are reclaimed accidentally. Currently, these pages are not counted
for sc->nr_reclaimed, but with this information, we can stop to reclaim
earlier, so can reduce overhead of reclaim.

Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 0f615eb..5d60ae0 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -365,7 +365,7 @@ void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask);
 extern void __free_pages(struct page *page, unsigned int order);
 extern void free_pages(unsigned long addr, unsigned int order);
 extern void free_hot_cold_page(struct page *page, int cold);
-extern void free_hot_cold_page_list(struct list_head *list, int cold);
+extern unsigned long free_hot_cold_page_list(struct list_head *list, int cold);
 
 extern void __free_memcg_kmem_pages(struct page *page, unsigned int order);
 extern void free_memcg_kmem_pages(unsigned long addr, unsigned int order);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8fcced7..a5f3952 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1360,14 +1360,18 @@ out:
 /*
  * Free a list of 0-order pages
  */
-void free_hot_cold_page_list(struct list_head *list, int cold)
+unsigned long free_hot_cold_page_list(struct list_head *list, int cold)
 {
+	unsigned long nr_reclaimed = 0;
 	struct page *page, *next;
 
 	list_for_each_entry_safe(page, next, list, lru) {
 		trace_mm_page_free_batched(page, cold);
 		free_hot_cold_page(page, cold);
+		nr_reclaimed++;
 	}
+
+	return nr_reclaimed;
 }
 
 /*
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 88c5fed..eff2927 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -915,7 +915,6 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		 */
 		__clear_page_locked(page);
 free_it:
-		nr_reclaimed++;
 
 		/*
 		 * Is there need to periodically free_page_list? It would
@@ -954,7 +953,7 @@ keep:
 	if (nr_dirty && nr_dirty == nr_congested && global_reclaim(sc))
 		zone_set_flag(zone, ZONE_CONGESTED);
 
-	free_hot_cold_page_list(&free_pages, 1);
+	nr_reclaimed += free_hot_cold_page_list(&free_pages, 1);
 
 	list_splice(&ret_pages, page_list);
 	count_vm_events(PGACTIVATE, pgactivate);
@@ -1321,7 +1320,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
 	if (nr_taken == 0)
 		return 0;
 
-	nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP,
+	nr_reclaimed += shrink_page_list(&page_list, zone, sc, TTU_UNMAP,
 					&nr_dirty, &nr_writeback, false);
 
 	spin_lock_irq(&zone->lru_lock);
@@ -1343,7 +1342,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
 
 	spin_unlock_irq(&zone->lru_lock);
 
-	free_hot_cold_page_list(&page_list, 1);
+	nr_reclaimed += free_hot_cold_page_list(&page_list, 1);
 
 	/*
 	 * If reclaim is isolating dirty pages under writeback, it implies
@@ -1438,7 +1437,7 @@ static void move_active_pages_to_lru(struct lruvec *lruvec,
 		__count_vm_events(PGDEACTIVATE, pgmoved);
 }
 
-static void shrink_active_list(unsigned long nr_to_scan,
+static unsigned long shrink_active_list(unsigned long nr_to_scan,
 			       struct lruvec *lruvec,
 			       struct scan_control *sc,
 			       enum lru_list lru)
@@ -1534,7 +1533,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
 	__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
 	spin_unlock_irq(&zone->lru_lock);
 
-	free_hot_cold_page_list(&l_hold, 1);
+	return free_hot_cold_page_list(&l_hold, 1);
 }
 
 #ifdef CONFIG_SWAP
@@ -1617,7 +1616,8 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
 {
 	if (is_active_lru(lru)) {
 		if (inactive_list_is_low(lruvec, lru))
-			shrink_active_list(nr_to_scan, lruvec, sc, lru);
+			return shrink_active_list(nr_to_scan, lruvec, sc, lru);
+
 		return 0;
 	}
 
@@ -1861,8 +1861,8 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
 	 * rebalance the anon lru active/inactive ratio.
 	 */
 	if (inactive_anon_is_low(lruvec))
-		shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
-				   sc, LRU_ACTIVE_ANON);
+		sc->nr_reclaimed += shrink_active_list(SWAP_CLUSTER_MAX,
+					lruvec, sc, LRU_ACTIVE_ANON);
 
 	throttle_vm_writeout(sc->gfp_mask);
 }
@@ -2470,23 +2470,27 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
 }
 #endif
 
-static void age_active_anon(struct zone *zone, struct scan_control *sc)
+static unsigned long age_active_anon(struct zone *zone,
+					struct scan_control *sc)
 {
+	unsigned long nr_reclaimed = 0;
 	struct mem_cgroup *memcg;
 
 	if (!total_swap_pages)
-		return;
+		return 0;
 
 	memcg = mem_cgroup_iter(NULL, NULL, NULL);
 	do {
 		struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
 
 		if (inactive_anon_is_low(lruvec))
-			shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
-					   sc, LRU_ACTIVE_ANON);
+			nr_reclaimed += shrink_active_list(SWAP_CLUSTER_MAX,
+					lruvec, sc, LRU_ACTIVE_ANON);
 
 		memcg = mem_cgroup_iter(NULL, memcg, NULL);
 	} while (memcg);
+
+	return nr_reclaimed;
 }
 
 static bool zone_balanced(struct zone *zone, int order,
@@ -2666,7 +2670,7 @@ loop_again:
 			 * Do some background aging of the anon list, to give
 			 * pages a chance to be referenced before reclaiming.
 			 */
-			age_active_anon(zone, &sc);
+			sc.nr_reclaimed += age_active_anon(zone, &sc);
 
 			/*
 			 * If the number of buffer_heads in the machine
-- 
1.7.9.5


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH 2/3] mm, slub: count freed pages via rcu as this task's reclaimed_slab
  2013-04-09  1:21 [PATCH 1/3] mm, vmscan: count accidental reclaimed pages failed to put into lru Joonsoo Kim
@ 2013-04-09  1:21 ` Joonsoo Kim
  2013-04-09  9:38   ` Simon Jeons
  2013-04-09 14:28   ` Christoph Lameter
  2013-04-09  1:21 ` [PATCH 3/3] mm, slab: " Joonsoo Kim
  2013-04-09  5:55 ` [PATCH 1/3] mm, vmscan: count accidental reclaimed pages failed to put into lru Minchan Kim
  2 siblings, 2 replies; 16+ messages in thread
From: Joonsoo Kim @ 2013-04-09  1:21 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-kernel, linux-mm, Mel Gorman, Hugh Dickins, Rik van Riel,
	Minchan Kim, Joonsoo Kim, Christoph Lameter, Pekka Enberg,
	Matt Mackall

Currently, freed pages via rcu is not counted for reclaimed_slab, because
it is freed in rcu context, not current task context. But, this free is
initiated by this task, so counting this into this task's reclaimed_slab
is meaningful to decide whether we continue reclaim, or not.
So change code to count these pages for this task's reclaimed_slab.

Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Matt Mackall <mpm@selenic.com>
Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>

diff --git a/mm/slub.c b/mm/slub.c
index 4aec537..16fd2d5 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1409,8 +1409,6 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
 
 	memcg_release_pages(s, order);
 	page_mapcount_reset(page);
-	if (current->reclaim_state)
-		current->reclaim_state->reclaimed_slab += pages;
 	__free_memcg_kmem_pages(page, order);
 }
 
@@ -1431,6 +1429,8 @@ static void rcu_free_slab(struct rcu_head *h)
 
 static void free_slab(struct kmem_cache *s, struct page *page)
 {
+	int pages = 1 << compound_order(page);
+
 	if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) {
 		struct rcu_head *head;
 
@@ -1450,6 +1450,9 @@ static void free_slab(struct kmem_cache *s, struct page *page)
 		call_rcu(head, rcu_free_slab);
 	} else
 		__free_slab(s, page);
+
+	if (current->reclaim_state)
+		current->reclaim_state->reclaimed_slab += pages;
 }
 
 static void discard_slab(struct kmem_cache *s, struct page *page)
-- 
1.7.9.5


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/3] mm, slub: count freed pages via rcu as this task's reclaimed_slab
  2013-04-09  1:21 ` [PATCH 2/3] mm, slub: count freed pages via rcu as this task's reclaimed_slab Joonsoo Kim
@ 2013-04-09  9:38   ` Simon Jeons
  2013-04-09 14:32     ` Christoph Lameter
  2013-04-09 14:28   ` Christoph Lameter
  1 sibling, 1 reply; 16+ messages in thread
From: Simon Jeons @ 2013-04-09  9:38 UTC (permalink / raw)
  To: Joonsoo Kim
  Cc: Andrew Morton, linux-kernel, linux-mm, Mel Gorman, Hugh Dickins,
	Rik van Riel, Minchan Kim, Christoph Lameter, Pekka Enberg,
	Matt Mackall

Hi Joonsoo,
On 04/09/2013 09:21 AM, Joonsoo Kim wrote:
> Currently, freed pages via rcu is not counted for reclaimed_slab, because
> it is freed in rcu context, not current task context. But, this free is
> initiated by this task, so counting this into this task's reclaimed_slab
> is meaningful to decide whether we continue reclaim, or not.
> So change code to count these pages for this task's reclaimed_slab.
>
> Cc: Christoph Lameter <cl@linux-foundation.org>
> Cc: Pekka Enberg <penberg@kernel.org>
> Cc: Matt Mackall <mpm@selenic.com>
> Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
>
> diff --git a/mm/slub.c b/mm/slub.c
> index 4aec537..16fd2d5 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -1409,8 +1409,6 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
>   
>   	memcg_release_pages(s, order);
>   	page_mapcount_reset(page);
> -	if (current->reclaim_state)
> -		current->reclaim_state->reclaimed_slab += pages;
>   	__free_memcg_kmem_pages(page, order);
>   }
>   
> @@ -1431,6 +1429,8 @@ static void rcu_free_slab(struct rcu_head *h)
>   
>   static void free_slab(struct kmem_cache *s, struct page *page)
>   {
> +	int pages = 1 << compound_order(page);

One question irrelevant this patch. Why slab cache can use compound 
page(hugetlbfs pages/thp pages)? They are just used by app to optimize 
tlb miss, is it?

> +
>   	if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) {
>   		struct rcu_head *head;
>   
> @@ -1450,6 +1450,9 @@ static void free_slab(struct kmem_cache *s, struct page *page)
>   		call_rcu(head, rcu_free_slab);
>   	} else
>   		__free_slab(s, page);
> +
> +	if (current->reclaim_state)
> +		current->reclaim_state->reclaimed_slab += pages;
>   }
>   
>   static void discard_slab(struct kmem_cache *s, struct page *page)


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/3] mm, slub: count freed pages via rcu as this task's reclaimed_slab
  2013-04-09  9:38   ` Simon Jeons
@ 2013-04-09 14:32     ` Christoph Lameter
  2013-04-10  3:20       ` Simon Jeons
  0 siblings, 1 reply; 16+ messages in thread
From: Christoph Lameter @ 2013-04-09 14:32 UTC (permalink / raw)
  To: Simon Jeons
  Cc: Joonsoo Kim, Andrew Morton, linux-kernel, linux-mm, Mel Gorman,
	Hugh Dickins, Rik van Riel, Minchan Kim, Pekka Enberg,
	Matt Mackall

On Tue, 9 Apr 2013, Simon Jeons wrote:

> > +	int pages = 1 << compound_order(page);
>
> One question irrelevant this patch. Why slab cache can use compound
> page(hugetlbfs pages/thp pages)? They are just used by app to optimize tlb
> miss, is it?

Slab caches can use any order pages because these pages are never on
the LRU and are not part of the page cache. Large continuous physical
memory means that objects can be arranged in a more efficient way in the
page. This is particularly useful for larger objects where we might use a
lot of memory because objects do not fit well into a 4k page.

It also reduces the slab page management if higher order pages are used.
In the case of slub the page size also determines the number of objects
that can be allocated/freed without the need for some form of
synchronization.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/3] mm, slub: count freed pages via rcu as this task's reclaimed_slab
  2013-04-09 14:32     ` Christoph Lameter
@ 2013-04-10  3:20       ` Simon Jeons
  2013-04-10 13:54         ` Christoph Lameter
  0 siblings, 1 reply; 16+ messages in thread
From: Simon Jeons @ 2013-04-10  3:20 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Joonsoo Kim, Andrew Morton, linux-kernel, linux-mm, Mel Gorman,
	Hugh Dickins, Rik van Riel, Minchan Kim, Pekka Enberg,
	Matt Mackall

Hi Christoph,
On 04/09/2013 10:32 PM, Christoph Lameter wrote:
> On Tue, 9 Apr 2013, Simon Jeons wrote:
>
>>> +	int pages = 1 << compound_order(page);
>> One question irrelevant this patch. Why slab cache can use compound
>> page(hugetlbfs pages/thp pages)? They are just used by app to optimize tlb
>> miss, is it?
> Slab caches can use any order pages because these pages are never on
> the LRU and are not part of the page cache. Large continuous physical
> memory means that objects can be arranged in a more efficient way in the
> page. This is particularly useful for larger objects where we might use a
> lot of memory because objects do not fit well into a 4k page.
>
> It also reduces the slab page management if higher order pages are used.
> In the case of slub the page size also determines the number of objects
> that can be allocated/freed without the need for some form of
> synchronization.

It seems that you misunderstand my question. I don't doubt slab/slub can 
use high order pages. However, what I focus on is why slab/slub can use 
compound page, PageCompound() just on behalf of hugetlbfs pages or thp 
pages which should used by apps, isn't it?

>


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/3] mm, slub: count freed pages via rcu as this task's reclaimed_slab
  2013-04-10  3:20       ` Simon Jeons
@ 2013-04-10 13:54         ` Christoph Lameter
  2013-04-11  3:46           ` Simon Jeons
  0 siblings, 1 reply; 16+ messages in thread
From: Christoph Lameter @ 2013-04-10 13:54 UTC (permalink / raw)
  To: Simon Jeons
  Cc: Joonsoo Kim, Andrew Morton, linux-kernel, linux-mm, Mel Gorman,
	Hugh Dickins, Rik van Riel, Minchan Kim, Pekka Enberg,
	Matt Mackall

On Wed, 10 Apr 2013, Simon Jeons wrote:

> It seems that you misunderstand my question. I don't doubt slab/slub can use
> high order pages. However, what I focus on is why slab/slub can use compound
> page, PageCompound() just on behalf of hugetlbfs pages or thp pages which
> should used by apps, isn't it?

I am not entirely clear on what you are asking for. The following gives a
couple of answers to what I guess the question was.

THP pages and user pages are on the lru and are managed differently.
The slab allocators cannot work with those pages.

Slab allocators *can* allocate higher order pages therefore they could
allocate a page of the same order as huge pages and manage it that way.

However there is no way that these pages could be handled like THP pages
since they cannot be broken up (unless we add the capability to move slab
objects which I wanted to do for a long time).

You can boot a Linux system that uses huge pages for slab allocation
by specifying the following parameter on the kernel command line.

	slub_min_order=9

The slub allocator will start using huge pages for all its storage
needs. You need a large number of huge pages to do this. Lots of memory
is going to be lost due to fragmentation but its going to be fast since
the slowpaths are rarely used. OOMs due to reclaim failure become much
more likely ;-).

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/3] mm, slub: count freed pages via rcu as this task's reclaimed_slab
  2013-04-10 13:54         ` Christoph Lameter
@ 2013-04-11  3:46           ` Simon Jeons
  2013-04-11 15:03             ` Christoph Lameter
  0 siblings, 1 reply; 16+ messages in thread
From: Simon Jeons @ 2013-04-11  3:46 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Joonsoo Kim, Andrew Morton, linux-kernel, linux-mm, Mel Gorman,
	Hugh Dickins, Rik van Riel, Minchan Kim, Pekka Enberg,
	Matt Mackall

Hi Christoph,
On 04/10/2013 09:54 PM, Christoph Lameter wrote:
> On Wed, 10 Apr 2013, Simon Jeons wrote:
>
>> It seems that you misunderstand my question. I don't doubt slab/slub can use
>> high order pages. However, what I focus on is why slab/slub can use compound
>> page, PageCompound() just on behalf of hugetlbfs pages or thp pages which
>> should used by apps, isn't it?
> I am not entirely clear on what you are asking for. The following gives a
> couple of answers to what I guess the question was.
>
> THP pages and user pages are on the lru and are managed differently.
> The slab allocators cannot work with those pages.
>
> Slab allocators *can* allocate higher order pages therefore they could
> allocate a page of the same order as huge pages and manage it that way.
>
> However there is no way that these pages could be handled like THP pages
> since they cannot be broken up (unless we add the capability to move slab
> objects which I wanted to do for a long time).
>
>
> You can boot a Linux system that uses huge pages for slab allocation
> by specifying the following parameter on the kernel command line.
>
> 	slub_min_order=9
>
> The slub allocator will start using huge pages for all its storage
> needs. You need a large number of huge pages to do this. Lots of memory
> is going to be lost due to fragmentation but its going to be fast since
> the slowpaths are rarely used. OOMs due to reclaim failure become much
> more likely ;-).
>

It seems that I need to simple my question.
All pages which order >=1 are compound pages?



^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/3] mm, slub: count freed pages via rcu as this task's reclaimed_slab
  2013-04-11  3:46           ` Simon Jeons
@ 2013-04-11 15:03             ` Christoph Lameter
  0 siblings, 0 replies; 16+ messages in thread
From: Christoph Lameter @ 2013-04-11 15:03 UTC (permalink / raw)
  To: Simon Jeons
  Cc: Joonsoo Kim, Andrew Morton, linux-kernel, linux-mm, Mel Gorman,
	Hugh Dickins, Rik van Riel, Minchan Kim, Pekka Enberg,
	Matt Mackall

On Thu, 11 Apr 2013, Simon Jeons wrote:

> It seems that I need to simple my question.
> All pages which order >=1 are compound pages?

In the slub allocator that is true.

One can request and free a series of contiguous pages that are not
compound pages from the page allocator and a couple of subsystems use
those.


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/3] mm, slub: count freed pages via rcu as this task's reclaimed_slab
  2013-04-09  1:21 ` [PATCH 2/3] mm, slub: count freed pages via rcu as this task's reclaimed_slab Joonsoo Kim
  2013-04-09  9:38   ` Simon Jeons
@ 2013-04-09 14:28   ` Christoph Lameter
  2013-04-10  5:26     ` Joonsoo Kim
  1 sibling, 1 reply; 16+ messages in thread
From: Christoph Lameter @ 2013-04-09 14:28 UTC (permalink / raw)
  To: Joonsoo Kim
  Cc: Andrew Morton, linux-kernel, linux-mm, Mel Gorman, Hugh Dickins,
	Rik van Riel, Minchan Kim, Pekka Enberg, Matt Mackall

On Tue, 9 Apr 2013, Joonsoo Kim wrote:

> Currently, freed pages via rcu is not counted for reclaimed_slab, because
> it is freed in rcu context, not current task context. But, this free is
> initiated by this task, so counting this into this task's reclaimed_slab
> is meaningful to decide whether we continue reclaim, or not.
> So change code to count these pages for this task's reclaimed_slab.

slab->reclaim_state guides the reclaim actions in vmscan.c. With this
patch slab->reclaim_state could get quite a high value without new pages being
available for allocation. slab->reclaim_state will only be updated
when the RCU period ends.


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/3] mm, slub: count freed pages via rcu as this task's reclaimed_slab
  2013-04-09 14:28   ` Christoph Lameter
@ 2013-04-10  5:26     ` Joonsoo Kim
  2013-04-10 13:57       ` Christoph Lameter
  0 siblings, 1 reply; 16+ messages in thread
From: Joonsoo Kim @ 2013-04-10  5:26 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Andrew Morton, linux-kernel, linux-mm, Mel Gorman, Hugh Dickins,
	Rik van Riel, Minchan Kim, Pekka Enberg, Matt Mackall

Hello, Christoph.

On Tue, Apr 09, 2013 at 02:28:06PM +0000, Christoph Lameter wrote:
> On Tue, 9 Apr 2013, Joonsoo Kim wrote:
> 
> > Currently, freed pages via rcu is not counted for reclaimed_slab, because
> > it is freed in rcu context, not current task context. But, this free is
> > initiated by this task, so counting this into this task's reclaimed_slab
> > is meaningful to decide whether we continue reclaim, or not.
> > So change code to count these pages for this task's reclaimed_slab.
> 
> slab->reclaim_state guides the reclaim actions in vmscan.c. With this
> patch slab->reclaim_state could get quite a high value without new pages being
> available for allocation. slab->reclaim_state will only be updated
> when the RCU period ends.

Okay.

In addition, there is a little place who use SLAB_DESTROY_BY_RCU.
I will drop this patch[2/3] and [3/3] for next spin.

Thanks.

> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/3] mm, slub: count freed pages via rcu as this task's reclaimed_slab
  2013-04-10  5:26     ` Joonsoo Kim
@ 2013-04-10 13:57       ` Christoph Lameter
  2013-04-10 14:24         ` JoonSoo Kim
  0 siblings, 1 reply; 16+ messages in thread
From: Christoph Lameter @ 2013-04-10 13:57 UTC (permalink / raw)
  To: Joonsoo Kim
  Cc: Andrew Morton, linux-kernel, linux-mm, Mel Gorman, Hugh Dickins,
	Rik van Riel, Minchan Kim, Pekka Enberg, Matt Mackall

On Wed, 10 Apr 2013, Joonsoo Kim wrote:

> Hello, Christoph.
>
> On Tue, Apr 09, 2013 at 02:28:06PM +0000, Christoph Lameter wrote:
> > On Tue, 9 Apr 2013, Joonsoo Kim wrote:
> >
> > > Currently, freed pages via rcu is not counted for reclaimed_slab, because
> > > it is freed in rcu context, not current task context. But, this free is
> > > initiated by this task, so counting this into this task's reclaimed_slab
> > > is meaningful to decide whether we continue reclaim, or not.
> > > So change code to count these pages for this task's reclaimed_slab.
> >
> > slab->reclaim_state guides the reclaim actions in vmscan.c. With this
> > patch slab->reclaim_state could get quite a high value without new pages being
> > available for allocation. slab->reclaim_state will only be updated
> > when the RCU period ends.
>
> Okay.
>
> In addition, there is a little place who use SLAB_DESTROY_BY_RCU.
> I will drop this patch[2/3] and [3/3] for next spin.

What you have discoverd is an issue that we have so far overlooked. Could
you add comments to both places explaining the situation? RCU is used for
some inode and the dentry cache. Failing to account for these frees could
pose a problem. One solution would be to ensure that we get through an RCU
quiescent period in the slabs reclaim. If we can ensure that then your
patch may be ok.



^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/3] mm, slub: count freed pages via rcu as this task's reclaimed_slab
  2013-04-10 13:57       ` Christoph Lameter
@ 2013-04-10 14:24         ` JoonSoo Kim
  0 siblings, 0 replies; 16+ messages in thread
From: JoonSoo Kim @ 2013-04-10 14:24 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Joonsoo Kim, Andrew Morton, LKML, Linux Memory Management List,
	Mel Gorman, Hugh Dickins, Rik van Riel, Minchan Kim, Pekka Enberg,
	Matt Mackall

2013/4/10 Christoph Lameter <cl@linux.com>:
> On Wed, 10 Apr 2013, Joonsoo Kim wrote:
>
>> Hello, Christoph.
>>
>> On Tue, Apr 09, 2013 at 02:28:06PM +0000, Christoph Lameter wrote:
>> > On Tue, 9 Apr 2013, Joonsoo Kim wrote:
>> >
>> > > Currently, freed pages via rcu is not counted for reclaimed_slab, because
>> > > it is freed in rcu context, not current task context. But, this free is
>> > > initiated by this task, so counting this into this task's reclaimed_slab
>> > > is meaningful to decide whether we continue reclaim, or not.
>> > > So change code to count these pages for this task's reclaimed_slab.
>> >
>> > slab->reclaim_state guides the reclaim actions in vmscan.c. With this
>> > patch slab->reclaim_state could get quite a high value without new pages being
>> > available for allocation. slab->reclaim_state will only be updated
>> > when the RCU period ends.
>>
>> Okay.
>>
>> In addition, there is a little place who use SLAB_DESTROY_BY_RCU.
>> I will drop this patch[2/3] and [3/3] for next spin.
>
> What you have discoverd is an issue that we have so far overlooked. Could
> you add comments to both places explaining the situation?

Yes, I can.

> RCU is used for
> some inode and the dentry cache. Failing to account for these frees could
> pose a problem. One solution would be to ensure that we get through an RCU
> quiescent period in the slabs reclaim. If we can ensure that then your
> patch may be ok.

Hmm... I don't perfectly understand RCU code and it's quiescent
period. But, yes, it
can be one of possible solutions in my quick thought. Currently, I
have no ability to
do that, so I skip to think about this.

Thanks.

>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH 3/3] mm, slab: count freed pages via rcu as this task's reclaimed_slab
  2013-04-09  1:21 [PATCH 1/3] mm, vmscan: count accidental reclaimed pages failed to put into lru Joonsoo Kim
  2013-04-09  1:21 ` [PATCH 2/3] mm, slub: count freed pages via rcu as this task's reclaimed_slab Joonsoo Kim
@ 2013-04-09  1:21 ` Joonsoo Kim
  2013-04-09  5:55 ` [PATCH 1/3] mm, vmscan: count accidental reclaimed pages failed to put into lru Minchan Kim
  2 siblings, 0 replies; 16+ messages in thread
From: Joonsoo Kim @ 2013-04-09  1:21 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-kernel, linux-mm, Mel Gorman, Hugh Dickins, Rik van Riel,
	Minchan Kim, Joonsoo Kim, Christoph Lameter, Pekka Enberg,
	Matt Mackall

Currently, freed pages via rcu is not counted for reclaimed_slab, because
it is freed in rcu context, not current task context. But, this free is
initiated by this task, so counting this into this task's reclaimed_slab
is meaningful to decide whether we continue reclaim, or not.
So change code to count these pages for this task's reclaimed_slab.

Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Matt Mackall <mpm@selenic.com>
Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>

diff --git a/mm/slab.c b/mm/slab.c
index 856e4a1..4d94bcb 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1934,8 +1934,6 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr)
 	}
 
 	memcg_release_pages(cachep, cachep->gfporder);
-	if (current->reclaim_state)
-		current->reclaim_state->reclaimed_slab += nr_freed;
 	free_memcg_kmem_pages((unsigned long)addr, cachep->gfporder);
 }
 
@@ -2165,6 +2163,7 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slab
  */
 static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp)
 {
+	unsigned long nr_freed = (1 << cachep->gfporder);
 	void *addr = slabp->s_mem - slabp->colouroff;
 
 	slab_destroy_debugcheck(cachep, slabp);
@@ -2180,6 +2179,9 @@ static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp)
 		if (OFF_SLAB(cachep))
 			kmem_cache_free(cachep->slabp_cache, slabp);
 	}
+
+	if (current->reclaim_state)
+		current->reclaim_state->reclaimed_slab += nr_freed;
 }
 
 /**
-- 
1.7.9.5


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH 1/3] mm, vmscan: count accidental reclaimed pages failed to put into lru
  2013-04-09  1:21 [PATCH 1/3] mm, vmscan: count accidental reclaimed pages failed to put into lru Joonsoo Kim
  2013-04-09  1:21 ` [PATCH 2/3] mm, slub: count freed pages via rcu as this task's reclaimed_slab Joonsoo Kim
  2013-04-09  1:21 ` [PATCH 3/3] mm, slab: " Joonsoo Kim
@ 2013-04-09  5:55 ` Minchan Kim
  2013-04-10  5:48   ` Joonsoo Kim
  2 siblings, 1 reply; 16+ messages in thread
From: Minchan Kim @ 2013-04-09  5:55 UTC (permalink / raw)
  To: Joonsoo Kim
  Cc: Andrew Morton, linux-kernel, linux-mm, Mel Gorman, Hugh Dickins,
	Rik van Riel

Hello Joonsoo,

On Tue, Apr 09, 2013 at 10:21:16AM +0900, Joonsoo Kim wrote:
> In shrink_(in)active_list(), we can fail to put into lru, and these pages
> are reclaimed accidentally. Currently, these pages are not counted
> for sc->nr_reclaimed, but with this information, we can stop to reclaim
> earlier, so can reduce overhead of reclaim.
> 
> Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>

Nice catch!

But this patch handles very corner case and makes reclaim function's name
rather stupid so I'd like to see text size change after we apply this patch.
Other nipicks below.

> 
> diff --git a/include/linux/gfp.h b/include/linux/gfp.h
> index 0f615eb..5d60ae0 100644
> --- a/include/linux/gfp.h
> +++ b/include/linux/gfp.h
> @@ -365,7 +365,7 @@ void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask);
>  extern void __free_pages(struct page *page, unsigned int order);
>  extern void free_pages(unsigned long addr, unsigned int order);
>  extern void free_hot_cold_page(struct page *page, int cold);
> -extern void free_hot_cold_page_list(struct list_head *list, int cold);
> +extern unsigned long free_hot_cold_page_list(struct list_head *list, int cold);
>  
>  extern void __free_memcg_kmem_pages(struct page *page, unsigned int order);
>  extern void free_memcg_kmem_pages(unsigned long addr, unsigned int order);
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 8fcced7..a5f3952 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -1360,14 +1360,18 @@ out:
>  /*
>   * Free a list of 0-order pages
>   */
> -void free_hot_cold_page_list(struct list_head *list, int cold)
> +unsigned long free_hot_cold_page_list(struct list_head *list, int cold)
>  {
> +	unsigned long nr_reclaimed = 0;

How about nr_free or nr_freed for consistent with function title?

>  	struct page *page, *next;
>  
>  	list_for_each_entry_safe(page, next, list, lru) {
>  		trace_mm_page_free_batched(page, cold);
>  		free_hot_cold_page(page, cold);
> +		nr_reclaimed++;
>  	}
> +
> +	return nr_reclaimed;
>  }
>  
>  /*
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 88c5fed..eff2927 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -915,7 +915,6 @@ static unsigned long shrink_page_list(struct list_head *page_list,
>  		 */
>  		__clear_page_locked(page);
>  free_it:
> -		nr_reclaimed++;
>  
>  		/*
>  		 * Is there need to periodically free_page_list? It would
> @@ -954,7 +953,7 @@ keep:
>  	if (nr_dirty && nr_dirty == nr_congested && global_reclaim(sc))
>  		zone_set_flag(zone, ZONE_CONGESTED);
>  
> -	free_hot_cold_page_list(&free_pages, 1);
> +	nr_reclaimed += free_hot_cold_page_list(&free_pages, 1);

Nice cleanup.

>  
>  	list_splice(&ret_pages, page_list);
>  	count_vm_events(PGACTIVATE, pgactivate);
> @@ -1321,7 +1320,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
>  	if (nr_taken == 0)
>  		return 0;
>  
> -	nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP,
> +	nr_reclaimed += shrink_page_list(&page_list, zone, sc, TTU_UNMAP,
>  					&nr_dirty, &nr_writeback, false);

Do you have any reason to change?
To me, '=' is more clear to initialize the variable.
When I see above, I have to look through above lines to catch where code
used the nr_reclaimed.

>  
>  	spin_lock_irq(&zone->lru_lock);
> @@ -1343,7 +1342,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
>  
>  	spin_unlock_irq(&zone->lru_lock);
>  
> -	free_hot_cold_page_list(&page_list, 1);
> +	nr_reclaimed += free_hot_cold_page_list(&page_list, 1);

How about considering vmstat, too?
It could be minor but you are considering freed page as
reclaim context. (ie, sc->nr_reclaimed) so it would be more appropriate.

>  
>  	/*
>  	 * If reclaim is isolating dirty pages under writeback, it implies
> @@ -1438,7 +1437,7 @@ static void move_active_pages_to_lru(struct lruvec *lruvec,
>  		__count_vm_events(PGDEACTIVATE, pgmoved);
>  }
>  
> -static void shrink_active_list(unsigned long nr_to_scan,
> +static unsigned long shrink_active_list(unsigned long nr_to_scan,
>  			       struct lruvec *lruvec,
>  			       struct scan_control *sc,
>  			       enum lru_list lru)
> @@ -1534,7 +1533,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
>  	__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
>  	spin_unlock_irq(&zone->lru_lock);
>  
> -	free_hot_cold_page_list(&l_hold, 1);
> +	return free_hot_cold_page_list(&l_hold, 1);

It would be better to add comment about return value.
Otherwise, people could confuse with the number of pages moved from
active to inactive.

>  }
>  
>  #ifdef CONFIG_SWAP
> @@ -1617,7 +1616,8 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
>  {
>  	if (is_active_lru(lru)) {
>  		if (inactive_list_is_low(lruvec, lru))
> -			shrink_active_list(nr_to_scan, lruvec, sc, lru);
> +			return shrink_active_list(nr_to_scan, lruvec, sc, lru);
> +

Unnecessary change.

>  		return 0;
>  	}
>  
> @@ -1861,8 +1861,8 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
>  	 * rebalance the anon lru active/inactive ratio.
>  	 */
>  	if (inactive_anon_is_low(lruvec))
> -		shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
> -				   sc, LRU_ACTIVE_ANON);
> +		sc->nr_reclaimed += shrink_active_list(SWAP_CLUSTER_MAX,
> +					lruvec, sc, LRU_ACTIVE_ANON);
>  
>  	throttle_vm_writeout(sc->gfp_mask);
>  }
> @@ -2470,23 +2470,27 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
>  }
>  #endif
>  
> -static void age_active_anon(struct zone *zone, struct scan_control *sc)

Comment about return value.
or rename but I have no idea. Sorry.

> +static unsigned long age_active_anon(struct zone *zone,
> +					struct scan_control *sc)
>  {
> +	unsigned long nr_reclaimed = 0;
>  	struct mem_cgroup *memcg;
>  
>  	if (!total_swap_pages)
> -		return;
> +		return 0;
>  
>  	memcg = mem_cgroup_iter(NULL, NULL, NULL);
>  	do {
>  		struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
>  
>  		if (inactive_anon_is_low(lruvec))
> -			shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
> -					   sc, LRU_ACTIVE_ANON);
> +			nr_reclaimed += shrink_active_list(SWAP_CLUSTER_MAX,
> +					lruvec, sc, LRU_ACTIVE_ANON);
>  
>  		memcg = mem_cgroup_iter(NULL, memcg, NULL);
>  	} while (memcg);
> +
> +	return nr_reclaimed;
>  }
>  
>  static bool zone_balanced(struct zone *zone, int order,
> @@ -2666,7 +2670,7 @@ loop_again:
>  			 * Do some background aging of the anon list, to give
>  			 * pages a chance to be referenced before reclaiming.
>  			 */
> -			age_active_anon(zone, &sc);
> +			sc.nr_reclaimed += age_active_anon(zone, &sc);
>  
>  			/*
>  			 * If the number of buffer_heads in the machine
> -- 
> 1.7.9.5
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

-- 
Kind regards,
Minchan Kim

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 1/3] mm, vmscan: count accidental reclaimed pages failed to put into lru
  2013-04-09  5:55 ` [PATCH 1/3] mm, vmscan: count accidental reclaimed pages failed to put into lru Minchan Kim
@ 2013-04-10  5:48   ` Joonsoo Kim
  2013-04-10  6:03     ` Minchan Kim
  0 siblings, 1 reply; 16+ messages in thread
From: Joonsoo Kim @ 2013-04-10  5:48 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Andrew Morton, linux-kernel, linux-mm, Mel Gorman, Hugh Dickins,
	Rik van Riel

Hello, Minchan.

On Tue, Apr 09, 2013 at 02:55:14PM +0900, Minchan Kim wrote:
> Hello Joonsoo,
> 
> On Tue, Apr 09, 2013 at 10:21:16AM +0900, Joonsoo Kim wrote:
> > In shrink_(in)active_list(), we can fail to put into lru, and these pages
> > are reclaimed accidentally. Currently, these pages are not counted
> > for sc->nr_reclaimed, but with this information, we can stop to reclaim
> > earlier, so can reduce overhead of reclaim.
> > 
> > Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
> 
> Nice catch!
> 
> But this patch handles very corner case and makes reclaim function's name
> rather stupid so I'd like to see text size change after we apply this patch.
> Other nipicks below.

Ah... Yes.
I can re-work it to add number to sc->nr_reclaimed directly for both cases,
shrink_active_list() and age_active_anon().

> 
> > 
> > diff --git a/include/linux/gfp.h b/include/linux/gfp.h
> > index 0f615eb..5d60ae0 100644
> > --- a/include/linux/gfp.h
> > +++ b/include/linux/gfp.h
> > @@ -365,7 +365,7 @@ void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask);
> >  extern void __free_pages(struct page *page, unsigned int order);
> >  extern void free_pages(unsigned long addr, unsigned int order);
> >  extern void free_hot_cold_page(struct page *page, int cold);
> > -extern void free_hot_cold_page_list(struct list_head *list, int cold);
> > +extern unsigned long free_hot_cold_page_list(struct list_head *list, int cold);
> >  
> >  extern void __free_memcg_kmem_pages(struct page *page, unsigned int order);
> >  extern void free_memcg_kmem_pages(unsigned long addr, unsigned int order);
> > diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> > index 8fcced7..a5f3952 100644
> > --- a/mm/page_alloc.c
> > +++ b/mm/page_alloc.c
> > @@ -1360,14 +1360,18 @@ out:
> >  /*
> >   * Free a list of 0-order pages
> >   */
> > -void free_hot_cold_page_list(struct list_head *list, int cold)
> > +unsigned long free_hot_cold_page_list(struct list_head *list, int cold)
> >  {
> > +	unsigned long nr_reclaimed = 0;
> 
> How about nr_free or nr_freed for consistent with function title?

Okay.

> 
> >  	struct page *page, *next;
> >  
> >  	list_for_each_entry_safe(page, next, list, lru) {
> >  		trace_mm_page_free_batched(page, cold);
> >  		free_hot_cold_page(page, cold);
> > +		nr_reclaimed++;
> >  	}
> > +
> > +	return nr_reclaimed;
> >  }
> >  
> >  /*
> > diff --git a/mm/vmscan.c b/mm/vmscan.c
> > index 88c5fed..eff2927 100644
> > --- a/mm/vmscan.c
> > +++ b/mm/vmscan.c
> > @@ -915,7 +915,6 @@ static unsigned long shrink_page_list(struct list_head *page_list,
> >  		 */
> >  		__clear_page_locked(page);
> >  free_it:
> > -		nr_reclaimed++;
> >  
> >  		/*
> >  		 * Is there need to periodically free_page_list? It would
> > @@ -954,7 +953,7 @@ keep:
> >  	if (nr_dirty && nr_dirty == nr_congested && global_reclaim(sc))
> >  		zone_set_flag(zone, ZONE_CONGESTED);
> >  
> > -	free_hot_cold_page_list(&free_pages, 1);
> > +	nr_reclaimed += free_hot_cold_page_list(&free_pages, 1);
> 
> Nice cleanup.
> 
> >  
> >  	list_splice(&ret_pages, page_list);
> >  	count_vm_events(PGACTIVATE, pgactivate);
> > @@ -1321,7 +1320,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
> >  	if (nr_taken == 0)
> >  		return 0;
> >  
> > -	nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP,
> > +	nr_reclaimed += shrink_page_list(&page_list, zone, sc, TTU_UNMAP,
> >  					&nr_dirty, &nr_writeback, false);
> 
> Do you have any reason to change?
> To me, '=' is more clear to initialize the variable.
> When I see above, I have to look through above lines to catch where code
> used the nr_reclaimed.
> 

There is no reason, I will change it.

> >  
> >  	spin_lock_irq(&zone->lru_lock);
> > @@ -1343,7 +1342,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
> >  
> >  	spin_unlock_irq(&zone->lru_lock);
> >  
> > -	free_hot_cold_page_list(&page_list, 1);
> > +	nr_reclaimed += free_hot_cold_page_list(&page_list, 1);
> 
> How about considering vmstat, too?
> It could be minor but you are considering freed page as
> reclaim context. (ie, sc->nr_reclaimed) so it would be more appropriate.

I don't understand what you mean.
Please explain more what you have in mind :)

> 
> >  
> >  	/*
> >  	 * If reclaim is isolating dirty pages under writeback, it implies
> > @@ -1438,7 +1437,7 @@ static void move_active_pages_to_lru(struct lruvec *lruvec,
> >  		__count_vm_events(PGDEACTIVATE, pgmoved);
> >  }
> >  
> > -static void shrink_active_list(unsigned long nr_to_scan,
> > +static unsigned long shrink_active_list(unsigned long nr_to_scan,
> >  			       struct lruvec *lruvec,
> >  			       struct scan_control *sc,
> >  			       enum lru_list lru)
> > @@ -1534,7 +1533,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
> >  	__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
> >  	spin_unlock_irq(&zone->lru_lock);
> >  
> > -	free_hot_cold_page_list(&l_hold, 1);
> > +	return free_hot_cold_page_list(&l_hold, 1);
> 
> It would be better to add comment about return value.
> Otherwise, people could confuse with the number of pages moved from
> active to inactive.
> 

In next spin, I will not change return type.
So above problem will be disappreared.

> >  }
> >  
> >  #ifdef CONFIG_SWAP
> > @@ -1617,7 +1616,8 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
> >  {
> >  	if (is_active_lru(lru)) {
> >  		if (inactive_list_is_low(lruvec, lru))
> > -			shrink_active_list(nr_to_scan, lruvec, sc, lru);
> > +			return shrink_active_list(nr_to_scan, lruvec, sc, lru);
> > +
> 
> Unnecessary change.
> 

Why?

> >  		return 0;
> >  	}
> >  
> > @@ -1861,8 +1861,8 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
> >  	 * rebalance the anon lru active/inactive ratio.
> >  	 */
> >  	if (inactive_anon_is_low(lruvec))
> > -		shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
> > -				   sc, LRU_ACTIVE_ANON);
> > +		sc->nr_reclaimed += shrink_active_list(SWAP_CLUSTER_MAX,
> > +					lruvec, sc, LRU_ACTIVE_ANON);
> >  
> >  	throttle_vm_writeout(sc->gfp_mask);
> >  }
> > @@ -2470,23 +2470,27 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
> >  }
> >  #endif
> >  
> > -static void age_active_anon(struct zone *zone, struct scan_control *sc)
> 
> Comment about return value.
> or rename but I have no idea. Sorry.

This will be disappreared in next spin.

Thanks for detailed review.

> 
> > +static unsigned long age_active_anon(struct zone *zone,
> > +					struct scan_control *sc)
> >  {
> > +	unsigned long nr_reclaimed = 0;
> >  	struct mem_cgroup *memcg;
> >  
> >  	if (!total_swap_pages)
> > -		return;
> > +		return 0;
> >  
> >  	memcg = mem_cgroup_iter(NULL, NULL, NULL);
> >  	do {
> >  		struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
> >  
> >  		if (inactive_anon_is_low(lruvec))
> > -			shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
> > -					   sc, LRU_ACTIVE_ANON);
> > +			nr_reclaimed += shrink_active_list(SWAP_CLUSTER_MAX,
> > +					lruvec, sc, LRU_ACTIVE_ANON);
> >  
> >  		memcg = mem_cgroup_iter(NULL, memcg, NULL);
> >  	} while (memcg);
> > +
> > +	return nr_reclaimed;
> >  }
> >  
> >  static bool zone_balanced(struct zone *zone, int order,
> > @@ -2666,7 +2670,7 @@ loop_again:
> >  			 * Do some background aging of the anon list, to give
> >  			 * pages a chance to be referenced before reclaiming.
> >  			 */
> > -			age_active_anon(zone, &sc);
> > +			sc.nr_reclaimed += age_active_anon(zone, &sc);
> >  
> >  			/*
> >  			 * If the number of buffer_heads in the machine
> > -- 
> > 1.7.9.5
> > 
> > --
> > To unsubscribe, send a message with 'unsubscribe linux-mm' in
> > the body to majordomo@kvack.org.  For more info on Linux MM,
> > see: http://www.linux-mm.org/ .
> > Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
> 
> -- 
> Kind regards,
> Minchan Kim
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 1/3] mm, vmscan: count accidental reclaimed pages failed to put into lru
  2013-04-10  5:48   ` Joonsoo Kim
@ 2013-04-10  6:03     ` Minchan Kim
  0 siblings, 0 replies; 16+ messages in thread
From: Minchan Kim @ 2013-04-10  6:03 UTC (permalink / raw)
  To: Joonsoo Kim
  Cc: Andrew Morton, linux-kernel, linux-mm, Mel Gorman, Hugh Dickins,
	Rik van Riel

On Wed, Apr 10, 2013 at 02:48:22PM +0900, Joonsoo Kim wrote:
> Hello, Minchan.
> 
> On Tue, Apr 09, 2013 at 02:55:14PM +0900, Minchan Kim wrote:
> > Hello Joonsoo,
> > 
> > On Tue, Apr 09, 2013 at 10:21:16AM +0900, Joonsoo Kim wrote:
> > > In shrink_(in)active_list(), we can fail to put into lru, and these pages
> > > are reclaimed accidentally. Currently, these pages are not counted
> > > for sc->nr_reclaimed, but with this information, we can stop to reclaim
> > > earlier, so can reduce overhead of reclaim.
> > > 
> > > Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
> > 
> > Nice catch!
> > 
> > But this patch handles very corner case and makes reclaim function's name
> > rather stupid so I'd like to see text size change after we apply this patch.
> > Other nipicks below.
> 
> Ah... Yes.
> I can re-work it to add number to sc->nr_reclaimed directly for both cases,
> shrink_active_list() and age_active_anon().

Sounds better.

> 
> > 
> > > 
> > > diff --git a/include/linux/gfp.h b/include/linux/gfp.h
> > > index 0f615eb..5d60ae0 100644
> > > --- a/include/linux/gfp.h
> > > +++ b/include/linux/gfp.h
> > > @@ -365,7 +365,7 @@ void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask);
> > >  extern void __free_pages(struct page *page, unsigned int order);
> > >  extern void free_pages(unsigned long addr, unsigned int order);
> > >  extern void free_hot_cold_page(struct page *page, int cold);
> > > -extern void free_hot_cold_page_list(struct list_head *list, int cold);
> > > +extern unsigned long free_hot_cold_page_list(struct list_head *list, int cold);
> > >  
> > >  extern void __free_memcg_kmem_pages(struct page *page, unsigned int order);
> > >  extern void free_memcg_kmem_pages(unsigned long addr, unsigned int order);
> > > diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> > > index 8fcced7..a5f3952 100644
> > > --- a/mm/page_alloc.c
> > > +++ b/mm/page_alloc.c
> > > @@ -1360,14 +1360,18 @@ out:
> > >  /*
> > >   * Free a list of 0-order pages
> > >   */
> > > -void free_hot_cold_page_list(struct list_head *list, int cold)
> > > +unsigned long free_hot_cold_page_list(struct list_head *list, int cold)
> > >  {
> > > +	unsigned long nr_reclaimed = 0;
> > 
> > How about nr_free or nr_freed for consistent with function title?
> 
> Okay.
> 
> > 
> > >  	struct page *page, *next;
> > >  
> > >  	list_for_each_entry_safe(page, next, list, lru) {
> > >  		trace_mm_page_free_batched(page, cold);
> > >  		free_hot_cold_page(page, cold);
> > > +		nr_reclaimed++;
> > >  	}
> > > +
> > > +	return nr_reclaimed;
> > >  }
> > >  
> > >  /*
> > > diff --git a/mm/vmscan.c b/mm/vmscan.c
> > > index 88c5fed..eff2927 100644
> > > --- a/mm/vmscan.c
> > > +++ b/mm/vmscan.c
> > > @@ -915,7 +915,6 @@ static unsigned long shrink_page_list(struct list_head *page_list,
> > >  		 */
> > >  		__clear_page_locked(page);
> > >  free_it:
> > > -		nr_reclaimed++;
> > >  
> > >  		/*
> > >  		 * Is there need to periodically free_page_list? It would
> > > @@ -954,7 +953,7 @@ keep:
> > >  	if (nr_dirty && nr_dirty == nr_congested && global_reclaim(sc))
> > >  		zone_set_flag(zone, ZONE_CONGESTED);
> > >  
> > > -	free_hot_cold_page_list(&free_pages, 1);
> > > +	nr_reclaimed += free_hot_cold_page_list(&free_pages, 1);
> > 
> > Nice cleanup.
> > 
> > >  
> > >  	list_splice(&ret_pages, page_list);
> > >  	count_vm_events(PGACTIVATE, pgactivate);
> > > @@ -1321,7 +1320,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
> > >  	if (nr_taken == 0)
> > >  		return 0;
> > >  
> > > -	nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP,
> > > +	nr_reclaimed += shrink_page_list(&page_list, zone, sc, TTU_UNMAP,
> > >  					&nr_dirty, &nr_writeback, false);
> > 
> > Do you have any reason to change?
> > To me, '=' is more clear to initialize the variable.
> > When I see above, I have to look through above lines to catch where code
> > used the nr_reclaimed.
> > 
> 
> There is no reason, I will change it.
> 
> > >  
> > >  	spin_lock_irq(&zone->lru_lock);
> > > @@ -1343,7 +1342,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
> > >  
> > >  	spin_unlock_irq(&zone->lru_lock);
> > >  
> > > -	free_hot_cold_page_list(&page_list, 1);
> > > +	nr_reclaimed += free_hot_cold_page_list(&page_list, 1);
> > 
> > How about considering vmstat, too?
> > It could be minor but you are considering freed page as
> > reclaim context. (ie, sc->nr_reclaimed) so it would be more appropriate.
> 
> I don't understand what you mean.
> Please explain more what you have in mind :)

We are accouting PGSTEAL_[KSWAPD|DIRECT] with nr_reclaimed and
nr_reclaimed are contributing for sc->nr_reclaimed accumulation but
your patch doesn't consider that so vmstat cound be not exact.
Of course, it's not exact inherently so no big deal that's why I said "minor".
I just want that you think over about that.

Thanks!

> 
> > 
> > >  
> > >  	/*
> > >  	 * If reclaim is isolating dirty pages under writeback, it implies
> > > @@ -1438,7 +1437,7 @@ static void move_active_pages_to_lru(struct lruvec *lruvec,
> > >  		__count_vm_events(PGDEACTIVATE, pgmoved);
> > >  }
> > >  
> > > -static void shrink_active_list(unsigned long nr_to_scan,
> > > +static unsigned long shrink_active_list(unsigned long nr_to_scan,
> > >  			       struct lruvec *lruvec,
> > >  			       struct scan_control *sc,
> > >  			       enum lru_list lru)
> > > @@ -1534,7 +1533,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
> > >  	__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
> > >  	spin_unlock_irq(&zone->lru_lock);
> > >  
> > > -	free_hot_cold_page_list(&l_hold, 1);
> > > +	return free_hot_cold_page_list(&l_hold, 1);
> > 
> > It would be better to add comment about return value.
> > Otherwise, people could confuse with the number of pages moved from
> > active to inactive.
> > 
> 
> In next spin, I will not change return type.
> So above problem will be disappreared.
> 
> > >  }
> > >  
> > >  #ifdef CONFIG_SWAP
> > > @@ -1617,7 +1616,8 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
> > >  {
> > >  	if (is_active_lru(lru)) {
> > >  		if (inactive_list_is_low(lruvec, lru))
> > > -			shrink_active_list(nr_to_scan, lruvec, sc, lru);
> > > +			return shrink_active_list(nr_to_scan, lruvec, sc, lru);
> > > +
> > 
> > Unnecessary change.
> > 
> 
> Why?

You are adding unnecessary newline.

> 
> > >  		return 0;
> > >  	}
> > >  
> > > @@ -1861,8 +1861,8 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
> > >  	 * rebalance the anon lru active/inactive ratio.
> > >  	 */
> > >  	if (inactive_anon_is_low(lruvec))
> > > -		shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
> > > -				   sc, LRU_ACTIVE_ANON);
> > > +		sc->nr_reclaimed += shrink_active_list(SWAP_CLUSTER_MAX,
> > > +					lruvec, sc, LRU_ACTIVE_ANON);
> > >  
> > >  	throttle_vm_writeout(sc->gfp_mask);
> > >  }
> > > @@ -2470,23 +2470,27 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
> > >  }
> > >  #endif
> > >  
> > > -static void age_active_anon(struct zone *zone, struct scan_control *sc)
> > 
> > Comment about return value.
> > or rename but I have no idea. Sorry.
> 
> This will be disappreared in next spin.
> 
> Thanks for detailed review.
> 
> > 
> > > +static unsigned long age_active_anon(struct zone *zone,
> > > +					struct scan_control *sc)
> > >  {
> > > +	unsigned long nr_reclaimed = 0;
> > >  	struct mem_cgroup *memcg;
> > >  
> > >  	if (!total_swap_pages)
> > > -		return;
> > > +		return 0;
> > >  
> > >  	memcg = mem_cgroup_iter(NULL, NULL, NULL);
> > >  	do {
> > >  		struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
> > >  
> > >  		if (inactive_anon_is_low(lruvec))
> > > -			shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
> > > -					   sc, LRU_ACTIVE_ANON);
> > > +			nr_reclaimed += shrink_active_list(SWAP_CLUSTER_MAX,
> > > +					lruvec, sc, LRU_ACTIVE_ANON);
> > >  
> > >  		memcg = mem_cgroup_iter(NULL, memcg, NULL);
> > >  	} while (memcg);
> > > +
> > > +	return nr_reclaimed;
> > >  }
> > >  
> > >  static bool zone_balanced(struct zone *zone, int order,
> > > @@ -2666,7 +2670,7 @@ loop_again:
> > >  			 * Do some background aging of the anon list, to give
> > >  			 * pages a chance to be referenced before reclaiming.
> > >  			 */
> > > -			age_active_anon(zone, &sc);
> > > +			sc.nr_reclaimed += age_active_anon(zone, &sc);
> > >  
> > >  			/*
> > >  			 * If the number of buffer_heads in the machine
> > > -- 
> > > 1.7.9.5
> > > 
> > > --
> > > To unsubscribe, send a message with 'unsubscribe linux-mm' in
> > > the body to majordomo@kvack.org.  For more info on Linux MM,
> > > see: http://www.linux-mm.org/ .
> > > Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
> > 
> > -- 
> > Kind regards,
> > Minchan Kim
> > 
> > --
> > To unsubscribe, send a message with 'unsubscribe linux-mm' in
> > the body to majordomo@kvack.org.  For more info on Linux MM,
> > see: http://www.linux-mm.org/ .
> > Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

-- 
Kind regards,
Minchan Kim

^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2013-04-11 15:15 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2013-04-09  1:21 [PATCH 1/3] mm, vmscan: count accidental reclaimed pages failed to put into lru Joonsoo Kim
2013-04-09  1:21 ` [PATCH 2/3] mm, slub: count freed pages via rcu as this task's reclaimed_slab Joonsoo Kim
2013-04-09  9:38   ` Simon Jeons
2013-04-09 14:32     ` Christoph Lameter
2013-04-10  3:20       ` Simon Jeons
2013-04-10 13:54         ` Christoph Lameter
2013-04-11  3:46           ` Simon Jeons
2013-04-11 15:03             ` Christoph Lameter
2013-04-09 14:28   ` Christoph Lameter
2013-04-10  5:26     ` Joonsoo Kim
2013-04-10 13:57       ` Christoph Lameter
2013-04-10 14:24         ` JoonSoo Kim
2013-04-09  1:21 ` [PATCH 3/3] mm, slab: " Joonsoo Kim
2013-04-09  5:55 ` [PATCH 1/3] mm, vmscan: count accidental reclaimed pages failed to put into lru Minchan Kim
2013-04-10  5:48   ` Joonsoo Kim
2013-04-10  6:03     ` Minchan Kim

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).