* [PATCH 1/3] mm, vmscan: count accidental reclaimed pages failed to put into lru
@ 2013-04-09 1:21 Joonsoo Kim
2013-04-09 1:21 ` [PATCH 2/3] mm, slub: count freed pages via rcu as this task's reclaimed_slab Joonsoo Kim
` (2 more replies)
0 siblings, 3 replies; 16+ messages in thread
From: Joonsoo Kim @ 2013-04-09 1:21 UTC (permalink / raw)
To: Andrew Morton
Cc: linux-kernel, linux-mm, Mel Gorman, Hugh Dickins, Rik van Riel,
Minchan Kim, Joonsoo Kim
In shrink_(in)active_list(), we can fail to put into lru, and these pages
are reclaimed accidentally. Currently, these pages are not counted
for sc->nr_reclaimed, but with this information, we can stop to reclaim
earlier, so can reduce overhead of reclaim.
Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 0f615eb..5d60ae0 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -365,7 +365,7 @@ void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask);
extern void __free_pages(struct page *page, unsigned int order);
extern void free_pages(unsigned long addr, unsigned int order);
extern void free_hot_cold_page(struct page *page, int cold);
-extern void free_hot_cold_page_list(struct list_head *list, int cold);
+extern unsigned long free_hot_cold_page_list(struct list_head *list, int cold);
extern void __free_memcg_kmem_pages(struct page *page, unsigned int order);
extern void free_memcg_kmem_pages(unsigned long addr, unsigned int order);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8fcced7..a5f3952 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1360,14 +1360,18 @@ out:
/*
* Free a list of 0-order pages
*/
-void free_hot_cold_page_list(struct list_head *list, int cold)
+unsigned long free_hot_cold_page_list(struct list_head *list, int cold)
{
+ unsigned long nr_reclaimed = 0;
struct page *page, *next;
list_for_each_entry_safe(page, next, list, lru) {
trace_mm_page_free_batched(page, cold);
free_hot_cold_page(page, cold);
+ nr_reclaimed++;
}
+
+ return nr_reclaimed;
}
/*
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 88c5fed..eff2927 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -915,7 +915,6 @@ static unsigned long shrink_page_list(struct list_head *page_list,
*/
__clear_page_locked(page);
free_it:
- nr_reclaimed++;
/*
* Is there need to periodically free_page_list? It would
@@ -954,7 +953,7 @@ keep:
if (nr_dirty && nr_dirty == nr_congested && global_reclaim(sc))
zone_set_flag(zone, ZONE_CONGESTED);
- free_hot_cold_page_list(&free_pages, 1);
+ nr_reclaimed += free_hot_cold_page_list(&free_pages, 1);
list_splice(&ret_pages, page_list);
count_vm_events(PGACTIVATE, pgactivate);
@@ -1321,7 +1320,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
if (nr_taken == 0)
return 0;
- nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP,
+ nr_reclaimed += shrink_page_list(&page_list, zone, sc, TTU_UNMAP,
&nr_dirty, &nr_writeback, false);
spin_lock_irq(&zone->lru_lock);
@@ -1343,7 +1342,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
spin_unlock_irq(&zone->lru_lock);
- free_hot_cold_page_list(&page_list, 1);
+ nr_reclaimed += free_hot_cold_page_list(&page_list, 1);
/*
* If reclaim is isolating dirty pages under writeback, it implies
@@ -1438,7 +1437,7 @@ static void move_active_pages_to_lru(struct lruvec *lruvec,
__count_vm_events(PGDEACTIVATE, pgmoved);
}
-static void shrink_active_list(unsigned long nr_to_scan,
+static unsigned long shrink_active_list(unsigned long nr_to_scan,
struct lruvec *lruvec,
struct scan_control *sc,
enum lru_list lru)
@@ -1534,7 +1533,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
spin_unlock_irq(&zone->lru_lock);
- free_hot_cold_page_list(&l_hold, 1);
+ return free_hot_cold_page_list(&l_hold, 1);
}
#ifdef CONFIG_SWAP
@@ -1617,7 +1616,8 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
{
if (is_active_lru(lru)) {
if (inactive_list_is_low(lruvec, lru))
- shrink_active_list(nr_to_scan, lruvec, sc, lru);
+ return shrink_active_list(nr_to_scan, lruvec, sc, lru);
+
return 0;
}
@@ -1861,8 +1861,8 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
* rebalance the anon lru active/inactive ratio.
*/
if (inactive_anon_is_low(lruvec))
- shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
- sc, LRU_ACTIVE_ANON);
+ sc->nr_reclaimed += shrink_active_list(SWAP_CLUSTER_MAX,
+ lruvec, sc, LRU_ACTIVE_ANON);
throttle_vm_writeout(sc->gfp_mask);
}
@@ -2470,23 +2470,27 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
}
#endif
-static void age_active_anon(struct zone *zone, struct scan_control *sc)
+static unsigned long age_active_anon(struct zone *zone,
+ struct scan_control *sc)
{
+ unsigned long nr_reclaimed = 0;
struct mem_cgroup *memcg;
if (!total_swap_pages)
- return;
+ return 0;
memcg = mem_cgroup_iter(NULL, NULL, NULL);
do {
struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
if (inactive_anon_is_low(lruvec))
- shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
- sc, LRU_ACTIVE_ANON);
+ nr_reclaimed += shrink_active_list(SWAP_CLUSTER_MAX,
+ lruvec, sc, LRU_ACTIVE_ANON);
memcg = mem_cgroup_iter(NULL, memcg, NULL);
} while (memcg);
+
+ return nr_reclaimed;
}
static bool zone_balanced(struct zone *zone, int order,
@@ -2666,7 +2670,7 @@ loop_again:
* Do some background aging of the anon list, to give
* pages a chance to be referenced before reclaiming.
*/
- age_active_anon(zone, &sc);
+ sc.nr_reclaimed += age_active_anon(zone, &sc);
/*
* If the number of buffer_heads in the machine
--
1.7.9.5
^ permalink raw reply related [flat|nested] 16+ messages in thread* [PATCH 2/3] mm, slub: count freed pages via rcu as this task's reclaimed_slab 2013-04-09 1:21 [PATCH 1/3] mm, vmscan: count accidental reclaimed pages failed to put into lru Joonsoo Kim @ 2013-04-09 1:21 ` Joonsoo Kim 2013-04-09 9:38 ` Simon Jeons 2013-04-09 14:28 ` Christoph Lameter 2013-04-09 1:21 ` [PATCH 3/3] mm, slab: " Joonsoo Kim 2013-04-09 5:55 ` [PATCH 1/3] mm, vmscan: count accidental reclaimed pages failed to put into lru Minchan Kim 2 siblings, 2 replies; 16+ messages in thread From: Joonsoo Kim @ 2013-04-09 1:21 UTC (permalink / raw) To: Andrew Morton Cc: linux-kernel, linux-mm, Mel Gorman, Hugh Dickins, Rik van Riel, Minchan Kim, Joonsoo Kim, Christoph Lameter, Pekka Enberg, Matt Mackall Currently, freed pages via rcu is not counted for reclaimed_slab, because it is freed in rcu context, not current task context. But, this free is initiated by this task, so counting this into this task's reclaimed_slab is meaningful to decide whether we continue reclaim, or not. So change code to count these pages for this task's reclaimed_slab. Cc: Christoph Lameter <cl@linux-foundation.org> Cc: Pekka Enberg <penberg@kernel.org> Cc: Matt Mackall <mpm@selenic.com> Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com> diff --git a/mm/slub.c b/mm/slub.c index 4aec537..16fd2d5 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1409,8 +1409,6 @@ static void __free_slab(struct kmem_cache *s, struct page *page) memcg_release_pages(s, order); page_mapcount_reset(page); - if (current->reclaim_state) - current->reclaim_state->reclaimed_slab += pages; __free_memcg_kmem_pages(page, order); } @@ -1431,6 +1429,8 @@ static void rcu_free_slab(struct rcu_head *h) static void free_slab(struct kmem_cache *s, struct page *page) { + int pages = 1 << compound_order(page); + if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) { struct rcu_head *head; @@ -1450,6 +1450,9 @@ static void free_slab(struct kmem_cache *s, struct page *page) call_rcu(head, rcu_free_slab); } else __free_slab(s, page); + + if (current->reclaim_state) + current->reclaim_state->reclaimed_slab += pages; } static void discard_slab(struct kmem_cache *s, struct page *page) -- 1.7.9.5 ^ permalink raw reply related [flat|nested] 16+ messages in thread
* Re: [PATCH 2/3] mm, slub: count freed pages via rcu as this task's reclaimed_slab 2013-04-09 1:21 ` [PATCH 2/3] mm, slub: count freed pages via rcu as this task's reclaimed_slab Joonsoo Kim @ 2013-04-09 9:38 ` Simon Jeons 2013-04-09 14:32 ` Christoph Lameter 2013-04-09 14:28 ` Christoph Lameter 1 sibling, 1 reply; 16+ messages in thread From: Simon Jeons @ 2013-04-09 9:38 UTC (permalink / raw) To: Joonsoo Kim Cc: Andrew Morton, linux-kernel, linux-mm, Mel Gorman, Hugh Dickins, Rik van Riel, Minchan Kim, Christoph Lameter, Pekka Enberg, Matt Mackall Hi Joonsoo, On 04/09/2013 09:21 AM, Joonsoo Kim wrote: > Currently, freed pages via rcu is not counted for reclaimed_slab, because > it is freed in rcu context, not current task context. But, this free is > initiated by this task, so counting this into this task's reclaimed_slab > is meaningful to decide whether we continue reclaim, or not. > So change code to count these pages for this task's reclaimed_slab. > > Cc: Christoph Lameter <cl@linux-foundation.org> > Cc: Pekka Enberg <penberg@kernel.org> > Cc: Matt Mackall <mpm@selenic.com> > Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com> > > diff --git a/mm/slub.c b/mm/slub.c > index 4aec537..16fd2d5 100644 > --- a/mm/slub.c > +++ b/mm/slub.c > @@ -1409,8 +1409,6 @@ static void __free_slab(struct kmem_cache *s, struct page *page) > > memcg_release_pages(s, order); > page_mapcount_reset(page); > - if (current->reclaim_state) > - current->reclaim_state->reclaimed_slab += pages; > __free_memcg_kmem_pages(page, order); > } > > @@ -1431,6 +1429,8 @@ static void rcu_free_slab(struct rcu_head *h) > > static void free_slab(struct kmem_cache *s, struct page *page) > { > + int pages = 1 << compound_order(page); One question irrelevant this patch. Why slab cache can use compound page(hugetlbfs pages/thp pages)? They are just used by app to optimize tlb miss, is it? > + > if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) { > struct rcu_head *head; > > @@ -1450,6 +1450,9 @@ static void free_slab(struct kmem_cache *s, struct page *page) > call_rcu(head, rcu_free_slab); > } else > __free_slab(s, page); > + > + if (current->reclaim_state) > + current->reclaim_state->reclaimed_slab += pages; > } > > static void discard_slab(struct kmem_cache *s, struct page *page) ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH 2/3] mm, slub: count freed pages via rcu as this task's reclaimed_slab 2013-04-09 9:38 ` Simon Jeons @ 2013-04-09 14:32 ` Christoph Lameter 2013-04-10 3:20 ` Simon Jeons 0 siblings, 1 reply; 16+ messages in thread From: Christoph Lameter @ 2013-04-09 14:32 UTC (permalink / raw) To: Simon Jeons Cc: Joonsoo Kim, Andrew Morton, linux-kernel, linux-mm, Mel Gorman, Hugh Dickins, Rik van Riel, Minchan Kim, Pekka Enberg, Matt Mackall On Tue, 9 Apr 2013, Simon Jeons wrote: > > + int pages = 1 << compound_order(page); > > One question irrelevant this patch. Why slab cache can use compound > page(hugetlbfs pages/thp pages)? They are just used by app to optimize tlb > miss, is it? Slab caches can use any order pages because these pages are never on the LRU and are not part of the page cache. Large continuous physical memory means that objects can be arranged in a more efficient way in the page. This is particularly useful for larger objects where we might use a lot of memory because objects do not fit well into a 4k page. It also reduces the slab page management if higher order pages are used. In the case of slub the page size also determines the number of objects that can be allocated/freed without the need for some form of synchronization. ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH 2/3] mm, slub: count freed pages via rcu as this task's reclaimed_slab 2013-04-09 14:32 ` Christoph Lameter @ 2013-04-10 3:20 ` Simon Jeons 2013-04-10 13:54 ` Christoph Lameter 0 siblings, 1 reply; 16+ messages in thread From: Simon Jeons @ 2013-04-10 3:20 UTC (permalink / raw) To: Christoph Lameter Cc: Joonsoo Kim, Andrew Morton, linux-kernel, linux-mm, Mel Gorman, Hugh Dickins, Rik van Riel, Minchan Kim, Pekka Enberg, Matt Mackall Hi Christoph, On 04/09/2013 10:32 PM, Christoph Lameter wrote: > On Tue, 9 Apr 2013, Simon Jeons wrote: > >>> + int pages = 1 << compound_order(page); >> One question irrelevant this patch. Why slab cache can use compound >> page(hugetlbfs pages/thp pages)? They are just used by app to optimize tlb >> miss, is it? > Slab caches can use any order pages because these pages are never on > the LRU and are not part of the page cache. Large continuous physical > memory means that objects can be arranged in a more efficient way in the > page. This is particularly useful for larger objects where we might use a > lot of memory because objects do not fit well into a 4k page. > > It also reduces the slab page management if higher order pages are used. > In the case of slub the page size also determines the number of objects > that can be allocated/freed without the need for some form of > synchronization. It seems that you misunderstand my question. I don't doubt slab/slub can use high order pages. However, what I focus on is why slab/slub can use compound page, PageCompound() just on behalf of hugetlbfs pages or thp pages which should used by apps, isn't it? > ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH 2/3] mm, slub: count freed pages via rcu as this task's reclaimed_slab 2013-04-10 3:20 ` Simon Jeons @ 2013-04-10 13:54 ` Christoph Lameter 2013-04-11 3:46 ` Simon Jeons 0 siblings, 1 reply; 16+ messages in thread From: Christoph Lameter @ 2013-04-10 13:54 UTC (permalink / raw) To: Simon Jeons Cc: Joonsoo Kim, Andrew Morton, linux-kernel, linux-mm, Mel Gorman, Hugh Dickins, Rik van Riel, Minchan Kim, Pekka Enberg, Matt Mackall On Wed, 10 Apr 2013, Simon Jeons wrote: > It seems that you misunderstand my question. I don't doubt slab/slub can use > high order pages. However, what I focus on is why slab/slub can use compound > page, PageCompound() just on behalf of hugetlbfs pages or thp pages which > should used by apps, isn't it? I am not entirely clear on what you are asking for. The following gives a couple of answers to what I guess the question was. THP pages and user pages are on the lru and are managed differently. The slab allocators cannot work with those pages. Slab allocators *can* allocate higher order pages therefore they could allocate a page of the same order as huge pages and manage it that way. However there is no way that these pages could be handled like THP pages since they cannot be broken up (unless we add the capability to move slab objects which I wanted to do for a long time). You can boot a Linux system that uses huge pages for slab allocation by specifying the following parameter on the kernel command line. slub_min_order=9 The slub allocator will start using huge pages for all its storage needs. You need a large number of huge pages to do this. Lots of memory is going to be lost due to fragmentation but its going to be fast since the slowpaths are rarely used. OOMs due to reclaim failure become much more likely ;-). ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH 2/3] mm, slub: count freed pages via rcu as this task's reclaimed_slab 2013-04-10 13:54 ` Christoph Lameter @ 2013-04-11 3:46 ` Simon Jeons 2013-04-11 15:03 ` Christoph Lameter 0 siblings, 1 reply; 16+ messages in thread From: Simon Jeons @ 2013-04-11 3:46 UTC (permalink / raw) To: Christoph Lameter Cc: Joonsoo Kim, Andrew Morton, linux-kernel, linux-mm, Mel Gorman, Hugh Dickins, Rik van Riel, Minchan Kim, Pekka Enberg, Matt Mackall Hi Christoph, On 04/10/2013 09:54 PM, Christoph Lameter wrote: > On Wed, 10 Apr 2013, Simon Jeons wrote: > >> It seems that you misunderstand my question. I don't doubt slab/slub can use >> high order pages. However, what I focus on is why slab/slub can use compound >> page, PageCompound() just on behalf of hugetlbfs pages or thp pages which >> should used by apps, isn't it? > I am not entirely clear on what you are asking for. The following gives a > couple of answers to what I guess the question was. > > THP pages and user pages are on the lru and are managed differently. > The slab allocators cannot work with those pages. > > Slab allocators *can* allocate higher order pages therefore they could > allocate a page of the same order as huge pages and manage it that way. > > However there is no way that these pages could be handled like THP pages > since they cannot be broken up (unless we add the capability to move slab > objects which I wanted to do for a long time). > > > You can boot a Linux system that uses huge pages for slab allocation > by specifying the following parameter on the kernel command line. > > slub_min_order=9 > > The slub allocator will start using huge pages for all its storage > needs. You need a large number of huge pages to do this. Lots of memory > is going to be lost due to fragmentation but its going to be fast since > the slowpaths are rarely used. OOMs due to reclaim failure become much > more likely ;-). > It seems that I need to simple my question. All pages which order >=1 are compound pages? ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH 2/3] mm, slub: count freed pages via rcu as this task's reclaimed_slab 2013-04-11 3:46 ` Simon Jeons @ 2013-04-11 15:03 ` Christoph Lameter 0 siblings, 0 replies; 16+ messages in thread From: Christoph Lameter @ 2013-04-11 15:03 UTC (permalink / raw) To: Simon Jeons Cc: Joonsoo Kim, Andrew Morton, linux-kernel, linux-mm, Mel Gorman, Hugh Dickins, Rik van Riel, Minchan Kim, Pekka Enberg, Matt Mackall On Thu, 11 Apr 2013, Simon Jeons wrote: > It seems that I need to simple my question. > All pages which order >=1 are compound pages? In the slub allocator that is true. One can request and free a series of contiguous pages that are not compound pages from the page allocator and a couple of subsystems use those. ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH 2/3] mm, slub: count freed pages via rcu as this task's reclaimed_slab 2013-04-09 1:21 ` [PATCH 2/3] mm, slub: count freed pages via rcu as this task's reclaimed_slab Joonsoo Kim 2013-04-09 9:38 ` Simon Jeons @ 2013-04-09 14:28 ` Christoph Lameter 2013-04-10 5:26 ` Joonsoo Kim 1 sibling, 1 reply; 16+ messages in thread From: Christoph Lameter @ 2013-04-09 14:28 UTC (permalink / raw) To: Joonsoo Kim Cc: Andrew Morton, linux-kernel, linux-mm, Mel Gorman, Hugh Dickins, Rik van Riel, Minchan Kim, Pekka Enberg, Matt Mackall On Tue, 9 Apr 2013, Joonsoo Kim wrote: > Currently, freed pages via rcu is not counted for reclaimed_slab, because > it is freed in rcu context, not current task context. But, this free is > initiated by this task, so counting this into this task's reclaimed_slab > is meaningful to decide whether we continue reclaim, or not. > So change code to count these pages for this task's reclaimed_slab. slab->reclaim_state guides the reclaim actions in vmscan.c. With this patch slab->reclaim_state could get quite a high value without new pages being available for allocation. slab->reclaim_state will only be updated when the RCU period ends. ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH 2/3] mm, slub: count freed pages via rcu as this task's reclaimed_slab 2013-04-09 14:28 ` Christoph Lameter @ 2013-04-10 5:26 ` Joonsoo Kim 2013-04-10 13:57 ` Christoph Lameter 0 siblings, 1 reply; 16+ messages in thread From: Joonsoo Kim @ 2013-04-10 5:26 UTC (permalink / raw) To: Christoph Lameter Cc: Andrew Morton, linux-kernel, linux-mm, Mel Gorman, Hugh Dickins, Rik van Riel, Minchan Kim, Pekka Enberg, Matt Mackall Hello, Christoph. On Tue, Apr 09, 2013 at 02:28:06PM +0000, Christoph Lameter wrote: > On Tue, 9 Apr 2013, Joonsoo Kim wrote: > > > Currently, freed pages via rcu is not counted for reclaimed_slab, because > > it is freed in rcu context, not current task context. But, this free is > > initiated by this task, so counting this into this task's reclaimed_slab > > is meaningful to decide whether we continue reclaim, or not. > > So change code to count these pages for this task's reclaimed_slab. > > slab->reclaim_state guides the reclaim actions in vmscan.c. With this > patch slab->reclaim_state could get quite a high value without new pages being > available for allocation. slab->reclaim_state will only be updated > when the RCU period ends. Okay. In addition, there is a little place who use SLAB_DESTROY_BY_RCU. I will drop this patch[2/3] and [3/3] for next spin. Thanks. > > -- > To unsubscribe, send a message with 'unsubscribe linux-mm' in > the body to majordomo@kvack.org. For more info on Linux MM, > see: http://www.linux-mm.org/ . > Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH 2/3] mm, slub: count freed pages via rcu as this task's reclaimed_slab 2013-04-10 5:26 ` Joonsoo Kim @ 2013-04-10 13:57 ` Christoph Lameter 2013-04-10 14:24 ` JoonSoo Kim 0 siblings, 1 reply; 16+ messages in thread From: Christoph Lameter @ 2013-04-10 13:57 UTC (permalink / raw) To: Joonsoo Kim Cc: Andrew Morton, linux-kernel, linux-mm, Mel Gorman, Hugh Dickins, Rik van Riel, Minchan Kim, Pekka Enberg, Matt Mackall On Wed, 10 Apr 2013, Joonsoo Kim wrote: > Hello, Christoph. > > On Tue, Apr 09, 2013 at 02:28:06PM +0000, Christoph Lameter wrote: > > On Tue, 9 Apr 2013, Joonsoo Kim wrote: > > > > > Currently, freed pages via rcu is not counted for reclaimed_slab, because > > > it is freed in rcu context, not current task context. But, this free is > > > initiated by this task, so counting this into this task's reclaimed_slab > > > is meaningful to decide whether we continue reclaim, or not. > > > So change code to count these pages for this task's reclaimed_slab. > > > > slab->reclaim_state guides the reclaim actions in vmscan.c. With this > > patch slab->reclaim_state could get quite a high value without new pages being > > available for allocation. slab->reclaim_state will only be updated > > when the RCU period ends. > > Okay. > > In addition, there is a little place who use SLAB_DESTROY_BY_RCU. > I will drop this patch[2/3] and [3/3] for next spin. What you have discoverd is an issue that we have so far overlooked. Could you add comments to both places explaining the situation? RCU is used for some inode and the dentry cache. Failing to account for these frees could pose a problem. One solution would be to ensure that we get through an RCU quiescent period in the slabs reclaim. If we can ensure that then your patch may be ok. ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH 2/3] mm, slub: count freed pages via rcu as this task's reclaimed_slab 2013-04-10 13:57 ` Christoph Lameter @ 2013-04-10 14:24 ` JoonSoo Kim 0 siblings, 0 replies; 16+ messages in thread From: JoonSoo Kim @ 2013-04-10 14:24 UTC (permalink / raw) To: Christoph Lameter Cc: Joonsoo Kim, Andrew Morton, LKML, Linux Memory Management List, Mel Gorman, Hugh Dickins, Rik van Riel, Minchan Kim, Pekka Enberg, Matt Mackall 2013/4/10 Christoph Lameter <cl@linux.com>: > On Wed, 10 Apr 2013, Joonsoo Kim wrote: > >> Hello, Christoph. >> >> On Tue, Apr 09, 2013 at 02:28:06PM +0000, Christoph Lameter wrote: >> > On Tue, 9 Apr 2013, Joonsoo Kim wrote: >> > >> > > Currently, freed pages via rcu is not counted for reclaimed_slab, because >> > > it is freed in rcu context, not current task context. But, this free is >> > > initiated by this task, so counting this into this task's reclaimed_slab >> > > is meaningful to decide whether we continue reclaim, or not. >> > > So change code to count these pages for this task's reclaimed_slab. >> > >> > slab->reclaim_state guides the reclaim actions in vmscan.c. With this >> > patch slab->reclaim_state could get quite a high value without new pages being >> > available for allocation. slab->reclaim_state will only be updated >> > when the RCU period ends. >> >> Okay. >> >> In addition, there is a little place who use SLAB_DESTROY_BY_RCU. >> I will drop this patch[2/3] and [3/3] for next spin. > > What you have discoverd is an issue that we have so far overlooked. Could > you add comments to both places explaining the situation? Yes, I can. > RCU is used for > some inode and the dentry cache. Failing to account for these frees could > pose a problem. One solution would be to ensure that we get through an RCU > quiescent period in the slabs reclaim. If we can ensure that then your > patch may be ok. Hmm... I don't perfectly understand RCU code and it's quiescent period. But, yes, it can be one of possible solutions in my quick thought. Currently, I have no ability to do that, so I skip to think about this. Thanks. > > -- > To unsubscribe, send a message with 'unsubscribe linux-mm' in > the body to majordomo@kvack.org. For more info on Linux MM, > see: http://www.linux-mm.org/ . > Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 16+ messages in thread
* [PATCH 3/3] mm, slab: count freed pages via rcu as this task's reclaimed_slab 2013-04-09 1:21 [PATCH 1/3] mm, vmscan: count accidental reclaimed pages failed to put into lru Joonsoo Kim 2013-04-09 1:21 ` [PATCH 2/3] mm, slub: count freed pages via rcu as this task's reclaimed_slab Joonsoo Kim @ 2013-04-09 1:21 ` Joonsoo Kim 2013-04-09 5:55 ` [PATCH 1/3] mm, vmscan: count accidental reclaimed pages failed to put into lru Minchan Kim 2 siblings, 0 replies; 16+ messages in thread From: Joonsoo Kim @ 2013-04-09 1:21 UTC (permalink / raw) To: Andrew Morton Cc: linux-kernel, linux-mm, Mel Gorman, Hugh Dickins, Rik van Riel, Minchan Kim, Joonsoo Kim, Christoph Lameter, Pekka Enberg, Matt Mackall Currently, freed pages via rcu is not counted for reclaimed_slab, because it is freed in rcu context, not current task context. But, this free is initiated by this task, so counting this into this task's reclaimed_slab is meaningful to decide whether we continue reclaim, or not. So change code to count these pages for this task's reclaimed_slab. Cc: Christoph Lameter <cl@linux-foundation.org> Cc: Pekka Enberg <penberg@kernel.org> Cc: Matt Mackall <mpm@selenic.com> Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com> diff --git a/mm/slab.c b/mm/slab.c index 856e4a1..4d94bcb 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1934,8 +1934,6 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr) } memcg_release_pages(cachep, cachep->gfporder); - if (current->reclaim_state) - current->reclaim_state->reclaimed_slab += nr_freed; free_memcg_kmem_pages((unsigned long)addr, cachep->gfporder); } @@ -2165,6 +2163,7 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slab */ static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp) { + unsigned long nr_freed = (1 << cachep->gfporder); void *addr = slabp->s_mem - slabp->colouroff; slab_destroy_debugcheck(cachep, slabp); @@ -2180,6 +2179,9 @@ static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp) if (OFF_SLAB(cachep)) kmem_cache_free(cachep->slabp_cache, slabp); } + + if (current->reclaim_state) + current->reclaim_state->reclaimed_slab += nr_freed; } /** -- 1.7.9.5 ^ permalink raw reply related [flat|nested] 16+ messages in thread
* Re: [PATCH 1/3] mm, vmscan: count accidental reclaimed pages failed to put into lru 2013-04-09 1:21 [PATCH 1/3] mm, vmscan: count accidental reclaimed pages failed to put into lru Joonsoo Kim 2013-04-09 1:21 ` [PATCH 2/3] mm, slub: count freed pages via rcu as this task's reclaimed_slab Joonsoo Kim 2013-04-09 1:21 ` [PATCH 3/3] mm, slab: " Joonsoo Kim @ 2013-04-09 5:55 ` Minchan Kim 2013-04-10 5:48 ` Joonsoo Kim 2 siblings, 1 reply; 16+ messages in thread From: Minchan Kim @ 2013-04-09 5:55 UTC (permalink / raw) To: Joonsoo Kim Cc: Andrew Morton, linux-kernel, linux-mm, Mel Gorman, Hugh Dickins, Rik van Riel Hello Joonsoo, On Tue, Apr 09, 2013 at 10:21:16AM +0900, Joonsoo Kim wrote: > In shrink_(in)active_list(), we can fail to put into lru, and these pages > are reclaimed accidentally. Currently, these pages are not counted > for sc->nr_reclaimed, but with this information, we can stop to reclaim > earlier, so can reduce overhead of reclaim. > > Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com> Nice catch! But this patch handles very corner case and makes reclaim function's name rather stupid so I'd like to see text size change after we apply this patch. Other nipicks below. > > diff --git a/include/linux/gfp.h b/include/linux/gfp.h > index 0f615eb..5d60ae0 100644 > --- a/include/linux/gfp.h > +++ b/include/linux/gfp.h > @@ -365,7 +365,7 @@ void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask); > extern void __free_pages(struct page *page, unsigned int order); > extern void free_pages(unsigned long addr, unsigned int order); > extern void free_hot_cold_page(struct page *page, int cold); > -extern void free_hot_cold_page_list(struct list_head *list, int cold); > +extern unsigned long free_hot_cold_page_list(struct list_head *list, int cold); > > extern void __free_memcg_kmem_pages(struct page *page, unsigned int order); > extern void free_memcg_kmem_pages(unsigned long addr, unsigned int order); > diff --git a/mm/page_alloc.c b/mm/page_alloc.c > index 8fcced7..a5f3952 100644 > --- a/mm/page_alloc.c > +++ b/mm/page_alloc.c > @@ -1360,14 +1360,18 @@ out: > /* > * Free a list of 0-order pages > */ > -void free_hot_cold_page_list(struct list_head *list, int cold) > +unsigned long free_hot_cold_page_list(struct list_head *list, int cold) > { > + unsigned long nr_reclaimed = 0; How about nr_free or nr_freed for consistent with function title? > struct page *page, *next; > > list_for_each_entry_safe(page, next, list, lru) { > trace_mm_page_free_batched(page, cold); > free_hot_cold_page(page, cold); > + nr_reclaimed++; > } > + > + return nr_reclaimed; > } > > /* > diff --git a/mm/vmscan.c b/mm/vmscan.c > index 88c5fed..eff2927 100644 > --- a/mm/vmscan.c > +++ b/mm/vmscan.c > @@ -915,7 +915,6 @@ static unsigned long shrink_page_list(struct list_head *page_list, > */ > __clear_page_locked(page); > free_it: > - nr_reclaimed++; > > /* > * Is there need to periodically free_page_list? It would > @@ -954,7 +953,7 @@ keep: > if (nr_dirty && nr_dirty == nr_congested && global_reclaim(sc)) > zone_set_flag(zone, ZONE_CONGESTED); > > - free_hot_cold_page_list(&free_pages, 1); > + nr_reclaimed += free_hot_cold_page_list(&free_pages, 1); Nice cleanup. > > list_splice(&ret_pages, page_list); > count_vm_events(PGACTIVATE, pgactivate); > @@ -1321,7 +1320,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, > if (nr_taken == 0) > return 0; > > - nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP, > + nr_reclaimed += shrink_page_list(&page_list, zone, sc, TTU_UNMAP, > &nr_dirty, &nr_writeback, false); Do you have any reason to change? To me, '=' is more clear to initialize the variable. When I see above, I have to look through above lines to catch where code used the nr_reclaimed. > > spin_lock_irq(&zone->lru_lock); > @@ -1343,7 +1342,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, > > spin_unlock_irq(&zone->lru_lock); > > - free_hot_cold_page_list(&page_list, 1); > + nr_reclaimed += free_hot_cold_page_list(&page_list, 1); How about considering vmstat, too? It could be minor but you are considering freed page as reclaim context. (ie, sc->nr_reclaimed) so it would be more appropriate. > > /* > * If reclaim is isolating dirty pages under writeback, it implies > @@ -1438,7 +1437,7 @@ static void move_active_pages_to_lru(struct lruvec *lruvec, > __count_vm_events(PGDEACTIVATE, pgmoved); > } > > -static void shrink_active_list(unsigned long nr_to_scan, > +static unsigned long shrink_active_list(unsigned long nr_to_scan, > struct lruvec *lruvec, > struct scan_control *sc, > enum lru_list lru) > @@ -1534,7 +1533,7 @@ static void shrink_active_list(unsigned long nr_to_scan, > __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); > spin_unlock_irq(&zone->lru_lock); > > - free_hot_cold_page_list(&l_hold, 1); > + return free_hot_cold_page_list(&l_hold, 1); It would be better to add comment about return value. Otherwise, people could confuse with the number of pages moved from active to inactive. > } > > #ifdef CONFIG_SWAP > @@ -1617,7 +1616,8 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, > { > if (is_active_lru(lru)) { > if (inactive_list_is_low(lruvec, lru)) > - shrink_active_list(nr_to_scan, lruvec, sc, lru); > + return shrink_active_list(nr_to_scan, lruvec, sc, lru); > + Unnecessary change. > return 0; > } > > @@ -1861,8 +1861,8 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) > * rebalance the anon lru active/inactive ratio. > */ > if (inactive_anon_is_low(lruvec)) > - shrink_active_list(SWAP_CLUSTER_MAX, lruvec, > - sc, LRU_ACTIVE_ANON); > + sc->nr_reclaimed += shrink_active_list(SWAP_CLUSTER_MAX, > + lruvec, sc, LRU_ACTIVE_ANON); > > throttle_vm_writeout(sc->gfp_mask); > } > @@ -2470,23 +2470,27 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, > } > #endif > > -static void age_active_anon(struct zone *zone, struct scan_control *sc) Comment about return value. or rename but I have no idea. Sorry. > +static unsigned long age_active_anon(struct zone *zone, > + struct scan_control *sc) > { > + unsigned long nr_reclaimed = 0; > struct mem_cgroup *memcg; > > if (!total_swap_pages) > - return; > + return 0; > > memcg = mem_cgroup_iter(NULL, NULL, NULL); > do { > struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); > > if (inactive_anon_is_low(lruvec)) > - shrink_active_list(SWAP_CLUSTER_MAX, lruvec, > - sc, LRU_ACTIVE_ANON); > + nr_reclaimed += shrink_active_list(SWAP_CLUSTER_MAX, > + lruvec, sc, LRU_ACTIVE_ANON); > > memcg = mem_cgroup_iter(NULL, memcg, NULL); > } while (memcg); > + > + return nr_reclaimed; > } > > static bool zone_balanced(struct zone *zone, int order, > @@ -2666,7 +2670,7 @@ loop_again: > * Do some background aging of the anon list, to give > * pages a chance to be referenced before reclaiming. > */ > - age_active_anon(zone, &sc); > + sc.nr_reclaimed += age_active_anon(zone, &sc); > > /* > * If the number of buffer_heads in the machine > -- > 1.7.9.5 > > -- > To unsubscribe, send a message with 'unsubscribe linux-mm' in > the body to majordomo@kvack.org. For more info on Linux MM, > see: http://www.linux-mm.org/ . > Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> -- Kind regards, Minchan Kim ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH 1/3] mm, vmscan: count accidental reclaimed pages failed to put into lru 2013-04-09 5:55 ` [PATCH 1/3] mm, vmscan: count accidental reclaimed pages failed to put into lru Minchan Kim @ 2013-04-10 5:48 ` Joonsoo Kim 2013-04-10 6:03 ` Minchan Kim 0 siblings, 1 reply; 16+ messages in thread From: Joonsoo Kim @ 2013-04-10 5:48 UTC (permalink / raw) To: Minchan Kim Cc: Andrew Morton, linux-kernel, linux-mm, Mel Gorman, Hugh Dickins, Rik van Riel Hello, Minchan. On Tue, Apr 09, 2013 at 02:55:14PM +0900, Minchan Kim wrote: > Hello Joonsoo, > > On Tue, Apr 09, 2013 at 10:21:16AM +0900, Joonsoo Kim wrote: > > In shrink_(in)active_list(), we can fail to put into lru, and these pages > > are reclaimed accidentally. Currently, these pages are not counted > > for sc->nr_reclaimed, but with this information, we can stop to reclaim > > earlier, so can reduce overhead of reclaim. > > > > Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com> > > Nice catch! > > But this patch handles very corner case and makes reclaim function's name > rather stupid so I'd like to see text size change after we apply this patch. > Other nipicks below. Ah... Yes. I can re-work it to add number to sc->nr_reclaimed directly for both cases, shrink_active_list() and age_active_anon(). > > > > > diff --git a/include/linux/gfp.h b/include/linux/gfp.h > > index 0f615eb..5d60ae0 100644 > > --- a/include/linux/gfp.h > > +++ b/include/linux/gfp.h > > @@ -365,7 +365,7 @@ void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask); > > extern void __free_pages(struct page *page, unsigned int order); > > extern void free_pages(unsigned long addr, unsigned int order); > > extern void free_hot_cold_page(struct page *page, int cold); > > -extern void free_hot_cold_page_list(struct list_head *list, int cold); > > +extern unsigned long free_hot_cold_page_list(struct list_head *list, int cold); > > > > extern void __free_memcg_kmem_pages(struct page *page, unsigned int order); > > extern void free_memcg_kmem_pages(unsigned long addr, unsigned int order); > > diff --git a/mm/page_alloc.c b/mm/page_alloc.c > > index 8fcced7..a5f3952 100644 > > --- a/mm/page_alloc.c > > +++ b/mm/page_alloc.c > > @@ -1360,14 +1360,18 @@ out: > > /* > > * Free a list of 0-order pages > > */ > > -void free_hot_cold_page_list(struct list_head *list, int cold) > > +unsigned long free_hot_cold_page_list(struct list_head *list, int cold) > > { > > + unsigned long nr_reclaimed = 0; > > How about nr_free or nr_freed for consistent with function title? Okay. > > > struct page *page, *next; > > > > list_for_each_entry_safe(page, next, list, lru) { > > trace_mm_page_free_batched(page, cold); > > free_hot_cold_page(page, cold); > > + nr_reclaimed++; > > } > > + > > + return nr_reclaimed; > > } > > > > /* > > diff --git a/mm/vmscan.c b/mm/vmscan.c > > index 88c5fed..eff2927 100644 > > --- a/mm/vmscan.c > > +++ b/mm/vmscan.c > > @@ -915,7 +915,6 @@ static unsigned long shrink_page_list(struct list_head *page_list, > > */ > > __clear_page_locked(page); > > free_it: > > - nr_reclaimed++; > > > > /* > > * Is there need to periodically free_page_list? It would > > @@ -954,7 +953,7 @@ keep: > > if (nr_dirty && nr_dirty == nr_congested && global_reclaim(sc)) > > zone_set_flag(zone, ZONE_CONGESTED); > > > > - free_hot_cold_page_list(&free_pages, 1); > > + nr_reclaimed += free_hot_cold_page_list(&free_pages, 1); > > Nice cleanup. > > > > > list_splice(&ret_pages, page_list); > > count_vm_events(PGACTIVATE, pgactivate); > > @@ -1321,7 +1320,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, > > if (nr_taken == 0) > > return 0; > > > > - nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP, > > + nr_reclaimed += shrink_page_list(&page_list, zone, sc, TTU_UNMAP, > > &nr_dirty, &nr_writeback, false); > > Do you have any reason to change? > To me, '=' is more clear to initialize the variable. > When I see above, I have to look through above lines to catch where code > used the nr_reclaimed. > There is no reason, I will change it. > > > > spin_lock_irq(&zone->lru_lock); > > @@ -1343,7 +1342,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, > > > > spin_unlock_irq(&zone->lru_lock); > > > > - free_hot_cold_page_list(&page_list, 1); > > + nr_reclaimed += free_hot_cold_page_list(&page_list, 1); > > How about considering vmstat, too? > It could be minor but you are considering freed page as > reclaim context. (ie, sc->nr_reclaimed) so it would be more appropriate. I don't understand what you mean. Please explain more what you have in mind :) > > > > > /* > > * If reclaim is isolating dirty pages under writeback, it implies > > @@ -1438,7 +1437,7 @@ static void move_active_pages_to_lru(struct lruvec *lruvec, > > __count_vm_events(PGDEACTIVATE, pgmoved); > > } > > > > -static void shrink_active_list(unsigned long nr_to_scan, > > +static unsigned long shrink_active_list(unsigned long nr_to_scan, > > struct lruvec *lruvec, > > struct scan_control *sc, > > enum lru_list lru) > > @@ -1534,7 +1533,7 @@ static void shrink_active_list(unsigned long nr_to_scan, > > __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); > > spin_unlock_irq(&zone->lru_lock); > > > > - free_hot_cold_page_list(&l_hold, 1); > > + return free_hot_cold_page_list(&l_hold, 1); > > It would be better to add comment about return value. > Otherwise, people could confuse with the number of pages moved from > active to inactive. > In next spin, I will not change return type. So above problem will be disappreared. > > } > > > > #ifdef CONFIG_SWAP > > @@ -1617,7 +1616,8 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, > > { > > if (is_active_lru(lru)) { > > if (inactive_list_is_low(lruvec, lru)) > > - shrink_active_list(nr_to_scan, lruvec, sc, lru); > > + return shrink_active_list(nr_to_scan, lruvec, sc, lru); > > + > > Unnecessary change. > Why? > > return 0; > > } > > > > @@ -1861,8 +1861,8 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) > > * rebalance the anon lru active/inactive ratio. > > */ > > if (inactive_anon_is_low(lruvec)) > > - shrink_active_list(SWAP_CLUSTER_MAX, lruvec, > > - sc, LRU_ACTIVE_ANON); > > + sc->nr_reclaimed += shrink_active_list(SWAP_CLUSTER_MAX, > > + lruvec, sc, LRU_ACTIVE_ANON); > > > > throttle_vm_writeout(sc->gfp_mask); > > } > > @@ -2470,23 +2470,27 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, > > } > > #endif > > > > -static void age_active_anon(struct zone *zone, struct scan_control *sc) > > Comment about return value. > or rename but I have no idea. Sorry. This will be disappreared in next spin. Thanks for detailed review. > > > +static unsigned long age_active_anon(struct zone *zone, > > + struct scan_control *sc) > > { > > + unsigned long nr_reclaimed = 0; > > struct mem_cgroup *memcg; > > > > if (!total_swap_pages) > > - return; > > + return 0; > > > > memcg = mem_cgroup_iter(NULL, NULL, NULL); > > do { > > struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); > > > > if (inactive_anon_is_low(lruvec)) > > - shrink_active_list(SWAP_CLUSTER_MAX, lruvec, > > - sc, LRU_ACTIVE_ANON); > > + nr_reclaimed += shrink_active_list(SWAP_CLUSTER_MAX, > > + lruvec, sc, LRU_ACTIVE_ANON); > > > > memcg = mem_cgroup_iter(NULL, memcg, NULL); > > } while (memcg); > > + > > + return nr_reclaimed; > > } > > > > static bool zone_balanced(struct zone *zone, int order, > > @@ -2666,7 +2670,7 @@ loop_again: > > * Do some background aging of the anon list, to give > > * pages a chance to be referenced before reclaiming. > > */ > > - age_active_anon(zone, &sc); > > + sc.nr_reclaimed += age_active_anon(zone, &sc); > > > > /* > > * If the number of buffer_heads in the machine > > -- > > 1.7.9.5 > > > > -- > > To unsubscribe, send a message with 'unsubscribe linux-mm' in > > the body to majordomo@kvack.org. For more info on Linux MM, > > see: http://www.linux-mm.org/ . > > Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> > > -- > Kind regards, > Minchan Kim > > -- > To unsubscribe, send a message with 'unsubscribe linux-mm' in > the body to majordomo@kvack.org. For more info on Linux MM, > see: http://www.linux-mm.org/ . > Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH 1/3] mm, vmscan: count accidental reclaimed pages failed to put into lru 2013-04-10 5:48 ` Joonsoo Kim @ 2013-04-10 6:03 ` Minchan Kim 0 siblings, 0 replies; 16+ messages in thread From: Minchan Kim @ 2013-04-10 6:03 UTC (permalink / raw) To: Joonsoo Kim Cc: Andrew Morton, linux-kernel, linux-mm, Mel Gorman, Hugh Dickins, Rik van Riel On Wed, Apr 10, 2013 at 02:48:22PM +0900, Joonsoo Kim wrote: > Hello, Minchan. > > On Tue, Apr 09, 2013 at 02:55:14PM +0900, Minchan Kim wrote: > > Hello Joonsoo, > > > > On Tue, Apr 09, 2013 at 10:21:16AM +0900, Joonsoo Kim wrote: > > > In shrink_(in)active_list(), we can fail to put into lru, and these pages > > > are reclaimed accidentally. Currently, these pages are not counted > > > for sc->nr_reclaimed, but with this information, we can stop to reclaim > > > earlier, so can reduce overhead of reclaim. > > > > > > Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com> > > > > Nice catch! > > > > But this patch handles very corner case and makes reclaim function's name > > rather stupid so I'd like to see text size change after we apply this patch. > > Other nipicks below. > > Ah... Yes. > I can re-work it to add number to sc->nr_reclaimed directly for both cases, > shrink_active_list() and age_active_anon(). Sounds better. > > > > > > > > > diff --git a/include/linux/gfp.h b/include/linux/gfp.h > > > index 0f615eb..5d60ae0 100644 > > > --- a/include/linux/gfp.h > > > +++ b/include/linux/gfp.h > > > @@ -365,7 +365,7 @@ void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask); > > > extern void __free_pages(struct page *page, unsigned int order); > > > extern void free_pages(unsigned long addr, unsigned int order); > > > extern void free_hot_cold_page(struct page *page, int cold); > > > -extern void free_hot_cold_page_list(struct list_head *list, int cold); > > > +extern unsigned long free_hot_cold_page_list(struct list_head *list, int cold); > > > > > > extern void __free_memcg_kmem_pages(struct page *page, unsigned int order); > > > extern void free_memcg_kmem_pages(unsigned long addr, unsigned int order); > > > diff --git a/mm/page_alloc.c b/mm/page_alloc.c > > > index 8fcced7..a5f3952 100644 > > > --- a/mm/page_alloc.c > > > +++ b/mm/page_alloc.c > > > @@ -1360,14 +1360,18 @@ out: > > > /* > > > * Free a list of 0-order pages > > > */ > > > -void free_hot_cold_page_list(struct list_head *list, int cold) > > > +unsigned long free_hot_cold_page_list(struct list_head *list, int cold) > > > { > > > + unsigned long nr_reclaimed = 0; > > > > How about nr_free or nr_freed for consistent with function title? > > Okay. > > > > > > struct page *page, *next; > > > > > > list_for_each_entry_safe(page, next, list, lru) { > > > trace_mm_page_free_batched(page, cold); > > > free_hot_cold_page(page, cold); > > > + nr_reclaimed++; > > > } > > > + > > > + return nr_reclaimed; > > > } > > > > > > /* > > > diff --git a/mm/vmscan.c b/mm/vmscan.c > > > index 88c5fed..eff2927 100644 > > > --- a/mm/vmscan.c > > > +++ b/mm/vmscan.c > > > @@ -915,7 +915,6 @@ static unsigned long shrink_page_list(struct list_head *page_list, > > > */ > > > __clear_page_locked(page); > > > free_it: > > > - nr_reclaimed++; > > > > > > /* > > > * Is there need to periodically free_page_list? It would > > > @@ -954,7 +953,7 @@ keep: > > > if (nr_dirty && nr_dirty == nr_congested && global_reclaim(sc)) > > > zone_set_flag(zone, ZONE_CONGESTED); > > > > > > - free_hot_cold_page_list(&free_pages, 1); > > > + nr_reclaimed += free_hot_cold_page_list(&free_pages, 1); > > > > Nice cleanup. > > > > > > > > list_splice(&ret_pages, page_list); > > > count_vm_events(PGACTIVATE, pgactivate); > > > @@ -1321,7 +1320,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, > > > if (nr_taken == 0) > > > return 0; > > > > > > - nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP, > > > + nr_reclaimed += shrink_page_list(&page_list, zone, sc, TTU_UNMAP, > > > &nr_dirty, &nr_writeback, false); > > > > Do you have any reason to change? > > To me, '=' is more clear to initialize the variable. > > When I see above, I have to look through above lines to catch where code > > used the nr_reclaimed. > > > > There is no reason, I will change it. > > > > > > > spin_lock_irq(&zone->lru_lock); > > > @@ -1343,7 +1342,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, > > > > > > spin_unlock_irq(&zone->lru_lock); > > > > > > - free_hot_cold_page_list(&page_list, 1); > > > + nr_reclaimed += free_hot_cold_page_list(&page_list, 1); > > > > How about considering vmstat, too? > > It could be minor but you are considering freed page as > > reclaim context. (ie, sc->nr_reclaimed) so it would be more appropriate. > > I don't understand what you mean. > Please explain more what you have in mind :) We are accouting PGSTEAL_[KSWAPD|DIRECT] with nr_reclaimed and nr_reclaimed are contributing for sc->nr_reclaimed accumulation but your patch doesn't consider that so vmstat cound be not exact. Of course, it's not exact inherently so no big deal that's why I said "minor". I just want that you think over about that. Thanks! > > > > > > > > > /* > > > * If reclaim is isolating dirty pages under writeback, it implies > > > @@ -1438,7 +1437,7 @@ static void move_active_pages_to_lru(struct lruvec *lruvec, > > > __count_vm_events(PGDEACTIVATE, pgmoved); > > > } > > > > > > -static void shrink_active_list(unsigned long nr_to_scan, > > > +static unsigned long shrink_active_list(unsigned long nr_to_scan, > > > struct lruvec *lruvec, > > > struct scan_control *sc, > > > enum lru_list lru) > > > @@ -1534,7 +1533,7 @@ static void shrink_active_list(unsigned long nr_to_scan, > > > __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); > > > spin_unlock_irq(&zone->lru_lock); > > > > > > - free_hot_cold_page_list(&l_hold, 1); > > > + return free_hot_cold_page_list(&l_hold, 1); > > > > It would be better to add comment about return value. > > Otherwise, people could confuse with the number of pages moved from > > active to inactive. > > > > In next spin, I will not change return type. > So above problem will be disappreared. > > > > } > > > > > > #ifdef CONFIG_SWAP > > > @@ -1617,7 +1616,8 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, > > > { > > > if (is_active_lru(lru)) { > > > if (inactive_list_is_low(lruvec, lru)) > > > - shrink_active_list(nr_to_scan, lruvec, sc, lru); > > > + return shrink_active_list(nr_to_scan, lruvec, sc, lru); > > > + > > > > Unnecessary change. > > > > Why? You are adding unnecessary newline. > > > > return 0; > > > } > > > > > > @@ -1861,8 +1861,8 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) > > > * rebalance the anon lru active/inactive ratio. > > > */ > > > if (inactive_anon_is_low(lruvec)) > > > - shrink_active_list(SWAP_CLUSTER_MAX, lruvec, > > > - sc, LRU_ACTIVE_ANON); > > > + sc->nr_reclaimed += shrink_active_list(SWAP_CLUSTER_MAX, > > > + lruvec, sc, LRU_ACTIVE_ANON); > > > > > > throttle_vm_writeout(sc->gfp_mask); > > > } > > > @@ -2470,23 +2470,27 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, > > > } > > > #endif > > > > > > -static void age_active_anon(struct zone *zone, struct scan_control *sc) > > > > Comment about return value. > > or rename but I have no idea. Sorry. > > This will be disappreared in next spin. > > Thanks for detailed review. > > > > > > +static unsigned long age_active_anon(struct zone *zone, > > > + struct scan_control *sc) > > > { > > > + unsigned long nr_reclaimed = 0; > > > struct mem_cgroup *memcg; > > > > > > if (!total_swap_pages) > > > - return; > > > + return 0; > > > > > > memcg = mem_cgroup_iter(NULL, NULL, NULL); > > > do { > > > struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); > > > > > > if (inactive_anon_is_low(lruvec)) > > > - shrink_active_list(SWAP_CLUSTER_MAX, lruvec, > > > - sc, LRU_ACTIVE_ANON); > > > + nr_reclaimed += shrink_active_list(SWAP_CLUSTER_MAX, > > > + lruvec, sc, LRU_ACTIVE_ANON); > > > > > > memcg = mem_cgroup_iter(NULL, memcg, NULL); > > > } while (memcg); > > > + > > > + return nr_reclaimed; > > > } > > > > > > static bool zone_balanced(struct zone *zone, int order, > > > @@ -2666,7 +2670,7 @@ loop_again: > > > * Do some background aging of the anon list, to give > > > * pages a chance to be referenced before reclaiming. > > > */ > > > - age_active_anon(zone, &sc); > > > + sc.nr_reclaimed += age_active_anon(zone, &sc); > > > > > > /* > > > * If the number of buffer_heads in the machine > > > -- > > > 1.7.9.5 > > > > > > -- > > > To unsubscribe, send a message with 'unsubscribe linux-mm' in > > > the body to majordomo@kvack.org. For more info on Linux MM, > > > see: http://www.linux-mm.org/ . > > > Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> > > > > -- > > Kind regards, > > Minchan Kim > > > > -- > > To unsubscribe, send a message with 'unsubscribe linux-mm' in > > the body to majordomo@kvack.org. For more info on Linux MM, > > see: http://www.linux-mm.org/ . > > Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> > > -- > To unsubscribe, send a message with 'unsubscribe linux-mm' in > the body to majordomo@kvack.org. For more info on Linux MM, > see: http://www.linux-mm.org/ . > Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> -- Kind regards, Minchan Kim ^ permalink raw reply [flat|nested] 16+ messages in thread
end of thread, other threads:[~2013-04-11 15:15 UTC | newest] Thread overview: 16+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2013-04-09 1:21 [PATCH 1/3] mm, vmscan: count accidental reclaimed pages failed to put into lru Joonsoo Kim 2013-04-09 1:21 ` [PATCH 2/3] mm, slub: count freed pages via rcu as this task's reclaimed_slab Joonsoo Kim 2013-04-09 9:38 ` Simon Jeons 2013-04-09 14:32 ` Christoph Lameter 2013-04-10 3:20 ` Simon Jeons 2013-04-10 13:54 ` Christoph Lameter 2013-04-11 3:46 ` Simon Jeons 2013-04-11 15:03 ` Christoph Lameter 2013-04-09 14:28 ` Christoph Lameter 2013-04-10 5:26 ` Joonsoo Kim 2013-04-10 13:57 ` Christoph Lameter 2013-04-10 14:24 ` JoonSoo Kim 2013-04-09 1:21 ` [PATCH 3/3] mm, slab: " Joonsoo Kim 2013-04-09 5:55 ` [PATCH 1/3] mm, vmscan: count accidental reclaimed pages failed to put into lru Minchan Kim 2013-04-10 5:48 ` Joonsoo Kim 2013-04-10 6:03 ` Minchan Kim
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).