* [PATCH -mm v2] vmscan: move reclaim_state handling to shrink_slab
@ 2015-01-15 8:37 Vladimir Davydov
2015-01-15 12:58 ` Michal Hocko
0 siblings, 1 reply; 8+ messages in thread
From: Vladimir Davydov @ 2015-01-15 8:37 UTC (permalink / raw)
To: Andrew Morton
Cc: Johannes Weiner, Michal Hocko, Vlastimil Babka, Mel Gorman,
Rik van Riel, linux-mm, linux-kernel
current->reclaim_state is only used to count the number of slab pages
reclaimed by shrink_slab(). So instead of initializing it before we are
going to call try_to_free_pages() or shrink_zone(), let's set in
directly in shrink_slab().
Note that after this patch try_to_free_mem_cgroup_pages() will count not
only reclaimed user pages, but also slab pages, which is expected,
because it can reclaim kmem from kmem-active sub cgroups.
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
---
Changes in v2:
- do not change shrink_slab() return value to the number of reclaimed
slab pages, because it can make drop_slab() abort beforehand (Andrew)
mm/page_alloc.c | 4 ----
mm/vmscan.c | 43 +++++++++++++++++--------------------------
2 files changed, 17 insertions(+), 30 deletions(-)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e1963ea0684a..f528e4ba91b5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2448,7 +2448,6 @@ static int
__perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist,
nodemask_t *nodemask)
{
- struct reclaim_state reclaim_state;
int progress;
cond_resched();
@@ -2457,12 +2456,9 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist,
cpuset_memory_pressure_bump();
current->flags |= PF_MEMALLOC;
lockdep_set_current_reclaim_state(gfp_mask);
- reclaim_state.reclaimed_slab = 0;
- current->reclaim_state = &reclaim_state;
progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
- current->reclaim_state = NULL;
lockdep_clear_current_reclaim_state();
current->flags &= ~PF_MEMALLOC;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 16f3e45742d6..26fdcc6c747d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -367,13 +367,18 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
* the ->seeks setting of the shrink function, which indicates the
* cost to recreate an object relative to that of an LRU page.
*
- * Returns the number of reclaimed slab objects.
+ * Returns the number of reclaimed slab objects. The number of reclaimed slab
+ * pages is added to *@ret_nr_reclaimed.
*/
static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
struct mem_cgroup *memcg,
unsigned long nr_scanned,
- unsigned long nr_eligible)
+ unsigned long nr_eligible,
+ unsigned long *ret_nr_reclaimed)
{
+ struct reclaim_state reclaim_state = {
+ .reclaimed_slab = 0,
+ };
struct shrinker *shrinker;
unsigned long freed = 0;
@@ -394,6 +399,8 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
goto out;
}
+ current->reclaim_state = &reclaim_state;
+
list_for_each_entry(shrinker, &shrinker_list, list) {
struct shrink_control sc = {
.gfp_mask = gfp_mask,
@@ -410,6 +417,9 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
freed += do_shrink_slab(&sc, shrinker, nr_scanned, nr_eligible);
}
+ current->reclaim_state = NULL;
+ *ret_nr_reclaimed += reclaim_state.reclaimed_slab;
+
up_read(&shrinker_rwsem);
out:
cond_resched();
@@ -419,6 +429,7 @@ out:
void drop_slab_node(int nid)
{
unsigned long freed;
+ unsigned long nr_reclaimed = 0;
do {
struct mem_cgroup *memcg = NULL;
@@ -426,9 +437,9 @@ void drop_slab_node(int nid)
freed = 0;
do {
freed += shrink_slab(GFP_KERNEL, nid, memcg,
- 1000, 1000);
+ 1000, 1000, &nr_reclaimed);
} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
- } while (freed > 10);
+ } while (freed);
}
void drop_slab(void)
@@ -2339,7 +2350,6 @@ static inline bool should_continue_reclaim(struct zone *zone,
static bool shrink_zone(struct zone *zone, struct scan_control *sc,
bool is_classzone)
{
- struct reclaim_state *reclaim_state = current->reclaim_state;
unsigned long nr_reclaimed, nr_scanned;
bool reclaimable = false;
@@ -2371,7 +2381,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
if (memcg && is_classzone)
shrink_slab(sc->gfp_mask, zone_to_nid(zone),
memcg, sc->nr_scanned - scanned,
- lru_pages);
+ lru_pages, &sc->nr_reclaimed);
/*
* Direct reclaim and kswapd have to scan all memory
@@ -2398,12 +2408,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
if (global_reclaim(sc) && is_classzone)
shrink_slab(sc->gfp_mask, zone_to_nid(zone), NULL,
sc->nr_scanned - nr_scanned,
- zone_lru_pages);
-
- if (reclaim_state) {
- sc->nr_reclaimed += reclaim_state->reclaimed_slab;
- reclaim_state->reclaimed_slab = 0;
- }
+ zone_lru_pages, &sc->nr_reclaimed);
vmpressure(sc->gfp_mask, sc->target_mem_cgroup,
sc->nr_scanned - nr_scanned,
@@ -3367,17 +3372,12 @@ static int kswapd(void *p)
int balanced_classzone_idx;
pg_data_t *pgdat = (pg_data_t*)p;
struct task_struct *tsk = current;
-
- struct reclaim_state reclaim_state = {
- .reclaimed_slab = 0,
- };
const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
lockdep_set_current_reclaim_state(GFP_KERNEL);
if (!cpumask_empty(cpumask))
set_cpus_allowed_ptr(tsk, cpumask);
- current->reclaim_state = &reclaim_state;
/*
* Tell the memory management that we're a "memory allocator",
@@ -3449,7 +3449,6 @@ static int kswapd(void *p)
}
tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
- current->reclaim_state = NULL;
lockdep_clear_current_reclaim_state();
return 0;
@@ -3492,7 +3491,6 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
*/
unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
{
- struct reclaim_state reclaim_state;
struct scan_control sc = {
.nr_to_reclaim = nr_to_reclaim,
.gfp_mask = GFP_HIGHUSER_MOVABLE,
@@ -3508,12 +3506,9 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
p->flags |= PF_MEMALLOC;
lockdep_set_current_reclaim_state(sc.gfp_mask);
- reclaim_state.reclaimed_slab = 0;
- p->reclaim_state = &reclaim_state;
nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
- p->reclaim_state = NULL;
lockdep_clear_current_reclaim_state();
p->flags &= ~PF_MEMALLOC;
@@ -3678,7 +3673,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
/* Minimum pages needed in order to stay on node */
const unsigned long nr_pages = 1 << order;
struct task_struct *p = current;
- struct reclaim_state reclaim_state;
struct scan_control sc = {
.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
.gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
@@ -3697,8 +3691,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
*/
p->flags |= PF_MEMALLOC | PF_SWAPWRITE;
lockdep_set_current_reclaim_state(gfp_mask);
- reclaim_state.reclaimed_slab = 0;
- p->reclaim_state = &reclaim_state;
if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) {
/*
@@ -3710,7 +3702,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
} while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
}
- p->reclaim_state = NULL;
current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
lockdep_clear_current_reclaim_state();
return sc.nr_reclaimed >= nr_pages;
--
1.7.10.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related [flat|nested] 8+ messages in thread
* Re: [PATCH -mm v2] vmscan: move reclaim_state handling to shrink_slab
@ 2015-01-15 11:43 Hillf Danton
0 siblings, 0 replies; 8+ messages in thread
From: Hillf Danton @ 2015-01-15 11:43 UTC (permalink / raw)
To: 'Vladimir Davydov'
Cc: Andrew Morton, Johannes Weiner, Michal Hocko, Vlastimil Babka,
Mel Gorman, linux-kernel, linux-mm, 'Rik van Riel'
> current->reclaim_state is only used to count the number of slab pages
> reclaimed by shrink_slab(). So instead of initializing it before we are
> going to call try_to_free_pages() or shrink_zone(), let's set in
> directly in shrink_slab().
>
> Note that after this patch try_to_free_mem_cgroup_pages() will count not
> only reclaimed user pages, but also slab pages, which is expected,
> because it can reclaim kmem from kmem-active sub cgroups.
>
> Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
> ---
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
> Changes in v2:
> - do not change shrink_slab() return value to the number of reclaimed
> slab pages, because it can make drop_slab() abort beforehand (Andrew)
>
> mm/page_alloc.c | 4 ----
> mm/vmscan.c | 43 +++++++++++++++++--------------------------
> 2 files changed, 17 insertions(+), 30 deletions(-)
>
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index e1963ea0684a..f528e4ba91b5 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -2448,7 +2448,6 @@ static int
> __perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist,
> nodemask_t *nodemask)
> {
> - struct reclaim_state reclaim_state;
> int progress;
>
> cond_resched();
> @@ -2457,12 +2456,9 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist,
> cpuset_memory_pressure_bump();
> current->flags |= PF_MEMALLOC;
> lockdep_set_current_reclaim_state(gfp_mask);
> - reclaim_state.reclaimed_slab = 0;
> - current->reclaim_state = &reclaim_state;
>
> progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
>
> - current->reclaim_state = NULL;
> lockdep_clear_current_reclaim_state();
> current->flags &= ~PF_MEMALLOC;
>
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 16f3e45742d6..26fdcc6c747d 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -367,13 +367,18 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
> * the ->seeks setting of the shrink function, which indicates the
> * cost to recreate an object relative to that of an LRU page.
> *
> - * Returns the number of reclaimed slab objects.
> + * Returns the number of reclaimed slab objects. The number of reclaimed slab
> + * pages is added to *@ret_nr_reclaimed.
> */
> static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
> struct mem_cgroup *memcg,
> unsigned long nr_scanned,
> - unsigned long nr_eligible)
> + unsigned long nr_eligible,
> + unsigned long *ret_nr_reclaimed)
> {
> + struct reclaim_state reclaim_state = {
> + .reclaimed_slab = 0,
> + };
> struct shrinker *shrinker;
> unsigned long freed = 0;
>
> @@ -394,6 +399,8 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
> goto out;
> }
>
> + current->reclaim_state = &reclaim_state;
> +
> list_for_each_entry(shrinker, &shrinker_list, list) {
> struct shrink_control sc = {
> .gfp_mask = gfp_mask,
> @@ -410,6 +417,9 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
> freed += do_shrink_slab(&sc, shrinker, nr_scanned, nr_eligible);
> }
>
> + current->reclaim_state = NULL;
> + *ret_nr_reclaimed += reclaim_state.reclaimed_slab;
> +
> up_read(&shrinker_rwsem);
> out:
> cond_resched();
> @@ -419,6 +429,7 @@ out:
> void drop_slab_node(int nid)
> {
> unsigned long freed;
> + unsigned long nr_reclaimed = 0;
>
> do {
> struct mem_cgroup *memcg = NULL;
> @@ -426,9 +437,9 @@ void drop_slab_node(int nid)
> freed = 0;
> do {
> freed += shrink_slab(GFP_KERNEL, nid, memcg,
> - 1000, 1000);
> + 1000, 1000, &nr_reclaimed);
> } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
> - } while (freed > 10);
> + } while (freed);
> }
>
> void drop_slab(void)
> @@ -2339,7 +2350,6 @@ static inline bool should_continue_reclaim(struct zone *zone,
> static bool shrink_zone(struct zone *zone, struct scan_control *sc,
> bool is_classzone)
> {
> - struct reclaim_state *reclaim_state = current->reclaim_state;
> unsigned long nr_reclaimed, nr_scanned;
> bool reclaimable = false;
>
> @@ -2371,7 +2381,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
> if (memcg && is_classzone)
> shrink_slab(sc->gfp_mask, zone_to_nid(zone),
> memcg, sc->nr_scanned - scanned,
> - lru_pages);
> + lru_pages, &sc->nr_reclaimed);
>
> /*
> * Direct reclaim and kswapd have to scan all memory
> @@ -2398,12 +2408,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
> if (global_reclaim(sc) && is_classzone)
> shrink_slab(sc->gfp_mask, zone_to_nid(zone), NULL,
> sc->nr_scanned - nr_scanned,
> - zone_lru_pages);
> -
> - if (reclaim_state) {
> - sc->nr_reclaimed += reclaim_state->reclaimed_slab;
> - reclaim_state->reclaimed_slab = 0;
> - }
> + zone_lru_pages, &sc->nr_reclaimed);
>
> vmpressure(sc->gfp_mask, sc->target_mem_cgroup,
> sc->nr_scanned - nr_scanned,
> @@ -3367,17 +3372,12 @@ static int kswapd(void *p)
> int balanced_classzone_idx;
> pg_data_t *pgdat = (pg_data_t*)p;
> struct task_struct *tsk = current;
> -
> - struct reclaim_state reclaim_state = {
> - .reclaimed_slab = 0,
> - };
> const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
>
> lockdep_set_current_reclaim_state(GFP_KERNEL);
>
> if (!cpumask_empty(cpumask))
> set_cpus_allowed_ptr(tsk, cpumask);
> - current->reclaim_state = &reclaim_state;
>
> /*
> * Tell the memory management that we're a "memory allocator",
> @@ -3449,7 +3449,6 @@ static int kswapd(void *p)
> }
>
> tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
> - current->reclaim_state = NULL;
> lockdep_clear_current_reclaim_state();
>
> return 0;
> @@ -3492,7 +3491,6 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
> */
> unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
> {
> - struct reclaim_state reclaim_state;
> struct scan_control sc = {
> .nr_to_reclaim = nr_to_reclaim,
> .gfp_mask = GFP_HIGHUSER_MOVABLE,
> @@ -3508,12 +3506,9 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
>
> p->flags |= PF_MEMALLOC;
> lockdep_set_current_reclaim_state(sc.gfp_mask);
> - reclaim_state.reclaimed_slab = 0;
> - p->reclaim_state = &reclaim_state;
>
> nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
>
> - p->reclaim_state = NULL;
> lockdep_clear_current_reclaim_state();
> p->flags &= ~PF_MEMALLOC;
>
> @@ -3678,7 +3673,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
> /* Minimum pages needed in order to stay on node */
> const unsigned long nr_pages = 1 << order;
> struct task_struct *p = current;
> - struct reclaim_state reclaim_state;
> struct scan_control sc = {
> .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
> .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
> @@ -3697,8 +3691,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
> */
> p->flags |= PF_MEMALLOC | PF_SWAPWRITE;
> lockdep_set_current_reclaim_state(gfp_mask);
> - reclaim_state.reclaimed_slab = 0;
> - p->reclaim_state = &reclaim_state;
>
> if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) {
> /*
> @@ -3710,7 +3702,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
> } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
> }
>
> - p->reclaim_state = NULL;
> current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
> lockdep_clear_current_reclaim_state();
> return sc.nr_reclaimed >= nr_pages;
> --
> 1.7.10.4
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH -mm v2] vmscan: move reclaim_state handling to shrink_slab
2015-01-15 8:37 [PATCH -mm v2] vmscan: move reclaim_state handling to shrink_slab Vladimir Davydov
@ 2015-01-15 12:58 ` Michal Hocko
2015-01-15 13:25 ` Vladimir Davydov
0 siblings, 1 reply; 8+ messages in thread
From: Michal Hocko @ 2015-01-15 12:58 UTC (permalink / raw)
To: Vladimir Davydov
Cc: Andrew Morton, Johannes Weiner, Vlastimil Babka, Mel Gorman,
Rik van Riel, linux-mm, linux-kernel
On Thu 15-01-15 11:37:53, Vladimir Davydov wrote:
> current->reclaim_state is only used to count the number of slab pages
> reclaimed by shrink_slab(). So instead of initializing it before we are
>
> Note that after this patch try_to_free_mem_cgroup_pages() will count not
> only reclaimed user pages, but also slab pages, which is expected,
> because it can reclaim kmem from kmem-active sub cgroups.
Except that reclaim_state counts all freed slab objects that have
current->reclaim_state != NULL AFAIR. This includes also kfreed pages
from interrupt context and who knows what else and those pages might be
from a different memcgs, no?
Besides that I am not sure this makes any difference in the end. No
try_to_free_mem_cgroup_pages caller really cares about the exact
number of reclaimed pages. We care only about whether there was any
progress done - and even that not exactly (e.g. try_charge checks
mem_cgroup_margin before retry/oom so if sufficient kmem pages were
uncharged then we will notice that).
That being said, I haven't read the patch yet, but the above assumption
doesn't sound correct to me. reclaim_state is nasty and relying on it for
something that is targeted might lead to unexpected results.
> Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
> ---
> Changes in v2:
> - do not change shrink_slab() return value to the number of reclaimed
> slab pages, because it can make drop_slab() abort beforehand (Andrew)
>
> mm/page_alloc.c | 4 ----
> mm/vmscan.c | 43 +++++++++++++++++--------------------------
> 2 files changed, 17 insertions(+), 30 deletions(-)
>
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index e1963ea0684a..f528e4ba91b5 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -2448,7 +2448,6 @@ static int
> __perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist,
> nodemask_t *nodemask)
> {
> - struct reclaim_state reclaim_state;
> int progress;
>
> cond_resched();
> @@ -2457,12 +2456,9 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist,
> cpuset_memory_pressure_bump();
> current->flags |= PF_MEMALLOC;
> lockdep_set_current_reclaim_state(gfp_mask);
> - reclaim_state.reclaimed_slab = 0;
> - current->reclaim_state = &reclaim_state;
>
> progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
>
> - current->reclaim_state = NULL;
> lockdep_clear_current_reclaim_state();
> current->flags &= ~PF_MEMALLOC;
>
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 16f3e45742d6..26fdcc6c747d 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -367,13 +367,18 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
> * the ->seeks setting of the shrink function, which indicates the
> * cost to recreate an object relative to that of an LRU page.
> *
> - * Returns the number of reclaimed slab objects.
> + * Returns the number of reclaimed slab objects. The number of reclaimed slab
> + * pages is added to *@ret_nr_reclaimed.
> */
> static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
> struct mem_cgroup *memcg,
> unsigned long nr_scanned,
> - unsigned long nr_eligible)
> + unsigned long nr_eligible,
> + unsigned long *ret_nr_reclaimed)
> {
> + struct reclaim_state reclaim_state = {
> + .reclaimed_slab = 0,
> + };
> struct shrinker *shrinker;
> unsigned long freed = 0;
>
> @@ -394,6 +399,8 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
> goto out;
> }
>
> + current->reclaim_state = &reclaim_state;
> +
> list_for_each_entry(shrinker, &shrinker_list, list) {
> struct shrink_control sc = {
> .gfp_mask = gfp_mask,
> @@ -410,6 +417,9 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
> freed += do_shrink_slab(&sc, shrinker, nr_scanned, nr_eligible);
> }
>
> + current->reclaim_state = NULL;
> + *ret_nr_reclaimed += reclaim_state.reclaimed_slab;
> +
> up_read(&shrinker_rwsem);
> out:
> cond_resched();
> @@ -419,6 +429,7 @@ out:
> void drop_slab_node(int nid)
> {
> unsigned long freed;
> + unsigned long nr_reclaimed = 0;
>
> do {
> struct mem_cgroup *memcg = NULL;
> @@ -426,9 +437,9 @@ void drop_slab_node(int nid)
> freed = 0;
> do {
> freed += shrink_slab(GFP_KERNEL, nid, memcg,
> - 1000, 1000);
> + 1000, 1000, &nr_reclaimed);
> } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
> - } while (freed > 10);
> + } while (freed);
> }
>
> void drop_slab(void)
> @@ -2339,7 +2350,6 @@ static inline bool should_continue_reclaim(struct zone *zone,
> static bool shrink_zone(struct zone *zone, struct scan_control *sc,
> bool is_classzone)
> {
> - struct reclaim_state *reclaim_state = current->reclaim_state;
> unsigned long nr_reclaimed, nr_scanned;
> bool reclaimable = false;
>
> @@ -2371,7 +2381,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
> if (memcg && is_classzone)
> shrink_slab(sc->gfp_mask, zone_to_nid(zone),
> memcg, sc->nr_scanned - scanned,
> - lru_pages);
> + lru_pages, &sc->nr_reclaimed);
>
> /*
> * Direct reclaim and kswapd have to scan all memory
> @@ -2398,12 +2408,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
> if (global_reclaim(sc) && is_classzone)
> shrink_slab(sc->gfp_mask, zone_to_nid(zone), NULL,
> sc->nr_scanned - nr_scanned,
> - zone_lru_pages);
> -
> - if (reclaim_state) {
> - sc->nr_reclaimed += reclaim_state->reclaimed_slab;
> - reclaim_state->reclaimed_slab = 0;
> - }
> + zone_lru_pages, &sc->nr_reclaimed);
>
> vmpressure(sc->gfp_mask, sc->target_mem_cgroup,
> sc->nr_scanned - nr_scanned,
> @@ -3367,17 +3372,12 @@ static int kswapd(void *p)
> int balanced_classzone_idx;
> pg_data_t *pgdat = (pg_data_t*)p;
> struct task_struct *tsk = current;
> -
> - struct reclaim_state reclaim_state = {
> - .reclaimed_slab = 0,
> - };
> const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
>
> lockdep_set_current_reclaim_state(GFP_KERNEL);
>
> if (!cpumask_empty(cpumask))
> set_cpus_allowed_ptr(tsk, cpumask);
> - current->reclaim_state = &reclaim_state;
>
> /*
> * Tell the memory management that we're a "memory allocator",
> @@ -3449,7 +3449,6 @@ static int kswapd(void *p)
> }
>
> tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
> - current->reclaim_state = NULL;
> lockdep_clear_current_reclaim_state();
>
> return 0;
> @@ -3492,7 +3491,6 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
> */
> unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
> {
> - struct reclaim_state reclaim_state;
> struct scan_control sc = {
> .nr_to_reclaim = nr_to_reclaim,
> .gfp_mask = GFP_HIGHUSER_MOVABLE,
> @@ -3508,12 +3506,9 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
>
> p->flags |= PF_MEMALLOC;
> lockdep_set_current_reclaim_state(sc.gfp_mask);
> - reclaim_state.reclaimed_slab = 0;
> - p->reclaim_state = &reclaim_state;
>
> nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
>
> - p->reclaim_state = NULL;
> lockdep_clear_current_reclaim_state();
> p->flags &= ~PF_MEMALLOC;
>
> @@ -3678,7 +3673,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
> /* Minimum pages needed in order to stay on node */
> const unsigned long nr_pages = 1 << order;
> struct task_struct *p = current;
> - struct reclaim_state reclaim_state;
> struct scan_control sc = {
> .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
> .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
> @@ -3697,8 +3691,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
> */
> p->flags |= PF_MEMALLOC | PF_SWAPWRITE;
> lockdep_set_current_reclaim_state(gfp_mask);
> - reclaim_state.reclaimed_slab = 0;
> - p->reclaim_state = &reclaim_state;
>
> if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) {
> /*
> @@ -3710,7 +3702,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
> } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
> }
>
> - p->reclaim_state = NULL;
> current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
> lockdep_clear_current_reclaim_state();
> return sc.nr_reclaimed >= nr_pages;
> --
> 1.7.10.4
>
--
Michal Hocko
SUSE Labs
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH -mm v2] vmscan: move reclaim_state handling to shrink_slab
2015-01-15 12:58 ` Michal Hocko
@ 2015-01-15 13:25 ` Vladimir Davydov
2015-01-15 14:48 ` Michal Hocko
0 siblings, 1 reply; 8+ messages in thread
From: Vladimir Davydov @ 2015-01-15 13:25 UTC (permalink / raw)
To: Michal Hocko
Cc: Andrew Morton, Johannes Weiner, Vlastimil Babka, Mel Gorman,
Rik van Riel, linux-mm, linux-kernel
On Thu, Jan 15, 2015 at 01:58:20PM +0100, Michal Hocko wrote:
> On Thu 15-01-15 11:37:53, Vladimir Davydov wrote:
> > current->reclaim_state is only used to count the number of slab pages
> > reclaimed by shrink_slab(). So instead of initializing it before we are
> >
> > Note that after this patch try_to_free_mem_cgroup_pages() will count not
> > only reclaimed user pages, but also slab pages, which is expected,
> > because it can reclaim kmem from kmem-active sub cgroups.
>
> Except that reclaim_state counts all freed slab objects that have
> current->reclaim_state != NULL AFAIR. This includes also kfreed pages
> from interrupt context and who knows what else and those pages might be
> from a different memcgs, no?
Hmm, true, good point. Can an interrupt handler free a lot of memory
though? Does RCU free objects from irq or soft irq context?
> Besides that I am not sure this makes any difference in the end. No
> try_to_free_mem_cgroup_pages caller really cares about the exact
> number of reclaimed pages. We care only about whether there was any
> progress done - and even that not exactly (e.g. try_charge checks
> mem_cgroup_margin before retry/oom so if sufficient kmem pages were
> uncharged then we will notice that).
Frankly, I thought exactly the same initially, that's why I dropped
reclaim_state handling from the initial memcg shrinkers patch set.
However, then Hillf noticed that nr_reclaimed is checked right after
calling shrink_slab() in the memcg iteration loop in shrink_zone():
memcg = mem_cgroup_iter(root, NULL, &reclaim);
do {
[...]
if (memcg && is_classzone)
shrink_slab(sc->gfp_mask, zone_to_nid(zone),
memcg, sc->nr_scanned - scanned,
lru_pages);
/*
* Direct reclaim and kswapd have to scan all memory
* cgroups to fulfill the overall scan target for the
* zone.
*
* Limit reclaim, on the other hand, only cares about
* nr_to_reclaim pages to be reclaimed and it will
* retry with decreasing priority if one round over the
* whole hierarchy is not sufficient.
*/
if (!global_reclaim(sc) &&
sc->nr_reclaimed >= sc->nr_to_reclaim) {
mem_cgroup_iter_break(root, memcg);
break;
}
memcg = mem_cgroup_iter(root, memcg, &reclaim);
} while (memcg);
If we can ignore reclaimed slab pages here (?), let's drop this patch.
Thanks,
Vladimir
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH -mm v2] vmscan: move reclaim_state handling to shrink_slab
2015-01-15 13:25 ` Vladimir Davydov
@ 2015-01-15 14:48 ` Michal Hocko
2015-01-15 17:07 ` Vladimir Davydov
2015-01-20 7:35 ` Paul E. McKenney
0 siblings, 2 replies; 8+ messages in thread
From: Michal Hocko @ 2015-01-15 14:48 UTC (permalink / raw)
To: Vladimir Davydov
Cc: Andrew Morton, Johannes Weiner, Vlastimil Babka, Mel Gorman,
Rik van Riel, linux-mm, linux-kernel
On Thu 15-01-15 16:25:16, Vladimir Davydov wrote:
> On Thu, Jan 15, 2015 at 01:58:20PM +0100, Michal Hocko wrote:
> > On Thu 15-01-15 11:37:53, Vladimir Davydov wrote:
> > > current->reclaim_state is only used to count the number of slab pages
> > > reclaimed by shrink_slab(). So instead of initializing it before we are
> > >
> > > Note that after this patch try_to_free_mem_cgroup_pages() will count not
> > > only reclaimed user pages, but also slab pages, which is expected,
> > > because it can reclaim kmem from kmem-active sub cgroups.
> >
> > Except that reclaim_state counts all freed slab objects that have
> > current->reclaim_state != NULL AFAIR. This includes also kfreed pages
> > from interrupt context and who knows what else and those pages might be
> > from a different memcgs, no?
>
> Hmm, true, good point. Can an interrupt handler free a lot of memory
> though?
it is drivers so who knows...
> Does RCU free objects from irq or soft irq context?
and this is another part which I didn't consider at all. RCU callbacks
are normally processed from kthread context but rcu_init also does
open_softirq(RCU_SOFTIRQ, rcu_process_callbacks)
so something is clearly processed from softirq as well. I am not
familiar with RCU details enough to tell how many callbacks are
processed this way. Tiny RCU, on the other hand, seem to be processing
all callbacks via __rcu_process_callbacks and that seems to be processed
from softirq only.
> > Besides that I am not sure this makes any difference in the end. No
> > try_to_free_mem_cgroup_pages caller really cares about the exact
> > number of reclaimed pages. We care only about whether there was any
> > progress done - and even that not exactly (e.g. try_charge checks
> > mem_cgroup_margin before retry/oom so if sufficient kmem pages were
> > uncharged then we will notice that).
>
> Frankly, I thought exactly the same initially, that's why I dropped
> reclaim_state handling from the initial memcg shrinkers patch set.
> However, then Hillf noticed that nr_reclaimed is checked right after
> calling shrink_slab() in the memcg iteration loop in shrink_zone():
>
>
> memcg = mem_cgroup_iter(root, NULL, &reclaim);
> do {
> [...]
> if (memcg && is_classzone)
> shrink_slab(sc->gfp_mask, zone_to_nid(zone),
> memcg, sc->nr_scanned - scanned,
> lru_pages);
>
> /*
> * Direct reclaim and kswapd have to scan all memory
> * cgroups to fulfill the overall scan target for the
> * zone.
> *
> * Limit reclaim, on the other hand, only cares about
> * nr_to_reclaim pages to be reclaimed and it will
> * retry with decreasing priority if one round over the
> * whole hierarchy is not sufficient.
> */
> if (!global_reclaim(sc) &&
> sc->nr_reclaimed >= sc->nr_to_reclaim) {
> mem_cgroup_iter_break(root, memcg);
> break;
> }
> memcg = mem_cgroup_iter(root, memcg, &reclaim);
> } while (memcg);
>
>
> If we can ignore reclaimed slab pages here (?), let's drop this patch.
I see what you are trying to achieve but can this lead to a serious
over-reclaim? We should be reclaiming mostly user pages and kmem should
be only a small portion I would expect.
--
Michal Hocko
SUSE Labs
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH -mm v2] vmscan: move reclaim_state handling to shrink_slab
2015-01-15 14:48 ` Michal Hocko
@ 2015-01-15 17:07 ` Vladimir Davydov
2015-01-20 7:35 ` Paul E. McKenney
1 sibling, 0 replies; 8+ messages in thread
From: Vladimir Davydov @ 2015-01-15 17:07 UTC (permalink / raw)
To: Michal Hocko
Cc: Andrew Morton, Johannes Weiner, Vlastimil Babka, Mel Gorman,
Rik van Riel, linux-mm, linux-kernel
On Thu, Jan 15, 2015 at 03:48:38PM +0100, Michal Hocko wrote:
> On Thu 15-01-15 16:25:16, Vladimir Davydov wrote:
> > memcg = mem_cgroup_iter(root, NULL, &reclaim);
> > do {
> > [...]
> > if (memcg && is_classzone)
> > shrink_slab(sc->gfp_mask, zone_to_nid(zone),
> > memcg, sc->nr_scanned - scanned,
> > lru_pages);
> >
> > /*
> > * Direct reclaim and kswapd have to scan all memory
> > * cgroups to fulfill the overall scan target for the
> > * zone.
> > *
> > * Limit reclaim, on the other hand, only cares about
> > * nr_to_reclaim pages to be reclaimed and it will
> > * retry with decreasing priority if one round over the
> > * whole hierarchy is not sufficient.
> > */
> > if (!global_reclaim(sc) &&
> > sc->nr_reclaimed >= sc->nr_to_reclaim) {
> > mem_cgroup_iter_break(root, memcg);
> > break;
> > }
> > memcg = mem_cgroup_iter(root, memcg, &reclaim);
> > } while (memcg);
> >
> >
> > If we can ignore reclaimed slab pages here (?), let's drop this patch.
>
> I see what you are trying to achieve but can this lead to a serious
> over-reclaim?
I think it can, but only if we shrink an inode with lots of pages
attached to its address space (they also count to reclaim_state). In
this case, we overreclaim anyway though.
I agree that this is a high risk for a vague benefit. Let's drop it
until we see this problem in real life.
Thanks,
Vladimir
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH -mm v2] vmscan: move reclaim_state handling to shrink_slab
2015-01-15 14:48 ` Michal Hocko
2015-01-15 17:07 ` Vladimir Davydov
@ 2015-01-20 7:35 ` Paul E. McKenney
2015-01-20 10:11 ` Michal Hocko
1 sibling, 1 reply; 8+ messages in thread
From: Paul E. McKenney @ 2015-01-20 7:35 UTC (permalink / raw)
To: Michal Hocko
Cc: Vladimir Davydov, Andrew Morton, Johannes Weiner, Vlastimil Babka,
Mel Gorman, Rik van Riel, linux-mm, linux-kernel
On Thu, Jan 15, 2015 at 03:48:38PM +0100, Michal Hocko wrote:
> On Thu 15-01-15 16:25:16, Vladimir Davydov wrote:
> > On Thu, Jan 15, 2015 at 01:58:20PM +0100, Michal Hocko wrote:
> > > On Thu 15-01-15 11:37:53, Vladimir Davydov wrote:
> > > > current->reclaim_state is only used to count the number of slab pages
> > > > reclaimed by shrink_slab(). So instead of initializing it before we are
> > > >
> > > > Note that after this patch try_to_free_mem_cgroup_pages() will count not
> > > > only reclaimed user pages, but also slab pages, which is expected,
> > > > because it can reclaim kmem from kmem-active sub cgroups.
> > >
> > > Except that reclaim_state counts all freed slab objects that have
> > > current->reclaim_state != NULL AFAIR. This includes also kfreed pages
> > > from interrupt context and who knows what else and those pages might be
> > > from a different memcgs, no?
> >
> > Hmm, true, good point. Can an interrupt handler free a lot of memory
> > though?
>
> it is drivers so who knows...
>
> > Does RCU free objects from irq or soft irq context?
>
> and this is another part which I didn't consider at all. RCU callbacks
> are normally processed from kthread context but rcu_init also does
> open_softirq(RCU_SOFTIRQ, rcu_process_callbacks)
> so something is clearly processed from softirq as well. I am not
> familiar with RCU details enough to tell how many callbacks are
> processed this way. Tiny RCU, on the other hand, seem to be processing
> all callbacks via __rcu_process_callbacks and that seems to be processed
> from softirq only.
RCU invokes all its callbacks with BH disabled, either because they
are running in softirq context or because the rcuo kthreads disable
BH while invoking each callback. When running in softirq context,
RCU will normally invoke only ten callbacks before letting the other
softirq vectors run. However, if there are more than 10,000 callbacks
queued on a given CPU (which can happen!), RCU will go into panic mode
and just invoke the callbacks as quickly as it can.
You can of course have your callback schedule a work-queue item or
wake up a kthread to avoid this tradeoff.
Thanx, Paul
> > > Besides that I am not sure this makes any difference in the end. No
> > > try_to_free_mem_cgroup_pages caller really cares about the exact
> > > number of reclaimed pages. We care only about whether there was any
> > > progress done - and even that not exactly (e.g. try_charge checks
> > > mem_cgroup_margin before retry/oom so if sufficient kmem pages were
> > > uncharged then we will notice that).
> >
> > Frankly, I thought exactly the same initially, that's why I dropped
> > reclaim_state handling from the initial memcg shrinkers patch set.
> > However, then Hillf noticed that nr_reclaimed is checked right after
> > calling shrink_slab() in the memcg iteration loop in shrink_zone():
> >
> >
> > memcg = mem_cgroup_iter(root, NULL, &reclaim);
> > do {
> > [...]
> > if (memcg && is_classzone)
> > shrink_slab(sc->gfp_mask, zone_to_nid(zone),
> > memcg, sc->nr_scanned - scanned,
> > lru_pages);
> >
> > /*
> > * Direct reclaim and kswapd have to scan all memory
> > * cgroups to fulfill the overall scan target for the
> > * zone.
> > *
> > * Limit reclaim, on the other hand, only cares about
> > * nr_to_reclaim pages to be reclaimed and it will
> > * retry with decreasing priority if one round over the
> > * whole hierarchy is not sufficient.
> > */
> > if (!global_reclaim(sc) &&
> > sc->nr_reclaimed >= sc->nr_to_reclaim) {
> > mem_cgroup_iter_break(root, memcg);
> > break;
> > }
> > memcg = mem_cgroup_iter(root, memcg, &reclaim);
> > } while (memcg);
> >
> >
> > If we can ignore reclaimed slab pages here (?), let's drop this patch.
>
> I see what you are trying to achieve but can this lead to a serious
> over-reclaim? We should be reclaiming mostly user pages and kmem should
> be only a small portion I would expect.
> --
> Michal Hocko
> SUSE Labs
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH -mm v2] vmscan: move reclaim_state handling to shrink_slab
2015-01-20 7:35 ` Paul E. McKenney
@ 2015-01-20 10:11 ` Michal Hocko
0 siblings, 0 replies; 8+ messages in thread
From: Michal Hocko @ 2015-01-20 10:11 UTC (permalink / raw)
To: Paul E. McKenney
Cc: Vladimir Davydov, Andrew Morton, Johannes Weiner, Vlastimil Babka,
Mel Gorman, Rik van Riel, linux-mm, linux-kernel
On Mon 19-01-15 23:35:50, Paul E. McKenney wrote:
> On Thu, Jan 15, 2015 at 03:48:38PM +0100, Michal Hocko wrote:
> > On Thu 15-01-15 16:25:16, Vladimir Davydov wrote:
[...]
> > > Does RCU free objects from irq or soft irq context?
> >
> > and this is another part which I didn't consider at all. RCU callbacks
> > are normally processed from kthread context but rcu_init also does
> > open_softirq(RCU_SOFTIRQ, rcu_process_callbacks)
> > so something is clearly processed from softirq as well. I am not
> > familiar with RCU details enough to tell how many callbacks are
> > processed this way. Tiny RCU, on the other hand, seem to be processing
> > all callbacks via __rcu_process_callbacks and that seems to be processed
> > from softirq only.
>
> RCU invokes all its callbacks with BH disabled, either because they
> are running in softirq context or because the rcuo kthreads disable
> BH while invoking each callback. When running in softirq context,
> RCU will normally invoke only ten callbacks before letting the other
> softirq vectors run. However, if there are more than 10,000 callbacks
> queued on a given CPU (which can happen!), RCU will go into panic mode
> and just invoke the callbacks as quickly as it can.
Thanks for the clarification, Paul! This means that not only drivers
might free some memory but also kfree called from RCU context would do
so this adds potentially even more memcg unrelated noise.
> You can of course have your callback schedule a work-queue item or
> wake up a kthread to avoid this tradeoff.
>
> Thanx, Paul
--
Michal Hocko
SUSE Labs
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 8+ messages in thread
end of thread, other threads:[~2015-01-20 10:11 UTC | newest]
Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2015-01-15 8:37 [PATCH -mm v2] vmscan: move reclaim_state handling to shrink_slab Vladimir Davydov
2015-01-15 12:58 ` Michal Hocko
2015-01-15 13:25 ` Vladimir Davydov
2015-01-15 14:48 ` Michal Hocko
2015-01-15 17:07 ` Vladimir Davydov
2015-01-20 7:35 ` Paul E. McKenney
2015-01-20 10:11 ` Michal Hocko
-- strict thread matches above, loose matches on Subject: below --
2015-01-15 11:43 Hillf Danton
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).