Re: [PATCH 5/7] memcg bgreclaim core.

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

From: Ying Han <yinghan@google.com>
To: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: "linux-mm@kvack.org" <linux-mm@kvack.org>,
	"kosaki.motohiro@jp.fujitsu.com" <kosaki.motohiro@jp.fujitsu.com>,
	"balbir@linux.vnet.ibm.com" <balbir@linux.vnet.ibm.com>,
	"nishimura@mxp.nes.nec.co.jp" <nishimura@mxp.nes.nec.co.jp>,
	"akpm@linux-foundation.org" <akpm@linux-foundation.org>,
	Johannes Weiner <jweiner@redhat.com>,
	"minchan.kim@gmail.com" <minchan.kim@gmail.com>,
	Michal Hocko <mhocko@suse.cz>
Subject: Re: [PATCH 5/7] memcg bgreclaim core.
Date: Mon, 25 Apr 2011 21:59:06 -0700	[thread overview]
Message-ID: <BANLkTinn5Cs8F5beX6od41xhH4qQuRR5Rw@mail.gmail.com> (raw)
In-Reply-To: <20110425183629.144d3f19.kamezawa.hiroyu@jp.fujitsu.com>

On Mon, Apr 25, 2011 at 2:36 AM, KAMEZAWA Hiroyuki
<kamezawa.hiroyu@jp.fujitsu.com> wrote:
> Following patch will chagnge the logic. This is a core.
> ==
> This is the main loop of per-memcg background reclaim which is implemented in
> function balance_mem_cgroup_pgdat().
>
> The function performs a priority loop similar to global reclaim. During each
> iteration it frees memory from a selected victim node.
> After reclaiming enough pages or scanning enough pages, it returns and find
> next work with round-robin.
>
> changelog v8b..v7
> 1. reworked for using work_queue rather than threads.
> 2. changed shrink_mem_cgroup algorithm to fit workqueue. In short, avoid
>   long running and allow quick round-robin and unnecessary write page.
>   When a thread make pages dirty continuously, write back them by flusher
>   is far faster than writeback by background reclaim. This detail will
>   be fixed when dirty_ratio implemented. The logic around this will be
>   revisited in following patche.
>
> Signed-off-by: Ying Han <yinghan@google.com>
> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> ---
>  include/linux/memcontrol.h |   11 ++++
>  mm/memcontrol.c            |   44 ++++++++++++++---
>  mm/vmscan.c                |  115 +++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 162 insertions(+), 8 deletions(-)
>
> Index: memcg/include/linux/memcontrol.h
> ===================================================================
> --- memcg.orig/include/linux/memcontrol.h
> +++ memcg/include/linux/memcontrol.h
> @@ -89,6 +89,8 @@ extern int mem_cgroup_last_scanned_node(
>  extern int mem_cgroup_select_victim_node(struct mem_cgroup *mem,
>                                        const nodemask_t *nodes);
>
> +unsigned long shrink_mem_cgroup(struct mem_cgroup *mem);
> +
>  static inline
>  int mm_match_cgroup(const struct mm_struct *mm, const struct mem_cgroup *cgroup)
>  {
> @@ -112,6 +114,9 @@ extern void mem_cgroup_end_migration(str
>  */
>  int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg);
>  int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg);
> +unsigned int mem_cgroup_swappiness(struct mem_cgroup *memcg);
> +unsigned long mem_cgroup_zone_reclaimable_pages(struct mem_cgroup *memcg,
> +                               int nid, int zone_idx);
>  unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
>                                       struct zone *zone,
>                                       enum lru_list lru);
> @@ -310,6 +315,12 @@ mem_cgroup_inactive_file_is_low(struct m
>  }
>
>  static inline unsigned long
> +mem_cgroup_zone_reclaimable_pages(struct mem_cgroup *memcg, int nid, int zone_idx)
> +{
> +       return 0;
> +}
> +
> +static inline unsigned long
>  mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg, struct zone *zone,
>                         enum lru_list lru)
>  {
> Index: memcg/mm/memcontrol.c
> ===================================================================
> --- memcg.orig/mm/memcontrol.c
> +++ memcg/mm/memcontrol.c
> @@ -1166,6 +1166,23 @@ int mem_cgroup_inactive_file_is_low(stru
>        return (active > inactive);
>  }
>
> +unsigned long mem_cgroup_zone_reclaimable_pages(struct mem_cgroup *memcg,
> +                                               int nid, int zone_idx)
> +{
> +       int nr;
> +       struct mem_cgroup_per_zone *mz =
> +               mem_cgroup_zoneinfo(memcg, nid, zone_idx);
> +
> +       nr = MEM_CGROUP_ZSTAT(mz, NR_ACTIVE_FILE) +
> +            MEM_CGROUP_ZSTAT(mz, NR_INACTIVE_FILE);
> +
> +       if (nr_swap_pages > 0)
> +               nr += MEM_CGROUP_ZSTAT(mz, NR_ACTIVE_ANON) +
> +                     MEM_CGROUP_ZSTAT(mz, NR_INACTIVE_ANON);
> +
> +       return nr;
> +}
> +
>  unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
>                                       struct zone *zone,
>                                       enum lru_list lru)
> @@ -1286,7 +1303,7 @@ static unsigned long mem_cgroup_margin(s
>        return margin >> PAGE_SHIFT;
>  }
>
> -static unsigned int get_swappiness(struct mem_cgroup *memcg)
> +unsigned int mem_cgroup_swappiness(struct mem_cgroup *memcg)
>  {
>        struct cgroup *cgrp = memcg->css.cgroup;
>
> @@ -1595,14 +1612,15 @@ static int mem_cgroup_hierarchical_recla
>                /* we use swappiness of local cgroup */
>                if (check_soft) {
>                        ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
> -                               noswap, get_swappiness(victim), zone,
> +                               noswap, mem_cgroup_swappiness(victim), zone,
>                                &nr_scanned);
>                        *total_scanned += nr_scanned;
>                        mem_cgroup_soft_steal(victim, ret);
>                        mem_cgroup_soft_scan(victim, nr_scanned);
>                } else
>                        ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
> -                                               noswap, get_swappiness(victim));
> +                                               noswap,
> +                                               mem_cgroup_swappiness(victim));
>                css_put(&victim->css);
>                /*
>                 * At shrinking usage, we can't check we should stop here or
> @@ -1628,15 +1646,25 @@ static int mem_cgroup_hierarchical_recla
>  int
>  mem_cgroup_select_victim_node(struct mem_cgroup *mem, const nodemask_t *nodes)
>  {
> -       int next_nid;
> +       int next_nid, i;
>        int last_scanned;
>
>        last_scanned = mem->last_scanned_node;
> -       next_nid = next_node(last_scanned, *nodes);
> +       next_nid = last_scanned;
> +rescan:
> +       next_nid = next_node(next_nid, *nodes);
>
>        if (next_nid == MAX_NUMNODES)
>                next_nid = first_node(*nodes);
>
> +       /* If no page on this node, skip */
> +       for (i = 0; i < MAX_NR_ZONES; i++)
> +               if (mem_cgroup_zone_reclaimable_pages(mem, next_nid, i))
> +                       break;
> +
> +       if (next_nid != last_scanned && (i == MAX_NR_ZONES))
> +               goto rescan;
> +
>        mem->last_scanned_node = next_nid;
>
>        return next_nid;
> @@ -3649,7 +3677,7 @@ try_to_free:
>                        goto out;
>                }
>                progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL,
> -                                               false, get_swappiness(mem));
> +                                       false, mem_cgroup_swappiness(mem));
>                if (!progress) {
>                        nr_retries--;
>                        /* maybe some writeback is necessary */
> @@ -4073,7 +4101,7 @@ static u64 mem_cgroup_swappiness_read(st
>  {
>        struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
>
> -       return get_swappiness(memcg);
> +       return mem_cgroup_swappiness(memcg);
>  }
>
>  static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
> @@ -4849,7 +4877,7 @@ mem_cgroup_create(struct cgroup_subsys *
>        INIT_LIST_HEAD(&mem->oom_notify);
>
>        if (parent)
> -               mem->swappiness = get_swappiness(parent);
> +               mem->swappiness = mem_cgroup_swappiness(parent);
>        atomic_set(&mem->refcnt, 1);
>        mem->move_charge_at_immigrate = 0;
>        mutex_init(&mem->thresholds_lock);
> Index: memcg/mm/vmscan.c
> ===================================================================
> --- memcg.orig/mm/vmscan.c
> +++ memcg/mm/vmscan.c
> @@ -42,6 +42,7 @@
>  #include <linux/delayacct.h>
>  #include <linux/sysctl.h>
>  #include <linux/oom.h>
> +#include <linux/res_counter.h>
>
>  #include <asm/tlbflush.h>
>  #include <asm/div64.h>
> @@ -2308,6 +2309,120 @@ static bool sleeping_prematurely(pg_data
>                return !all_zones_ok;
>  }
>
> +#ifdef CONFIG_CGROUP_MEM_RES_CTLR
> +/*
> + * The function is used for per-memcg LRU. It scanns all the zones of the
> + * node and returns the nr_scanned and nr_reclaimed.
> + */
> +/*
> + * Limit of scanning per iteration. For round-robin.
> + */
> +#define MEMCG_BGSCAN_LIMIT     (2048)
> +
> +static void
> +shrink_memcg_node(int nid, int priority, struct scan_control *sc)
> +{
> +       unsigned long total_scanned = 0;
> +       struct mem_cgroup *mem_cont = sc->mem_cgroup;
> +       int i;
> +
> +       /*
> +        * This dma->highmem order is consistant with global reclaim.
> +        * We do this because the page allocator works in the opposite
> +        * direction although memcg user pages are mostly allocated at
> +        * highmem.
> +        */
> +       for (i = 0;
> +            (i < NODE_DATA(nid)->nr_zones) &&
> +            (total_scanned < MEMCG_BGSCAN_LIMIT);
> +            i++) {
> +               struct zone *zone = NODE_DATA(nid)->node_zones + i;
> +               struct zone_reclaim_stat *zrs;
> +               unsigned long scan, rotate;
> +
> +               if (!populated_zone(zone))
> +                       continue;
> +               scan = mem_cgroup_zone_reclaimable_pages(mem_cont, nid, i);
> +               if (!scan)
> +                       continue;
> +               /* If recent memory reclaim on this zone doesn't get good */
> +               zrs = get_reclaim_stat(zone, sc);
> +               scan = zrs->recent_scanned[0] + zrs->recent_scanned[1];
> +               rotate = zrs->recent_rotated[0] + zrs->recent_rotated[1];
> +
> +               if (rotate > scan/2)
> +                       sc->may_writepage = 1;
> +
> +               sc->nr_scanned = 0;
> +               shrink_zone(priority, zone, sc);
> +               total_scanned += sc->nr_scanned;
> +               sc->may_writepage = 0;
> +       }
> +       sc->nr_scanned = total_scanned;
> +}

I see the MEMCG_BGSCAN_LIMIT is a newly defined macro from previous
post. So, now the number of pages to scan is capped on 2k for each
memcg, and does it make difference on big vs small cgroup?

--Ying

> +/*
> + * Per cgroup background reclaim.
> + */
> +unsigned long shrink_mem_cgroup(struct mem_cgroup *mem)
> +{
> +       int nid, priority, next_prio;
> +       nodemask_t nodes;
> +       unsigned long total_scanned;
> +       struct scan_control sc = {
> +               .gfp_mask = GFP_HIGHUSER_MOVABLE,
> +               .may_unmap = 1,
> +               .may_swap = 1,
> +               .nr_to_reclaim = SWAP_CLUSTER_MAX,
> +               .order = 0,
> +               .mem_cgroup = mem,
> +       };
> +
> +       sc.may_writepage = 0;
> +       sc.nr_reclaimed = 0;
> +       total_scanned = 0;
> +       nodes = node_states[N_HIGH_MEMORY];
> +       sc.swappiness = mem_cgroup_swappiness(mem);
> +
> +       current->flags |= PF_SWAPWRITE;
> +       /*
> +        * Unlike kswapd, we need to traverse cgroups one by one. So, we don't
> +        * use full priority. Just scan small number of pages and visit next.
> +        * Now, we scan MEMCG_BGRECLAIM_SCAN_LIMIT pages per scan.
> +        * We use static priority 0.
> +        */
> +       next_prio = min(SWAP_CLUSTER_MAX * num_node_state(N_HIGH_MEMORY),
> +                       MEMCG_BGSCAN_LIMIT/8);
> +       priority = DEF_PRIORITY;
> +       while ((total_scanned < MEMCG_BGSCAN_LIMIT) &&
> +              !nodes_empty(nodes) &&
> +              (sc.nr_to_reclaim > sc.nr_reclaimed)) {
> +
> +               nid = mem_cgroup_select_victim_node(mem, &nodes);
> +               shrink_memcg_node(nid, priority, &sc);
> +               /*
> +                * the node seems to have no pages.
> +                * skip this for a while
> +                */
> +               if (!sc.nr_scanned)
> +                       node_clear(nid, nodes);
> +               total_scanned += sc.nr_scanned;
> +               if (mem_cgroup_watermark_ok(mem, CHARGE_WMARK_HIGH))
> +                       break;
> +               /* emulate priority */
> +               if (total_scanned > next_prio) {
> +                       priority--;
> +                       next_prio <<= 1;
> +               }
> +               if (sc.nr_scanned &&
> +                   total_scanned > sc.nr_reclaimed * 2)
> +                       congestion_wait(WRITE, HZ/10);
> +       }
> +       current->flags &= ~PF_SWAPWRITE;
> +       return sc.nr_reclaimed;
> +}
> +#endif
> +
>  /*
>  * For kswapd, balance_pgdat() will work across all this node's zones until
>  * they are all at high_wmark_pages(zone).
>
>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

next prev parent reply	other threads:[~2011-04-26  4:59 UTC|newest]

Thread overview: 68+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2011-04-25  9:25 [PATCH 0/7] memcg background reclaim , yet another one KAMEZAWA Hiroyuki
2011-04-25  9:28 ` [PATCH 1/7] memcg: add high/low watermark to res_counter KAMEZAWA Hiroyuki
2011-04-26 17:54   ` Ying Han
2011-04-29 13:33   ` Michal Hocko
2011-05-01  6:06     ` KOSAKI Motohiro
2011-05-03  6:49       ` Michal Hocko
2011-05-03  7:45         ` KOSAKI Motohiro
2011-05-03  8:25           ` Michal Hocko
2011-05-03 17:01             ` Ying Han
2011-05-04  8:58               ` Michal Hocko
2011-05-04 17:16                 ` Ying Han
2011-05-05  6:59                   ` Michal Hocko
2011-05-06  5:28                     ` KAMEZAWA Hiroyuki
2011-05-06 14:22                       ` Johannes Weiner
2011-05-09  0:21                         ` KAMEZAWA Hiroyuki
2011-05-09  5:47                           ` Ying Han
2011-05-09  9:58                           ` Johannes Weiner
2011-05-09  9:59                             ` KAMEZAWA Hiroyuki
2011-05-10  4:43                             ` Ying Han
2011-05-09  5:40                       ` Ying Han
2011-05-09  7:10                         ` KAMEZAWA Hiroyuki
2011-05-09 10:18                           ` Johannes Weiner
2011-05-09 12:49                             ` Michal Hocko
2011-05-09 23:49                               ` KAMEZAWA Hiroyuki
2011-05-10  4:39                                 ` Ying Han
2011-05-10  4:51                             ` Ying Han
2011-05-10  6:27                               ` Johannes Weiner
2011-05-10  7:09                                 ` Ying Han
2011-05-04  3:55             ` KOSAKI Motohiro
2011-05-04  8:55               ` Michal Hocko
2011-05-09  3:24                 ` KOSAKI Motohiro
2011-05-02  9:07   ` Balbir Singh
2011-05-06  5:30     ` KAMEZAWA Hiroyuki
2011-04-25  9:29 ` [PATCH 2/7] memcg high watermark interface KAMEZAWA Hiroyuki
2011-04-25 22:36   ` Ying Han
2011-04-25  9:31 ` [PATCH 3/7] memcg: select victim node in round robin KAMEZAWA Hiroyuki
2011-04-25  9:34 ` [PATCH 4/7] memcg fix scan ratio with small memcg KAMEZAWA Hiroyuki
2011-04-25 17:35   ` Ying Han
2011-04-26  1:43     ` KAMEZAWA Hiroyuki
2011-04-25  9:36 ` [PATCH 5/7] memcg bgreclaim core KAMEZAWA Hiroyuki
2011-04-26  4:59   ` Ying Han [this message]
2011-04-26  5:08     ` KAMEZAWA Hiroyuki
2011-04-26 23:15       ` Ying Han
2011-04-27  0:10         ` KAMEZAWA Hiroyuki
2011-04-27  1:01           ` KAMEZAWA Hiroyuki
2011-04-26 18:37   ` Ying Han
2011-04-25  9:40 ` [PATCH 6/7] memcg add zone_all_unreclaimable KAMEZAWA Hiroyuki
2011-04-25  9:42 ` [PATCH 7/7] memcg watermark reclaim workqueue KAMEZAWA Hiroyuki
2011-04-26 23:19   ` Ying Han
2011-04-27  0:31     ` KAMEZAWA Hiroyuki
2011-04-27  3:40       ` Ying Han
2011-04-25  9:43 ` [PATCH 8/7] memcg : reclaim statistics KAMEZAWA Hiroyuki
2011-04-26  5:35   ` Ying Han
2011-04-25  9:49 ` [PATCH 0/7] memcg background reclaim , yet another one KAMEZAWA Hiroyuki
2011-04-25 10:14 ` KAMEZAWA Hiroyuki
2011-04-25 22:21   ` Ying Han
2011-04-26  1:38     ` KAMEZAWA Hiroyuki
2011-04-26  7:19       ` Ying Han
2011-04-26  7:43         ` KAMEZAWA Hiroyuki
2011-04-26  8:43           ` Ying Han
2011-04-26  8:47             ` KAMEZAWA Hiroyuki
2011-04-26 23:08               ` Ying Han
2011-04-27  0:34                 ` KAMEZAWA Hiroyuki
2011-04-27  1:19                   ` Ying Han
2011-04-28  3:55               ` Ying Han
2011-04-28  4:05                 ` KAMEZAWA Hiroyuki
2011-05-02  7:02     ` Balbir Singh
2011-05-02  6:09 ` Balbir Singh

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=BANLkTinn5Cs8F5beX6od41xhH4qQuRR5Rw@mail.gmail.com \
    --to=yinghan@google.com \
    --cc=akpm@linux-foundation.org \
    --cc=balbir@linux.vnet.ibm.com \
    --cc=jweiner@redhat.com \
    --cc=kamezawa.hiroyu@jp.fujitsu.com \
    --cc=kosaki.motohiro@jp.fujitsu.com \
    --cc=linux-mm@kvack.org \
    --cc=mhocko@suse.cz \
    --cc=minchan.kim@gmail.com \
    --cc=nishimura@mxp.nes.nec.co.jp \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).