Re: [PATCH 3/4] Per cgroup background reclaim.

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

From: Ying Han <yinghan@google.com>
To: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Balbir Singh <balbir@linux.vnet.ibm.com>,
	Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>,
	Andrew Morton <akpm@linux-foundation.org>,
	Mel Gorman <mel@csn.ul.ie>, Johannes Weiner <hannes@cmpxchg.org>,
	Christoph Lameter <cl@linux.com>,
	Wu Fengguang <fengguang.wu@intel.com>,
	Andi Kleen <ak@linux.intel.com>, Hugh Dickins <hughd@google.com>,
	Rik van Riel <riel@redhat.com>,
	KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>,
	Tejun Heo <tj@kernel.org>,
	linux-mm@kvack.org
Subject: Re: [PATCH 3/4] Per cgroup background reclaim.
Date: Mon, 6 Dec 2010 18:25:55 -0800	[thread overview]
Message-ID: <AANLkTimm1NrMJ2owLnKbeRe2=p-dHQpJGd7j+A2Sixap@mail.gmail.com> (raw)
In-Reply-To: <20101130165142.bff427b0.kamezawa.hiroyu@jp.fujitsu.com>

On Mon, Nov 29, 2010 at 11:51 PM, KAMEZAWA Hiroyuki
<kamezawa.hiroyu@jp.fujitsu.com> wrote:
> On Mon, 29 Nov 2010 22:49:44 -0800
> Ying Han <yinghan@google.com> wrote:
>
>> The current implementation of memcg only supports direct reclaim and this
>> patch adds the support for background reclaim. Per cgroup background reclaim
>> is needed which spreads out the memory pressure over longer period of time
>> and smoothes out the system performance.
>>
>> There is a kswapd kernel thread for each memory node. We add a different kswapd
>> for each cgroup. The kswapd is sleeping in the wait queue headed at kswapd_wait
>> field of a kswapd descriptor.
>>
>> The kswapd() function now is shared between global and per cgroup kswapd thread.
>> It is passed in with the kswapd descriptor which contains the information of
>> either node or cgroup. Then the new function balance_mem_cgroup_pgdat is invoked
>> if it is per cgroup kswapd thread. The balance_mem_cgroup_pgdat performs a
>> priority loop similar to global reclaim. In each iteration it invokes
>> balance_pgdat_node for all nodes on the system, which is a new function performs
>> background reclaim per node. After reclaiming each node, it checks
>> mem_cgroup_watermark_ok() and breaks the priority loop if returns true. A per
>> memcg zone will be marked as "unreclaimable" if the scanning rate is much
>> greater than the reclaiming rate on the per cgroup LRU. The bit is cleared when
>> there is a page charged to the cgroup being freed. Kswapd breaks the priority
>> loop if all the zones are marked as "unreclaimable".
>>
>> Signed-off-by: Ying Han <yinghan@google.com>
>> ---
>>  include/linux/memcontrol.h |   30 +++++++
>>  mm/memcontrol.c            |  182 ++++++++++++++++++++++++++++++++++++++-
>>  mm/page_alloc.c            |    2 +
>>  mm/vmscan.c                |  205 +++++++++++++++++++++++++++++++++++++++++++-
>>  4 files changed, 416 insertions(+), 3 deletions(-)
>>
>> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
>> index 90fe7fe..dbed45d 100644
>> --- a/include/linux/memcontrol.h
>> +++ b/include/linux/memcontrol.h
>> @@ -127,6 +127,12 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
>>                                               gfp_t gfp_mask);
>>  u64 mem_cgroup_get_limit(struct mem_cgroup *mem);
>>
>> +void mem_cgroup_clear_unreclaimable(struct page *page, struct zone *zone);
>> +bool mem_cgroup_zone_reclaimable(struct mem_cgroup *mem, int nid, int zid);
>> +bool mem_cgroup_mz_unreclaimable(struct mem_cgroup *mem, struct zone *zone);
>> +void mem_cgroup_mz_set_unreclaimable(struct mem_cgroup *mem, struct zone *zone);
>> +void mem_cgroup_mz_pages_scanned(struct mem_cgroup *mem, struct zone* zone,
>> +                                     unsigned long nr_scanned);
>>  #else /* CONFIG_CGROUP_MEM_RES_CTLR */
>>  struct mem_cgroup;
>>
>> @@ -299,6 +305,25 @@ static inline void mem_cgroup_update_file_mapped(struct page *page,
>>  {
>>  }
>>
>> +static inline void mem_cgroup_mz_pages_scanned(struct mem_cgroup *mem,
>> +                                             struct zone *zone,
>> +                                             unsigned long nr_scanned)
>> +{
>> +}
>> +
>> +static inline void mem_cgroup_clear_unreclaimable(struct page *page,
>> +                                                     struct zone *zone)
>> +{
>> +}
>> +static inline void mem_cgroup_mz_set_unreclaimable(struct mem_cgroup *mem,
>> +             struct zone *zone)
>> +{
>> +}
>> +static inline bool mem_cgroup_mz_unreclaimable(struct mem_cgroup *mem,
>> +                                             struct zone *zone)
>> +{
>> +}
>> +
>>  static inline
>>  unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
>>                                           gfp_t gfp_mask)
>> @@ -312,6 +337,11 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *mem)
>>       return 0;
>>  }
>>
>> +static inline bool mem_cgroup_zone_reclaimable(struct mem_cgroup *mem, int nid,
>> +                                                             int zid)
>> +{
>> +     return false;
>> +}
>>  #endif /* CONFIG_CGROUP_MEM_CONT */
>>
>>  #endif /* _LINUX_MEMCONTROL_H */
>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
>> index a0c6ed9..1d39b65 100644
>> --- a/mm/memcontrol.c
>> +++ b/mm/memcontrol.c
>> @@ -48,6 +48,8 @@
>>  #include <linux/page_cgroup.h>
>>  #include <linux/cpu.h>
>>  #include <linux/oom.h>
>> +#include <linux/kthread.h>
>> +
>>  #include "internal.h"
>>
>>  #include <asm/uaccess.h>
>> @@ -118,7 +120,10 @@ struct mem_cgroup_per_zone {
>>       bool                    on_tree;
>>       struct mem_cgroup       *mem;           /* Back pointer, we cannot */
>>                                               /* use container_of        */
>> +     unsigned long           pages_scanned;  /* since last reclaim */
>> +     int                     all_unreclaimable;      /* All pages pinned */
>>  };
>> +
>>  /* Macro for accessing counter */
>>  #define MEM_CGROUP_ZSTAT(mz, idx)    ((mz)->count[(idx)])
>>
>> @@ -372,6 +377,7 @@ static void mem_cgroup_put(struct mem_cgroup *mem);
>>  static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
>>  static void drain_all_stock_async(void);
>>  static unsigned long get_min_free_kbytes(struct mem_cgroup *mem);
>> +static inline void wake_memcg_kswapd(struct mem_cgroup *mem);
>>
>>  static struct mem_cgroup_per_zone *
>>  mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
>> @@ -1086,6 +1092,106 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page)
>>       return &mz->reclaim_stat;
>>  }
>>
>> +unsigned long mem_cgroup_zone_reclaimable_pages(
>> +                                     struct mem_cgroup_per_zone *mz)
>> +{
>> +     int nr;
>> +     nr = MEM_CGROUP_ZSTAT(mz, LRU_ACTIVE_FILE) +
>> +             MEM_CGROUP_ZSTAT(mz, LRU_INACTIVE_FILE);
>> +
>> +     if (nr_swap_pages > 0)
>> +             nr += MEM_CGROUP_ZSTAT(mz, LRU_ACTIVE_ANON) +
>> +                     MEM_CGROUP_ZSTAT(mz, LRU_INACTIVE_ANON);
>> +
>> +     return nr;
>> +}
>> +
>> +void mem_cgroup_mz_pages_scanned(struct mem_cgroup *mem, struct zone* zone,
>> +                                             unsigned long nr_scanned)
>> +{
>> +     struct mem_cgroup_per_zone *mz = NULL;
>> +     int nid = zone_to_nid(zone);
>> +     int zid = zone_idx(zone);
>> +
>> +     if (!mem)
>> +             return;
>> +
>> +     mz = mem_cgroup_zoneinfo(mem, nid, zid);
>> +     if (mz)
>> +             mz->pages_scanned += nr_scanned;
>> +}
>> +
>> +bool mem_cgroup_zone_reclaimable(struct mem_cgroup *mem, int nid, int zid)
>> +{
>> +     struct mem_cgroup_per_zone *mz = NULL;
>> +
>> +     if (!mem)
>> +             return 0;
>> +
>> +     mz = mem_cgroup_zoneinfo(mem, nid, zid);
>> +     if (mz)
>> +             return mz->pages_scanned <
>> +                             mem_cgroup_zone_reclaimable_pages(mz) * 6;
>> +     return 0;
>> +}
>> +
>> +bool mem_cgroup_mz_unreclaimable(struct mem_cgroup *mem, struct zone *zone)
>> +{
>> +     struct mem_cgroup_per_zone *mz = NULL;
>> +     int nid = zone_to_nid(zone);
>> +     int zid = zone_idx(zone);
>> +
>> +     if (!mem)
>> +             return 0;
>> +
>> +     mz = mem_cgroup_zoneinfo(mem, nid, zid);
>> +     if (mz)
>> +             return mz->all_unreclaimable;
>> +
>> +     return 0;
>> +}
>> +
>> +void mem_cgroup_mz_set_unreclaimable(struct mem_cgroup *mem, struct zone *zone)
>> +{
>> +     struct mem_cgroup_per_zone *mz = NULL;
>> +     int nid = zone_to_nid(zone);
>> +     int zid = zone_idx(zone);
>> +
>> +     if (!mem)
>> +             return;
>> +
>> +     mz = mem_cgroup_zoneinfo(mem, nid, zid);
>> +     if (mz)
>> +             mz->all_unreclaimable = 1;
>> +}
>> +
>> +void mem_cgroup_clear_unreclaimable(struct page *page, struct zone *zone)
>> +{
>> +     struct mem_cgroup_per_zone *mz = NULL;
>> +     struct mem_cgroup *mem = NULL;
>> +     int nid = zone_to_nid(zone);
>> +     int zid = zone_idx(zone);
>> +     struct page_cgroup *pc = lookup_page_cgroup(page);
>> +
>> +     if (unlikely(!pc))
>> +             return;
>> +
>> +     rcu_read_lock();
>> +     mem = pc->mem_cgroup;
>
> This is incorrect. you have to do css_tryget(&mem->css) before rcu_read_unlock.
>
>> +     rcu_read_unlock();
>> +
>> +     if (!mem)
>> +             return;
>> +
>> +     mz = mem_cgroup_zoneinfo(mem, nid, zid);
>> +     if (mz) {
>> +             mz->pages_scanned = 0;
>> +             mz->all_unreclaimable = 0;
>> +     }
>> +
>> +     return;
>> +}
>> +
>>  unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
>>                                       struct list_head *dst,
>>                                       unsigned long *scanned, int order,
>> @@ -1887,6 +1993,20 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
>>       struct res_counter *fail_res;
>>       unsigned long flags = 0;
>>       int ret;
>> +     unsigned long min_free_kbytes = 0;
>> +
>> +     min_free_kbytes = get_min_free_kbytes(mem);
>> +     if (min_free_kbytes) {
>> +             ret = res_counter_charge(&mem->res, csize, CHARGE_WMARK_LOW,
>> +                                     &fail_res);
>> +             if (likely(!ret)) {
>> +                     return CHARGE_OK;
>> +             } else {
>> +                     mem_over_limit = mem_cgroup_from_res_counter(fail_res,
>> +                                                                     res);
>> +                     wake_memcg_kswapd(mem_over_limit);
>> +             }
>> +     }
>
> I think this check can be moved out to periodic-check as threshould notifiers.

Yes. This will be changed in V2.

>
>
>
>>
>>       ret = res_counter_charge(&mem->res, csize, CHARGE_WMARK_MIN, &fail_res);
>>
>> @@ -3037,6 +3157,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
>>                       else
>>                               memcg->memsw_is_minimum = false;
>>               }
>> +             setup_per_memcg_wmarks(memcg);
>>               mutex_unlock(&set_limit_mutex);
>>
>>               if (!ret)
>> @@ -3046,7 +3167,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
>>                                               MEM_CGROUP_RECLAIM_SHRINK);
>>               curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
>>               /* Usage is reduced ? */
>> -             if (curusage >= oldusage)
>> +             if (curusage >= oldusage)
>>                       retry_count--;
>>               else
>>                       oldusage = curusage;
>
> What's changed here ?
>
>> @@ -3096,6 +3217,7 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
>>                       else
>>                               memcg->memsw_is_minimum = false;
>>               }
>> +             setup_per_memcg_wmarks(memcg);
>>               mutex_unlock(&set_limit_mutex);
>>
>>               if (!ret)
>> @@ -4352,6 +4474,8 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
>>  static void __mem_cgroup_free(struct mem_cgroup *mem)
>>  {
>>       int node;
>> +     struct kswapd *kswapd_p;
>> +     wait_queue_head_t *wait;
>>
>>       mem_cgroup_remove_from_trees(mem);
>>       free_css_id(&mem_cgroup_subsys, &mem->css);
>> @@ -4360,6 +4484,15 @@ static void __mem_cgroup_free(struct mem_cgroup *mem)
>>               free_mem_cgroup_per_zone_info(mem, node);
>>
>>       free_percpu(mem->stat);
>> +
>> +     wait = mem->kswapd_wait;
>> +     kswapd_p = container_of(wait, struct kswapd, kswapd_wait);
>> +     if (kswapd_p) {
>> +             if (kswapd_p->kswapd_task)
>> +                     kthread_stop(kswapd_p->kswapd_task);
>> +             kfree(kswapd_p);
>> +     }
>> +
>>       if (sizeof(struct mem_cgroup) < PAGE_SIZE)
>>               kfree(mem);
>>       else
>> @@ -4421,6 +4554,39 @@ int mem_cgroup_watermark_ok(struct mem_cgroup *mem,
>>       return ret;
>>  }
>>
>> +static inline
>> +void wake_memcg_kswapd(struct mem_cgroup *mem)
>> +{
>> +     wait_queue_head_t *wait;
>> +     struct kswapd *kswapd_p;
>> +     struct task_struct *thr;
>> +     static char memcg_name[PATH_MAX];
>> +
>> +     if (!mem)
>> +             return;
>> +
>> +     wait = mem->kswapd_wait;
>> +     kswapd_p = container_of(wait, struct kswapd, kswapd_wait);
>> +     if (!kswapd_p->kswapd_task) {
>> +             if (mem->css.cgroup)
>> +                     cgroup_path(mem->css.cgroup, memcg_name, PATH_MAX);
>> +             else
>> +                     sprintf(memcg_name, "no_name");
>> +
>> +             thr = kthread_run(kswapd, kswapd_p, "kswapd%s", memcg_name);
>
> I don't think reusing the name of "kswapd" isn't good. and this name cannot
> be long as PATH_MAX...IIUC, this name is for comm[] field which is 16bytes long.
>
> So, how about naming this as
>
>  "memcg%d", mem->css.id ?
>
> Exporing css.id will be okay if necessary.

I am not if that is working since the mem->css hasn't been initialized
during mem_cgroup_create(). So that is one of the reasons that I put
the kswapd creation at triggering wmarks instead of creating cgroup,
since I have all that information ready by the time.

However, I agree that adding into the cgroup creation is better for
performance perspective since we won't add the overhead for the page
allocation. ( Although only the first wmark triggering ). Any
suggestion?

--Ying
>
>
>
>> +             if (IS_ERR(thr))
>> +                     printk(KERN_INFO "Failed to start kswapd on memcg %d\n",
>> +                             0);
>> +             else
>> +                     kswapd_p->kswapd_task = thr;
>> +     }
>
> Hmm, ok, then, kswapd-for-memcg is created when someone go over watermark

next prev parent reply	other threads:[~2010-12-07  2:26 UTC|newest]

Thread overview: 52+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2010-11-30  6:49 [RFC][PATCH 0/4] memcg: per cgroup background reclaim Ying Han
2010-11-30  6:49 ` [PATCH 1/4] Add kswapd descriptor Ying Han
2010-11-30  7:08   ` KAMEZAWA Hiroyuki
2010-11-30  8:15     ` Minchan Kim
2010-11-30  8:27       ` KAMEZAWA Hiroyuki
2010-11-30  8:54         ` KAMEZAWA Hiroyuki
2010-11-30 20:40           ` Ying Han
2010-11-30 23:46             ` KAMEZAWA Hiroyuki
2010-12-07  6:15           ` Balbir Singh
2010-12-07  6:24             ` KAMEZAWA Hiroyuki
2010-12-07  6:59               ` Balbir Singh
2010-12-07  8:00                 ` KAMEZAWA Hiroyuki
2010-11-30 20:26       ` Ying Han
2010-11-30 20:17     ` Ying Han
2010-12-01  0:12       ` KAMEZAWA Hiroyuki
2010-12-07  6:52   ` Balbir Singh
2010-12-07 19:21     ` Ying Han
2010-12-07 12:33   ` Mel Gorman
2010-12-07 17:28     ` Ying Han
2010-12-08  0:39       ` KAMEZAWA Hiroyuki
2010-12-08  1:24         ` Ying Han
2010-12-08  1:28           ` KAMEZAWA Hiroyuki
2010-12-08  2:10             ` Ying Han
2010-12-08  2:13               ` KAMEZAWA Hiroyuki
2010-12-08 12:19           ` Mel Gorman
2010-12-08  7:21       ` KOSAKI Motohiro
2010-12-07 18:50     ` Ying Han
2010-12-08  7:22     ` KOSAKI Motohiro
2010-12-08  7:37       ` KAMEZAWA Hiroyuki
2010-11-30  6:49 ` [PATCH 2/4] Add per cgroup reclaim watermarks Ying Han
2010-11-30  7:21   ` KAMEZAWA Hiroyuki
2010-11-30 20:44     ` Ying Han
2010-12-01  0:27       ` KAMEZAWA Hiroyuki
2010-12-07 14:56   ` Mel Gorman
2010-11-30  6:49 ` [PATCH 3/4] Per cgroup background reclaim Ying Han
2010-11-30  7:51   ` KAMEZAWA Hiroyuki
2010-11-30  8:07     ` KAMEZAWA Hiroyuki
2010-11-30 22:01       ` Ying Han
2010-11-30 22:00     ` Ying Han
2010-12-07  2:25     ` Ying Han [this message]
2010-12-07  5:21       ` KAMEZAWA Hiroyuki
2010-12-01  2:18   ` KOSAKI Motohiro
2010-12-01  2:16     ` KAMEZAWA Hiroyuki
2010-11-30  6:49 ` [PATCH 4/4] Add more per memcg stats Ying Han
2010-11-30  7:53   ` KAMEZAWA Hiroyuki
2010-11-30 18:22     ` Ying Han
2010-11-30  6:54 ` [RFC][PATCH 0/4] memcg: per cgroup background reclaim KOSAKI Motohiro
2010-11-30  7:03   ` Ying Han
2010-12-02 14:41     ` Balbir Singh
2010-12-07  2:29       ` Ying Han
2010-11-30  7:00 ` KAMEZAWA Hiroyuki
2010-11-30  9:05   ` Ying Han

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to='AANLkTimm1NrMJ2owLnKbeRe2=p-dHQpJGd7j+A2Sixap@mail.gmail.com' \
    --to=yinghan@google.com \
    --cc=ak@linux.intel.com \
    --cc=akpm@linux-foundation.org \
    --cc=balbir@linux.vnet.ibm.com \
    --cc=cl@linux.com \
    --cc=fengguang.wu@intel.com \
    --cc=hannes@cmpxchg.org \
    --cc=hughd@google.com \
    --cc=kamezawa.hiroyu@jp.fujitsu.com \
    --cc=kosaki.motohiro@jp.fujitsu.com \
    --cc=linux-mm@kvack.org \
    --cc=mel@csn.ul.ie \
    --cc=nishimura@mxp.nes.nec.co.jp \
    --cc=riel@redhat.com \
    --cc=tj@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).