From: Ying Han <yinghan@google.com>
To: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Balbir Singh <balbir@linux.vnet.ibm.com>,
Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>,
Andrew Morton <akpm@linux-foundation.org>,
Mel Gorman <mel@csn.ul.ie>, Johannes Weiner <hannes@cmpxchg.org>,
Christoph Lameter <cl@linux.com>,
Wu Fengguang <fengguang.wu@intel.com>,
Andi Kleen <ak@linux.intel.com>, Hugh Dickins <hughd@google.com>,
Rik van Riel <riel@redhat.com>,
KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>,
Tejun Heo <tj@kernel.org>,
linux-mm@kvack.org
Subject: Re: [PATCH 3/4] Per cgroup background reclaim.
Date: Mon, 6 Dec 2010 18:25:55 -0800 [thread overview]
Message-ID: <AANLkTimm1NrMJ2owLnKbeRe2=p-dHQpJGd7j+A2Sixap@mail.gmail.com> (raw)
In-Reply-To: <20101130165142.bff427b0.kamezawa.hiroyu@jp.fujitsu.com>
On Mon, Nov 29, 2010 at 11:51 PM, KAMEZAWA Hiroyuki
<kamezawa.hiroyu@jp.fujitsu.com> wrote:
> On Mon, 29 Nov 2010 22:49:44 -0800
> Ying Han <yinghan@google.com> wrote:
>
>> The current implementation of memcg only supports direct reclaim and this
>> patch adds the support for background reclaim. Per cgroup background reclaim
>> is needed which spreads out the memory pressure over longer period of time
>> and smoothes out the system performance.
>>
>> There is a kswapd kernel thread for each memory node. We add a different kswapd
>> for each cgroup. The kswapd is sleeping in the wait queue headed at kswapd_wait
>> field of a kswapd descriptor.
>>
>> The kswapd() function now is shared between global and per cgroup kswapd thread.
>> It is passed in with the kswapd descriptor which contains the information of
>> either node or cgroup. Then the new function balance_mem_cgroup_pgdat is invoked
>> if it is per cgroup kswapd thread. The balance_mem_cgroup_pgdat performs a
>> priority loop similar to global reclaim. In each iteration it invokes
>> balance_pgdat_node for all nodes on the system, which is a new function performs
>> background reclaim per node. After reclaiming each node, it checks
>> mem_cgroup_watermark_ok() and breaks the priority loop if returns true. A per
>> memcg zone will be marked as "unreclaimable" if the scanning rate is much
>> greater than the reclaiming rate on the per cgroup LRU. The bit is cleared when
>> there is a page charged to the cgroup being freed. Kswapd breaks the priority
>> loop if all the zones are marked as "unreclaimable".
>>
>> Signed-off-by: Ying Han <yinghan@google.com>
>> ---
>> include/linux/memcontrol.h | 30 +++++++
>> mm/memcontrol.c | 182 ++++++++++++++++++++++++++++++++++++++-
>> mm/page_alloc.c | 2 +
>> mm/vmscan.c | 205 +++++++++++++++++++++++++++++++++++++++++++-
>> 4 files changed, 416 insertions(+), 3 deletions(-)
>>
>> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
>> index 90fe7fe..dbed45d 100644
>> --- a/include/linux/memcontrol.h
>> +++ b/include/linux/memcontrol.h
>> @@ -127,6 +127,12 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
>> gfp_t gfp_mask);
>> u64 mem_cgroup_get_limit(struct mem_cgroup *mem);
>>
>> +void mem_cgroup_clear_unreclaimable(struct page *page, struct zone *zone);
>> +bool mem_cgroup_zone_reclaimable(struct mem_cgroup *mem, int nid, int zid);
>> +bool mem_cgroup_mz_unreclaimable(struct mem_cgroup *mem, struct zone *zone);
>> +void mem_cgroup_mz_set_unreclaimable(struct mem_cgroup *mem, struct zone *zone);
>> +void mem_cgroup_mz_pages_scanned(struct mem_cgroup *mem, struct zone* zone,
>> + unsigned long nr_scanned);
>> #else /* CONFIG_CGROUP_MEM_RES_CTLR */
>> struct mem_cgroup;
>>
>> @@ -299,6 +305,25 @@ static inline void mem_cgroup_update_file_mapped(struct page *page,
>> {
>> }
>>
>> +static inline void mem_cgroup_mz_pages_scanned(struct mem_cgroup *mem,
>> + struct zone *zone,
>> + unsigned long nr_scanned)
>> +{
>> +}
>> +
>> +static inline void mem_cgroup_clear_unreclaimable(struct page *page,
>> + struct zone *zone)
>> +{
>> +}
>> +static inline void mem_cgroup_mz_set_unreclaimable(struct mem_cgroup *mem,
>> + struct zone *zone)
>> +{
>> +}
>> +static inline bool mem_cgroup_mz_unreclaimable(struct mem_cgroup *mem,
>> + struct zone *zone)
>> +{
>> +}
>> +
>> static inline
>> unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
>> gfp_t gfp_mask)
>> @@ -312,6 +337,11 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *mem)
>> return 0;
>> }
>>
>> +static inline bool mem_cgroup_zone_reclaimable(struct mem_cgroup *mem, int nid,
>> + int zid)
>> +{
>> + return false;
>> +}
>> #endif /* CONFIG_CGROUP_MEM_CONT */
>>
>> #endif /* _LINUX_MEMCONTROL_H */
>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
>> index a0c6ed9..1d39b65 100644
>> --- a/mm/memcontrol.c
>> +++ b/mm/memcontrol.c
>> @@ -48,6 +48,8 @@
>> #include <linux/page_cgroup.h>
>> #include <linux/cpu.h>
>> #include <linux/oom.h>
>> +#include <linux/kthread.h>
>> +
>> #include "internal.h"
>>
>> #include <asm/uaccess.h>
>> @@ -118,7 +120,10 @@ struct mem_cgroup_per_zone {
>> bool on_tree;
>> struct mem_cgroup *mem; /* Back pointer, we cannot */
>> /* use container_of */
>> + unsigned long pages_scanned; /* since last reclaim */
>> + int all_unreclaimable; /* All pages pinned */
>> };
>> +
>> /* Macro for accessing counter */
>> #define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)])
>>
>> @@ -372,6 +377,7 @@ static void mem_cgroup_put(struct mem_cgroup *mem);
>> static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
>> static void drain_all_stock_async(void);
>> static unsigned long get_min_free_kbytes(struct mem_cgroup *mem);
>> +static inline void wake_memcg_kswapd(struct mem_cgroup *mem);
>>
>> static struct mem_cgroup_per_zone *
>> mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
>> @@ -1086,6 +1092,106 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page)
>> return &mz->reclaim_stat;
>> }
>>
>> +unsigned long mem_cgroup_zone_reclaimable_pages(
>> + struct mem_cgroup_per_zone *mz)
>> +{
>> + int nr;
>> + nr = MEM_CGROUP_ZSTAT(mz, LRU_ACTIVE_FILE) +
>> + MEM_CGROUP_ZSTAT(mz, LRU_INACTIVE_FILE);
>> +
>> + if (nr_swap_pages > 0)
>> + nr += MEM_CGROUP_ZSTAT(mz, LRU_ACTIVE_ANON) +
>> + MEM_CGROUP_ZSTAT(mz, LRU_INACTIVE_ANON);
>> +
>> + return nr;
>> +}
>> +
>> +void mem_cgroup_mz_pages_scanned(struct mem_cgroup *mem, struct zone* zone,
>> + unsigned long nr_scanned)
>> +{
>> + struct mem_cgroup_per_zone *mz = NULL;
>> + int nid = zone_to_nid(zone);
>> + int zid = zone_idx(zone);
>> +
>> + if (!mem)
>> + return;
>> +
>> + mz = mem_cgroup_zoneinfo(mem, nid, zid);
>> + if (mz)
>> + mz->pages_scanned += nr_scanned;
>> +}
>> +
>> +bool mem_cgroup_zone_reclaimable(struct mem_cgroup *mem, int nid, int zid)
>> +{
>> + struct mem_cgroup_per_zone *mz = NULL;
>> +
>> + if (!mem)
>> + return 0;
>> +
>> + mz = mem_cgroup_zoneinfo(mem, nid, zid);
>> + if (mz)
>> + return mz->pages_scanned <
>> + mem_cgroup_zone_reclaimable_pages(mz) * 6;
>> + return 0;
>> +}
>> +
>> +bool mem_cgroup_mz_unreclaimable(struct mem_cgroup *mem, struct zone *zone)
>> +{
>> + struct mem_cgroup_per_zone *mz = NULL;
>> + int nid = zone_to_nid(zone);
>> + int zid = zone_idx(zone);
>> +
>> + if (!mem)
>> + return 0;
>> +
>> + mz = mem_cgroup_zoneinfo(mem, nid, zid);
>> + if (mz)
>> + return mz->all_unreclaimable;
>> +
>> + return 0;
>> +}
>> +
>> +void mem_cgroup_mz_set_unreclaimable(struct mem_cgroup *mem, struct zone *zone)
>> +{
>> + struct mem_cgroup_per_zone *mz = NULL;
>> + int nid = zone_to_nid(zone);
>> + int zid = zone_idx(zone);
>> +
>> + if (!mem)
>> + return;
>> +
>> + mz = mem_cgroup_zoneinfo(mem, nid, zid);
>> + if (mz)
>> + mz->all_unreclaimable = 1;
>> +}
>> +
>> +void mem_cgroup_clear_unreclaimable(struct page *page, struct zone *zone)
>> +{
>> + struct mem_cgroup_per_zone *mz = NULL;
>> + struct mem_cgroup *mem = NULL;
>> + int nid = zone_to_nid(zone);
>> + int zid = zone_idx(zone);
>> + struct page_cgroup *pc = lookup_page_cgroup(page);
>> +
>> + if (unlikely(!pc))
>> + return;
>> +
>> + rcu_read_lock();
>> + mem = pc->mem_cgroup;
>
> This is incorrect. you have to do css_tryget(&mem->css) before rcu_read_unlock.
>
>> + rcu_read_unlock();
>> +
>> + if (!mem)
>> + return;
>> +
>> + mz = mem_cgroup_zoneinfo(mem, nid, zid);
>> + if (mz) {
>> + mz->pages_scanned = 0;
>> + mz->all_unreclaimable = 0;
>> + }
>> +
>> + return;
>> +}
>> +
>> unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
>> struct list_head *dst,
>> unsigned long *scanned, int order,
>> @@ -1887,6 +1993,20 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
>> struct res_counter *fail_res;
>> unsigned long flags = 0;
>> int ret;
>> + unsigned long min_free_kbytes = 0;
>> +
>> + min_free_kbytes = get_min_free_kbytes(mem);
>> + if (min_free_kbytes) {
>> + ret = res_counter_charge(&mem->res, csize, CHARGE_WMARK_LOW,
>> + &fail_res);
>> + if (likely(!ret)) {
>> + return CHARGE_OK;
>> + } else {
>> + mem_over_limit = mem_cgroup_from_res_counter(fail_res,
>> + res);
>> + wake_memcg_kswapd(mem_over_limit);
>> + }
>> + }
>
> I think this check can be moved out to periodic-check as threshould notifiers.
Yes. This will be changed in V2.
>
>
>
>>
>> ret = res_counter_charge(&mem->res, csize, CHARGE_WMARK_MIN, &fail_res);
>>
>> @@ -3037,6 +3157,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
>> else
>> memcg->memsw_is_minimum = false;
>> }
>> + setup_per_memcg_wmarks(memcg);
>> mutex_unlock(&set_limit_mutex);
>>
>> if (!ret)
>> @@ -3046,7 +3167,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
>> MEM_CGROUP_RECLAIM_SHRINK);
>> curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
>> /* Usage is reduced ? */
>> - if (curusage >= oldusage)
>> + if (curusage >= oldusage)
>> retry_count--;
>> else
>> oldusage = curusage;
>
> What's changed here ?
>
>> @@ -3096,6 +3217,7 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
>> else
>> memcg->memsw_is_minimum = false;
>> }
>> + setup_per_memcg_wmarks(memcg);
>> mutex_unlock(&set_limit_mutex);
>>
>> if (!ret)
>> @@ -4352,6 +4474,8 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
>> static void __mem_cgroup_free(struct mem_cgroup *mem)
>> {
>> int node;
>> + struct kswapd *kswapd_p;
>> + wait_queue_head_t *wait;
>>
>> mem_cgroup_remove_from_trees(mem);
>> free_css_id(&mem_cgroup_subsys, &mem->css);
>> @@ -4360,6 +4484,15 @@ static void __mem_cgroup_free(struct mem_cgroup *mem)
>> free_mem_cgroup_per_zone_info(mem, node);
>>
>> free_percpu(mem->stat);
>> +
>> + wait = mem->kswapd_wait;
>> + kswapd_p = container_of(wait, struct kswapd, kswapd_wait);
>> + if (kswapd_p) {
>> + if (kswapd_p->kswapd_task)
>> + kthread_stop(kswapd_p->kswapd_task);
>> + kfree(kswapd_p);
>> + }
>> +
>> if (sizeof(struct mem_cgroup) < PAGE_SIZE)
>> kfree(mem);
>> else
>> @@ -4421,6 +4554,39 @@ int mem_cgroup_watermark_ok(struct mem_cgroup *mem,
>> return ret;
>> }
>>
>> +static inline
>> +void wake_memcg_kswapd(struct mem_cgroup *mem)
>> +{
>> + wait_queue_head_t *wait;
>> + struct kswapd *kswapd_p;
>> + struct task_struct *thr;
>> + static char memcg_name[PATH_MAX];
>> +
>> + if (!mem)
>> + return;
>> +
>> + wait = mem->kswapd_wait;
>> + kswapd_p = container_of(wait, struct kswapd, kswapd_wait);
>> + if (!kswapd_p->kswapd_task) {
>> + if (mem->css.cgroup)
>> + cgroup_path(mem->css.cgroup, memcg_name, PATH_MAX);
>> + else
>> + sprintf(memcg_name, "no_name");
>> +
>> + thr = kthread_run(kswapd, kswapd_p, "kswapd%s", memcg_name);
>
> I don't think reusing the name of "kswapd" isn't good. and this name cannot
> be long as PATH_MAX...IIUC, this name is for comm[] field which is 16bytes long.
>
> So, how about naming this as
>
> "memcg%d", mem->css.id ?
>
> Exporing css.id will be okay if necessary.
I am not if that is working since the mem->css hasn't been initialized
during mem_cgroup_create(). So that is one of the reasons that I put
the kswapd creation at triggering wmarks instead of creating cgroup,
since I have all that information ready by the time.
However, I agree that adding into the cgroup creation is better for
performance perspective since we won't add the overhead for the page
allocation. ( Although only the first wmark triggering ). Any
suggestion?
--Ying
>
>
>
>> + if (IS_ERR(thr))
>> + printk(KERN_INFO "Failed to start kswapd on memcg %d\n",
>> + 0);
>> + else
>> + kswapd_p->kswapd_task = thr;
>> + }
>
> Hmm, ok, then, kswapd-for-memcg is created when someone go over watermark
next prev parent reply other threads:[~2010-12-07 2:26 UTC|newest]
Thread overview: 52+ messages / expand[flat|nested] mbox.gz Atom feed top
2010-11-30 6:49 [RFC][PATCH 0/4] memcg: per cgroup background reclaim Ying Han
2010-11-30 6:49 ` [PATCH 1/4] Add kswapd descriptor Ying Han
2010-11-30 7:08 ` KAMEZAWA Hiroyuki
2010-11-30 8:15 ` Minchan Kim
2010-11-30 8:27 ` KAMEZAWA Hiroyuki
2010-11-30 8:54 ` KAMEZAWA Hiroyuki
2010-11-30 20:40 ` Ying Han
2010-11-30 23:46 ` KAMEZAWA Hiroyuki
2010-12-07 6:15 ` Balbir Singh
2010-12-07 6:24 ` KAMEZAWA Hiroyuki
2010-12-07 6:59 ` Balbir Singh
2010-12-07 8:00 ` KAMEZAWA Hiroyuki
2010-11-30 20:26 ` Ying Han
2010-11-30 20:17 ` Ying Han
2010-12-01 0:12 ` KAMEZAWA Hiroyuki
2010-12-07 6:52 ` Balbir Singh
2010-12-07 19:21 ` Ying Han
2010-12-07 12:33 ` Mel Gorman
2010-12-07 17:28 ` Ying Han
2010-12-08 0:39 ` KAMEZAWA Hiroyuki
2010-12-08 1:24 ` Ying Han
2010-12-08 1:28 ` KAMEZAWA Hiroyuki
2010-12-08 2:10 ` Ying Han
2010-12-08 2:13 ` KAMEZAWA Hiroyuki
2010-12-08 12:19 ` Mel Gorman
2010-12-08 7:21 ` KOSAKI Motohiro
2010-12-07 18:50 ` Ying Han
2010-12-08 7:22 ` KOSAKI Motohiro
2010-12-08 7:37 ` KAMEZAWA Hiroyuki
2010-11-30 6:49 ` [PATCH 2/4] Add per cgroup reclaim watermarks Ying Han
2010-11-30 7:21 ` KAMEZAWA Hiroyuki
2010-11-30 20:44 ` Ying Han
2010-12-01 0:27 ` KAMEZAWA Hiroyuki
2010-12-07 14:56 ` Mel Gorman
2010-11-30 6:49 ` [PATCH 3/4] Per cgroup background reclaim Ying Han
2010-11-30 7:51 ` KAMEZAWA Hiroyuki
2010-11-30 8:07 ` KAMEZAWA Hiroyuki
2010-11-30 22:01 ` Ying Han
2010-11-30 22:00 ` Ying Han
2010-12-07 2:25 ` Ying Han [this message]
2010-12-07 5:21 ` KAMEZAWA Hiroyuki
2010-12-01 2:18 ` KOSAKI Motohiro
2010-12-01 2:16 ` KAMEZAWA Hiroyuki
2010-11-30 6:49 ` [PATCH 4/4] Add more per memcg stats Ying Han
2010-11-30 7:53 ` KAMEZAWA Hiroyuki
2010-11-30 18:22 ` Ying Han
2010-11-30 6:54 ` [RFC][PATCH 0/4] memcg: per cgroup background reclaim KOSAKI Motohiro
2010-11-30 7:03 ` Ying Han
2010-12-02 14:41 ` Balbir Singh
2010-12-07 2:29 ` Ying Han
2010-11-30 7:00 ` KAMEZAWA Hiroyuki
2010-11-30 9:05 ` Ying Han
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to='AANLkTimm1NrMJ2owLnKbeRe2=p-dHQpJGd7j+A2Sixap@mail.gmail.com' \
--to=yinghan@google.com \
--cc=ak@linux.intel.com \
--cc=akpm@linux-foundation.org \
--cc=balbir@linux.vnet.ibm.com \
--cc=cl@linux.com \
--cc=fengguang.wu@intel.com \
--cc=hannes@cmpxchg.org \
--cc=hughd@google.com \
--cc=kamezawa.hiroyu@jp.fujitsu.com \
--cc=kosaki.motohiro@jp.fujitsu.com \
--cc=linux-mm@kvack.org \
--cc=mel@csn.ul.ie \
--cc=nishimura@mxp.nes.nec.co.jp \
--cc=riel@redhat.com \
--cc=tj@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).