Re: memcg swappiness (Re: memo: mem+swap controller)

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Balbir Singh <balbir@linux.vnet.ibm.com>
To: YAMAMOTO Takashi <yamamoto@valinux.co.jp>
Cc: nishimura@mxp.nes.nec.co.jp, kamezawa.hiroyu@jp.fujitsu.com,
	hugh@veritas.com, linux-mm@kvack.org, menage@google.com,
	akpm@linux-foundation.org
Subject: Re: memcg swappiness (Re: memo: mem+swap controller)
Date: Fri, 01 Aug 2008 10:55:52 +0530	[thread overview]
Message-ID: <48929E60.6050608@linux.vnet.ibm.com> (raw)
In-Reply-To: <20080801050718.A13CE5A64@siro.lan>

YAMAMOTO Takashi wrote:
> hi,
> 
>>>> I do intend to add the swappiness feature soon for control groups.
>>>>
>>> How does it work?
>>> Does it affect global page reclaim?
>>>
>> We have a swappiness parameter in scan_control. Each control group indicates
>> what it wants it swappiness to be when the control group is over it's limit and
>> reclaim kicks in.
> 
> the following is an untested work-in-progress patch i happen to have.
> i'd appreciate it if you take care of it.
> 

Looks very similar to the patch I have. You seemed to have made much more
progress than me, I am yet to look at the recent_* statistics. How are the test
results? Are they close to what you expect?  Some comments below

> 
> Signed-off-by: YAMAMOTO Takashi <yamamoto@valinux.co.jp>
> ---
> 
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index ee1b2fc..7618944 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -47,6 +47,7 @@ extern unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
>  					int active, int file);
>  extern void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask);
>  int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem);
> +extern int mem_cgroup_swappiness(struct mem_cgroup *mem);
> 
>  extern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
> 
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index fcfa8b4..e1eeb09 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -129,6 +129,7 @@ struct mem_cgroup {
>  	struct mem_cgroup_lru_info info;
> 
>  	int	prev_priority;	/* for recording reclaim priority */
> +	unsigned int swappiness;	/* swappiness */
>  	/*
>  	 * statistics.
>  	 */
> @@ -437,14 +438,11 @@ void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority)
>  long mem_cgroup_calc_reclaim(struct mem_cgroup *mem, struct zone *zone,
>  					int priority, enum lru_list lru)
>  {
> -	long nr_pages;
>  	int nid = zone->zone_pgdat->node_id;
>  	int zid = zone_idx(zone);
>  	struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid);
> 
> -	nr_pages = MEM_CGROUP_ZSTAT(mz, lru);
> -
> -	return (nr_pages >> priority);
> +	return MEM_CGROUP_ZSTAT(mz, lru);
>  }
> 
>  unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
> @@ -963,6 +961,24 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
>  	return 0;
>  }
> 
> +static int mem_cgroup_swappiness_write(struct cgroup *cont, struct cftype *cft,
> +				       u64 val)
> +{
> +	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
> +
> +	if (val > 100)
> +		return -EINVAL;
> +	mem->swappiness = val;
> +	return 0;
> +}
> +
> +static u64 mem_cgroup_swappiness_read(struct cgroup *cont, struct cftype *cft)
> +{
> +	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
> +  
> +	return mem->swappiness;
> +}
> +
>  static struct cftype mem_cgroup_files[] = {
>  	{
>  		.name = "usage_in_bytes",
> @@ -995,8 +1011,21 @@ static struct cftype mem_cgroup_files[] = {
>  		.name = "stat",
>  		.read_map = mem_control_stat_show,
>  	},
> +	{
> +		.name = "swappiness",
> +		.write_u64 = mem_cgroup_swappiness_write,
> +		.read_u64 = mem_cgroup_swappiness_read,
> +	},
>  };
> 
> +/* XXX probably it's better to move try_to_free_mem_cgroup_pages to
> +  memcontrol.c and kill this */
> +int mem_cgroup_swappiness(struct mem_cgroup *mem)
> +{
> +
> +	return mem->swappiness;
> +}
> +
>  static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
>  {
>  	struct mem_cgroup_per_node *pn;
> @@ -1072,6 +1101,8 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
>  			return ERR_PTR(-ENOMEM);
>  	}
> 
> +	mem->swappiness = 60; /* XXX probably should inherit a value from
> +				either parent cgroup or global vm_swappiness */

Precisely, we need hierarchy support to inherit from parent. vm_swappiness is
exported, so we can use it here.

>  	res_counter_init(&mem->res);
> 
>  	for_each_node_state(node, N_POSSIBLE)
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 6527e04..9f2ddbc 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -1364,15 +1364,10 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
>  static void get_scan_ratio(struct zone *zone, struct scan_control * sc,
>  					unsigned long *percent)
>  {
> -	unsigned long anon, file, free;
>  	unsigned long anon_prio, file_prio;
>  	unsigned long ap, fp;
> -
> -	anon  = zone_page_state(zone, NR_ACTIVE_ANON) +
> -		zone_page_state(zone, NR_INACTIVE_ANON);
> -	file  = zone_page_state(zone, NR_ACTIVE_FILE) +
> -		zone_page_state(zone, NR_INACTIVE_FILE);
> -	free  = zone_page_state(zone, NR_FREE_PAGES);
> +	unsigned long recent_scanned[2];
> +	unsigned long recent_rotated[2];
> 
>  	/* If we have no swap space, do not bother scanning anon pages. */
>  	if (nr_swap_pages <= 0) {
> @@ -1381,36 +1376,59 @@ static void get_scan_ratio(struct zone *zone, struct scan_control * sc,
>  		return;
>  	}
> 
> -	/* If we have very few page cache pages, force-scan anon pages. */
> -	if (unlikely(file + free <= zone->pages_high)) {
> -		percent[0] = 100;
> -		percent[1] = 0;
> -		return;
> -	}
> +	if (scan_global_lru(sc)) {
> +		unsigned long anon, file, free;
> 
> -	/*
> -         * OK, so we have swap space and a fair amount of page cache
> -         * pages.  We use the recently rotated / recently scanned
> -         * ratios to determine how valuable each cache is.
> -         *
> -         * Because workloads change over time (and to avoid overflow)
> -         * we keep these statistics as a floating average, which ends
> -         * up weighing recent references more than old ones.
> -         *
> -         * anon in [0], file in [1]
> -         */
> -	if (unlikely(zone->recent_scanned[0] > anon / 4)) {
> -		spin_lock_irq(&zone->lru_lock);
> -		zone->recent_scanned[0] /= 2;
> -		zone->recent_rotated[0] /= 2;
> -		spin_unlock_irq(&zone->lru_lock);
> -	}
> +		anon  = zone_page_state(zone, NR_ACTIVE_ANON) +
> +			zone_page_state(zone, NR_INACTIVE_ANON);
> +		file  = zone_page_state(zone, NR_ACTIVE_FILE) +
> +			zone_page_state(zone, NR_INACTIVE_FILE);
> +		free  = zone_page_state(zone, NR_FREE_PAGES);
> 
> -	if (unlikely(zone->recent_scanned[1] > file / 4)) {
> -		spin_lock_irq(&zone->lru_lock);
> -		zone->recent_scanned[1] /= 2;
> -		zone->recent_rotated[1] /= 2;
> -		spin_unlock_irq(&zone->lru_lock);
> +		/*
> +		 * If we have very few page cache pages, force-scan anon pages.
> +		 */
> +		if (unlikely(file + free <= zone->pages_high)) {
> +			percent[0] = 100;
> +			percent[1] = 0;
> +			return;
> +		}
> +
> +		/*
> +		 * OK, so we have swap space and a fair amount of page cache
> +		 * pages.  We use the recently rotated / recently scanned
> +		 * ratios to determine how valuable each cache is.
> +		 *
> +		 * Because workloads change over time (and to avoid overflow)
> +		 * we keep these statistics as a floating average, which ends
> +		 * up weighing recent references more than old ones.
> +		 *
> +		 * anon in [0], file in [1]
> +		 */
> +		if (unlikely(zone->recent_scanned[0] > anon / 4)) {
> +			spin_lock_irq(&zone->lru_lock);
> +			zone->recent_scanned[0] /= 2;
> +			zone->recent_rotated[0] /= 2;
> +			spin_unlock_irq(&zone->lru_lock);
> +		}
> +
> +		if (unlikely(zone->recent_scanned[1] > file / 4)) {
> +			spin_lock_irq(&zone->lru_lock);
> +			zone->recent_scanned[1] /= 2;
> +			zone->recent_rotated[1] /= 2;
> +			spin_unlock_irq(&zone->lru_lock);
> +		}
> +
> +		recent_scanned[0] = zone->recent_scanned[0];
> +		recent_scanned[1] = zone->recent_scanned[1];
> +		recent_rotated[0] = zone->recent_rotated[0];
> +		recent_rotated[1] = zone->recent_rotated[1];
> +	} else {
> +		/* XXX */

OK, so this needs to be completed for swappiness to work.

> +		recent_scanned[0] = 0;
> +		recent_scanned[1] = 0;
> +		recent_rotated[0] = 0;
> +		recent_rotated[1] = 0;
>  	}
> 
>  	/*
> @@ -1425,11 +1443,11 @@ static void get_scan_ratio(struct zone *zone, struct scan_control * sc,
>  	 * %anon = 100 * ----------- / ----------------- * IO cost
>  	 *               anon + file      rotate_sum
>  	 */
> -	ap = (anon_prio + 1) * (zone->recent_scanned[0] + 1);
> -	ap /= zone->recent_rotated[0] + 1;
> +	ap = (anon_prio + 1) * (recent_scanned[0] + 1);
> +	ap /= recent_rotated[0] + 1;
> 
> -	fp = (file_prio + 1) * (zone->recent_scanned[1] + 1);
> -	fp /= zone->recent_rotated[1] + 1;
> +	fp = (file_prio + 1) * (recent_scanned[1] + 1);
> +	fp /= recent_rotated[1] + 1;
> 
>  	/* Normalize to percentages */
>  	percent[0] = 100 * ap / (ap + fp + 1);
> @@ -1452,9 +1470,10 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
>  	get_scan_ratio(zone, sc, percent);
> 
>  	for_each_evictable_lru(l) {
> +		int file = is_file_lru(l);
> +		unsigned long scan;
> +
>  		if (scan_global_lru(sc)) {
> -			int file = is_file_lru(l);
> -			int scan;
>  			/*
>  			 * Add one to nr_to_scan just to make sure that the
>  			 * kernel will slowly sift through each list.
> @@ -1476,8 +1495,13 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
>  			 * but because memory controller hits its limit.
>  			 * Don't modify zone reclaim related data.
>  			 */
> -			nr[l] = mem_cgroup_calc_reclaim(sc->mem_cgroup, zone,
> +			scan = mem_cgroup_calc_reclaim(sc->mem_cgroup, zone,
>  								priority, l);
> +			if (priority) {
> +				scan >>= priority;
> +				scan = (scan * percent[file]) / 100;
> +			}
> +			nr[l] = scan + 1;
>  		}
>  	}
> 
> @@ -1704,7 +1728,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
>  		.may_writepage = !laptop_mode,
>  		.may_swap = 1,
>  		.swap_cluster_max = SWAP_CLUSTER_MAX,
> -		.swappiness = vm_swappiness,
> +		.swappiness = mem_cgroup_swappiness(mem_cont),
>  		.order = 0,
>  		.mem_cgroup = mem_cont,
>  		.isolate_pages = mem_cgroup_isolate_pages,


-- 
	Warm Regards,
	Balbir Singh
	Linux Technology Center
	IBM, ISTL

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

next prev parent reply	other threads:[~2008-08-01  5:24 UTC|newest]

Thread overview: 19+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2008-07-31  1:15 memo: mem+swap controller KAMEZAWA Hiroyuki
2008-07-31  6:18 ` KOSAKI Motohiro
2008-07-31  6:25 ` Daisuke Nishimura
2008-07-31  6:51   ` KAMEZAWA Hiroyuki
2008-07-31 13:03     ` Daisuke Nishimura
2008-07-31 16:31       ` kamezawa.hiroyu
2008-08-01  3:05         ` Daisuke Nishimura
2008-08-01  3:28   ` Balbir Singh
2008-08-01  4:02     ` Daisuke Nishimura
2008-08-01  4:13       ` Balbir Singh
2008-08-01  4:57         ` Daisuke Nishimura
2008-08-01  5:07         ` memcg swappiness (Re: memo: mem+swap controller) YAMAMOTO Takashi
2008-08-01  5:25           ` Balbir Singh [this message]
2008-08-01  6:37             ` YAMAMOTO Takashi
2008-08-01  6:46               ` Balbir Singh
2008-09-09  9:17                 ` YAMAMOTO Takashi
2008-09-09 14:07                   ` Balbir Singh
2008-08-01  3:20 ` memo: mem+swap controller Balbir Singh
2008-08-01  3:45   ` KAMEZAWA Hiroyuki

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=48929E60.6050608@linux.vnet.ibm.com \
    --to=balbir@linux.vnet.ibm.com \
    --cc=akpm@linux-foundation.org \
    --cc=hugh@veritas.com \
    --cc=kamezawa.hiroyu@jp.fujitsu.com \
    --cc=linux-mm@kvack.org \
    --cc=menage@google.com \
    --cc=nishimura@mxp.nes.nec.co.jp \
    --cc=yamamoto@valinux.co.jp \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.