[PATCH] memcg: implement low limits

public inbox for cgroups@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH] memcg: implement low limits
@ 2013-02-27  8:02 Roman Gushchin
  2013-02-27  8:20 ` Greg Thelen
       [not found] ` <8121361952156-UZ+4eo27dRL0t1ZyImeKKJZIWCK8hONu@public.gmane.org>
  0 siblings, 2 replies; 10+ messages in thread
From: Roman Gushchin @ 2013-02-27  8:02 UTC (permalink / raw)
  To: Johannes Weiner-Arquette, Michal Hocko, bsingharora,
	kamezawa.hiroyu, akpm, kosaki.motohiro, Rik van Riel, mel, gregkh,
	linux-kernel, cgroups, linux-mm

Hi, all!

I've implemented low limits for memory cgroups. The primary goal was to add an ability 
to protect some memory from reclaiming without using mlock(). A kind of "soft mlock()".

I think this patch will be helpful when it's necessary to protect production processes from
memory-wasting backup processes.

--

Low limits for memory cgroup can be used to limit memory pressure on it.
If memory usage of a cgroup is under it's low limit, it will not be
affected by global reclaim. If it reaches it's low limit from above,
the reclaiming speed will be dropped exponentially.

Low limits don't affect soft reclaim.
Also, it's possible that a cgroup with memory usage under low limit
will be reclaimed slowly on very low scanning priorities.

Signed-off-by: Roman Gushchin <klamm@yandex-team.ru>
---
 include/linux/memcontrol.h  |    7 +++++
 include/linux/res_counter.h |   17 +++++++++++
 kernel/res_counter.c        |    2 ++
 mm/memcontrol.c             |   67 +++++++++++++++++++++++++++++++++++++++++++
 mm/vmscan.c                 |    5 ++++
 5 files changed, 98 insertions(+)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index d6183f0..33e233f 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -183,6 +183,8 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
 						gfp_t gfp_mask,
 						unsigned long *total_scanned);
 
+unsigned int mem_cgroup_low_limit_scale(struct lruvec *lruvec);
+
 void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx);
 static inline void mem_cgroup_count_vm_event(struct mm_struct *mm,
 					     enum vm_event_item idx)
@@ -365,6 +367,11 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
 	return 0;
 }
 
+static inline unsigned int mem_cgroup_low_limit_scale(struct lruvec *lruvec)
+{
+	return 0;
+}
+
 static inline void mem_cgroup_split_huge_fixup(struct page *head)
 {
 }
diff --git a/include/linux/res_counter.h b/include/linux/res_counter.h
index 5ae8456..df3510d 100644
--- a/include/linux/res_counter.h
+++ b/include/linux/res_counter.h
@@ -39,6 +39,10 @@ struct res_counter {
 	 */
 	unsigned long long soft_limit;
 	/*
+	 * the secured guaranteed minimal limit of resource
+	 */
+	unsigned long long low_limit;
+	/*
 	 * the number of unsuccessful attempts to consume the resource
 	 */
 	unsigned long long failcnt;
@@ -87,6 +91,7 @@ enum {
 	RES_LIMIT,
 	RES_FAILCNT,
 	RES_SOFT_LIMIT,
+	RES_LOW_LIMIT,
 };
 
 /*
@@ -223,4 +228,16 @@ res_counter_set_soft_limit(struct res_counter *cnt,
 	return 0;
 }
 
+static inline int
+res_counter_set_low_limit(struct res_counter *cnt,
+			   unsigned long long low_limit)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&cnt->lock, flags);
+	cnt->low_limit = low_limit;
+	spin_unlock_irqrestore(&cnt->lock, flags);
+	return 0;
+}
+
 #endif
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index ff55247..ebfefc1 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -135,6 +135,8 @@ res_counter_member(struct res_counter *counter, int member)
 		return &counter->failcnt;
 	case RES_SOFT_LIMIT:
 		return &counter->soft_limit;
+	case RES_LOW_LIMIT:
+		return &counter->low_limit;
 	};
 
 	BUG();
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 53b8201..d8e6ee6 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1743,6 +1743,53 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
 			 NULL, "Memory cgroup out of memory");
 }
 
+/*
+ * If a cgroup is under low limit or enough close to it,
+ * decrease speed of page scanning.
+ *
+ * mem_cgroup_low_limit_scale() returns a number
+ * from range [0, DEF_PRIORITY - 2], which is used
+ * in the reclaim code as a scanning priority modifier.
+ *
+ * If the low limit is not set, it returns 0;
+ *
+ * usage - low_limit > usage / 8  => 0
+ * usage - low_limit > usage / 16 => 1
+ * usage - low_limit > usage / 32 => 2
+ * ...
+ * usage - low_limit > usage / (2 ^ DEF_PRIORITY - 3) => DEF_PRIORITY - 3
+ * usage < low_limit => DEF_PRIORITY - 2
+ *
+ */
+unsigned int mem_cgroup_low_limit_scale(struct lruvec *lruvec)
+{
+	struct mem_cgroup_per_zone *mz;
+	struct mem_cgroup *memcg;
+	unsigned long long low_limit;
+	unsigned long long usage;
+	unsigned int i;
+
+	mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
+	memcg = mz->memcg;
+	if (!memcg)
+		return 0;
+
+	low_limit = res_counter_read_u64(&memcg->res, RES_LOW_LIMIT);
+	if (!low_limit)
+		return 0;
+
+	usage = res_counter_read_u64(&memcg->res, RES_USAGE);
+
+	if (usage < low_limit)
+		return DEF_PRIORITY - 2;
+
+	for (i = 0; i < DEF_PRIORITY - 2; i++)
+		if (usage - low_limit > (usage >> (i + 3)))
+			break;
+
+	return i;
+}
+
 static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
 					gfp_t gfp_mask,
 					unsigned long flags)
@@ -5116,6 +5163,20 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
 		else
 			ret = -EINVAL;
 		break;
+	case RES_LOW_LIMIT:
+		ret = res_counter_memparse_write_strategy(buffer, &val);
+		if (ret)
+			break;
+		/*
+		 * For memsw, low limits (as also soft limits, see upper)
+		 * are hard to implement in terms of semantics,
+		 * for now, we support soft limits for control without swap
+		 */
+		if (type == _MEM)
+			ret = res_counter_set_low_limit(&memcg->res, val);
+		else
+			ret = -EINVAL;
+		break;
 	default:
 		ret = -EINVAL; /* should be BUG() ? */
 		break;
@@ -5798,6 +5859,12 @@ static struct cftype mem_cgroup_files[] = {
 		.read = mem_cgroup_read,
 	},
 	{
+		.name = "low_limit_in_bytes",
+		.private = MEMFILE_PRIVATE(_MEM, RES_LOW_LIMIT),
+		.write_string = mem_cgroup_write,
+		.read = mem_cgroup_read,
+	},
+	{
 		.name = "failcnt",
 		.private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
 		.trigger = mem_cgroup_reset,
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 88c5fed..9c1c702 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1660,6 +1660,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
 	bool force_scan = false;
 	unsigned long ap, fp;
 	enum lru_list lru;
+	unsigned int low_limit_scale = 0;
 
 	/*
 	 * If the zone or memcg is small, nr[l] can be 0.  This
@@ -1779,6 +1780,9 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
 	fraction[1] = fp;
 	denominator = ap + fp + 1;
 out:
+	if (global_reclaim(sc))
+		low_limit_scale = mem_cgroup_low_limit_scale(lruvec);
+
 	for_each_evictable_lru(lru) {
 		int file = is_file_lru(lru);
 		unsigned long size;
@@ -1786,6 +1790,7 @@ out:
 
 		size = get_lru_size(lruvec, lru);
 		scan = size >> sc->priority;
+		scan >>= low_limit_scale;
 
 		if (!scan && force_scan)
 			scan = min(size, SWAP_CLUSTER_MAX);
-- 
1.7.9.5

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH] memcg: implement low limits
  2013-02-27  8:02 [PATCH] memcg: implement low limits Roman Gushchin
@ 2013-02-27  8:20 ` Greg Thelen
       [not found]   ` <xr93y5eacgmj.fsf-aSPv4SP+Du0KgorLzL7FmE7CuiCeIGUxQQ4Iyu8u01E@public.gmane.org>
       [not found] ` <8121361952156-UZ+4eo27dRL0t1ZyImeKKJZIWCK8hONu@public.gmane.org>
  1 sibling, 1 reply; 10+ messages in thread
From: Greg Thelen @ 2013-02-27  8:20 UTC (permalink / raw)
  To: Roman Gushchin
  Cc: Johannes Weiner-Arquette, Michal Hocko, bsingharora,
	kamezawa.hiroyu, akpm, kosaki.motohiro, Rik van Riel, mel, gregkh,
	linux-kernel, cgroups, linux-mm

On Wed, Feb 27 2013, Roman Gushchin wrote:

> Hi, all!
>
> I've implemented low limits for memory cgroups. The primary goal was to add an ability 
> to protect some memory from reclaiming without using mlock(). A kind of "soft mlock()".
>
> I think this patch will be helpful when it's necessary to protect production processes from
> memory-wasting backup processes.
>
> --
>
> Low limits for memory cgroup can be used to limit memory pressure on it.
> If memory usage of a cgroup is under it's low limit, it will not be
> affected by global reclaim. If it reaches it's low limit from above,
> the reclaiming speed will be dropped exponentially.
>
> Low limits don't affect soft reclaim.
> Also, it's possible that a cgroup with memory usage under low limit
> will be reclaimed slowly on very low scanning priorities.

So the new low limit is not a rigid limit.  Global reclaim can reclaim
from a cgroup when its usage is below low_limit_in_bytes although such
reclaim is less aggressive than when usage is above low_limit_in_bytes.
Correct?

Why doesn't memcg reclaim (i.e. !global_reclaim) also consider
low_limit_in_bytes?

Do you have demonstration of how this improves system operation?

Why is soft_limit insufficient?

> Signed-off-by: Roman Gushchin <klamm@yandex-team.ru>
> ---
>  include/linux/memcontrol.h  |    7 +++++
>  include/linux/res_counter.h |   17 +++++++++++
>  kernel/res_counter.c        |    2 ++
>  mm/memcontrol.c             |   67 +++++++++++++++++++++++++++++++++++++++++++
>  mm/vmscan.c                 |    5 ++++
>  5 files changed, 98 insertions(+)

Need to update Documentation/cgroups/memory.txt explaining the external
behavior of this new know and how it interacts with soft_limit_in_bytes.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 10+ messages in thread

[parent not found: <xr93y5eacgmj.fsf-aSPv4SP+Du0KgorLzL7FmE7CuiCeIGUxQQ4Iyu8u01E@public.gmane.org>]

* Re: [PATCH] memcg: implement low limits
       [not found]   ` <xr93y5eacgmj.fsf-aSPv4SP+Du0KgorLzL7FmE7CuiCeIGUxQQ4Iyu8u01E@public.gmane.org>
@ 2013-02-27 10:11     ` Roman Gushchin
  0 siblings, 0 replies; 10+ messages in thread
From: Roman Gushchin @ 2013-02-27 10:11 UTC (permalink / raw)
  To: Greg Thelen
  Cc: Johannes Weiner-Arquette, Michal Hocko,
	bsingharora-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org,
	kamezawa.hiroyu-+CUm20s59erQFUHtdCDX3A@public.gmane.org,
	akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b@public.gmane.org,
	kosaki.motohiro-+CUm20s59erQFUHtdCDX3A@public.gmane.org,
	Rik van Riel, mel-wPRd99KPJ+uzQB+pC5nmwQ@public.gmane.org,
	gregkh-hQyY1W1yCW8ekmWlsbkhG0B+6BGkLq7r@public.gmane.org,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	cgroups-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg@public.gmane.org

> So the new low limit is not a rigid limit. šGlobal reclaim can reclaim
> from a cgroup when its usage is below low_limit_in_bytes although such
> reclaim is less aggressive than when usage is above low_limit_in_bytes.
> Correct?

That's true.
But such reclaim occurs only on very small reclaiming priorities, 
so it's not a common behavior. It's mostly a protection against 
a case when all cgroups are under low limit (a results of wrong cgroups configuration).

>
> Why doesn't memcg reclaim (i.e. !global_reclaim) also consider
> low_limit_in_bytes?

For some configurations (for instance, low_limit_in_bytes == limit_in_bytes) it will work ugly.
May be it's better to introduce some restrictions on setting memcg limits, but it will be
a much more significant change from a user's point of view.

>
> Do you have demonstration of how this improves system operation?

Assume, you have a machine with some production processes (db, web servers, etc) and a set 
of additional helper processes. You have to protect production processes from steeling theirs 
memory by other processes.
You have constant memory starvation, so kswapd works fast permanently. The production processes 
use, for instance, 80-90% of all physical memory.
Setting low limit for production cgroup to 80% of physical memory solves this problem easily and secure.

And I see no possibility to solve this task with current hard- and soft limits.
So, even if I set hard limit for all other processes to 20% of physical memory, it doesn't mean that 
production cgroup will not been scanned/reclaimed. Some magic with soft limits can help in some cases, 
but it's much more complex in configuration (see below).

> Why is soft_limit insufficient?

1) If I want to grant (and protect) some amount of memory to a cgroup, i have to set soft limits for 
all other cgroups. I must consider total amount of memory, number of cgroups, theirs soft and hard limits.
Low limits provide an easier interface.
2) It works only on DEF_PRIORITY priority.
3) Also, it can be so, that my preferable cgroup is higher above it's soft limit than 
other cgroups (and it's hard to control), so it will be reclaimed more intensively than necessary.

>> šSigned-off-by: Roman Gushchin <klamm-XoJtRXgx1JseBXzfvpsJ4g@public.gmane.org>
>> š---
>> ššinclude/linux/memcontrol.h š| ššš7 +++++
>> ššinclude/linux/res_counter.h | šš17 +++++++++++
>> šškernel/res_counter.c ššššššš| ššš2 ++
>> ššmm/memcontrol.c šššššššššššš| šš67 +++++++++++++++++++++++++++++++++++++++++++
>> ššmm/vmscan.c šššššššššššššššš| ššš5 ++++
>> šš5 files changed, 98 insertions(+)
>
> Need to update Documentation/cgroups/memory.txt explaining the external
> behavior of this new know and how it interacts with soft_limit_in_bytes.

Will do.

Thank you!

--
Regards,
Roman

^ permalink raw reply	[flat|nested] 10+ messages in thread

[parent not found: <8121361952156-UZ+4eo27dRL0t1ZyImeKKJZIWCK8hONu@public.gmane.org>]

* Re: [PATCH] memcg: implement low limits
       [not found] ` <8121361952156-UZ+4eo27dRL0t1ZyImeKKJZIWCK8hONu@public.gmane.org>
@ 2013-02-27  9:40   ` Michal Hocko
       [not found]     ` <20130227094054.GC16719-2MMpYkNvuYDjFM9bn6wA6Q@public.gmane.org>
  2013-02-27 14:57     ` Roman Gushchin
  0 siblings, 2 replies; 10+ messages in thread
From: Michal Hocko @ 2013-02-27  9:40 UTC (permalink / raw)
  To: Roman Gushchin
  Cc: Johannes Weiner-Arquette, bsingharora-Re5JQEeQqe8AvxtiuMwx3w,
	kamezawa.hiroyu-+CUm20s59erQFUHtdCDX3A,
	akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b,
	kosaki.motohiro-+CUm20s59erQFUHtdCDX3A, Rik van Riel,
	mel-wPRd99KPJ+uzQB+pC5nmwQ,
	gregkh-hQyY1W1yCW8ekmWlsbkhG0B+6BGkLq7r,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	cgroups-u79uwXL29TY76Z2rM5mHXA, linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
	Ying Han

On Wed 27-02-13 12:02:36, Roman Gushchin wrote:
> Hi, all!
> 
> I've implemented low limits for memory cgroups. The primary goal was
> to add an ability to protect some memory from reclaiming without using
> mlock(). A kind of "soft mlock()".

Let me restate what I have already mentioned in the private
communication.

We already have soft limit which can be implemented to achieve the
same/similar functionality and in fact this is a long term objective (at
least for me). I hope I will be able to post my code soon. The last post
by Ying Hand (cc-ing her) was here:
http://comments.gmane.org/gmane.linux.kernel.mm/83499

To be honest I do not like introduction of a new limit because we have
two already and the situation would get over complicated.

More comments on the code bellow.

[...]
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 53b8201..d8e6ee6 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -1743,6 +1743,53 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
>  			 NULL, "Memory cgroup out of memory");
>  }
>  
> +/*
> + * If a cgroup is under low limit or enough close to it,
> + * decrease speed of page scanning.
> + *
> + * mem_cgroup_low_limit_scale() returns a number
> + * from range [0, DEF_PRIORITY - 2], which is used
> + * in the reclaim code as a scanning priority modifier.
> + *
> + * If the low limit is not set, it returns 0;
> + *
> + * usage - low_limit > usage / 8  => 0
> + * usage - low_limit > usage / 16 => 1
> + * usage - low_limit > usage / 32 => 2
> + * ...
> + * usage - low_limit > usage / (2 ^ DEF_PRIORITY - 3) => DEF_PRIORITY - 3
> + * usage < low_limit => DEF_PRIORITY - 2

Could you clarify why you have used this calculation. The comment
exlaims _what_ is done but not _why_ it is done.

It is also strange (and unexplained) that the low limit will work
differently depending on the memcg memory usage - bigger groups have a
bigger chance to be reclaimed even if they are under the limit.

> + *
> + */
> +unsigned int mem_cgroup_low_limit_scale(struct lruvec *lruvec)
> +{
> +	struct mem_cgroup_per_zone *mz;
> +	struct mem_cgroup *memcg;
> +	unsigned long long low_limit;
> +	unsigned long long usage;
> +	unsigned int i;
> +
> +	mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
> +	memcg = mz->memcg;
> +	if (!memcg)
> +		return 0;
> +
> +	low_limit = res_counter_read_u64(&memcg->res, RES_LOW_LIMIT);
> +	if (!low_limit)
> +		return 0;
> +
> +	usage = res_counter_read_u64(&memcg->res, RES_USAGE);
> +
> +	if (usage < low_limit)
> +		return DEF_PRIORITY - 2;
> +
> +	for (i = 0; i < DEF_PRIORITY - 2; i++)
> +		if (usage - low_limit > (usage >> (i + 3)))
> +			break;

why this doesn't depend in the current reclaim priority?

> +
> +	return i;
> +}
> +
>  static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
>  					gfp_t gfp_mask,
>  					unsigned long flags)
[...]
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 88c5fed..9c1c702 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -1660,6 +1660,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
>  	bool force_scan = false;
>  	unsigned long ap, fp;
>  	enum lru_list lru;
> +	unsigned int low_limit_scale = 0;
>  
>  	/*
>  	 * If the zone or memcg is small, nr[l] can be 0.  This
> @@ -1779,6 +1780,9 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
>  	fraction[1] = fp;
>  	denominator = ap + fp + 1;
>  out:
> +	if (global_reclaim(sc))
> +		low_limit_scale = mem_cgroup_low_limit_scale(lruvec);

What if the group is reclaimed as a result from parent hitting its
limit?

> +
>  	for_each_evictable_lru(lru) {
>  		int file = is_file_lru(lru);
>  		unsigned long size;
> @@ -1786,6 +1790,7 @@ out:
>  
>  		size = get_lru_size(lruvec, lru);
>  		scan = size >> sc->priority;
> +		scan >>= low_limit_scale;
>  
>  		if (!scan && force_scan)
>  			scan = min(size, SWAP_CLUSTER_MAX);

Thanks!
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 10+ messages in thread

[parent not found: <20130227094054.GC16719-2MMpYkNvuYDjFM9bn6wA6Q@public.gmane.org>]

* Re: [PATCH] memcg: implement low limits
       [not found]     ` <20130227094054.GC16719-2MMpYkNvuYDjFM9bn6wA6Q@public.gmane.org>
@ 2013-02-27 10:39       ` Roman Gushchin
       [not found]         ` <17521361961576-UZ+4eo27dRL0t1ZyImeKKJZIWCK8hONu@public.gmane.org>
  0 siblings, 1 reply; 10+ messages in thread
From: Roman Gushchin @ 2013-02-27 10:39 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Johannes Weiner-Arquette,
	bsingharora-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org,
	kamezawa.hiroyu-+CUm20s59erQFUHtdCDX3A@public.gmane.org,
	akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b@public.gmane.org,
	kosaki.motohiro-+CUm20s59erQFUHtdCDX3A@public.gmane.org,
	Rik van Riel, mel-wPRd99KPJ+uzQB+pC5nmwQ@public.gmane.org,
	gregkh-hQyY1W1yCW8ekmWlsbkhG0B+6BGkLq7r@public.gmane.org,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	cgroups-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg@public.gmane.org, Ying Han

27.02.2013, 13:41, "Michal Hocko" <mhocko-AlSwsSmVLrQ@public.gmane.org>:
> Let me restate what I have already mentioned in the private
> communication.
>
> We already have soft limit which can be implemented to achieve the
> same/similar functionality and in fact this is a long term objective (at
> least for me). I hope I will be able to post my code soon. The last post
> by Ying Hand (cc-ing her) was here:
> http://comments.gmane.org/gmane.linux.kernel.mm/83499
>
> To be honest I do not like introduction of a new limit because we have
> two already and the situation would get over complicated.

I think, there are three different tasks:
1) keeping cgroups below theirs hard limit to avoid direct reclaim (for performance reasons),
2) cgroup's prioritization during global reclaim,
3) granting some amount of memory to a selected cgroup (and protecting it from reclaim without significant reasons)

IMHO, combining them all in one limit will simplify a kernel code, but will also make a user's (or administrator's) 
life much more complicated. Introducing low limits can make the situation simpler.

>
> More comments on the code bellow.

Thank you very much!
I'll address them in an other letter.

--
Regards,
Roman

^ permalink raw reply	[flat|nested] 10+ messages in thread

[parent not found: <17521361961576-UZ+4eo27dRL0t1ZyImeKKJZIWCK8hONu@public.gmane.org>]

* Re: [PATCH] memcg: implement low limits
       [not found]         ` <17521361961576-UZ+4eo27dRL0t1ZyImeKKJZIWCK8hONu@public.gmane.org>
@ 2013-02-27 16:13           ` Michal Hocko
       [not found]             ` <20130227161352.GF16719-2MMpYkNvuYDjFM9bn6wA6Q@public.gmane.org>
  0 siblings, 1 reply; 10+ messages in thread
From: Michal Hocko @ 2013-02-27 16:13 UTC (permalink / raw)
  To: Roman Gushchin
  Cc: Johannes Weiner-Arquette,
	bsingharora-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org,
	kamezawa.hiroyu-+CUm20s59erQFUHtdCDX3A@public.gmane.org,
	akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b@public.gmane.org,
	kosaki.motohiro-+CUm20s59erQFUHtdCDX3A@public.gmane.org,
	Rik van Riel, mel-wPRd99KPJ+uzQB+pC5nmwQ@public.gmane.org,
	gregkh-hQyY1W1yCW8ekmWlsbkhG0B+6BGkLq7r@public.gmane.org,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	cgroups-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg@public.gmane.org, Ying Han

On Wed 27-02-13 14:39:36, Roman Gushchin wrote:
> 27.02.2013, 13:41, "Michal Hocko" <mhocko-AlSwsSmVLrQ@public.gmane.org>:
> > Let me restate what I have already mentioned in the private
> > communication.
> >
> > We already have soft limit which can be implemented to achieve the
> > same/similar functionality and in fact this is a long term objective (at
> > least for me). I hope I will be able to post my code soon. The last post
> > by Ying Hand (cc-ing her) was here:
> > http://comments.gmane.org/gmane.linux.kernel.mm/83499
> >
> > To be honest I do not like introduction of a new limit because we have
> > two already and the situation would get over complicated.
> 
> I think, there are three different tasks:
> 1) keeping cgroups below theirs hard limit to avoid direct reclaim
> (for performance reasons),

Could you clarify what you mean by this, please? There is no background
reclaim for memcgs currently and I am a bit skeptical whether it is
useful. If it would be useful then it should be in par with the global
reclaim (so something like min_free_kbytes would be more appropriate).

> 2) cgroup's prioritization during global reclaim,

Yes, group priorities sound like a useful feature not just for the
reclaim I would like it for oom selection as well.
I think that we shouldn't use any kind of limit for this task, though.

> 3) granting some amount of memory to a selected cgroup (and protecting
> it from reclaim without significant reasons)

and soft limit sounds like a good fit with this description.

> IMHO, combining them all in one limit will simplify a kernel code,
> but will also make a user's (or administrator's) life much more
> complicated.

I do not think all 3 tasks you have described can be covered by a single
limit of course. We have hard limit to cap the usage, we have a soft
limit to allow over-committing the machine. Task 2 would require a new
knob but it shouldn't be covered by any limit or depend on the group
usage. And task 1 sounds like a background reclaim and then it should be
consistent with the global knob.

> Introducing low limits can make the situation simpler.

How exactly? I can see how it would address task 3 but yet again, soft
limit can be turned into this behavior as well without changing its
semantic (that limit would be still considered if we are able to handle
memory pressure from the above - either global pressure or parent
hitting the limit).
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 10+ messages in thread

[parent not found: <20130227161352.GF16719-2MMpYkNvuYDjFM9bn6wA6Q@public.gmane.org>]

* Re: [PATCH] memcg: implement low limits
       [not found]             ` <20130227161352.GF16719-2MMpYkNvuYDjFM9bn6wA6Q@public.gmane.org>
@ 2013-02-28 11:13               ` Roman Gushchin
  2013-02-28 13:02                 ` Michal Hocko
  0 siblings, 1 reply; 10+ messages in thread
From: Roman Gushchin @ 2013-02-28 11:13 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Johannes Weiner-Arquette,
	bsingharora-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org,
	kamezawa.hiroyu-+CUm20s59erQFUHtdCDX3A@public.gmane.org,
	akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b@public.gmane.org,
	kosaki.motohiro-+CUm20s59erQFUHtdCDX3A@public.gmane.org,
	Rik van Riel, mel-wPRd99KPJ+uzQB+pC5nmwQ@public.gmane.org,
	gregkh-hQyY1W1yCW8ekmWlsbkhG0B+6BGkLq7r@public.gmane.org,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	cgroups-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg@public.gmane.org, Ying Han

27.02.2013, 20:14, "Michal Hocko" <mhocko-AlSwsSmVLrQ@public.gmane.org>:
> On Wed 27-02-13 14:39:36, Roman Gushchin wrote:
>
>> š27.02.2013, 13:41, "Michal Hocko" <mhocko-AlSwsSmVLrQ@public.gmane.org>:
>>> šLet me restate what I have already mentioned in the private
>>> šcommunication.
>>>
>>> šWe already have soft limit which can be implemented to achieve the
>>> šsame/similar functionality and in fact this is a long term objective (at
>>> šleast for me). I hope I will be able to post my code soon. The last post
>>> šby Ying Hand (cc-ing her) was here:
>>> šhttp://comments.gmane.org/gmane.linux.kernel.mm/83499
>>>
>>> šTo be honest I do not like introduction of a new limit because we have
>>> štwo already and the situation would get over complicated.
>> šI think, there are three different tasks:
>> š1) keeping cgroups below theirs hard limit to avoid direct reclaim
>> š(for performance reasons),
>
> Could you clarify what you mean by this, please? There is no background
> reclaim for memcgs currently and I am a bit skeptical whether it is
> useful. If it would be useful then it should be in par with the global
> reclaim (so something like min_free_kbytes would be more appropriate).

I mean, that it's a good idea to keep memory usage of any cgroup a little bit 
under it's hard limit to avoid direct reclaim. There is no good mechanism yet,
but it can be developed later.

>> š2) cgroup's prioritization during global reclaim,
>
> Yes, group priorities sound like a useful feature not just for the
> reclaim I would like it for oom selection as well.
> I think that we shouldn't use any kind of limit for this task, though.

I'm thinking about them. Do you know, did someone any attempts to implement them?
I have an idea how to implement them (and also implementing (fast) strict lower limits limits).
I'll try to post my version soon.

>> š3) granting some amount of memory to a selected cgroup (and protecting
>> šit from reclaim without significant reasons)
>
> and soft limit sounds like a good fit with this description.
>
>> šIMHO, combining them all in one limit will simplify a kernel code,
>> šbut will also make a user's (or administrator's) life much more
>> šcomplicated.
>
> I do not think all 3 tasks you have described can be covered by a single
> limit of course. We have hard limit to cap the usage, we have a soft
> limit to allow over-committing the machine. Task 2 would require a new
> knob but it shouldn't be covered by any limit or depend on the group
> usage. And task 1 sounds like a background reclaim and then it should be
> consistent with the global knob.
>
>> šIntroducing low limits can make the situation simpler.
>
> How exactly? I can see how it would address task 3 but yet again, soft
> limit can be turned into this behavior as well without changing its
> semantic (that limit would be still considered if we are able to handle
> memory pressure from the above - either global pressure or parent
> hitting the limit).

Actually, I don't like the name of soft limits - the word "soft". It's not clear from
the name if it's lower or upper limit. It's a little bit confusing that "limit" 
means upper limit, and "soft limit" means lower limit.

Assuming it's possible to implement strict lower limit efficiently, how do you call them?

Thank you!

--
Regards, 
Roman

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] memcg: implement low limits
  2013-02-28 11:13               ` Roman Gushchin
@ 2013-02-28 13:02                 ` Michal Hocko
  0 siblings, 0 replies; 10+ messages in thread
From: Michal Hocko @ 2013-02-28 13:02 UTC (permalink / raw)
  To: Roman Gushchin
  Cc: Johannes Weiner-Arquette, bsingharora@gmail.com,
	kamezawa.hiroyu@jp.fujitsu.com, akpm@linux-foundation.org,
	kosaki.motohiro@jp.fujitsu.com, Rik van Riel, mel@csn.ul.ie,
	gregkh@linuxfoundation.org, linux-kernel@vger.kernel.org,
	cgroups@vger.kernel.org, linux-mm@kvack.org, Ying Han

On Thu 28-02-13 15:13:15, Roman Gushchin wrote:
> 27.02.2013, 20:14, "Michal Hocko" <mhocko@suse.cz>:
> > On Wed 27-02-13 14:39:36, Roman Gushchin wrote:
[...]
> >>  2) cgroup's prioritization during global reclaim,
> >
> > Yes, group priorities sound like a useful feature not just for the
> > reclaim I would like it for oom selection as well.
> > I think that we shouldn't use any kind of limit for this task, though.
> 
> I'm thinking about them. Do you know, did someone any attempts to
> implement them?

I do not remember any patches but we have touched that topic in the
past. With no conclusion AFAIR. The primary issue is that this requires
good justification and nobody seemed to have a good use case - other
than "I can imagine this could be useful". But others might disagree and
provide such use cases...

[...]
> Actually, I don't like the name of soft limits - the word "soft". It's
> not clear from the name if it's lower or upper limit.

The name might be not the best one but it makes some sense.

> It's a little bit confusing that "limit" means upper limit, and "soft
> limit" means lower limit.

It is not a lower limit. It is basically a (soft) high limit for
memory contended situations. Now it just depends on what "memory
contended situation" means and this is an internal implementation thing
(e.g. reclaim only groups which are over soft limit to reduce the
contention). Changing it from best-effort to (almost) guarantee doesn't
change the semantic of the limit for users.

> Assuming it's possible to implement strict lower limit efficiently,
> how do you call them?

This is not about efficiency but rather about an user interface. If the
current one can be used we shouldn't introduce new or we will end up in
an unusable mess.
-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] memcg: implement low limits
  2013-02-27  9:40   ` Michal Hocko
       [not found]     ` <20130227094054.GC16719-2MMpYkNvuYDjFM9bn6wA6Q@public.gmane.org>
@ 2013-02-27 14:57     ` Roman Gushchin
       [not found]       ` <38951361977052-uV6RMHoE7x/0t1ZyImeKKJZIWCK8hONu@public.gmane.org>
  1 sibling, 1 reply; 10+ messages in thread
From: Roman Gushchin @ 2013-02-27 14:57 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Johannes Weiner-Arquette, bsingharora@gmail.com,
	kamezawa.hiroyu@jp.fujitsu.com, akpm@linux-foundation.org,
	kosaki.motohiro@jp.fujitsu.com, Rik van Riel, mel@csn.ul.ie,
	gregkh@linuxfoundation.org, linux-kernel@vger.kernel.org,
	cgroups@vger.kernel.org, linux-mm@kvack.org, Ying Han

[-- Attachment #1: Type: text/plain, Size: 4345 bytes --]

Please find my comments below.

> More comments on the code bellow.
>
> [...]
>
>>  diff --git a/mm/memcontrol.c b/mm/memcontrol.c
>>  index 53b8201..d8e6ee6 100644
>>  --- a/mm/memcontrol.c
>>  +++ b/mm/memcontrol.c
>>  @@ -1743,6 +1743,53 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
>>                            NULL, "Memory cgroup out of memory");
>>   }
>>
>>  +/*
>>  + * If a cgroup is under low limit or enough close to it,
>>  + * decrease speed of page scanning.
>>  + *
>>  + * mem_cgroup_low_limit_scale() returns a number
>>  + * from range [0, DEF_PRIORITY - 2], which is used
>>  + * in the reclaim code as a scanning priority modifier.
>>  + *
>>  + * If the low limit is not set, it returns 0;
>>  + *
>>  + * usage - low_limit > usage / 8  => 0
>>  + * usage - low_limit > usage / 16 => 1
>>  + * usage - low_limit > usage / 32 => 2
>>  + * ...
>>  + * usage - low_limit > usage / (2 ^ DEF_PRIORITY - 3) => DEF_PRIORITY - 3
>>  + * usage < low_limit => DEF_PRIORITY - 2
>
> Could you clarify why you have used this calculation. The comment
> exlaims _what_ is done but not _why_ it is done.
>
> It is also strange (and unexplained) that the low limit will work
> differently depending on the memcg memory usage - bigger groups have a
> bigger chance to be reclaimed even if they are under the limit.

The idea is to decrease scanning speed smoothly.
It's hard to explain why I used exact these numbers. It' like why DEF_PRIORITY is 12?
Just because it works :). Of course, these numbers are an object for discussion/change.

There is a picture in attachment that illustrates how low limits work:
red line - memory usage of cgroup with low_limit set to 1Gb,
blue line - memory usage of another cgroup, where I ran cat <large file> > /dev/null.

>>  + *
>>  + */
>>  +unsigned int mem_cgroup_low_limit_scale(struct lruvec *lruvec)
>>  +{
>>  + struct mem_cgroup_per_zone *mz;
>>  + struct mem_cgroup *memcg;
>>  + unsigned long long low_limit;
>>  + unsigned long long usage;
>>  + unsigned int i;
>>  +
>>  + mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
>>  + memcg = mz->memcg;
>>  + if (!memcg)
>>  + return 0;
>>  +
>>  + low_limit = res_counter_read_u64(&memcg->res, RES_LOW_LIMIT);
>>  + if (!low_limit)
>>  + return 0;
>>  +
>>  + usage = res_counter_read_u64(&memcg->res, RES_USAGE);
>>  +
>>  + if (usage < low_limit)
>>  + return DEF_PRIORITY - 2;
>>  +
>>  + for (i = 0; i < DEF_PRIORITY - 2; i++)
>>  + if (usage - low_limit > (usage >> (i + 3)))
>>  + break;
>
> why this doesn't depend in the current reclaim priority?

How do you want to use reclaim priority here?

I don't like an idea to start ignoring low limit on some priorities.

In my implementation low_limit_scale just "increases" scanning priority, 
but no more than for 10 (DEF_PRIORITY - 2). So, if priority is 0-2, 
the reclaim works as if the priority were 10-12, that means "normal" slow reclaim.

>
>>  +
>>  + return i;
>>  +}
>>  +
>>   static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
>>                                           gfp_t gfp_mask,
>>                                           unsigned long flags)
>
> [...]
>
>>  diff --git a/mm/vmscan.c b/mm/vmscan.c
>>  index 88c5fed..9c1c702 100644
>>  --- a/mm/vmscan.c
>>  +++ b/mm/vmscan.c
>>  @@ -1660,6 +1660,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
>>           bool force_scan = false;
>>           unsigned long ap, fp;
>>           enum lru_list lru;
>>  + unsigned int low_limit_scale = 0;
>>
>>           /*
>>            * If the zone or memcg is small, nr[l] can be 0.  This
>>  @@ -1779,6 +1780,9 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
>>           fraction[1] = fp;
>>           denominator = ap + fp + 1;
>>   out:
>>  + if (global_reclaim(sc))
>>  + low_limit_scale = mem_cgroup_low_limit_scale(lruvec);
>
> What if the group is reclaimed as a result from parent hitting its
> limit?

For now, low limits will work only for global reclaim. Enabling them for target reclaim will require some additional checks.
I plan to do this as a separate change.

Thank you for your comments!

--
Regards,
Roman

[-- Attachment #2: low_limit_memcg.gif --]
[-- Type: image/gif, Size: 11206 bytes --]

^ permalink raw reply	[flat|nested] 10+ messages in thread

[parent not found: <38951361977052-uV6RMHoE7x/0t1ZyImeKKJZIWCK8hONu@public.gmane.org>]

* Re: [PATCH] memcg: implement low limits
       [not found]       ` <38951361977052-uV6RMHoE7x/0t1ZyImeKKJZIWCK8hONu@public.gmane.org>
@ 2013-02-28 14:30         ` Michal Hocko
  0 siblings, 0 replies; 10+ messages in thread
From: Michal Hocko @ 2013-02-28 14:30 UTC (permalink / raw)
  To: Roman Gushchin
  Cc: Johannes Weiner-Arquette,
	bsingharora-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org,
	kamezawa.hiroyu-+CUm20s59erQFUHtdCDX3A@public.gmane.org,
	akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b@public.gmane.org,
	kosaki.motohiro-+CUm20s59erQFUHtdCDX3A@public.gmane.org,
	Rik van Riel, mel-wPRd99KPJ+uzQB+pC5nmwQ@public.gmane.org,
	gregkh-hQyY1W1yCW8ekmWlsbkhG0B+6BGkLq7r@public.gmane.org,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	cgroups-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg@public.gmane.org, Ying Han

On Wed 27-02-13 18:57:32, Roman Gushchin wrote:
[...]
> >>  + *
> >>  + */
> >>  +unsigned int mem_cgroup_low_limit_scale(struct lruvec *lruvec)
> >>  +{
> >>  + struct mem_cgroup_per_zone *mz;
> >>  + struct mem_cgroup *memcg;
> >>  + unsigned long long low_limit;
> >>  + unsigned long long usage;
> >>  + unsigned int i;
> >>  +
> >>  + mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
> >>  + memcg = mz->memcg;
> >>  + if (!memcg)
> >>  + return 0;
> >>  +
> >>  + low_limit = res_counter_read_u64(&memcg->res, RES_LOW_LIMIT);
> >>  + if (!low_limit)
> >>  + return 0;
> >>  +
> >>  + usage = res_counter_read_u64(&memcg->res, RES_USAGE);
> >>  +
> >>  + if (usage < low_limit)
> >>  + return DEF_PRIORITY - 2;
> >>  +
> >>  + for (i = 0; i < DEF_PRIORITY - 2; i++)
> >>  + if (usage - low_limit > (usage >> (i + 3)))
> >>  + break;
> >
> > why this doesn't depend in the current reclaim priority?
> 
> How do you want to use reclaim priority here?

But then you can get up to 2*DEF_PRIORITY-2 priority (in
get_scan_count) in the end and we are back to my original and more
fundamental objection that the low_limit depends on the group size
because small groups basically do not get scanned when under/close_to
limit while big groups do get scanned and reclaimed.

> I don't like an idea to start ignoring low limit on some priorities.

Well, but you are doing that already. If you are reclaiming for prio 0 then
you add up just DEF_PRIORITY-2 which means you reclaim for all groups with
more than 1024 pages on the LRUs.
[...]
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2013-02-28 14:30 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2013-02-27  8:02 [PATCH] memcg: implement low limits Roman Gushchin
2013-02-27  8:20 ` Greg Thelen
     [not found]   ` <xr93y5eacgmj.fsf-aSPv4SP+Du0KgorLzL7FmE7CuiCeIGUxQQ4Iyu8u01E@public.gmane.org>
2013-02-27 10:11     ` Roman Gushchin
     [not found] ` <8121361952156-UZ+4eo27dRL0t1ZyImeKKJZIWCK8hONu@public.gmane.org>
2013-02-27  9:40   ` Michal Hocko
     [not found]     ` <20130227094054.GC16719-2MMpYkNvuYDjFM9bn6wA6Q@public.gmane.org>
2013-02-27 10:39       ` Roman Gushchin
     [not found]         ` <17521361961576-UZ+4eo27dRL0t1ZyImeKKJZIWCK8hONu@public.gmane.org>
2013-02-27 16:13           ` Michal Hocko
     [not found]             ` <20130227161352.GF16719-2MMpYkNvuYDjFM9bn6wA6Q@public.gmane.org>
2013-02-28 11:13               ` Roman Gushchin
2013-02-28 13:02                 ` Michal Hocko
2013-02-27 14:57     ` Roman Gushchin
     [not found]       ` <38951361977052-uV6RMHoE7x/0t1ZyImeKKJZIWCK8hONu@public.gmane.org>
2013-02-28 14:30         ` Michal Hocko

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox