[RFC][PATCH 3/4] Memory controller soft limit organize cgroups

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Balbir Singh <balbir@linux.vnet.ibm.com>
To: Andrew Morton <akpm@linux-foundation.org>
Cc: Sudhir Kumar <skumar@linux.vnet.ibm.com>,
	YAMAMOTO Takashi <yamamoto@valinux.co.jp>,
	Paul Menage <menage@google.com>,
	lizf@cn.fujitsu.com, linux-kernel@vger.kernel.org,
	linux-mm@kvack.org, David Rientjes <rientjes@google.com>,
	Pavel Emelianov <xemul@openvz.org>,
	Balbir Singh <balbir@linux.vnet.ibm.com>,
	KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Subject: [RFC][PATCH 3/4] Memory controller soft limit organize cgroups
Date: Thu, 08 Jan 2009 00:11:28 +0530	[thread overview]
Message-ID: <20090107184128.18062.96016.sendpatchset@localhost.localdomain> (raw)
In-Reply-To: <20090107184110.18062.41459.sendpatchset@localhost.localdomain>

From: Balbir Singh <balbir@linux.vnet.ibm.com>

This patch introduces a RB-Tree for storing memory cgroups that are over their
soft limit. The overall goal is to

1. Add a memory cgroup to the RB-Tree when the soft limit is exceeded.
   We are careful about updates, updates take place only after a particular
   time interval has passed
2. We remove the node from the RB-Tree when the usage goes below the soft
   limit

The next set of patches will exploit the RB-Tree to get the group that is
over its soft limit by the largest amount and reclaim from it, when we
face memory contention.

Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
---

 mm/memcontrol.c |   78 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 78 insertions(+)

diff -puN mm/memcontrol.c~memcg-organize-over-soft-limit-groups mm/memcontrol.c
--- a/mm/memcontrol.c~memcg-organize-over-soft-limit-groups
+++ a/mm/memcontrol.c
@@ -28,6 +28,7 @@
 #include <linux/bit_spinlock.h>
 #include <linux/rcupdate.h>
 #include <linux/mutex.h>
+#include <linux/rbtree.h>
 #include <linux/slab.h>
 #include <linux/swap.h>
 #include <linux/spinlock.h>
@@ -119,6 +120,13 @@ struct mem_cgroup_lru_info {
 };
 
 /*
+ * Cgroups above their limits are maintained in a RB-Tree, independent of
+ * their hierarchy representation
+ */
+static struct rb_root mem_cgroup_soft_limit_exceeded_groups;
+static DEFINE_MUTEX(memcg_soft_limit_tree_mutex);
+
+/*
  * The memory controller data structure. The memory controller controls both
  * page cache and RSS per cgroup. We would eventually like to provide
  * statistics based on the statistics developed by Rik Van Riel for clock-pro,
@@ -166,12 +174,18 @@ struct mem_cgroup {
 
 	unsigned int	swappiness;
 
+	struct rb_node mem_cgroup_node;
+	unsigned long long usage_in_excess;
+	unsigned long last_tree_update;
+
 	/*
 	 * statistics. This must be placed at the end of memcg.
 	 */
 	struct mem_cgroup_stat stat;
 };
 
+#define	MEM_CGROUP_TREE_UPDATE_INTERVAL		(HZ)
+
 enum charge_type {
 	MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
 	MEM_CGROUP_CHARGE_TYPE_MAPPED,
@@ -203,6 +217,39 @@ pcg_default_flags[NR_CHARGE_TYPE] = {
 static void mem_cgroup_get(struct mem_cgroup *mem);
 static void mem_cgroup_put(struct mem_cgroup *mem);
 
+static void mem_cgroup_insert_exceeded(struct mem_cgroup *mem)
+{
+	struct rb_node **p = &mem_cgroup_soft_limit_exceeded_groups.rb_node;
+	struct rb_node *parent = NULL;
+	struct mem_cgroup *mem_node;
+
+	mutex_lock(&memcg_soft_limit_tree_mutex);
+	while (*p) {
+		parent = *p;
+		mem_node = rb_entry(parent, struct mem_cgroup, mem_cgroup_node);
+		if (mem->usage_in_excess < mem_node->usage_in_excess)
+			p = &(*p)->rb_left;
+		/*
+		 * We can't avoid mem cgroups that are over their soft
+		 * limit by the same amount
+		 */
+		else if (mem->usage_in_excess >= mem_node->usage_in_excess)
+			p = &(*p)->rb_right;
+	}
+	rb_link_node(&mem->mem_cgroup_node, parent, p);
+	rb_insert_color(&mem->mem_cgroup_node,
+			&mem_cgroup_soft_limit_exceeded_groups);
+	mem->last_tree_update = jiffies;
+	mutex_unlock(&memcg_soft_limit_tree_mutex);
+}
+
+static void mem_cgroup_remove_exceeded(struct mem_cgroup *mem)
+{
+	mutex_lock(&memcg_soft_limit_tree_mutex);
+	rb_erase(&mem->mem_cgroup_node, &mem_cgroup_soft_limit_exceeded_groups);
+	mutex_unlock(&memcg_soft_limit_tree_mutex);
+}
+
 static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
 					 struct page_cgroup *pc,
 					 bool charge)
@@ -917,6 +964,10 @@ static void __mem_cgroup_commit_charge(s
 				     struct page_cgroup *pc,
 				     enum charge_type ctype)
 {
+	unsigned long long prev_usage_in_excess, new_usage_in_excess;
+	bool updated_tree = false;
+	unsigned long next_update;
+
 	/* try_charge() can return NULL to *memcg, taking care of it. */
 	if (!mem)
 		return;
@@ -937,6 +988,30 @@ static void __mem_cgroup_commit_charge(s
 	mem_cgroup_charge_statistics(mem, pc, true);
 
 	unlock_page_cgroup(pc);
+
+	mem_cgroup_get(mem);
+	prev_usage_in_excess = mem->usage_in_excess;
+	new_usage_in_excess = res_counter_soft_limit_excess(&mem->res);
+
+	next_update = mem->last_tree_update + MEM_CGROUP_TREE_UPDATE_INTERVAL;
+	if (new_usage_in_excess && time_after(jiffies, next_update)) {
+		if (prev_usage_in_excess)
+			mem_cgroup_remove_exceeded(mem);
+		mem_cgroup_insert_exceeded(mem);
+		updated_tree = true;
+	} else if (prev_usage_in_excess && !new_usage_in_excess) {
+		mem_cgroup_remove_exceeded(mem);
+		updated_tree = true;
+	}
+
+	if (updated_tree) {
+		mutex_lock(&memcg_soft_limit_tree_mutex);
+		mem->last_tree_update = jiffies;
+		mem->usage_in_excess = new_usage_in_excess;
+		mutex_unlock(&memcg_soft_limit_tree_mutex);
+	}
+	mem_cgroup_put(mem);
+
 }
 
 /**
@@ -2218,6 +2293,7 @@ mem_cgroup_create(struct cgroup_subsys *
 	if (cont->parent == NULL) {
 		enable_swap_cgroup();
 		parent = NULL;
+		mem_cgroup_soft_limit_exceeded_groups = RB_ROOT;
 	} else {
 		parent = mem_cgroup_from_cont(cont->parent);
 		mem->use_hierarchy = parent->use_hierarchy;
@@ -2231,6 +2307,8 @@ mem_cgroup_create(struct cgroup_subsys *
 		res_counter_init(&mem->memsw, NULL);
 	}
 	mem->last_scanned_child = NULL;
+	mem->usage_in_excess = 0;
+	mem->last_tree_update = 0;	/* Yes, time begins at 0 here */
 	spin_lock_init(&mem->reclaim_param_lock);
 
 	if (parent)
_

-- 
	Balbir

WARNING: multiple messages have this Message-ID (diff)

From: Balbir Singh <balbir@linux.vnet.ibm.com>
To: Andrew Morton <akpm@linux-foundation.org>
Cc: Sudhir Kumar <skumar@linux.vnet.ibm.com>,
	YAMAMOTO Takashi <yamamoto@valinux.co.jp>,
	Paul Menage <menage@google.com>,
	lizf@cn.fujitsu.com, linux-kernel@vger.kernel.org,
	linux-mm@kvack.org, David Rientjes <rientjes@google.com>,
	Pavel Emelianov <xemul@openvz.org>,
	Balbir Singh <balbir@linux.vnet.ibm.com>,
	KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Subject: [RFC][PATCH 3/4] Memory controller soft limit organize cgroups
Date: Thu, 08 Jan 2009 00:11:28 +0530	[thread overview]
Message-ID: <20090107184128.18062.96016.sendpatchset@localhost.localdomain> (raw)
In-Reply-To: <20090107184110.18062.41459.sendpatchset@localhost.localdomain>

From: Balbir Singh <balbir@linux.vnet.ibm.com>

This patch introduces a RB-Tree for storing memory cgroups that are over their
soft limit. The overall goal is to

1. Add a memory cgroup to the RB-Tree when the soft limit is exceeded.
   We are careful about updates, updates take place only after a particular
   time interval has passed
2. We remove the node from the RB-Tree when the usage goes below the soft
   limit

The next set of patches will exploit the RB-Tree to get the group that is
over its soft limit by the largest amount and reclaim from it, when we
face memory contention.

Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
---

 mm/memcontrol.c |   78 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 78 insertions(+)

diff -puN mm/memcontrol.c~memcg-organize-over-soft-limit-groups mm/memcontrol.c
--- a/mm/memcontrol.c~memcg-organize-over-soft-limit-groups
+++ a/mm/memcontrol.c
@@ -28,6 +28,7 @@
 #include <linux/bit_spinlock.h>
 #include <linux/rcupdate.h>
 #include <linux/mutex.h>
+#include <linux/rbtree.h>
 #include <linux/slab.h>
 #include <linux/swap.h>
 #include <linux/spinlock.h>
@@ -119,6 +120,13 @@ struct mem_cgroup_lru_info {
 };
 
 /*
+ * Cgroups above their limits are maintained in a RB-Tree, independent of
+ * their hierarchy representation
+ */
+static struct rb_root mem_cgroup_soft_limit_exceeded_groups;
+static DEFINE_MUTEX(memcg_soft_limit_tree_mutex);
+
+/*
  * The memory controller data structure. The memory controller controls both
  * page cache and RSS per cgroup. We would eventually like to provide
  * statistics based on the statistics developed by Rik Van Riel for clock-pro,
@@ -166,12 +174,18 @@ struct mem_cgroup {
 
 	unsigned int	swappiness;
 
+	struct rb_node mem_cgroup_node;
+	unsigned long long usage_in_excess;
+	unsigned long last_tree_update;
+
 	/*
 	 * statistics. This must be placed at the end of memcg.
 	 */
 	struct mem_cgroup_stat stat;
 };
 
+#define	MEM_CGROUP_TREE_UPDATE_INTERVAL		(HZ)
+
 enum charge_type {
 	MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
 	MEM_CGROUP_CHARGE_TYPE_MAPPED,
@@ -203,6 +217,39 @@ pcg_default_flags[NR_CHARGE_TYPE] = {
 static void mem_cgroup_get(struct mem_cgroup *mem);
 static void mem_cgroup_put(struct mem_cgroup *mem);
 
+static void mem_cgroup_insert_exceeded(struct mem_cgroup *mem)
+{
+	struct rb_node **p = &mem_cgroup_soft_limit_exceeded_groups.rb_node;
+	struct rb_node *parent = NULL;
+	struct mem_cgroup *mem_node;
+
+	mutex_lock(&memcg_soft_limit_tree_mutex);
+	while (*p) {
+		parent = *p;
+		mem_node = rb_entry(parent, struct mem_cgroup, mem_cgroup_node);
+		if (mem->usage_in_excess < mem_node->usage_in_excess)
+			p = &(*p)->rb_left;
+		/*
+		 * We can't avoid mem cgroups that are over their soft
+		 * limit by the same amount
+		 */
+		else if (mem->usage_in_excess >= mem_node->usage_in_excess)
+			p = &(*p)->rb_right;
+	}
+	rb_link_node(&mem->mem_cgroup_node, parent, p);
+	rb_insert_color(&mem->mem_cgroup_node,
+			&mem_cgroup_soft_limit_exceeded_groups);
+	mem->last_tree_update = jiffies;
+	mutex_unlock(&memcg_soft_limit_tree_mutex);
+}
+
+static void mem_cgroup_remove_exceeded(struct mem_cgroup *mem)
+{
+	mutex_lock(&memcg_soft_limit_tree_mutex);
+	rb_erase(&mem->mem_cgroup_node, &mem_cgroup_soft_limit_exceeded_groups);
+	mutex_unlock(&memcg_soft_limit_tree_mutex);
+}
+
 static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
 					 struct page_cgroup *pc,
 					 bool charge)
@@ -917,6 +964,10 @@ static void __mem_cgroup_commit_charge(s
 				     struct page_cgroup *pc,
 				     enum charge_type ctype)
 {
+	unsigned long long prev_usage_in_excess, new_usage_in_excess;
+	bool updated_tree = false;
+	unsigned long next_update;
+
 	/* try_charge() can return NULL to *memcg, taking care of it. */
 	if (!mem)
 		return;
@@ -937,6 +988,30 @@ static void __mem_cgroup_commit_charge(s
 	mem_cgroup_charge_statistics(mem, pc, true);
 
 	unlock_page_cgroup(pc);
+
+	mem_cgroup_get(mem);
+	prev_usage_in_excess = mem->usage_in_excess;
+	new_usage_in_excess = res_counter_soft_limit_excess(&mem->res);
+
+	next_update = mem->last_tree_update + MEM_CGROUP_TREE_UPDATE_INTERVAL;
+	if (new_usage_in_excess && time_after(jiffies, next_update)) {
+		if (prev_usage_in_excess)
+			mem_cgroup_remove_exceeded(mem);
+		mem_cgroup_insert_exceeded(mem);
+		updated_tree = true;
+	} else if (prev_usage_in_excess && !new_usage_in_excess) {
+		mem_cgroup_remove_exceeded(mem);
+		updated_tree = true;
+	}
+
+	if (updated_tree) {
+		mutex_lock(&memcg_soft_limit_tree_mutex);
+		mem->last_tree_update = jiffies;
+		mem->usage_in_excess = new_usage_in_excess;
+		mutex_unlock(&memcg_soft_limit_tree_mutex);
+	}
+	mem_cgroup_put(mem);
+
 }
 
 /**
@@ -2218,6 +2293,7 @@ mem_cgroup_create(struct cgroup_subsys *
 	if (cont->parent == NULL) {
 		enable_swap_cgroup();
 		parent = NULL;
+		mem_cgroup_soft_limit_exceeded_groups = RB_ROOT;
 	} else {
 		parent = mem_cgroup_from_cont(cont->parent);
 		mem->use_hierarchy = parent->use_hierarchy;
@@ -2231,6 +2307,8 @@ mem_cgroup_create(struct cgroup_subsys *
 		res_counter_init(&mem->memsw, NULL);
 	}
 	mem->last_scanned_child = NULL;
+	mem->usage_in_excess = 0;
+	mem->last_tree_update = 0;	/* Yes, time begins at 0 here */
 	spin_lock_init(&mem->reclaim_param_lock);
 
 	if (parent)
_

-- 
	Balbir

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

next prev parent reply	other threads:[~2009-01-07 18:42 UTC|newest]

Thread overview: 38+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2009-01-07 18:41 [RFC][PATCH 0/4] Memory controller soft limit patches Balbir Singh
2009-01-07 18:41 ` Balbir Singh
2009-01-07 18:41 ` [RFC][PATCH 1/4] Memory controller soft limit documentation Balbir Singh
2009-01-07 18:41   ` Balbir Singh
2009-01-14  1:45   ` Paul Menage
2009-01-14  1:45     ` Paul Menage
2009-01-14  5:30     ` Balbir Singh
2009-01-14  5:30       ` Balbir Singh
2009-01-07 18:41 ` [RFC][PATCH 2/4] Memory controller soft limit interface Balbir Singh
2009-01-07 18:41   ` Balbir Singh
2009-01-07 18:41 ` Balbir Singh [this message]
2009-01-07 18:41   ` [RFC][PATCH 3/4] Memory controller soft limit organize cgroups Balbir Singh
2009-01-08  1:11   ` KAMEZAWA Hiroyuki
2009-01-08  1:11     ` KAMEZAWA Hiroyuki
2009-01-08  4:25     ` Balbir Singh
2009-01-08  4:25       ` Balbir Singh
2009-01-08  4:28       ` KAMEZAWA Hiroyuki
2009-01-08  4:28         ` KAMEZAWA Hiroyuki
2009-01-08  4:41         ` Balbir Singh
2009-01-08  4:41           ` Balbir Singh
2009-01-08  4:57           ` KAMEZAWA Hiroyuki
2009-01-08  4:57             ` KAMEZAWA Hiroyuki
2009-01-07 18:41 ` [RFC][PATCH 4/4] Memory controller soft limit reclaim on contention Balbir Singh
2009-01-07 18:41   ` Balbir Singh
2009-01-07 18:56 ` [RFC][PATCH 0/4] Memory controller soft limit patches Dhaval Giani
2009-01-07 18:56   ` Dhaval Giani
2009-01-08  0:37   ` KAMEZAWA Hiroyuki
2009-01-08  0:37     ` KAMEZAWA Hiroyuki
2009-01-08  3:46     ` Balbir Singh
2009-01-08  3:46       ` Balbir Singh
2009-01-08  0:30 ` KAMEZAWA Hiroyuki
2009-01-08  0:30   ` KAMEZAWA Hiroyuki
2009-01-08  3:59   ` Balbir Singh
2009-01-08  3:59     ` Balbir Singh
2009-01-08  4:21     ` KAMEZAWA Hiroyuki
2009-01-08  4:21       ` KAMEZAWA Hiroyuki
2009-01-08  4:41     ` Daisuke Nishimura
2009-01-08  4:41       ` Daisuke Nishimura

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20090107184128.18062.96016.sendpatchset@localhost.localdomain \
    --to=balbir@linux.vnet.ibm.com \
    --cc=akpm@linux-foundation.org \
    --cc=kamezawa.hiroyu@jp.fujitsu.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=lizf@cn.fujitsu.com \
    --cc=menage@google.com \
    --cc=rientjes@google.com \
    --cc=skumar@linux.vnet.ibm.com \
    --cc=xemul@openvz.org \
    --cc=yamamoto@valinux.co.jp \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.