All of lore.kernel.org
 help / color / mirror / Atom feed
From: Roman Gushchin <guro@fb.com>
To: linux-mm@kvack.org
Cc: Roman Gushchin <guro@fb.com>, Michal Hocko <mhocko@kernel.org>,
	Vladimir Davydov <vdavydov.dev@gmail.com>,
	Johannes Weiner <hannes@cmpxchg.org>,
	David Rientjes <rientjes@google.com>, Tejun Heo <tj@kernel.org>,
	Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>,
	kernel-team@fb.com, cgroups@vger.kernel.org,
	linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org
Subject: [v4 3/4] mm, oom: introduce oom_priority for memory cgroups
Date: Wed, 26 Jul 2017 14:27:17 +0100	[thread overview]
Message-ID: <20170726132718.14806-4-guro@fb.com> (raw)
In-Reply-To: <20170726132718.14806-1-guro@fb.com>

Introduce a per-memory-cgroup oom_priority setting: an integer number
within the [-10000, 10000] range, which defines the order in which
the OOM killer selects victim memory cgroups.

OOM killer prefers memory cgroups with larger priority if they are
populated with elegible tasks.

The oom_priority value is compared within sibling cgroups.

The root cgroup has the oom_priority 0, which cannot be changed.

Signed-off-by: Roman Gushchin <guro@fb.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: kernel-team@fb.com
Cc: cgroups@vger.kernel.org
Cc: linux-doc@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: linux-mm@kvack.org
---
 include/linux/memcontrol.h |  3 +++
 mm/memcontrol.c            | 55 ++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 56 insertions(+), 2 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index b21bbb0edc72..d31ac58e08ad 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -206,6 +206,9 @@ struct mem_cgroup {
 	/* cached OOM score */
 	long oom_score;
 
+	/* OOM killer priority */
+	short oom_priority;
+
 	/* handle for "memory.events" */
 	struct cgroup_file events_file;
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index ba72d1cf73d0..2c1566995077 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2710,12 +2710,21 @@ static void select_victim_memcg(struct mem_cgroup *root, struct oom_control *oc)
 	for (;;) {
 		struct cgroup_subsys_state *css;
 		struct mem_cgroup *memcg = NULL;
+		short prio = SHRT_MIN;
 		long score = LONG_MIN;
 
 		css_for_each_child(css, &root->css) {
 			struct mem_cgroup *iter = mem_cgroup_from_css(css);
 
-			if (iter->oom_score > score) {
+			if (iter->oom_score == 0)
+				continue;
+
+			if (iter->oom_priority > prio) {
+				memcg = iter;
+				prio = iter->oom_priority;
+				score = iter->oom_score;
+			} else if (iter->oom_priority == prio &&
+				   iter->oom_score > score) {
 				memcg = iter;
 				score = iter->oom_score;
 			}
@@ -2782,7 +2791,15 @@ bool mem_cgroup_select_oom_victim(struct oom_control *oc)
 	 * For system-wide OOMs we should consider tasks in the root cgroup
 	 * with oom_score larger than oc->chosen_points.
 	 */
-	if (!oc->memcg) {
+	if (!oc->memcg && !(oc->chosen_memcg &&
+			    oc->chosen_memcg->oom_priority > 0)) {
+		/*
+		 * Root memcg has priority 0, so if chosen memcg has lower
+		 * priority, any task in root cgroup is preferable.
+		 */
+		if (oc->chosen_memcg && oc->chosen_memcg->oom_priority < 0)
+			oc->chosen_points = 0;
+
 		select_victim_root_cgroup_task(oc);
 
 		if (oc->chosen && oc->chosen_memcg) {
@@ -5373,6 +5390,34 @@ static ssize_t memory_oom_kill_all_tasks_write(struct kernfs_open_file *of,
 	return nbytes;
 }
 
+static int memory_oom_priority_show(struct seq_file *m, void *v)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+
+	seq_printf(m, "%d\n", memcg->oom_priority);
+
+	return 0;
+}
+
+static ssize_t memory_oom_priority_write(struct kernfs_open_file *of,
+				char *buf, size_t nbytes, loff_t off)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+	int oom_priority;
+	int err;
+
+	err = kstrtoint(strstrip(buf), 0, &oom_priority);
+	if (err)
+		return err;
+
+	if (oom_priority < -10000 || oom_priority > 10000)
+		return -EINVAL;
+
+	memcg->oom_priority = (short)oom_priority;
+
+	return nbytes;
+}
+
 static int memory_events_show(struct seq_file *m, void *v)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
@@ -5499,6 +5544,12 @@ static struct cftype memory_files[] = {
 		.write = memory_oom_kill_all_tasks_write,
 	},
 	{
+		.name = "oom_priority",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = memory_oom_priority_show,
+		.write = memory_oom_priority_write,
+	},
+	{
 		.name = "events",
 		.flags = CFTYPE_NOT_ON_ROOT,
 		.file_offset = offsetof(struct mem_cgroup, events_file),
-- 
2.13.3

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

WARNING: multiple messages have this Message-ID (diff)
From: Roman Gushchin <guro@fb.com>
To: <linux-mm@kvack.org>
Cc: Roman Gushchin <guro@fb.com>, Michal Hocko <mhocko@kernel.org>,
	Vladimir Davydov <vdavydov.dev@gmail.com>,
	Johannes Weiner <hannes@cmpxchg.org>,
	David Rientjes <rientjes@google.com>, Tejun Heo <tj@kernel.org>,
	Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>,
	<kernel-team@fb.com>, <cgroups@vger.kernel.org>,
	<linux-doc@vger.kernel.org>, <linux-kernel@vger.kernel.org>
Subject: [v4 3/4] mm, oom: introduce oom_priority for memory cgroups
Date: Wed, 26 Jul 2017 14:27:17 +0100	[thread overview]
Message-ID: <20170726132718.14806-4-guro@fb.com> (raw)
In-Reply-To: <20170726132718.14806-1-guro@fb.com>

Introduce a per-memory-cgroup oom_priority setting: an integer number
within the [-10000, 10000] range, which defines the order in which
the OOM killer selects victim memory cgroups.

OOM killer prefers memory cgroups with larger priority if they are
populated with elegible tasks.

The oom_priority value is compared within sibling cgroups.

The root cgroup has the oom_priority 0, which cannot be changed.

Signed-off-by: Roman Gushchin <guro@fb.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: kernel-team@fb.com
Cc: cgroups@vger.kernel.org
Cc: linux-doc@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: linux-mm@kvack.org
---
 include/linux/memcontrol.h |  3 +++
 mm/memcontrol.c            | 55 ++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 56 insertions(+), 2 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index b21bbb0edc72..d31ac58e08ad 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -206,6 +206,9 @@ struct mem_cgroup {
 	/* cached OOM score */
 	long oom_score;
 
+	/* OOM killer priority */
+	short oom_priority;
+
 	/* handle for "memory.events" */
 	struct cgroup_file events_file;
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index ba72d1cf73d0..2c1566995077 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2710,12 +2710,21 @@ static void select_victim_memcg(struct mem_cgroup *root, struct oom_control *oc)
 	for (;;) {
 		struct cgroup_subsys_state *css;
 		struct mem_cgroup *memcg = NULL;
+		short prio = SHRT_MIN;
 		long score = LONG_MIN;
 
 		css_for_each_child(css, &root->css) {
 			struct mem_cgroup *iter = mem_cgroup_from_css(css);
 
-			if (iter->oom_score > score) {
+			if (iter->oom_score == 0)
+				continue;
+
+			if (iter->oom_priority > prio) {
+				memcg = iter;
+				prio = iter->oom_priority;
+				score = iter->oom_score;
+			} else if (iter->oom_priority == prio &&
+				   iter->oom_score > score) {
 				memcg = iter;
 				score = iter->oom_score;
 			}
@@ -2782,7 +2791,15 @@ bool mem_cgroup_select_oom_victim(struct oom_control *oc)
 	 * For system-wide OOMs we should consider tasks in the root cgroup
 	 * with oom_score larger than oc->chosen_points.
 	 */
-	if (!oc->memcg) {
+	if (!oc->memcg && !(oc->chosen_memcg &&
+			    oc->chosen_memcg->oom_priority > 0)) {
+		/*
+		 * Root memcg has priority 0, so if chosen memcg has lower
+		 * priority, any task in root cgroup is preferable.
+		 */
+		if (oc->chosen_memcg && oc->chosen_memcg->oom_priority < 0)
+			oc->chosen_points = 0;
+
 		select_victim_root_cgroup_task(oc);
 
 		if (oc->chosen && oc->chosen_memcg) {
@@ -5373,6 +5390,34 @@ static ssize_t memory_oom_kill_all_tasks_write(struct kernfs_open_file *of,
 	return nbytes;
 }
 
+static int memory_oom_priority_show(struct seq_file *m, void *v)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+
+	seq_printf(m, "%d\n", memcg->oom_priority);
+
+	return 0;
+}
+
+static ssize_t memory_oom_priority_write(struct kernfs_open_file *of,
+				char *buf, size_t nbytes, loff_t off)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+	int oom_priority;
+	int err;
+
+	err = kstrtoint(strstrip(buf), 0, &oom_priority);
+	if (err)
+		return err;
+
+	if (oom_priority < -10000 || oom_priority > 10000)
+		return -EINVAL;
+
+	memcg->oom_priority = (short)oom_priority;
+
+	return nbytes;
+}
+
 static int memory_events_show(struct seq_file *m, void *v)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
@@ -5499,6 +5544,12 @@ static struct cftype memory_files[] = {
 		.write = memory_oom_kill_all_tasks_write,
 	},
 	{
+		.name = "oom_priority",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = memory_oom_priority_show,
+		.write = memory_oom_priority_write,
+	},
+	{
 		.name = "events",
 		.flags = CFTYPE_NOT_ON_ROOT,
 		.file_offset = offsetof(struct mem_cgroup, events_file),
-- 
2.13.3

  parent reply	other threads:[~2017-07-26 13:27 UTC|newest]

Thread overview: 55+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-07-26 13:27 [v4 0/4] cgroup-aware OOM killer Roman Gushchin
2017-07-26 13:27 ` [v4 1/4] mm, oom: refactor the TIF_MEMDIE usage Roman Gushchin
2017-07-26 13:27   ` Roman Gushchin
2017-07-26 13:56   ` Michal Hocko
2017-07-26 13:56     ` Michal Hocko
2017-07-26 14:06     ` Roman Gushchin
2017-07-26 14:06       ` Roman Gushchin
2017-07-26 14:24       ` Michal Hocko
2017-07-26 14:24         ` Michal Hocko
2017-07-26 14:44         ` Michal Hocko
2017-07-26 14:44           ` Michal Hocko
     [not found]           ` <20170726144408.GU2981-2MMpYkNvuYDjFM9bn6wA6Q@public.gmane.org>
2017-07-26 14:50             ` Roman Gushchin
2017-07-26 14:50               ` Roman Gushchin
2017-07-26 14:50               ` Roman Gushchin
2017-07-26 13:27 ` [v4 2/4] mm, oom: cgroup-aware OOM killer Roman Gushchin
2017-07-26 13:27   ` Roman Gushchin
     [not found]   ` <20170726132718.14806-3-guro-b10kYP2dOMg@public.gmane.org>
2017-07-27 21:41     ` kbuild test robot
2017-07-27 21:41       ` kbuild test robot
2017-07-27 21:41       ` kbuild test robot
2017-08-01 14:54   ` Michal Hocko
2017-08-01 14:54     ` Michal Hocko
2017-08-01 15:25     ` Roman Gushchin
2017-08-01 15:25       ` Roman Gushchin
2017-08-01 17:03       ` Michal Hocko
2017-08-01 17:03         ` Michal Hocko
2017-08-01 18:13         ` Roman Gushchin
2017-08-01 18:13           ` Roman Gushchin
2017-08-01 18:13           ` Roman Gushchin
2017-08-02  7:29           ` Michal Hocko
2017-08-02  7:29             ` Michal Hocko
2017-08-03 12:47             ` Roman Gushchin
2017-08-03 12:47               ` Roman Gushchin
     [not found]               ` <20170803124751.GA24563-2xczL/1GIl5a1dPMsufgnw2O0Ztt9esIQQ4Iyu8u01E@public.gmane.org>
2017-08-03 13:01                 ` Michal Hocko
2017-08-03 13:01                   ` Michal Hocko
2017-08-03 13:01                   ` Michal Hocko
2017-08-08 23:06       ` David Rientjes
2017-08-08 23:06         ` David Rientjes
     [not found]         ` <alpine.DEB.2.10.1708081559001.54505-X6Q0R45D7oAcqpCFd4KODRPsWskHk0ljAL8bYrjMMd8@public.gmane.org>
2017-08-14 12:03           ` Roman Gushchin
2017-08-14 12:03             ` Roman Gushchin
2017-08-14 12:03             ` Roman Gushchin
2017-07-26 13:27 ` Roman Gushchin [this message]
2017-07-26 13:27   ` [v4 3/4] mm, oom: introduce oom_priority for memory cgroups Roman Gushchin
     [not found]   ` <20170726132718.14806-4-guro-b10kYP2dOMg@public.gmane.org>
2017-08-08 23:14     ` David Rientjes
2017-08-08 23:14       ` David Rientjes
2017-08-08 23:14       ` David Rientjes
2017-08-14 12:39       ` Roman Gushchin
2017-08-14 12:39         ` Roman Gushchin
2017-07-26 13:27 ` [v4 4/4] mm, oom, docs: describe the cgroup-aware OOM killer Roman Gushchin
2017-07-26 13:27   ` Roman Gushchin
     [not found]   ` <20170726132718.14806-5-guro-b10kYP2dOMg@public.gmane.org>
2017-08-08 23:24     ` David Rientjes
2017-08-08 23:24       ` David Rientjes
2017-08-08 23:24       ` David Rientjes
2017-08-14 12:28       ` Roman Gushchin
2017-08-14 12:28         ` Roman Gushchin
2017-08-14 12:28         ` Roman Gushchin

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20170726132718.14806-4-guro@fb.com \
    --to=guro@fb.com \
    --cc=cgroups@vger.kernel.org \
    --cc=hannes@cmpxchg.org \
    --cc=kernel-team@fb.com \
    --cc=linux-doc@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mhocko@kernel.org \
    --cc=penguin-kernel@I-love.SAKURA.ne.jp \
    --cc=rientjes@google.com \
    --cc=tj@kernel.org \
    --cc=vdavydov.dev@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.