linux-api.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC v2] prctl: prctl(PR_SET_IDLE, PR_IDLE_MODE_KILLME), for stateless idle loops
       [not found] <20171101053244.5218-1-slandden@gmail.com>
@ 2017-11-03  6:35 ` Shawn Landden
  2017-11-03  9:09   ` Michal Hocko
                     ` (2 more replies)
  0 siblings, 3 replies; 14+ messages in thread
From: Shawn Landden @ 2017-11-03  6:35 UTC (permalink / raw)
  Cc: linux-kernel, linux-fsdevel, linux-mm, linux-api, Shawn Landden

It is common for services to be stateless around their main event loop.
If a process sets PR_SET_IDLE to PR_IDLE_MODE_KILLME then it
signals to the kernel that epoll_wait() and friends may not complete,
and the kernel may send SIGKILL if resources get tight.

See my systemd patch: https://github.com/shawnl/systemd/tree/prctl

Android uses this memory model for all programs, and having it in the
kernel will enable integration with the page cache (not in this
series).

16 bytes per process is kinda spendy, but I want to keep
lru behavior, which mem_score_adj does not allow. When a supervisor,
like Android's user input is keeping track this can be done in user-space.
It could be pulled out of task_struct if an cross-indexing additional
red-black tree is added to support pid-based lookup.

v2
switch to prctl, memcg support
---
 fs/eventpoll.c             | 17 +++++++++++++
 fs/proc/array.c            |  7 ++++++
 include/linux/memcontrol.h |  3 +++
 include/linux/oom.h        |  4 ++++
 include/linux/sched.h      |  4 ++++
 include/uapi/linux/prctl.h |  4 ++++
 kernel/cgroup/cgroup.c     | 12 ++++++++++
 kernel/exit.c              |  2 ++
 kernel/sys.c               |  9 +++++++
 mm/memcontrol.c            |  4 ++++
 mm/oom_kill.c              | 60 ++++++++++++++++++++++++++++++++++++++++++++++
 11 files changed, 126 insertions(+)

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 2fabd19cdeea..04011fca038b 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -43,6 +43,7 @@
 #include <linux/compat.h>
 #include <linux/rculist.h>
 #include <net/busy_poll.h>
+#include <linux/oom.h>
 
 /*
  * LOCKING:
@@ -1762,6 +1763,14 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
 	wait_queue_entry_t wait;
 	ktime_t expires, *to = NULL;
 
+	if (current->oom_target) {
+		spin_lock(oom_target_get_spinlock(current));
+		list_add(&current->se.oom_target_queue,
+			 oom_target_get_queue(current));
+		current->se.oom_target_on_queue = 1;
+		spin_unlock(oom_target_get_spinlock(current));
+	}
+
 	if (timeout > 0) {
 		struct timespec64 end_time = ep_set_mstimeout(timeout);
 
@@ -1783,6 +1792,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
 	if (!ep_events_available(ep))
 		ep_busy_loop(ep, timed_out);
 
+
 	spin_lock_irqsave(&ep->lock, flags);
 
 	if (!ep_events_available(ep)) {
@@ -1850,6 +1860,13 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
 	    !(res = ep_send_events(ep, events, maxevents)) && !timed_out)
 		goto fetch_events;
 
+	if (current->oom_target) {
+		spin_lock(oom_target_get_spinlock(current));
+		list_del(&current->se.oom_target_queue);
+		current->se.oom_target_on_queue = 0;
+		spin_unlock(oom_target_get_spinlock(current));
+	}
+
 	return res;
 }
 
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 77a8eacbe032..cab009727a7f 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -349,6 +349,12 @@ static inline void task_seccomp(struct seq_file *m, struct task_struct *p)
 	seq_putc(m, '\n');
 }
 
+static inline void task_idle(struct seq_file *m, struct task_struct *p)
+{
+	seq_put_decimal_ull(m, "Idle:\t", p->oom_target);
+	seq_putc(m, '\n');
+}
+
 static inline void task_context_switch_counts(struct seq_file *m,
 						struct task_struct *p)
 {
@@ -380,6 +386,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
 	task_sig(m, task);
 	task_cap(m, task);
 	task_seccomp(m, task);
+	task_idle(m, task);
 	task_cpus_allowed(m, task);
 	cpuset_task_status_allowed(m, task);
 	task_context_switch_counts(m, task);
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 69966c461d1c..40a2db8ae522 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -261,6 +261,9 @@ struct mem_cgroup {
 	struct list_head event_list;
 	spinlock_t event_list_lock;
 
+	struct list_head	oom_target_queue;
+	spinlock_t		oom_target_spinlock;
+
 	struct mem_cgroup_per_node *nodeinfo[0];
 	/* WARNING: nodeinfo must be the last member here */
 };
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 76aac4ce39bc..a5d16eb05297 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -101,6 +101,10 @@ extern void oom_killer_enable(void);
 
 extern struct task_struct *find_lock_task_mm(struct task_struct *p);
 
+extern void exit_oom_target(void);
+struct list_head *oom_target_get_queue(struct task_struct *ts);
+spinlock_t *oom_target_get_spinlock(struct task_struct *ts);
+
 /* sysctls */
 extern int sysctl_oom_dump_tasks;
 extern int sysctl_oom_kill_allocating_task;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 26a7df4e558c..2b110c4d7357 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -380,6 +380,9 @@ struct sched_entity {
 	struct list_head		group_node;
 	unsigned int			on_rq;
 
+	unsigned			oom_target_on_queue:1;
+	struct list_head		oom_target_queue;
+
 	u64				exec_start;
 	u64				sum_exec_runtime;
 	u64				vruntime;
@@ -651,6 +654,7 @@ struct task_struct {
 	/* disallow userland-initiated cgroup migration */
 	unsigned			no_cgroup_migration:1;
 #endif
+	unsigned			oom_target:1;
 
 	unsigned long			atomic_flags; /* Flags requiring atomic access. */
 
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index a8d0759a9e40..eba3c3c8375b 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -197,4 +197,8 @@ struct prctl_mm_map {
 # define PR_CAP_AMBIENT_LOWER		3
 # define PR_CAP_AMBIENT_CLEAR_ALL	4
 
+#define PR_SET_IDLE		48
+#define PR_GET_IDLE		49
+# define PR_IDLE_MODE_KILLME	1
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 44857278eb8a..bd48b84d9565 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -55,6 +55,7 @@
 #include <linux/nsproxy.h>
 #include <linux/file.h>
 #include <net/sock.h>
+#include <linux/oom.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/cgroup.h>
@@ -779,6 +780,11 @@ static void css_set_move_task(struct task_struct *task,
 				css_task_iter_advance(it);
 
 		list_del_init(&task->cg_list);
+		if (task->se.oom_target_on_queue) {
+			spin_lock(oom_target_get_spinlock(task));
+			list_del_init(&task->se.oom_target_queue);
+			spin_unlock(oom_target_get_spinlock(task));
+		}
 		if (!css_set_populated(from_cset))
 			css_set_update_populated(from_cset, false);
 	} else {
@@ -797,6 +803,12 @@ static void css_set_move_task(struct task_struct *task,
 		rcu_assign_pointer(task->cgroups, to_cset);
 		list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks :
 							     &to_cset->tasks);
+		if (task->se.oom_target_on_queue) {
+			spin_lock(oom_target_get_spinlock(task));
+			list_add_tail(&task->se.oom_target_queue,
+					oom_target_get_queue(task));
+			spin_unlock(oom_target_get_spinlock(task));
+		}
 	}
 }
 
diff --git a/kernel/exit.c b/kernel/exit.c
index f6cad39f35df..bb13a359b5e7 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -62,6 +62,7 @@
 #include <linux/random.h>
 #include <linux/rcuwait.h>
 #include <linux/compat.h>
+#include <linux/eventpoll.h>
 
 #include <linux/uaccess.h>
 #include <asm/unistd.h>
@@ -917,6 +918,7 @@ void __noreturn do_exit(long code)
 		__this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);
 	exit_rcu();
 	exit_tasks_rcu_finish();
+	exit_oom_target();
 
 	lockdep_free_task(tsk);
 	do_task_dead();
diff --git a/kernel/sys.c b/kernel/sys.c
index 9aebc2935013..f949b193f126 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2385,6 +2385,15 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 	case PR_GET_FP_MODE:
 		error = GET_FP_MODE(me);
 		break;
+	case PR_SET_IDLE:
+		if (!((arg2 == 0) || (arg2 == PR_IDLE_MODE_KILLME)))
+			return -EINVAL;
+		me->oom_target = arg2;
+		error = 0;
+		break;
+	case PR_GET_IDLE:
+		error = me->oom_target;
+		break;
 	default:
 		error = -EINVAL;
 		break;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 661f046ad318..f6ea5adac586 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4300,6 +4300,10 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 			memory_cgrp_subsys.broken_hierarchy = true;
 	}
 
+	INIT_LIST_HEAD(&memcg->oom_target_queue);
+	memcg->oom_target_spinlock = __SPIN_LOCK_UNLOCKED(
+					&memcg->oom_target_spinlock);
+
 	/* The following stuff does not apply to the root */
 	if (!parent) {
 		root_mem_cgroup = memcg;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index dee0f75c3013..05394f0bd6ab 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -41,6 +41,7 @@
 #include <linux/kthread.h>
 #include <linux/init.h>
 #include <linux/mmu_notifier.h>
+#include <linux/eventpoll.h>
 
 #include <asm/tlb.h>
 #include "internal.h"
@@ -54,6 +55,46 @@ int sysctl_oom_dump_tasks = 1;
 
 DEFINE_MUTEX(oom_lock);
 
+static DEFINE_SPINLOCK(oom_target_spinlock);
+static LIST_HEAD(oom_target_global_queue);
+
+/* Clean up after a EPOLL_KILLME process quits.
+ * Called by kernel/exit.c.
+ */
+void exit_oom_target(void)
+{
+	if (current->se.oom_target_on_queue) {
+		spin_lock(&oom_target_spinlock);
+		current->se.oom_target_on_queue = 0;
+		list_del(&current->se.oom_target_queue);
+		spin_unlock(&oom_target_spinlock);
+	}
+}
+
+inline struct list_head *oom_target_get_queue(struct task_struct *ts)
+{
+#ifdef CONFIG_MEMCG
+	struct mem_cgroup *mcg;
+
+	mcg = mem_cgroup_from_task(ts);
+	if (mcg)
+		return &mcg->oom_target_queue;
+#endif
+	return &oom_target_global_queue;
+}
+
+inline spinlock_t *oom_target_get_spinlock(struct task_struct *ts)
+{
+#ifdef CONFIG_MEMCG
+	struct mem_cgroup *mcg;
+
+	mcg = mem_cgroup_from_task(ts);
+	if (mcg)
+		return &mcg->oom_target_spinlock;
+#endif
+	return &oom_target_spinlock;
+}
+
 #ifdef CONFIG_NUMA
 /**
  * has_intersects_mems_allowed() - check task eligiblity for kill
@@ -1007,6 +1048,7 @@ bool out_of_memory(struct oom_control *oc)
 {
 	unsigned long freed = 0;
 	enum oom_constraint constraint = CONSTRAINT_NONE;
+	struct list_head *l;
 
 	if (oom_killer_disabled)
 		return false;
@@ -1018,6 +1060,24 @@ bool out_of_memory(struct oom_control *oc)
 			return true;
 	}
 
+	/*
+	 * Check death row for current memcg or global.
+	 */
+	l = oom_target_get_queue(current);
+	if (!list_empty(l)) {
+		struct task_struct *ts = list_first_entry(l,
+				struct task_struct, se.oom_target_queue);
+
+		pr_debug("Killing pid %u from EPOLL_KILLME death row.",
+			 ts->pid);
+
+		/* We use SIGKILL instead of the oom killer
+		 * so as to cleanly interrupt ep_poll()
+		 */
+		send_sig(SIGKILL, ts, 1);
+		return true;
+	}
+
 	/*
 	 * If current has a pending SIGKILL or is exiting, then automatically
 	 * select it.  The goal is to allow it to allocate so that it may
-- 
2.15.0.rc2

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [RFC v2] prctl: prctl(PR_SET_IDLE, PR_IDLE_MODE_KILLME), for stateless idle loops
  2017-11-03  6:35 ` [RFC v2] prctl: prctl(PR_SET_IDLE, PR_IDLE_MODE_KILLME), for stateless idle loops Shawn Landden
@ 2017-11-03  9:09   ` Michal Hocko
  2017-11-18  4:45     ` Shawn Landden
  2017-11-18 20:33     ` Shawn Landden
  2017-11-21  4:49   ` [RFC v3] It is common for services to be stateless around their main event loop. If a process sets PR_SET_IDLE to PR_IDLE_MODE_KILLME then it signals to the kernel that epoll_wait() and friends may not complete, and the kernel may send SIGKILL if resources get tight Shawn Landden
  2017-11-22 10:29   ` [RFC v2] prctl: prctl(PR_SET_IDLE, PR_IDLE_MODE_KILLME), for stateless idle loops peter enderborg
  2 siblings, 2 replies; 14+ messages in thread
From: Michal Hocko @ 2017-11-03  9:09 UTC (permalink / raw)
  To: Shawn Landden; +Cc: linux-kernel, linux-fsdevel, linux-mm, linux-api

On Thu 02-11-17 23:35:44, Shawn Landden wrote:
> It is common for services to be stateless around their main event loop.
> If a process sets PR_SET_IDLE to PR_IDLE_MODE_KILLME then it
> signals to the kernel that epoll_wait() and friends may not complete,
> and the kernel may send SIGKILL if resources get tight.
> 
> See my systemd patch: https://github.com/shawnl/systemd/tree/prctl
> 
> Android uses this memory model for all programs, and having it in the
> kernel will enable integration with the page cache (not in this
> series).
> 
> 16 bytes per process is kinda spendy, but I want to keep
> lru behavior, which mem_score_adj does not allow. When a supervisor,
> like Android's user input is keeping track this can be done in user-space.
> It could be pulled out of task_struct if an cross-indexing additional
> red-black tree is added to support pid-based lookup.

This is still an abuse and the patch is wrong. We really do have an API
to use I fail to see why you do not use it.

[...]
> @@ -1018,6 +1060,24 @@ bool out_of_memory(struct oom_control *oc)
>  			return true;
>  	}
>  
> +	/*
> +	 * Check death row for current memcg or global.
> +	 */
> +	l = oom_target_get_queue(current);
> +	if (!list_empty(l)) {
> +		struct task_struct *ts = list_first_entry(l,
> +				struct task_struct, se.oom_target_queue);
> +
> +		pr_debug("Killing pid %u from EPOLL_KILLME death row.",
> +			 ts->pid);
> +
> +		/* We use SIGKILL instead of the oom killer
> +		 * so as to cleanly interrupt ep_poll()
> +		 */
> +		send_sig(SIGKILL, ts, 1);
> +		return true;
> +	}

Still not NUMA aware and completely backwards. If this is a memcg OOM
then it is _memcg_ to evaluate not the current. The oom might happen up
the hierarchy due to hard limit.

But still, you should be very clear _why_ the existing oom tuning is not
appropropriate and we can think of a way to hanle it better but cramming
the oom selection this way is simply not acceptable.
-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC v2] prctl: prctl(PR_SET_IDLE, PR_IDLE_MODE_KILLME), for stateless idle loops
  2017-11-03  9:09   ` Michal Hocko
@ 2017-11-18  4:45     ` Shawn Landden
       [not found]       ` <CA+49okqZ8CME0EN1xS_cCTc5Q-fGRreg0makhzNNuRpGs3mjfw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  2017-11-20  8:35       ` Michal Hocko
  2017-11-18 20:33     ` Shawn Landden
  1 sibling, 2 replies; 14+ messages in thread
From: Shawn Landden @ 2017-11-18  4:45 UTC (permalink / raw)
  To: Michal Hocko; +Cc: linux-kernel, linux-fsdevel, linux-mm, linux-api

[-- Attachment #1: Type: text/plain, Size: 2369 bytes --]

On Fri, Nov 3, 2017 at 2:09 AM, Michal Hocko <mhocko@kernel.org> wrote:

> On Thu 02-11-17 23:35:44, Shawn Landden wrote:
> > It is common for services to be stateless around their main event loop.
> > If a process sets PR_SET_IDLE to PR_IDLE_MODE_KILLME then it
> > signals to the kernel that epoll_wait() and friends may not complete,
> > and the kernel may send SIGKILL if resources get tight.
> >
> > See my systemd patch: https://github.com/shawnl/systemd/tree/prctl
> >
> > Android uses this memory model for all programs, and having it in the
> > kernel will enable integration with the page cache (not in this
> > series).
> >
> > 16 bytes per process is kinda spendy, but I want to keep
> > lru behavior, which mem_score_adj does not allow. When a supervisor,
> > like Android's user input is keeping track this can be done in
> user-space.
> > It could be pulled out of task_struct if an cross-indexing additional
> > red-black tree is added to support pid-based lookup.
>
> This is still an abuse and the patch is wrong. We really do have an API
> to use I fail to see why you do not use it.
>
When I looked at wait_queue_head_t it was 20 byes.

>
> [...]
> > @@ -1018,6 +1060,24 @@ bool out_of_memory(struct oom_control *oc)
> >                       return true;
> >       }
> >
> > +     /*
> > +      * Check death row for current memcg or global.
> > +      */
> > +     l = oom_target_get_queue(current);
> > +     if (!list_empty(l)) {
> > +             struct task_struct *ts = list_first_entry(l,
> > +                             struct task_struct, se.oom_target_queue);
> > +
> > +             pr_debug("Killing pid %u from EPOLL_KILLME death row.",
> > +                      ts->pid);
> > +
> > +             /* We use SIGKILL instead of the oom killer
> > +              * so as to cleanly interrupt ep_poll()
> > +              */
> > +             send_sig(SIGKILL, ts, 1);
> > +             return true;
> > +     }
>
> Still not NUMA aware and completely backwards. If this is a memcg OOM
> then it is _memcg_ to evaluate not the current. The oom might happen up
> the hierarchy due to hard limit.
>
> But still, you should be very clear _why_ the existing oom tuning is not
> appropropriate and we can think of a way to hanle it better but cramming
> the oom selection this way is simply not acceptable.
> --
> Michal Hocko
> SUSE Labs
>

[-- Attachment #2: Type: text/html, Size: 3382 bytes --]

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC v2] prctl: prctl(PR_SET_IDLE, PR_IDLE_MODE_KILLME), for stateless idle loops
  2017-11-03  9:09   ` Michal Hocko
  2017-11-18  4:45     ` Shawn Landden
@ 2017-11-18 20:33     ` Shawn Landden
  1 sibling, 0 replies; 14+ messages in thread
From: Shawn Landden @ 2017-11-18 20:33 UTC (permalink / raw)
  To: Michal Hocko; +Cc: linux-kernel, linux-fsdevel, linux-mm, linux-api

On Fri, Nov 3, 2017 at 2:09 AM, Michal Hocko <mhocko@kernel.org> wrote:
> On Thu 02-11-17 23:35:44, Shawn Landden wrote:
>> 16 bytes per process is kinda spendy, but I want to keep
>> lru behavior, which mem_score_adj does not allow. When a supervisor,
>> like Android's user input is keeping track this can be done in user-space.
>> It could be pulled out of task_struct if an cross-indexing additional
>> red-black tree is added to support pid-based lookup.
>
> This is still an abuse and the patch is wrong. We really do have an API
> to use I fail to see why you do not use it.
When I looked at wait_queue_head_t it was 20 bytes.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC v2] prctl: prctl(PR_SET_IDLE, PR_IDLE_MODE_KILLME), for stateless idle loops
       [not found]       ` <CA+49okqZ8CME0EN1xS_cCTc5Q-fGRreg0makhzNNuRpGs3mjfw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2017-11-19  4:19         ` Matthew Wilcox
  0 siblings, 0 replies; 14+ messages in thread
From: Matthew Wilcox @ 2017-11-19  4:19 UTC (permalink / raw)
  To: Shawn Landden
  Cc: Michal Hocko, linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg, linux-api-u79uwXL29TY76Z2rM5mHXA

On Fri, Nov 17, 2017 at 08:45:03PM -0800, Shawn Landden wrote:
> On Fri, Nov 3, 2017 at 2:09 AM, Michal Hocko <mhocko-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org> wrote:
> > On Thu 02-11-17 23:35:44, Shawn Landden wrote:
> > > 16 bytes per process is kinda spendy, but I want to keep
> > > lru behavior, which mem_score_adj does not allow. When a supervisor,
> > > like Android's user input is keeping track this can be done in
> > user-space.
> > > It could be pulled out of task_struct if an cross-indexing additional
> > > red-black tree is added to support pid-based lookup.
> >
> > This is still an abuse and the patch is wrong. We really do have an API
> > to use I fail to see why you do not use it.
> >
> When I looked at wait_queue_head_t it was 20 byes.

24 bytes actually; the compiler will add 4 bytes of padding between
the spinlock and the list_head.  But there's one for the entire system.
Then you add a 40 byte structure (wait_queue_entry) on the stack for each
sleeping process.  There's no per-process cost.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC v2] prctl: prctl(PR_SET_IDLE, PR_IDLE_MODE_KILLME), for stateless idle loops
  2017-11-18  4:45     ` Shawn Landden
       [not found]       ` <CA+49okqZ8CME0EN1xS_cCTc5Q-fGRreg0makhzNNuRpGs3mjfw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2017-11-20  8:35       ` Michal Hocko
  2017-11-21  4:48         ` Shawn Landden
  1 sibling, 1 reply; 14+ messages in thread
From: Michal Hocko @ 2017-11-20  8:35 UTC (permalink / raw)
  To: Shawn Landden; +Cc: linux-kernel, linux-fsdevel, linux-mm, linux-api

On Fri 17-11-17 20:45:03, Shawn Landden wrote:
> On Fri, Nov 3, 2017 at 2:09 AM, Michal Hocko <mhocko@kernel.org> wrote:
> 
> > On Thu 02-11-17 23:35:44, Shawn Landden wrote:
> > > It is common for services to be stateless around their main event loop.
> > > If a process sets PR_SET_IDLE to PR_IDLE_MODE_KILLME then it
> > > signals to the kernel that epoll_wait() and friends may not complete,
> > > and the kernel may send SIGKILL if resources get tight.
> > >
> > > See my systemd patch: https://github.com/shawnl/systemd/tree/prctl
> > >
> > > Android uses this memory model for all programs, and having it in the
> > > kernel will enable integration with the page cache (not in this
> > > series).
> > >
> > > 16 bytes per process is kinda spendy, but I want to keep
> > > lru behavior, which mem_score_adj does not allow. When a supervisor,
> > > like Android's user input is keeping track this can be done in
> > user-space.
> > > It could be pulled out of task_struct if an cross-indexing additional
> > > red-black tree is added to support pid-based lookup.
> >
> > This is still an abuse and the patch is wrong. We really do have an API
> > to use I fail to see why you do not use it.
> >
> When I looked at wait_queue_head_t it was 20 byes.

I do not understand. What I meant to say is that we do have a proper
user api to hint OOM killer decisions.
-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC v2] prctl: prctl(PR_SET_IDLE, PR_IDLE_MODE_KILLME), for stateless idle loops
  2017-11-20  8:35       ` Michal Hocko
@ 2017-11-21  4:48         ` Shawn Landden
  2017-11-21  7:05           ` Michal Hocko
  0 siblings, 1 reply; 14+ messages in thread
From: Shawn Landden @ 2017-11-21  4:48 UTC (permalink / raw)
  To: Michal Hocko; +Cc: linux-kernel, linux-fsdevel, linux-mm, linux-api

On Mon, Nov 20, 2017 at 12:35 AM, Michal Hocko <mhocko@kernel.org> wrote:
> On Fri 17-11-17 20:45:03, Shawn Landden wrote:
>> On Fri, Nov 3, 2017 at 2:09 AM, Michal Hocko <mhocko@kernel.org> wrote:
>>
>> > On Thu 02-11-17 23:35:44, Shawn Landden wrote:
>> > > It is common for services to be stateless around their main event loop.
>> > > If a process sets PR_SET_IDLE to PR_IDLE_MODE_KILLME then it
>> > > signals to the kernel that epoll_wait() and friends may not complete,
>> > > and the kernel may send SIGKILL if resources get tight.
>> > >
>> > > See my systemd patch: https://github.com/shawnl/systemd/tree/prctl
>> > >
>> > > Android uses this memory model for all programs, and having it in the
>> > > kernel will enable integration with the page cache (not in this
>> > > series).
>> > >
>> > > 16 bytes per process is kinda spendy, but I want to keep
>> > > lru behavior, which mem_score_adj does not allow. When a supervisor,
>> > > like Android's user input is keeping track this can be done in
>> > user-space.
>> > > It could be pulled out of task_struct if an cross-indexing additional
>> > > red-black tree is added to support pid-based lookup.
>> >
>> > This is still an abuse and the patch is wrong. We really do have an API
>> > to use I fail to see why you do not use it.
>> >
>> When I looked at wait_queue_head_t it was 20 byes.
>
> I do not understand. What I meant to say is that we do have a proper
> user api to hint OOM killer decisions.
This is a FIFO queue, rather than a heuristic, which is all you get
with the current API.
> --
> Michal Hocko
> SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [RFC v3] It is common for services to be stateless around their main event loop. If a process sets PR_SET_IDLE to PR_IDLE_MODE_KILLME then it signals to the kernel that epoll_wait() and friends may not complete, and the kernel may send SIGKILL if resources get tight.
  2017-11-03  6:35 ` [RFC v2] prctl: prctl(PR_SET_IDLE, PR_IDLE_MODE_KILLME), for stateless idle loops Shawn Landden
  2017-11-03  9:09   ` Michal Hocko
@ 2017-11-21  4:49   ` Shawn Landden
  2017-11-21  4:56     ` Shawn Landden
  2017-11-21  5:16     ` [RFC v4] " Shawn Landden
  2017-11-22 10:29   ` [RFC v2] prctl: prctl(PR_SET_IDLE, PR_IDLE_MODE_KILLME), for stateless idle loops peter enderborg
  2 siblings, 2 replies; 14+ messages in thread
From: Shawn Landden @ 2017-11-21  4:49 UTC (permalink / raw)
  Cc: linux-kernel, linux-fsdevel, linux-mm, linux-api, mhocko, willy,
	Shawn Landden

See my systemd patch: https://github.com/shawnl/systemd/tree/prctl

Android uses this memory model for all programs, and having it in the
kernel will enable integration with the page cache (not in this
series).

v2
switch to prctl, memcg support

v3
use <linux/wait.h>
put OOM after constraint checking
---
 fs/eventpoll.c             | 27 ++++++++++++++++++++
 fs/proc/array.c            |  7 ++++++
 include/linux/memcontrol.h |  3 +++
 include/linux/oom.h        |  4 +++
 include/linux/sched.h      |  1 +
 include/uapi/linux/prctl.h |  4 +++
 kernel/cgroup/cgroup.c     | 61 ++++++++++++++++++++++++++++++++++++++++++++++
 kernel/exit.c              |  1 +
 kernel/sys.c               |  9 +++++++
 mm/memcontrol.c            |  2 ++
 mm/oom_kill.c              | 47 +++++++++++++++++++++++++++++++++++
 11 files changed, 166 insertions(+)

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 2fabd19cdeea..745662f9a7e1 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -43,6 +43,8 @@
 #include <linux/compat.h>
 #include <linux/rculist.h>
 #include <net/busy_poll.h>
+#include <linux/memcontrol.h>
+#include <linux/oom.h>
 
 /*
  * LOCKING:
@@ -1761,6 +1763,19 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
 	u64 slack = 0;
 	wait_queue_entry_t wait;
 	ktime_t expires, *to = NULL;
+	DEFINE_WAIT_FUNC(oom_target_wait, oom_target_callback);
+	DEFINE_WAIT_FUNC(oom_target_wait_mcg, oom_target_callback);
+
+	if (current->oom_target) {
+#ifdef CONFIG_MEMCG
+		struct mem_cgroup *mcg;
+
+		mcg = mem_cgroup_from_task(current);
+		if (mcg)
+			add_wait_queue(&mcg->oom_target, &oom_target_wait_mcg);
+#endif
+		add_wait_queue(oom_target_get_wait(), &oom_target_wait);
+	}
 
 	if (timeout > 0) {
 		struct timespec64 end_time = ep_set_mstimeout(timeout);
@@ -1850,6 +1865,18 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
 	    !(res = ep_send_events(ep, events, maxevents)) && !timed_out)
 		goto fetch_events;
 
+	if (current->oom_target) {
+#ifdef CONFIG_MEMCG
+		struct mem_cgroup *mcg;
+
+		mcg = mem_cgroup_from_task(current);
+		if (mcg)
+			remove_wait_queue(&mcg->oom_target,
+					&oom_target_wait_mcg);
+#endif
+		remove_wait_queue(oom_target_get_wait(), &oom_target_wait);
+	}
+
 	return res;
 }
 
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 9390032a11e1..1954ae87cb88 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -350,6 +350,12 @@ static inline void task_seccomp(struct seq_file *m, struct task_struct *p)
 	seq_putc(m, '\n');
 }
 
+static inline void task_idle(struct seq_file *m, struct task_struct *p)
+{
+	seq_put_decimal_ull(m, "Idle:\t", p->oom_target);
+	seq_putc(m, '\n');
+}
+
 static inline void task_context_switch_counts(struct seq_file *m,
 						struct task_struct *p)
 {
@@ -381,6 +387,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
 	task_sig(m, task);
 	task_cap(m, task);
 	task_seccomp(m, task);
+	task_idle(m, task);
 	task_cpus_allowed(m, task);
 	cpuset_task_status_allowed(m, task);
 	task_context_switch_counts(m, task);
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 69966c461d1c..02eb92e7eff5 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -30,6 +30,7 @@
 #include <linux/vmstat.h>
 #include <linux/writeback.h>
 #include <linux/page-flags.h>
+#include <linux/wait.h>
 
 struct mem_cgroup;
 struct page;
@@ -261,6 +262,8 @@ struct mem_cgroup {
 	struct list_head event_list;
 	spinlock_t event_list_lock;
 
+	wait_queue_head_t	oom_target;
+
 	struct mem_cgroup_per_node *nodeinfo[0];
 	/* WARNING: nodeinfo must be the last member here */
 };
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 01c91d874a57..88acea9e0a59 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -102,6 +102,10 @@ extern void oom_killer_enable(void);
 
 extern struct task_struct *find_lock_task_mm(struct task_struct *p);
 
+extern void exit_oom_target(void);
+struct wait_queue_head *oom_target_get_wait(void);
+int oom_target_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key);
+
 /* sysctls */
 extern int sysctl_oom_dump_tasks;
 extern int sysctl_oom_kill_allocating_task;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index fdf74f27acf1..51b0e5987e8c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -652,6 +652,7 @@ struct task_struct {
 	/* disallow userland-initiated cgroup migration */
 	unsigned			no_cgroup_migration:1;
 #endif
+	unsigned			oom_target:1;
 
 	unsigned long			atomic_flags; /* Flags requiring atomic access. */
 
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index b640071421f7..94868317c6f2 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -198,4 +198,8 @@ struct prctl_mm_map {
 # define PR_CAP_AMBIENT_LOWER		3
 # define PR_CAP_AMBIENT_CLEAR_ALL	4
 
+#define PR_SET_IDLE		48
+#define PR_GET_IDLE		49
+# define PR_IDLE_MODE_KILLME	1
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 44857278eb8a..081bcd84a8d0 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -55,6 +55,8 @@
 #include <linux/nsproxy.h>
 #include <linux/file.h>
 #include <net/sock.h>
+#include <linux/oom.h>
+#include <linux/memcontrol.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/cgroup.h>
@@ -756,6 +758,9 @@ static void css_set_move_task(struct task_struct *task,
 			      struct css_set *from_cset, struct css_set *to_cset,
 			      bool use_mg_tasks)
 {
+#ifdef CONFIG_MEMCG
+	struct mem_cgroup *mcg;
+#endif
 	lockdep_assert_held(&css_set_lock);
 
 	if (to_cset && !css_set_populated(to_cset))
@@ -779,6 +784,35 @@ static void css_set_move_task(struct task_struct *task,
 				css_task_iter_advance(it);
 
 		list_del_init(&task->cg_list);
+#ifdef CONFIG_MEMCG
+		/* dequeue from memcg->oom_target
+		 * TODO: this is O(n), add rb-tree to make it O(logn)
+		 */
+		mcg = mem_cgroup_from_task(task);
+		if (mcg) {
+			struct wait_queue_entry *wait;
+
+			spin_lock(&mcg->oom_target.lock);
+			if (!waitqueue_active(&mcg->oom_target))
+				goto empty_from;
+			wait = list_first_entry(&mcg->oom_target.head,
+						wait_queue_entry_t, entry);
+			do {
+				struct list_head *list;
+
+				if (wait->private == task)
+					__remove_wait_queue(&mcg->oom_target,
+							  wait);
+				list = wait->entry.next;
+				if (list_is_last(list, &mcg->oom_target.head))
+					break;
+				wait = list_entry(list,
+					struct wait_queue_entry, entry);
+			} while (1);
+empty_from:
+			spin_unlock(&mcg->oom_target.lock);
+		}
+#endif
 		if (!css_set_populated(from_cset))
 			css_set_update_populated(from_cset, false);
 	} else {
@@ -797,6 +831,33 @@ static void css_set_move_task(struct task_struct *task,
 		rcu_assign_pointer(task->cgroups, to_cset);
 		list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks :
 							     &to_cset->tasks);
+#ifdef CONFIG_MEMCG
+		/* dequeue from memcg->oom_target */
+		mcg = mem_cgroup_from_task(task);
+		if (mcg) {
+			struct wait_queue_entry *wait;
+
+			spin_lock(&mcg->oom_target.lock);
+			if (!waitqueue_active(&mcg->oom_target))
+				goto empty_to;
+			wait = list_first_entry(&mcg->oom_target.head,
+						wait_queue_entry_t, entry);
+			do {
+				struct list_head *list;
+
+				if (wait->private == task)
+					__add_wait_queue(&mcg->oom_target,
+							  wait);
+				list = wait->entry.next;
+				if (list_is_last(list, &mcg->oom_target.head))
+					break;
+				wait = list_entry(list,
+					struct wait_queue_entry, entry);
+			} while (1);
+empty_to:
+			spin_unlock(&mcg->oom_target.lock);
+		}
+#endif
 	}
 }
 
diff --git a/kernel/exit.c b/kernel/exit.c
index f6cad39f35df..2788fbdae267 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -62,6 +62,7 @@
 #include <linux/random.h>
 #include <linux/rcuwait.h>
 #include <linux/compat.h>
+#include <linux/eventpoll.h>
 
 #include <linux/uaccess.h>
 #include <asm/unistd.h>
diff --git a/kernel/sys.c b/kernel/sys.c
index 524a4cb9bbe2..e1eb049a85e6 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2386,6 +2386,15 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 	case PR_GET_FP_MODE:
 		error = GET_FP_MODE(me);
 		break;
+	case PR_SET_IDLE:
+		if (!((arg2 == 0) || (arg2 == PR_IDLE_MODE_KILLME)))
+			return -EINVAL;
+		me->oom_target = arg2;
+		error = 0;
+		break;
+	case PR_GET_IDLE:
+		error = me->oom_target;
+		break;
 	default:
 		error = -EINVAL;
 		break;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 661f046ad318..a4e3b93aeccd 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4300,6 +4300,8 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 			memory_cgrp_subsys.broken_hierarchy = true;
 	}
 
+	init_waitqueue_head(&memcg->oom_target);
+
 	/* The following stuff does not apply to the root */
 	if (!parent) {
 		root_mem_cgroup = memcg;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index dee0f75c3013..c5d8f5a716bc 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -41,6 +41,9 @@
 #include <linux/kthread.h>
 #include <linux/init.h>
 #include <linux/mmu_notifier.h>
+#include <linux/eventpoll.h>
+#include <linux/wait.h>
+#include <linux/memcontrol.h>
 
 #include <asm/tlb.h>
 #include "internal.h"
@@ -54,6 +57,23 @@ int sysctl_oom_dump_tasks = 1;
 
 DEFINE_MUTEX(oom_lock);
 
+static DECLARE_WAIT_QUEUE_HEAD(oom_target);
+
+/* Clean up after a EPOLL_KILLME process quits.
+ * Called by kernel/exit.c.
+ */
+void exit_oom_target(void)
+{
+	DECLARE_WAITQUEUE(wait, current);
+
+	remove_wait_queue(&oom_target, &wait);
+}
+
+inline struct wait_queue_head *oom_target_get_wait()
+{
+	return &oom_target;
+}
+
 #ifdef CONFIG_NUMA
 /**
  * has_intersects_mems_allowed() - check task eligiblity for kill
@@ -994,6 +1014,18 @@ int unregister_oom_notifier(struct notifier_block *nb)
 }
 EXPORT_SYMBOL_GPL(unregister_oom_notifier);
 
+int oom_target_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
+{
+	struct task_struct *ts = wait->private;
+
+	/* We use SIGKILL instead of the oom killer
+	 * so as to cleanly interrupt ep_poll()
+	 */
+	pr_info("Killing pid %u from prctl(PR_SET_IDLE) death row.\n", ts->pid);
+	send_sig(SIGKILL, ts, 1);
+	return 0;
+}
+
 /**
  * out_of_memory - kill the "best" process when we run out of memory
  * @oc: pointer to struct oom_control
@@ -1007,6 +1039,7 @@ bool out_of_memory(struct oom_control *oc)
 {
 	unsigned long freed = 0;
 	enum oom_constraint constraint = CONSTRAINT_NONE;
+	wait_queue_head_t *w;
 
 	if (oom_killer_disabled)
 		return false;
@@ -1056,6 +1089,20 @@ bool out_of_memory(struct oom_control *oc)
 		return true;
 	}
 
+	/*
+	 * Check death row for current memcg or global.
+	 */
+#ifdef CONFIG_MEMCG
+	if (is_memcg_oom(oc))
+		w = &oc->memcg->oom_target;
+	else
+#endif
+		w = oom_target_get_wait();
+	if (waitqueue_active(w)) {
+		wake_up(w);
+		return true;
+	}
+
 	select_bad_process(oc);
 	/* Found nothing?!?! Either we hang forever, or we panic. */
 	if (!oc->chosen && !is_sysrq_oom(oc) && !is_memcg_oom(oc)) {
-- 
2.14.1

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [RFC v3] It is common for services to be stateless around their main event loop. If a process sets PR_SET_IDLE to PR_IDLE_MODE_KILLME then it signals to the kernel that epoll_wait() and friends may not complete, and the kernel may send SIGKILL if resources get tight.
  2017-11-21  4:49   ` [RFC v3] It is common for services to be stateless around their main event loop. If a process sets PR_SET_IDLE to PR_IDLE_MODE_KILLME then it signals to the kernel that epoll_wait() and friends may not complete, and the kernel may send SIGKILL if resources get tight Shawn Landden
@ 2017-11-21  4:56     ` Shawn Landden
  2017-11-21  5:16     ` [RFC v4] " Shawn Landden
  1 sibling, 0 replies; 14+ messages in thread
From: Shawn Landden @ 2017-11-21  4:56 UTC (permalink / raw)
  To: Shawn Landden
  Cc: linux-kernel, linux-fsdevel, linux-mm, linux-api, Michal Hocko,
	willy

On Mon, Nov 20, 2017 at 8:49 PM, Shawn Landden <slandden@gmail.com> wrote:
> See my systemd patch: https://github.com/shawnl/systemd/tree/prctl
>
> Android uses this memory model for all programs, and having it in the
> kernel will enable integration with the page cache (not in this
> series).
>
> v2
> switch to prctl, memcg support
>
> v3
> use <linux/wait.h>
> put OOM after constraint checking
> ---
>  fs/eventpoll.c             | 27 ++++++++++++++++++++
>  fs/proc/array.c            |  7 ++++++
>  include/linux/memcontrol.h |  3 +++
>  include/linux/oom.h        |  4 +++
>  include/linux/sched.h      |  1 +
>  include/uapi/linux/prctl.h |  4 +++
>  kernel/cgroup/cgroup.c     | 61 ++++++++++++++++++++++++++++++++++++++++++++++
>  kernel/exit.c              |  1 +
>  kernel/sys.c               |  9 +++++++
>  mm/memcontrol.c            |  2 ++
>  mm/oom_kill.c              | 47 +++++++++++++++++++++++++++++++++++
>  11 files changed, 166 insertions(+)
>
> diff --git a/fs/eventpoll.c b/fs/eventpoll.c
> index 2fabd19cdeea..745662f9a7e1 100644
> --- a/fs/eventpoll.c
> +++ b/fs/eventpoll.c
> @@ -43,6 +43,8 @@
>  #include <linux/compat.h>
>  #include <linux/rculist.h>
>  #include <net/busy_poll.h>
> +#include <linux/memcontrol.h>
> +#include <linux/oom.h>
>
>  /*
>   * LOCKING:
> @@ -1761,6 +1763,19 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
>         u64 slack = 0;
>         wait_queue_entry_t wait;
>         ktime_t expires, *to = NULL;
> +       DEFINE_WAIT_FUNC(oom_target_wait, oom_target_callback);
> +       DEFINE_WAIT_FUNC(oom_target_wait_mcg, oom_target_callback);
> +
> +       if (current->oom_target) {
> +#ifdef CONFIG_MEMCG
> +               struct mem_cgroup *mcg;
> +
> +               mcg = mem_cgroup_from_task(current);
> +               if (mcg)
> +                       add_wait_queue(&mcg->oom_target, &oom_target_wait_mcg);
> +#endif
> +               add_wait_queue(oom_target_get_wait(), &oom_target_wait);
> +       }
>
>         if (timeout > 0) {
>                 struct timespec64 end_time = ep_set_mstimeout(timeout);
> @@ -1850,6 +1865,18 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
>             !(res = ep_send_events(ep, events, maxevents)) && !timed_out)
>                 goto fetch_events;
>
> +       if (current->oom_target) {
> +#ifdef CONFIG_MEMCG
> +               struct mem_cgroup *mcg;
> +
> +               mcg = mem_cgroup_from_task(current);
> +               if (mcg)
> +                       remove_wait_queue(&mcg->oom_target,
> +                                       &oom_target_wait_mcg);
> +#endif
> +               remove_wait_queue(oom_target_get_wait(), &oom_target_wait);
> +       }
> +
>         return res;
>  }
>
> diff --git a/fs/proc/array.c b/fs/proc/array.c
> index 9390032a11e1..1954ae87cb88 100644
> --- a/fs/proc/array.c
> +++ b/fs/proc/array.c
> @@ -350,6 +350,12 @@ static inline void task_seccomp(struct seq_file *m, struct task_struct *p)
>         seq_putc(m, '\n');
>  }
>
> +static inline void task_idle(struct seq_file *m, struct task_struct *p)
> +{
> +       seq_put_decimal_ull(m, "Idle:\t", p->oom_target);
> +       seq_putc(m, '\n');
> +}
> +
>  static inline void task_context_switch_counts(struct seq_file *m,
>                                                 struct task_struct *p)
>  {
> @@ -381,6 +387,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
>         task_sig(m, task);
>         task_cap(m, task);
>         task_seccomp(m, task);
> +       task_idle(m, task);
>         task_cpus_allowed(m, task);
>         cpuset_task_status_allowed(m, task);
>         task_context_switch_counts(m, task);
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 69966c461d1c..02eb92e7eff5 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -30,6 +30,7 @@
>  #include <linux/vmstat.h>
>  #include <linux/writeback.h>
>  #include <linux/page-flags.h>
> +#include <linux/wait.h>
>
>  struct mem_cgroup;
>  struct page;
> @@ -261,6 +262,8 @@ struct mem_cgroup {
>         struct list_head event_list;
>         spinlock_t event_list_lock;
>
> +       wait_queue_head_t       oom_target;
> +
>         struct mem_cgroup_per_node *nodeinfo[0];
>         /* WARNING: nodeinfo must be the last member here */
>  };
> diff --git a/include/linux/oom.h b/include/linux/oom.h
> index 01c91d874a57..88acea9e0a59 100644
> --- a/include/linux/oom.h
> +++ b/include/linux/oom.h
> @@ -102,6 +102,10 @@ extern void oom_killer_enable(void);
>
>  extern struct task_struct *find_lock_task_mm(struct task_struct *p);
>
> +extern void exit_oom_target(void);
> +struct wait_queue_head *oom_target_get_wait(void);
> +int oom_target_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key);
> +
>  /* sysctls */
>  extern int sysctl_oom_dump_tasks;
>  extern int sysctl_oom_kill_allocating_task;
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index fdf74f27acf1..51b0e5987e8c 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -652,6 +652,7 @@ struct task_struct {
>         /* disallow userland-initiated cgroup migration */
>         unsigned                        no_cgroup_migration:1;
>  #endif
> +       unsigned                        oom_target:1;
>
>         unsigned long                   atomic_flags; /* Flags requiring atomic access. */
>
> diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
> index b640071421f7..94868317c6f2 100644
> --- a/include/uapi/linux/prctl.h
> +++ b/include/uapi/linux/prctl.h
> @@ -198,4 +198,8 @@ struct prctl_mm_map {
>  # define PR_CAP_AMBIENT_LOWER          3
>  # define PR_CAP_AMBIENT_CLEAR_ALL      4
>
> +#define PR_SET_IDLE            48
> +#define PR_GET_IDLE            49
> +# define PR_IDLE_MODE_KILLME   1
> +
>  #endif /* _LINUX_PRCTL_H */
> diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
> index 44857278eb8a..081bcd84a8d0 100644
> --- a/kernel/cgroup/cgroup.c
> +++ b/kernel/cgroup/cgroup.c
> @@ -55,6 +55,8 @@
>  #include <linux/nsproxy.h>
>  #include <linux/file.h>
>  #include <net/sock.h>
> +#include <linux/oom.h>
> +#include <linux/memcontrol.h>
>
>  #define CREATE_TRACE_POINTS
>  #include <trace/events/cgroup.h>
> @@ -756,6 +758,9 @@ static void css_set_move_task(struct task_struct *task,
>                               struct css_set *from_cset, struct css_set *to_cset,
>                               bool use_mg_tasks)
>  {
> +#ifdef CONFIG_MEMCG
> +       struct mem_cgroup *mcg;
> +#endif
>         lockdep_assert_held(&css_set_lock);
>
>         if (to_cset && !css_set_populated(to_cset))
> @@ -779,6 +784,35 @@ static void css_set_move_task(struct task_struct *task,
>                                 css_task_iter_advance(it);
>
>                 list_del_init(&task->cg_list);
> +#ifdef CONFIG_MEMCG

> +               /* dequeue from memcg->oom_target
Ahh this is all shitty here. Sorry for the noise of this shit.
> +                * TODO: this is O(n), add rb-tree to make it O(logn)
> +                */
> +               mcg = mem_cgroup_from_task(task);
> +               if (mcg) {
> +                       struct wait_queue_entry *wait;
> +
> +                       spin_lock(&mcg->oom_target.lock);
> +                       if (!waitqueue_active(&mcg->oom_target))
> +                               goto empty_from;
> +                       wait = list_first_entry(&mcg->oom_target.head,
> +                                               wait_queue_entry_t, entry);
> +                       do {
> +                               struct list_head *list;
> +
> +                               if (wait->private == task)
> +                                       __remove_wait_queue(&mcg->oom_target,
> +                                                         wait);
> +                               list = wait->entry.next;
> +                               if (list_is_last(list, &mcg->oom_target.head))
> +                                       break;
> +                               wait = list_entry(list,
> +                                       struct wait_queue_entry, entry);
> +                       } while (1);
> +empty_from:
> +                       spin_unlock(&mcg->oom_target.lock);
> +               }
> +#endif
>                 if (!css_set_populated(from_cset))
>                         css_set_update_populated(from_cset, false);
>         } else {
> @@ -797,6 +831,33 @@ static void css_set_move_task(struct task_struct *task,
>                 rcu_assign_pointer(task->cgroups, to_cset);
>                 list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks :
>                                                              &to_cset->tasks);
> +#ifdef CONFIG_MEMCG
> +               /* dequeue from memcg->oom_target */
> +               mcg = mem_cgroup_from_task(task);
> +               if (mcg) {
> +                       struct wait_queue_entry *wait;
> +
> +                       spin_lock(&mcg->oom_target.lock);
> +                       if (!waitqueue_active(&mcg->oom_target))
> +                               goto empty_to;
> +                       wait = list_first_entry(&mcg->oom_target.head,
> +                                               wait_queue_entry_t, entry);
> +                       do {
> +                               struct list_head *list;
> +
> +                               if (wait->private == task)
> +                                       __add_wait_queue(&mcg->oom_target,
> +                                                         wait);
> +                               list = wait->entry.next;
> +                               if (list_is_last(list, &mcg->oom_target.head))
> +                                       break;
> +                               wait = list_entry(list,
> +                                       struct wait_queue_entry, entry);
> +                       } while (1);
> +empty_to:
> +                       spin_unlock(&mcg->oom_target.lock);
> +               }
> +#endif
>         }
>  }
>
> diff --git a/kernel/exit.c b/kernel/exit.c
> index f6cad39f35df..2788fbdae267 100644
> --- a/kernel/exit.c
> +++ b/kernel/exit.c
> @@ -62,6 +62,7 @@
>  #include <linux/random.h>
>  #include <linux/rcuwait.h>
>  #include <linux/compat.h>
> +#include <linux/eventpoll.h>
>
>  #include <linux/uaccess.h>
>  #include <asm/unistd.h>
> diff --git a/kernel/sys.c b/kernel/sys.c
> index 524a4cb9bbe2..e1eb049a85e6 100644
> --- a/kernel/sys.c
> +++ b/kernel/sys.c
> @@ -2386,6 +2386,15 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
>         case PR_GET_FP_MODE:
>                 error = GET_FP_MODE(me);
>                 break;
> +       case PR_SET_IDLE:
> +               if (!((arg2 == 0) || (arg2 == PR_IDLE_MODE_KILLME)))
> +                       return -EINVAL;
> +               me->oom_target = arg2;
> +               error = 0;
> +               break;
> +       case PR_GET_IDLE:
> +               error = me->oom_target;
> +               break;
>         default:
>                 error = -EINVAL;
>                 break;
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 661f046ad318..a4e3b93aeccd 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -4300,6 +4300,8 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
>                         memory_cgrp_subsys.broken_hierarchy = true;
>         }
>
> +       init_waitqueue_head(&memcg->oom_target);
> +
>         /* The following stuff does not apply to the root */
>         if (!parent) {
>                 root_mem_cgroup = memcg;
> diff --git a/mm/oom_kill.c b/mm/oom_kill.c
> index dee0f75c3013..c5d8f5a716bc 100644
> --- a/mm/oom_kill.c
> +++ b/mm/oom_kill.c
> @@ -41,6 +41,9 @@
>  #include <linux/kthread.h>
>  #include <linux/init.h>
>  #include <linux/mmu_notifier.h>
> +#include <linux/eventpoll.h>
> +#include <linux/wait.h>
> +#include <linux/memcontrol.h>
>
>  #include <asm/tlb.h>
>  #include "internal.h"
> @@ -54,6 +57,23 @@ int sysctl_oom_dump_tasks = 1;
>
>  DEFINE_MUTEX(oom_lock);
>
> +static DECLARE_WAIT_QUEUE_HEAD(oom_target);
> +
> +/* Clean up after a EPOLL_KILLME process quits.
> + * Called by kernel/exit.c.
> + */
> +void exit_oom_target(void)
> +{
> +       DECLARE_WAITQUEUE(wait, current);
> +
> +       remove_wait_queue(&oom_target, &wait);
> +}
> +
> +inline struct wait_queue_head *oom_target_get_wait()
> +{
> +       return &oom_target;
> +}
> +
>  #ifdef CONFIG_NUMA
>  /**
>   * has_intersects_mems_allowed() - check task eligiblity for kill
> @@ -994,6 +1014,18 @@ int unregister_oom_notifier(struct notifier_block *nb)
>  }
>  EXPORT_SYMBOL_GPL(unregister_oom_notifier);
>
> +int oom_target_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
> +{
> +       struct task_struct *ts = wait->private;
> +
> +       /* We use SIGKILL instead of the oom killer
> +        * so as to cleanly interrupt ep_poll()
> +        */
> +       pr_info("Killing pid %u from prctl(PR_SET_IDLE) death row.\n", ts->pid);
> +       send_sig(SIGKILL, ts, 1);
> +       return 0;
> +}
> +
>  /**
>   * out_of_memory - kill the "best" process when we run out of memory
>   * @oc: pointer to struct oom_control
> @@ -1007,6 +1039,7 @@ bool out_of_memory(struct oom_control *oc)
>  {
>         unsigned long freed = 0;
>         enum oom_constraint constraint = CONSTRAINT_NONE;
> +       wait_queue_head_t *w;
>
>         if (oom_killer_disabled)
>                 return false;
> @@ -1056,6 +1089,20 @@ bool out_of_memory(struct oom_control *oc)
>                 return true;
>         }
>
> +       /*
> +        * Check death row for current memcg or global.
> +        */
> +#ifdef CONFIG_MEMCG
> +       if (is_memcg_oom(oc))
> +               w = &oc->memcg->oom_target;
> +       else
> +#endif
> +               w = oom_target_get_wait();
> +       if (waitqueue_active(w)) {
> +               wake_up(w);
> +               return true;
> +       }
> +
>         select_bad_process(oc);
>         /* Found nothing?!?! Either we hang forever, or we panic. */
>         if (!oc->chosen && !is_sysrq_oom(oc) && !is_memcg_oom(oc)) {
> --
> 2.14.1
>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [RFC v4] It is common for services to be stateless around their main event loop. If a process sets PR_SET_IDLE to PR_IDLE_MODE_KILLME then it signals to the kernel that epoll_wait() and friends may not complete, and the kernel may send SIGKILL if resources get tight.
  2017-11-21  4:49   ` [RFC v3] It is common for services to be stateless around their main event loop. If a process sets PR_SET_IDLE to PR_IDLE_MODE_KILLME then it signals to the kernel that epoll_wait() and friends may not complete, and the kernel may send SIGKILL if resources get tight Shawn Landden
  2017-11-21  4:56     ` Shawn Landden
@ 2017-11-21  5:16     ` Shawn Landden
  2017-11-21  5:26       ` Shawn Landden
  2017-11-21  9:14       ` Thomas Gleixner
  1 sibling, 2 replies; 14+ messages in thread
From: Shawn Landden @ 2017-11-21  5:16 UTC (permalink / raw)
  Cc: linux-kernel, linux-fsdevel, linux-mm, linux-api, mhocko, willy,
	Shawn Landden

See my systemd patch: https://github.com/shawnl/systemd/tree/prctl

Android uses this memory model for all programs, and having it in the
kernel will enable integration with the page cache (not in this
series).

v2
switch to prctl, memcg support

v3
use <linux/wait.h>
put OOM after constraint checking

v4
ignore memcg OOMs as should have been all along (sry for the noise)
---
 fs/eventpoll.c             |  9 +++++++++
 fs/proc/array.c            |  7 +++++++
 include/linux/memcontrol.h |  1 +
 include/linux/oom.h        |  4 ++++
 include/linux/sched.h      |  1 +
 include/uapi/linux/prctl.h |  4 ++++
 kernel/exit.c              |  1 +
 kernel/sys.c               |  9 +++++++++
 mm/oom_kill.c              | 43 +++++++++++++++++++++++++++++++++++++++++++
 9 files changed, 79 insertions(+)

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 2fabd19cdeea..5b3f084b22d5 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -43,6 +43,8 @@
 #include <linux/compat.h>
 #include <linux/rculist.h>
 #include <net/busy_poll.h>
+#include <linux/memcontrol.h>
+#include <linux/oom.h>
 
 /*
  * LOCKING:
@@ -1761,6 +1763,10 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
 	u64 slack = 0;
 	wait_queue_entry_t wait;
 	ktime_t expires, *to = NULL;
+	DEFINE_WAIT_FUNC(oom_target_wait, oom_target_callback);
+
+	if (current->oom_target)
+		add_wait_queue(oom_target_get_wait(), &oom_target_wait);
 
 	if (timeout > 0) {
 		struct timespec64 end_time = ep_set_mstimeout(timeout);
@@ -1850,6 +1856,9 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
 	    !(res = ep_send_events(ep, events, maxevents)) && !timed_out)
 		goto fetch_events;
 
+	if (current->oom_target)
+		remove_wait_queue(oom_target_get_wait(), &oom_target_wait);
+
 	return res;
 }
 
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 9390032a11e1..1954ae87cb88 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -350,6 +350,12 @@ static inline void task_seccomp(struct seq_file *m, struct task_struct *p)
 	seq_putc(m, '\n');
 }
 
+static inline void task_idle(struct seq_file *m, struct task_struct *p)
+{
+	seq_put_decimal_ull(m, "Idle:\t", p->oom_target);
+	seq_putc(m, '\n');
+}
+
 static inline void task_context_switch_counts(struct seq_file *m,
 						struct task_struct *p)
 {
@@ -381,6 +387,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
 	task_sig(m, task);
 	task_cap(m, task);
 	task_seccomp(m, task);
+	task_idle(m, task);
 	task_cpus_allowed(m, task);
 	cpuset_task_status_allowed(m, task);
 	task_context_switch_counts(m, task);
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 69966c461d1c..471d1d52ae72 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -30,6 +30,7 @@
 #include <linux/vmstat.h>
 #include <linux/writeback.h>
 #include <linux/page-flags.h>
+#include <linux/wait.h>
 
 struct mem_cgroup;
 struct page;
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 01c91d874a57..88acea9e0a59 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -102,6 +102,10 @@ extern void oom_killer_enable(void);
 
 extern struct task_struct *find_lock_task_mm(struct task_struct *p);
 
+extern void exit_oom_target(void);
+struct wait_queue_head *oom_target_get_wait(void);
+int oom_target_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key);
+
 /* sysctls */
 extern int sysctl_oom_dump_tasks;
 extern int sysctl_oom_kill_allocating_task;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index fdf74f27acf1..51b0e5987e8c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -652,6 +652,7 @@ struct task_struct {
 	/* disallow userland-initiated cgroup migration */
 	unsigned			no_cgroup_migration:1;
 #endif
+	unsigned			oom_target:1;
 
 	unsigned long			atomic_flags; /* Flags requiring atomic access. */
 
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index b640071421f7..94868317c6f2 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -198,4 +198,8 @@ struct prctl_mm_map {
 # define PR_CAP_AMBIENT_LOWER		3
 # define PR_CAP_AMBIENT_CLEAR_ALL	4
 
+#define PR_SET_IDLE		48
+#define PR_GET_IDLE		49
+# define PR_IDLE_MODE_KILLME	1
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/kernel/exit.c b/kernel/exit.c
index f6cad39f35df..2788fbdae267 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -62,6 +62,7 @@
 #include <linux/random.h>
 #include <linux/rcuwait.h>
 #include <linux/compat.h>
+#include <linux/eventpoll.h>
 
 #include <linux/uaccess.h>
 #include <asm/unistd.h>
diff --git a/kernel/sys.c b/kernel/sys.c
index 524a4cb9bbe2..e1eb049a85e6 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2386,6 +2386,15 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 	case PR_GET_FP_MODE:
 		error = GET_FP_MODE(me);
 		break;
+	case PR_SET_IDLE:
+		if (!((arg2 == 0) || (arg2 == PR_IDLE_MODE_KILLME)))
+			return -EINVAL;
+		me->oom_target = arg2;
+		error = 0;
+		break;
+	case PR_GET_IDLE:
+		error = me->oom_target;
+		break;
 	default:
 		error = -EINVAL;
 		break;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index dee0f75c3013..73ad7ee47c8e 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -41,6 +41,8 @@
 #include <linux/kthread.h>
 #include <linux/init.h>
 #include <linux/mmu_notifier.h>
+#include <linux/eventpoll.h>
+#include <linux/wait.h>
 
 #include <asm/tlb.h>
 #include "internal.h"
@@ -54,6 +56,23 @@ int sysctl_oom_dump_tasks = 1;
 
 DEFINE_MUTEX(oom_lock);
 
+static DECLARE_WAIT_QUEUE_HEAD(oom_target);
+
+/* Clean up after a EPOLL_KILLME process quits.
+ * Called by kernel/exit.c.
+ */
+void exit_oom_target(void)
+{
+	DECLARE_WAITQUEUE(wait, current);
+
+	remove_wait_queue(&oom_target, &wait);
+}
+
+inline struct wait_queue_head *oom_target_get_wait()
+{
+	return &oom_target;
+}
+
 #ifdef CONFIG_NUMA
 /**
  * has_intersects_mems_allowed() - check task eligiblity for kill
@@ -994,6 +1013,18 @@ int unregister_oom_notifier(struct notifier_block *nb)
 }
 EXPORT_SYMBOL_GPL(unregister_oom_notifier);
 
+int oom_target_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
+{
+	struct task_struct *ts = wait->private;
+
+	/* We use SIGKILL instead of the oom killer
+	 * so as to cleanly interrupt ep_poll()
+	 */
+	pr_debug("Killing pid %u from prctl(PR_SET_IDLE) death row.\n", ts->pid);
+	send_sig(SIGKILL, ts, 1);
+	return 0;
+}
+
 /**
  * out_of_memory - kill the "best" process when we run out of memory
  * @oc: pointer to struct oom_control
@@ -1007,6 +1038,7 @@ bool out_of_memory(struct oom_control *oc)
 {
 	unsigned long freed = 0;
 	enum oom_constraint constraint = CONSTRAINT_NONE;
+	wait_queue_head_t *w;
 
 	if (oom_killer_disabled)
 		return false;
@@ -1056,6 +1088,17 @@ bool out_of_memory(struct oom_control *oc)
 		return true;
 	}
 
+	/*
+	 * Check death row for current memcg or global.
+	 */
+	if (!is_memcg_oom(oc)) {
+		w = oom_target_get_wait();
+		if (waitqueue_active(w)) {
+			wake_up(w);
+			return true;
+		}
+	}
+
 	select_bad_process(oc);
 	/* Found nothing?!?! Either we hang forever, or we panic. */
 	if (!oc->chosen && !is_sysrq_oom(oc) && !is_memcg_oom(oc)) {
-- 
2.14.1

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [RFC v4] It is common for services to be stateless around their main event loop. If a process sets PR_SET_IDLE to PR_IDLE_MODE_KILLME then it signals to the kernel that epoll_wait() and friends may not complete, and the kernel may send SIGKILL if resources get tight.
  2017-11-21  5:16     ` [RFC v4] " Shawn Landden
@ 2017-11-21  5:26       ` Shawn Landden
  2017-11-21  9:14       ` Thomas Gleixner
  1 sibling, 0 replies; 14+ messages in thread
From: Shawn Landden @ 2017-11-21  5:26 UTC (permalink / raw)
  To: Shawn Landden
  Cc: linux-kernel, linux-fsdevel, linux-mm, linux-api, Michal Hocko,
	willy

On Mon, Nov 20, 2017 at 9:16 PM, Shawn Landden <slandden@gmail.com> wrote:
> See my systemd patch: https://github.com/shawnl/systemd/tree/prctl
>
> Android uses this memory model for all programs, and having it in the
> kernel will enable integration with the page cache (not in this
> series).
What about having a dedicated way to kill these type of processes,
instead of overloading the OOM killer? This was suggested by
Colin Walters <walters@verbum.org>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC v2] prctl: prctl(PR_SET_IDLE, PR_IDLE_MODE_KILLME), for stateless idle loops
  2017-11-21  4:48         ` Shawn Landden
@ 2017-11-21  7:05           ` Michal Hocko
  0 siblings, 0 replies; 14+ messages in thread
From: Michal Hocko @ 2017-11-21  7:05 UTC (permalink / raw)
  To: Shawn Landden; +Cc: linux-kernel, linux-fsdevel, linux-mm, linux-api

On Mon 20-11-17 20:48:10, Shawn Landden wrote:
> On Mon, Nov 20, 2017 at 12:35 AM, Michal Hocko <mhocko@kernel.org> wrote:
> > On Fri 17-11-17 20:45:03, Shawn Landden wrote:
> >> On Fri, Nov 3, 2017 at 2:09 AM, Michal Hocko <mhocko@kernel.org> wrote:
> >>
> >> > On Thu 02-11-17 23:35:44, Shawn Landden wrote:
> >> > > It is common for services to be stateless around their main event loop.
> >> > > If a process sets PR_SET_IDLE to PR_IDLE_MODE_KILLME then it
> >> > > signals to the kernel that epoll_wait() and friends may not complete,
> >> > > and the kernel may send SIGKILL if resources get tight.
> >> > >
> >> > > See my systemd patch: https://github.com/shawnl/systemd/tree/prctl
> >> > >
> >> > > Android uses this memory model for all programs, and having it in the
> >> > > kernel will enable integration with the page cache (not in this
> >> > > series).
> >> > >
> >> > > 16 bytes per process is kinda spendy, but I want to keep
> >> > > lru behavior, which mem_score_adj does not allow. When a supervisor,
> >> > > like Android's user input is keeping track this can be done in
> >> > user-space.
> >> > > It could be pulled out of task_struct if an cross-indexing additional
> >> > > red-black tree is added to support pid-based lookup.
> >> >
> >> > This is still an abuse and the patch is wrong. We really do have an API
> >> > to use I fail to see why you do not use it.
> >> >
> >> When I looked at wait_queue_head_t it was 20 byes.
> >
> > I do not understand. What I meant to say is that we do have a proper
> > user api to hint OOM killer decisions.
> This is a FIFO queue, rather than a heuristic, which is all you get
> with the current API.

Yes I can read the code. All I am saing is that we already have an API
to achieve what you want or at least very similar.

Let me be explicit.
Nacked-by: Michal Hocko <mhocko@suse.com>
until it is sufficiently explained that the oom_score_adj is not
suitable and there are no other means to achieve what you need.
-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC v4] It is common for services to be stateless around their main event loop. If a process sets PR_SET_IDLE to PR_IDLE_MODE_KILLME then it signals to the kernel that epoll_wait() and friends may not complete, and the kernel may send SIGKILL if resources get tight.
  2017-11-21  5:16     ` [RFC v4] " Shawn Landden
  2017-11-21  5:26       ` Shawn Landden
@ 2017-11-21  9:14       ` Thomas Gleixner
  1 sibling, 0 replies; 14+ messages in thread
From: Thomas Gleixner @ 2017-11-21  9:14 UTC (permalink / raw)
  To: Shawn Landden
  Cc: linux-kernel, linux-fsdevel, linux-mm, linux-api, mhocko, willy

On Mon, 20 Nov 2017, Shawn Landden wrote:

Please use a short and comprehensible subject line and do not pack a full
sentence into it. The sentence wants to be in the change log body.

> +static DECLARE_WAIT_QUEUE_HEAD(oom_target);
> +
> +/* Clean up after a EPOLL_KILLME process quits.
> + * Called by kernel/exit.c.

It's hardly called by kernel/exit.c and aside of that multi line comments
are formatted like this:

/*
 * ....
 * ....
 */

> + */
> +void exit_oom_target(void)
> +{
> +	DECLARE_WAITQUEUE(wait, current);
> +
> +	remove_wait_queue(&oom_target, &wait);

This is completely pointless, really. It does:

     	INIT_LIST_HEAD(&wait.entry);

	spin_lock_irqsave(&oom_target->lock, flags);
	list_del(&wait->entry);
	spin_lock_irqrestore(&oom_target->lock, flags);

IOW. It's a NOOP. What are you trying to achieve?

> +}
> +
> +inline struct wait_queue_head *oom_target_get_wait()
> +{
> +	return &oom_target;

This wrapper is useless.

> +}
> +
>  #ifdef CONFIG_NUMA
>  /**
>   * has_intersects_mems_allowed() - check task eligiblity for kill
> @@ -994,6 +1013,18 @@ int unregister_oom_notifier(struct notifier_block *nb)
>  }
>  EXPORT_SYMBOL_GPL(unregister_oom_notifier);
>  
> +int oom_target_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
> +{
> +	struct task_struct *ts = wait->private;
> +
> +	/* We use SIGKILL instead of the oom killer
> +	 * so as to cleanly interrupt ep_poll()

Huch? oom_killer uses SIGKILL as well, it just does it correctly.

> +	 */
> +	pr_debug("Killing pid %u from prctl(PR_SET_IDLE) death row.\n", ts->pid);
> +	send_sig(SIGKILL, ts, 1);
> +	return 0;
> +}
> +
>  /**
>   * out_of_memory - kill the "best" process when we run out of memory
>   * @oc: pointer to struct oom_control
> @@ -1007,6 +1038,7 @@ bool out_of_memory(struct oom_control *oc)
>  {
>  	unsigned long freed = 0;
>  	enum oom_constraint constraint = CONSTRAINT_NONE;
> +	wait_queue_head_t *w;
>  
>  	if (oom_killer_disabled)
>  		return false;
> @@ -1056,6 +1088,17 @@ bool out_of_memory(struct oom_control *oc)
>  		return true;
>  	}
>  
> +	/*
> +	 * Check death row for current memcg or global.
> +	 */
> +	if (!is_memcg_oom(oc)) {
> +		w = oom_target_get_wait();
> +		if (waitqueue_active(w)) {
> +			wake_up(w);
> +			return true;
> +		}
> +	}

Why on earth do you need that extra wait_queue magic?

You completely fail to explain in your empty changelog why the existing
oom hinting infrastructure is not sufficient.

If you can explain why, then there is no reason to have this side
channel. Extend/fix the current hinting mechanism and be done with it.

Thanks,

	tglx

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC v2] prctl: prctl(PR_SET_IDLE, PR_IDLE_MODE_KILLME), for stateless idle loops
  2017-11-03  6:35 ` [RFC v2] prctl: prctl(PR_SET_IDLE, PR_IDLE_MODE_KILLME), for stateless idle loops Shawn Landden
  2017-11-03  9:09   ` Michal Hocko
  2017-11-21  4:49   ` [RFC v3] It is common for services to be stateless around their main event loop. If a process sets PR_SET_IDLE to PR_IDLE_MODE_KILLME then it signals to the kernel that epoll_wait() and friends may not complete, and the kernel may send SIGKILL if resources get tight Shawn Landden
@ 2017-11-22 10:29   ` peter enderborg
  2 siblings, 0 replies; 14+ messages in thread
From: peter enderborg @ 2017-11-22 10:29 UTC (permalink / raw)
  To: Shawn Landden; +Cc: linux-kernel, linux-fsdevel, linux-mm, linux-api

On 11/03/2017 07:35 AM, Shawn Landden wrote:
> It is common for services to be stateless around their main event loop.
> If a process sets PR_SET_IDLE to PR_IDLE_MODE_KILLME then it
> signals to the kernel that epoll_wait() and friends may not complete,
> and the kernel may send SIGKILL if resources get tight.
>
> See my systemd patch: https://github.com/shawnl/systemd/tree/prctl
>
> Android uses this memory model for all programs, and having it in the
> kernel will enable integration with the page cache (not in this
> series).
>
> 16 bytes per process is kinda spendy, but I want to keep
> lru behavior, which mem_score_adj does not allow. When a supervisor,
> like Android's user input is keeping track this can be done in user-space.
> It could be pulled out of task_struct if an cross-indexing additional
> red-black tree is added to support pid-based lookup.
What android version is using systemd?
In android there is a OnTrimMemory that is sent from activitymanager that
you can listen on and make a nice exit.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2017-11-22 10:29 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <20171101053244.5218-1-slandden@gmail.com>
2017-11-03  6:35 ` [RFC v2] prctl: prctl(PR_SET_IDLE, PR_IDLE_MODE_KILLME), for stateless idle loops Shawn Landden
2017-11-03  9:09   ` Michal Hocko
2017-11-18  4:45     ` Shawn Landden
     [not found]       ` <CA+49okqZ8CME0EN1xS_cCTc5Q-fGRreg0makhzNNuRpGs3mjfw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2017-11-19  4:19         ` Matthew Wilcox
2017-11-20  8:35       ` Michal Hocko
2017-11-21  4:48         ` Shawn Landden
2017-11-21  7:05           ` Michal Hocko
2017-11-18 20:33     ` Shawn Landden
2017-11-21  4:49   ` [RFC v3] It is common for services to be stateless around their main event loop. If a process sets PR_SET_IDLE to PR_IDLE_MODE_KILLME then it signals to the kernel that epoll_wait() and friends may not complete, and the kernel may send SIGKILL if resources get tight Shawn Landden
2017-11-21  4:56     ` Shawn Landden
2017-11-21  5:16     ` [RFC v4] " Shawn Landden
2017-11-21  5:26       ` Shawn Landden
2017-11-21  9:14       ` Thomas Gleixner
2017-11-22 10:29   ` [RFC v2] prctl: prctl(PR_SET_IDLE, PR_IDLE_MODE_KILLME), for stateless idle loops peter enderborg

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).