All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC/RFT PATCH] sched: automated per tty task groups
@ 2010-10-19  9:16 Mike Galbraith
  2010-10-19  9:26 ` Peter Zijlstra
                   ` (3 more replies)
  0 siblings, 4 replies; 31+ messages in thread
From: Mike Galbraith @ 2010-10-19  9:16 UTC (permalink / raw)
  To: LKML; +Cc: Ingo Molnar, Peter Zijlstra, Linus Torvalds

Greetings,

Comments, suggestions etc highly welcome.

This patch implements an idea from Linus, to automatically create task groups
per tty, to improve desktop interactivity under hefty load such as kbuild.  The
feature is enabled from boot by default,  The default setting can be changed via
the boot option ttysched=0, and can be can be turned on or off on the fly via
echo [01] > /proc/sys/kernel/sched_tty_sched_enabled.

A 100% hog overhead measurement proggy pinned to the same CPU as a make -j10

pert/s:      229 >5484.43us:       41 min:  0.15 max:12069.42 avg:2193.81 sum/s:502382us overhead:50.24%
pert/s:      222 >5652.28us:       43 min:  0.46 max:12077.31 avg:2248.56 sum/s:499181us overhead:49.92%
pert/s:      211 >5809.38us:       43 min:  0.16 max:12064.78 avg:2381.70 sum/s:502538us overhead:50.25%
pert/s:      223 >6147.92us:       43 min:  0.15 max:16107.46 avg:2282.17 sum/s:508925us overhead:50.49%
pert/s:      218 >6252.64us:       43 min:  0.16 max:12066.13 avg:2324.11 sum/s:506656us overhead:50.27%

Signed-off-by: Mike Galbraith <efault@gmx.de>

---
 drivers/char/tty_io.c |    2 
 include/linux/sched.h |   14 +++++
 include/linux/tty.h   |    3 +
 init/Kconfig          |   13 +++++
 kernel/sched.c        |    9 +++
 kernel/sched_tty.c    |  128 ++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/sched_tty.h    |    7 ++
 kernel/sysctl.c       |   11 ++++
 8 files changed, 186 insertions(+), 1 deletion(-)

Index: linux-2.6.36.git/include/linux/sched.h
===================================================================
--- linux-2.6.36.git.orig/include/linux/sched.h
+++ linux-2.6.36.git/include/linux/sched.h
@@ -1900,6 +1900,20 @@ int sched_rt_handler(struct ctl_table *t
 
 extern unsigned int sysctl_sched_compat_yield;
 
+#ifdef CONFIG_SCHED_DESKTOP
+int sched_tty_sched_handler(struct ctl_table *table, int write,
+		void __user *buffer, size_t *lenp,
+		loff_t *ppos);
+
+extern unsigned int sysctl_sched_tty_sched_enabled;
+
+void tty_sched_create_group(struct tty_struct *tty);
+void tty_sched_destroy_group(struct tty_struct *tty);
+#else
+static inline void tty_sched_create_group(struct tty_struct *tty) { }
+static inline void tty_sched_destroy_group(struct tty_struct *tty) { }
+#endif
+
 #ifdef CONFIG_RT_MUTEXES
 extern int rt_mutex_getprio(struct task_struct *p);
 extern void rt_mutex_setprio(struct task_struct *p, int prio);
Index: linux-2.6.36.git/include/linux/tty.h
===================================================================
--- linux-2.6.36.git.orig/include/linux/tty.h
+++ linux-2.6.36.git/include/linux/tty.h
@@ -327,6 +327,9 @@ struct tty_struct {
 	/* If the tty has a pending do_SAK, queue it here - akpm */
 	struct work_struct SAK_work;
 	struct tty_port *port;
+#ifdef CONFIG_SCHED_DESKTOP
+	struct task_group *tg;
+#endif
 };
 
 /* Each of a tty's open files has private_data pointing to tty_file_private */
Index: linux-2.6.36.git/kernel/sched.c
===================================================================
--- linux-2.6.36.git.orig/kernel/sched.c
+++ linux-2.6.36.git/kernel/sched.c
@@ -78,6 +78,7 @@
 
 #include "sched_cpupri.h"
 #include "workqueue_sched.h"
+#include "sched_tty.h"
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/sched.h>
@@ -612,11 +613,16 @@ static inline int cpu_of(struct rq *rq)
  */
 static inline struct task_group *task_group(struct task_struct *p)
 {
+	struct task_group *tg;
 	struct cgroup_subsys_state *css;
 
 	css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
 			lockdep_is_held(&task_rq(p)->lock));
-	return container_of(css, struct task_group, css);
+	tg = container_of(css, struct task_group, css);
+
+	tty_sched_check_attach(p, &tg);
+
+	return tg;
 }
 
 /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
@@ -1920,6 +1926,7 @@ static void deactivate_task(struct rq *r
 #include "sched_idletask.c"
 #include "sched_fair.c"
 #include "sched_rt.c"
+#include "sched_tty.c"
 #ifdef CONFIG_SCHED_DEBUG
 # include "sched_debug.c"
 #endif
Index: linux-2.6.36.git/drivers/char/tty_io.c
===================================================================
--- linux-2.6.36.git.orig/drivers/char/tty_io.c
+++ linux-2.6.36.git/drivers/char/tty_io.c
@@ -185,6 +185,7 @@ void free_tty_struct(struct tty_struct *
 {
 	kfree(tty->write_buf);
 	tty_buffer_free_all(tty);
+	tty_sched_destroy_group(tty);
 	kfree(tty);
 }
 
@@ -2823,6 +2824,7 @@ void initialize_tty_struct(struct tty_st
 	tty->ops = driver->ops;
 	tty->index = idx;
 	tty_line_name(driver, idx, tty->name);
+	tty_sched_create_group(tty);
 }
 
 /**
Index: linux-2.6.36.git/kernel/sched_tty.h
===================================================================
--- /dev/null
+++ linux-2.6.36.git/kernel/sched_tty.h
@@ -0,0 +1,7 @@
+#ifdef CONFIG_SCHED_DESKTOP
+static inline void
+tty_sched_check_attach(struct task_struct *p, struct task_group **tg);
+#else
+static inline void
+tty_sched_check_attach(struct task_struct *p, struct task_group **tg) { }
+#endif
Index: linux-2.6.36.git/kernel/sched_tty.c
===================================================================
--- /dev/null
+++ linux-2.6.36.git/kernel/sched_tty.c
@@ -0,0 +1,128 @@
+#ifdef CONFIG_SCHED_DESKTOP
+#include <linux/tty.h>
+
+unsigned int __read_mostly sysctl_sched_tty_sched_enabled = 1;
+
+void tty_sched_create_group(struct tty_struct *tty)
+{
+	tty->tg = sched_create_group(&init_task_group);
+	if (IS_ERR(tty->tg)) {
+		tty->tg = &init_task_group;
+		 WARN_ON(1);
+	}
+}
+EXPORT_SYMBOL(tty_sched_create_group);
+
+void tty_sched_destroy_group(struct tty_struct *tty)
+{
+	if (tty->tg && tty->tg != &init_task_group)
+		sched_destroy_group(tty->tg);
+}
+EXPORT_SYMBOL(tty_sched_destroy_group);
+
+static inline void
+tty_sched_check_attach(struct task_struct *p, struct task_group **tg)
+{
+	struct tty_struct *tty;
+	int attach = 0, enabled = sysctl_sched_tty_sched_enabled;
+
+	rcu_read_lock();
+	tty = p->signal->tty;
+	if (!tty)
+		goto out_unlock;
+
+	if (enabled && *tg == &root_task_group) {
+		*tg = p->signal->tty->tg;
+		attach = 1;
+	} else if (!enabled && *tg == tty->tg) {
+		*tg = &root_task_group;
+		attach = 1;
+	}
+
+	if (attach && !p->se.on_rq) {
+		p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
+		p->se.vruntime += (*tg)->cfs_rq[task_cpu(p)]->min_vruntime;
+	}
+
+out_unlock:
+	rcu_read_unlock();
+}
+
+void tty_sched_move_task(struct task_struct *p, struct task_group *tg)
+{
+	struct sched_entity *se = &p->se;
+	struct rq *rq;
+	unsigned long flags;
+	int on_rq, running, cpu;
+
+	rq = task_rq_lock(p, &flags);
+
+	running = task_current(rq, p);
+	on_rq = se->on_rq;
+	cpu = rq->cpu;
+
+	if (on_rq)
+		dequeue_task(rq, p, 0);
+	if (unlikely(running))
+		p->sched_class->put_prev_task(rq, p);
+
+	if (!on_rq)
+		se->vruntime -= cfs_rq_of(se)->min_vruntime;
+
+	se->cfs_rq = tg->cfs_rq[cpu];
+	se->parent = tg->se[cpu];
+
+	p->rt.rt_rq  = tg->rt_rq[cpu];
+	p->rt.parent = tg->rt_se[cpu];
+
+	if (!on_rq)
+		se->vruntime += cfs_rq_of(se)->min_vruntime;
+
+	if (unlikely(running))
+		p->sched_class->set_curr_task(rq);
+	if (on_rq)
+		enqueue_task(rq, p, 0);
+
+	task_rq_unlock(rq, &flags);
+}
+
+int sched_tty_sched_handler(struct ctl_table *table, int write,
+		void __user *buffer, size_t *lenp,
+		loff_t *ppos)
+{
+	struct task_struct *p, *t;
+	struct task_group *tg;
+	unsigned long flags;
+	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+
+	if (ret || !write)
+		return ret;
+
+	read_lock_irqsave(&tasklist_lock, flags);
+
+	rcu_read_lock();
+	for_each_process(p) {
+		tg = task_group(p);
+		tty_sched_move_task(p, tg);
+		list_for_each_entry_rcu(t, &p->thread_group, thread_group) {
+			tty_sched_move_task(t, tg);
+		}
+	}
+	rcu_read_unlock();
+
+	read_unlock_irqrestore(&tasklist_lock, flags);
+
+	return 0;
+}
+
+static int __init setup_tty_sched(char *str)
+{
+	unsigned long val;
+
+	val = simple_strtoul(str, NULL, 0);
+	sysctl_sched_tty_sched_enabled = val ? 1 : 0;
+
+	return 1;
+}
+__setup("ttysched=", setup_tty_sched);
+#endif
Index: linux-2.6.36.git/kernel/sysctl.c
===================================================================
--- linux-2.6.36.git.orig/kernel/sysctl.c
+++ linux-2.6.36.git/kernel/sysctl.c
@@ -384,6 +384,17 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
+#ifdef CONFIG_SCHED_DESKTOP
+	{
+		.procname	= "sched_tty_sched_enabled",
+		.data		= &sysctl_sched_tty_sched_enabled,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= sched_tty_sched_handler,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
+#endif
 #ifdef CONFIG_PROVE_LOCKING
 	{
 		.procname	= "prove_locking",
Index: linux-2.6.36.git/init/Kconfig
===================================================================
--- linux-2.6.36.git.orig/init/Kconfig
+++ linux-2.6.36.git/init/Kconfig
@@ -652,6 +652,19 @@ config DEBUG_BLK_CGROUP
 
 endif # CGROUPS
 
+config SCHED_DESKTOP
+	bool "Desktop centric group scheduling"
+	depends on EXPERIMENTAL
+	select CGROUPS
+	select CGROUP_SCHED
+	select FAIR_GROUP_SCHED
+	select RT_GROUP_SCHED
+	select BLK_CGROUP
+	help
+	  This option optimizes the group scheduler for common desktop workloads,
+	  by creating separate per tty groups. This separation of workloads isolates
+	  aggressive CPU burners (like build jobs) from desktop applications.
+
 config MM_OWNER
 	bool
 



^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC/RFT PATCH] sched: automated per tty task groups
  2010-10-19  9:16 Mike Galbraith
@ 2010-10-19  9:26 ` Peter Zijlstra
  2010-10-19  9:39   ` Mike Galbraith
  2010-10-19  9:29 ` Peter Zijlstra
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 31+ messages in thread
From: Peter Zijlstra @ 2010-10-19  9:26 UTC (permalink / raw)
  To: Mike Galbraith; +Cc: LKML, Ingo Molnar, Linus Torvalds

On Tue, 2010-10-19 at 11:16 +0200, Mike Galbraith wrote:
> Greetings,
> 
> Comments, suggestions etc highly welcome.

You might be wanting to exclude RT tasks from the tty groups since there
is no interface to grant them any runtime and such :-)

Also, I think tty_sched_move_task and sched_move_task() should be
sharing lots more code -- I recently proposed a fix for
sched_move_task() because the Android people complained, but they
haven't replied back yet..


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC/RFT PATCH] sched: automated per tty task groups
  2010-10-19  9:16 Mike Galbraith
  2010-10-19  9:26 ` Peter Zijlstra
@ 2010-10-19  9:29 ` Peter Zijlstra
  2010-10-19  9:42   ` Mike Galbraith
  2010-10-19 11:29 ` Mike Galbraith
  2010-10-20 13:55 ` Markus Trippelsdorf
  3 siblings, 1 reply; 31+ messages in thread
From: Peter Zijlstra @ 2010-10-19  9:29 UTC (permalink / raw)
  To: Mike Galbraith; +Cc: LKML, Ingo Molnar, Linus Torvalds

On Tue, 2010-10-19 at 11:16 +0200, Mike Galbraith wrote:
> +       read_lock_irqsave(&tasklist_lock, flags);
> +
> +       rcu_read_lock();
> +       for_each_process(p) {
> +               tg = task_group(p);
> +               tty_sched_move_task(p, tg);
> +               list_for_each_entry_rcu(t, &p->thread_group, thread_group) {
> +                       tty_sched_move_task(t, tg);
> +               }
> +       }
> +       rcu_read_unlock();
> +
> +       read_unlock_irqrestore(&tasklist_lock, flags); 

I don't think you need to disable IRQs for tasklist lock, nor do I think
you actually need it.

If you enable tty groups and then scan all the existing tasks you've
covered them all, new tasks will already be placed right, dying tasks we
don't care about anyway.



^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC/RFT PATCH] sched: automated per tty task groups
  2010-10-19  9:26 ` Peter Zijlstra
@ 2010-10-19  9:39   ` Mike Galbraith
  2010-10-19  9:43     ` Peter Zijlstra
  0 siblings, 1 reply; 31+ messages in thread
From: Mike Galbraith @ 2010-10-19  9:39 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: LKML, Ingo Molnar, Linus Torvalds

On Tue, 2010-10-19 at 11:26 +0200, Peter Zijlstra wrote:
> On Tue, 2010-10-19 at 11:16 +0200, Mike Galbraith wrote:
> > Greetings,
> > 
> > Comments, suggestions etc highly welcome.
> 
> You might be wanting to exclude RT tasks from the tty groups since there
> is no interface to grant them any runtime and such :-)

:)

> Also, I think tty_sched_move_task and sched_move_task() should be
> sharing lots more code -- I recently proposed a fix for
> sched_move_task() because the Android people complained, but they
> haven't replied back yet..

Yeah. I should be able to just do sched_move_task(), but it doesn't
currently work even with your patch, turning tty_sched on/off can lead
to incredible delays before you get box back.  With virgin source, it's
size infinity for all intents and purposes.

	-Mike


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC/RFT PATCH] sched: automated per tty task groups
  2010-10-19  9:29 ` Peter Zijlstra
@ 2010-10-19  9:42   ` Mike Galbraith
  0 siblings, 0 replies; 31+ messages in thread
From: Mike Galbraith @ 2010-10-19  9:42 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: LKML, Ingo Molnar, Linus Torvalds

On Tue, 2010-10-19 at 11:29 +0200, Peter Zijlstra wrote:
> On Tue, 2010-10-19 at 11:16 +0200, Mike Galbraith wrote:
> > +       read_lock_irqsave(&tasklist_lock, flags);
> > +
> > +       rcu_read_lock();
> > +       for_each_process(p) {
> > +               tg = task_group(p);
> > +               tty_sched_move_task(p, tg);
> > +               list_for_each_entry_rcu(t, &p->thread_group, thread_group) {
> > +                       tty_sched_move_task(t, tg);
> > +               }
> > +       }
> > +       rcu_read_unlock();
> > +
> > +       read_unlock_irqrestore(&tasklist_lock, flags); 
> 
> I don't think you need to disable IRQs for tasklist lock, nor do I think
> you actually need it.

OK, thanks.  (No such thing as too paranoid;)

	-Mike


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC/RFT PATCH] sched: automated per tty task groups
  2010-10-19  9:39   ` Mike Galbraith
@ 2010-10-19  9:43     ` Peter Zijlstra
  2010-10-19  9:46       ` Mike Galbraith
  2010-10-21  7:55       ` Mike Galbraith
  0 siblings, 2 replies; 31+ messages in thread
From: Peter Zijlstra @ 2010-10-19  9:43 UTC (permalink / raw)
  To: Mike Galbraith; +Cc: LKML, Ingo Molnar, Linus Torvalds

On Tue, 2010-10-19 at 11:39 +0200, Mike Galbraith wrote:
> 
> Yeah. I should be able to just do sched_move_task(), but it doesn't
> currently work even with your patch, turning tty_sched on/off can lead
> to incredible delays before you get box back.  With virgin source, it's
> size infinity for all intents and purposes.

Ah, first feedback I've got on that patch,. but surely since you created
one that does work we can fix my patch and use the normal path? :-)

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC/RFT PATCH] sched: automated per tty task groups
  2010-10-19  9:43     ` Peter Zijlstra
@ 2010-10-19  9:46       ` Mike Galbraith
  2010-10-21  7:55       ` Mike Galbraith
  1 sibling, 0 replies; 31+ messages in thread
From: Mike Galbraith @ 2010-10-19  9:46 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: LKML, Ingo Molnar, Linus Torvalds

On Tue, 2010-10-19 at 11:43 +0200, Peter Zijlstra wrote:
> On Tue, 2010-10-19 at 11:39 +0200, Mike Galbraith wrote:
> > 
> > Yeah. I should be able to just do sched_move_task(), but it doesn't
> > currently work even with your patch, turning tty_sched on/off can lead
> > to incredible delays before you get box back.  With virgin source, it's
> > size infinity for all intents and purposes.
> 
> Ah, first feedback I've got on that patch,. but surely since you created
> one that does work we can fix my patch and use the normal path? :-)

Yeah.  I wanted to get the RFC out the door, so took a shortcut.

	-Mike


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC/RFT PATCH] sched: automated per tty task groups
  2010-10-19  9:16 Mike Galbraith
  2010-10-19  9:26 ` Peter Zijlstra
  2010-10-19  9:29 ` Peter Zijlstra
@ 2010-10-19 11:29 ` Mike Galbraith
  2010-10-19 11:56   ` Ingo Molnar
  2010-10-19 15:28   ` Linus Torvalds
  2010-10-20 13:55 ` Markus Trippelsdorf
  3 siblings, 2 replies; 31+ messages in thread
From: Mike Galbraith @ 2010-10-19 11:29 UTC (permalink / raw)
  To: LKML; +Cc: Ingo Molnar, Peter Zijlstra, Linus Torvalds, Mathieu Desnoyers

[-- Attachment #1: Type: text/plain, Size: 1962 bytes --]

It was suggested that I show a bit more info.

On Tue, 2010-10-19 at 11:16 +0200, Mike Galbraith wrote:

> A 100% hog overhead measurement proggy pinned to the same CPU as a make -j10
> 
> pert/s:      229 >5484.43us:       41 min:  0.15 max:12069.42 avg:2193.81 sum/s:502382us overhead:50.24%
> pert/s:      222 >5652.28us:       43 min:  0.46 max:12077.31 avg:2248.56 sum/s:499181us overhead:49.92%
> pert/s:      211 >5809.38us:       43 min:  0.16 max:12064.78 avg:2381.70 sum/s:502538us overhead:50.25%
> pert/s:      223 >6147.92us:       43 min:  0.15 max:16107.46 avg:2282.17 sum/s:508925us overhead:50.49%
> pert/s:      218 >6252.64us:       43 min:  0.16 max:12066.13 avg:2324.11 sum/s:506656us overhead:50.27%

The same load without per tty task groups.

pert/s:       31 >40475.37us:        3 min:  0.37 max:48103.60 avg:29573.74 sum/s:916786us overhead:90.24%
pert/s:       23 >41237.70us:       12 min:  0.36 max:56010.39 avg:40187.01 sum/s:924301us overhead:91.99%
pert/s:       24 >42150.22us:       12 min:  8.86 max:61265.91 avg:39459.91 sum/s:947038us overhead:92.20%
pert/s:       26 >42344.91us:       11 min:  3.83 max:52029.60 avg:36164.70 sum/s:940282us overhead:91.12%
pert/s:       24 >44262.90us:       14 min:  5.05 max:82735.15 avg:40314.33 sum/s:967544us overhead:92.22%
                                                      ^^^^^usecs   ^^^^^usecs                       ^^the competition got

Average service latency is an order of magnitude better with tty_sched.
(Imagine that pert is Xorg or whatnot instead)

Using Mathieu Desnoyers' wakeup-latency testcase (attached):

With taskset -c 3 make -j 10 running..

taskset -c 3 ./wakeup-latency& sleep 30;killall wakeup-latency

without:
maximum latency: 42963.2 µs
average latency: 9077.0 µs
missed timer events: 0

with:
maximum latency: 4160.7 µs
average latency: 149.4 µs
missed timer events: 0

Patch makes a big difference in desktop feel under hefty load here.

	-Mike

[-- Attachment #2: wakeup-latency.c --]
[-- Type: text/x-csrc, Size: 7307 bytes --]

/*

    Test application to test wakeup latency under different kinds of
    conditions and scheduling systems.

    Copyright (C) 2008 Nokia Corporation.
    Copyright (C) 2008 Jussi Laako <jussi@sonarnerd.net>

    Contact: Jussi Laako <jussi.laako@nokia.com>
             Mathieu Desnoyers <mathieu.desnoyers@efficios.com>

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program; if not, write to the Free Software
    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA

    Compile with:
    gcc -lm -lrt -o wakeup-latency wakeup-latency.c
*/


#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <signal.h>
#include <time.h>
#include <math.h>
#include <sched.h>
#include <pthread.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>

#define WL_CLOCK_ID	CLOCK_MONOTONIC
/*#define WL_CLOCK_ID	CLOCK_REALTIME*/
#define WL_INTERVAL	10			/* in milliseconds */
#define WL_DELTA	0.01			/* same as above, in seconds */
#define WL_REPORT	0.005			/* report threshold */
#define WL_BLOCKSIZE	480
#define WL_WORKCOUNT	2


typedef struct _ctx_t
{
	timer_t tid;
	unsigned count;
	unsigned overruns;
	double prevtime;
	double maxlat;
	double avglat;
	int dowork;
} ctx_t;


static int workcount = WL_WORKCOUNT;
static float x[4 + 1] = { 0 };
static float y[4] = { 0 };
static float wrk[WL_BLOCKSIZE];

static int tracefd;

static void usertrace(char *str)
{
	write(tracefd, str, strlen(str));
}

static inline double time_as_double (void)
{
	double restime;
	struct timespec ts = { 0 };  /* zero result in case of failure */

	clock_gettime(WL_CLOCK_ID, &ts);

	restime = (double) ts.tv_sec;
	restime += (double) ts.tv_nsec * 1.0e-9;

	return restime;
}


static void flt (float *data, int count)
{
	int i;
	float by;
	float ay;
	float yn;
	static const float a[] = {
		1.0F,
		2.000399874F,
		1.777661733F,
		0.7472161963F,
		0.1247940929F
	};
	static const float b[] = {
		0.3531294936F,
		1.412517974F,
		2.118776961F,
		1.412517974F,
		0.3531294936F
	};

	while (count)
	{
		for (i = 1; i <= 4; i++)
			x[i - 1] = x[i];
		x[4] = *data;
		by = 0;
		for (i = 0; i <= 4; i++)
			by += b[i] * x[4 - i];
		ay = 0;
		for (i = 1; i <= 4; i++)
			ay += a[i] * y[4 - i];
		yn = by - ay;
		for (i = 1; i < 4; i++)
			y[i - 1] = y[i];
		y[4 - 1] = yn;
		*data++ = yn;
		count--;
	}
}


static void sig_cb (int signo)
{
	/* no-op */
}


static void event_cb (sigval_t sval)
{
	int i, w;
	int or;
	int rnd;
	float s;
	float rms;
	double curtime;
	double delta;
	double lat;
	ctx_t *lctx = (ctx_t *) sval.sival_ptr;
	char buf[256];

	curtime = time_as_double();
	or = timer_getoverrun(lctx->tid);
	if (or > 0)
	{
		snprintf(buf, 256, "overruns: %i", or);
		usertrace(buf);
		printf("%s\n", buf);
		lctx->overruns += or;
	}
	delta = curtime - lctx->prevtime;
	if (delta > WL_DELTA)
	{
		lat = delta - WL_DELTA;
		/* display only the ones exceeding the threshold */
		if (lat > WL_REPORT) {
			snprintf(buf, 256, "late by: %.1f µs", lat * 1.0e6);
			usertrace(buf);
			printf("%s\n", buf);
		}
		if (lat > lctx->maxlat)
			lctx->maxlat = lat;
		lctx->avglat += lat;
		lctx->count++;
	}
	lctx->prevtime = curtime;
	if (lctx->dowork)
	{
		for (w = 0; w < workcount; w++)
		{
			s = 1.0F / (float) (RAND_MAX / 2);
			for (i = 0; i < WL_BLOCKSIZE; i++)
			{
				rnd = rand() - (RAND_MAX >> 1);
				wrk[i] = (float) rnd * s;
			}
			flt(wrk, WL_BLOCKSIZE);
			rms = 0.0F;
			for (i = 0; i < WL_BLOCKSIZE; i++)
				rms += wrk[i];
			rms = sqrtf(rms / (float) WL_BLOCKSIZE);
		}
	}
}


static int set_scheduling (int schedpol)
{
	struct sched_param schedparm = { 0 };

	printf("min priority: %i, max priority: %i\n",
		sched_get_priority_min(schedpol),
		sched_get_priority_max(schedpol));
	if (schedpol == SCHED_FIFO || schedpol == SCHED_RR)
		schedparm.sched_priority = sched_get_priority_min(schedpol);
	else
		schedparm.sched_priority = sched_get_priority_max(schedpol);
	if (sched_setscheduler(0, schedpol, &schedparm))
	{
		perror("sched_setscheduler()");
		return -1;
	}
	return 0;
}


static int create_source (ctx_t *ctx, int schedpol)
{
	int res = 0;
	struct sigevent sev = { 0 };
	struct sched_param schedparm = { 0 };
	pthread_attr_t tattr;

	pthread_attr_init(&tattr);
	if (pthread_attr_setinheritsched(&tattr, PTHREAD_INHERIT_SCHED))
	{
		pthread_attr_setinheritsched(&tattr, PTHREAD_EXPLICIT_SCHED);
		pthread_attr_setschedpolicy(&tattr, schedpol);
		if (schedpol == SCHED_FIFO || schedpol == SCHED_RR)
			schedparm.sched_priority =
				sched_get_priority_min(schedpol);
		else
			schedparm.sched_priority =
				sched_get_priority_max(schedpol);
		pthread_attr_setschedparam(&tattr, &schedparm);
	}

	sev.sigev_notify = SIGEV_THREAD;
	sev.sigev_signo = SIGRTMIN;
	sev.sigev_value.sival_ptr = ctx;
	sev.sigev_notify_function = event_cb;
	sev.sigev_notify_attributes = &tattr;
	if (timer_create(WL_CLOCK_ID, &sev, &ctx->tid))
	{
		perror("timer_create()");
		res = -1;
	}

	pthread_attr_destroy(&tattr);

	return res;
}


static int set_timer (ctx_t *ctx)
{
	struct itimerspec ts;

	ts.it_interval.tv_sec = 0;
	ts.it_interval.tv_nsec = WL_INTERVAL * 1000000;
	ts.it_value.tv_sec = 0;
	ts.it_value.tv_nsec = WL_INTERVAL * 1000000;
	ctx->prevtime = time_as_double();
	if (timer_settime(ctx->tid, 0, &ts, NULL))
	{
		perror("timer_settime()");
		return -1;
	}
	return 0;
}


int main (int argc, char *argv[])
{
	int i;
	int w;
	int schedpol = 0;
	int sig;
	int ret = 0;
	sigset_t ss;
	ctx_t ctx = { 0 };

	tracefd = open("/debugfs/ltt/write_event", O_WRONLY);

	signal(SIGINT, sig_cb);
	signal(SIGTERM, sig_cb);
	srand(time(NULL));
	for (i = 1; i < argc; i++)
	{
		if (strcmp(argv[i], "--use-mm") == 0)
			schedpol = 6;
		else if (strcmp(argv[i], "--use-fifo") == 0)
			schedpol = SCHED_FIFO;
		else if (strcmp(argv[i], "--use-rr") == 0)
			schedpol = SCHED_RR;
		else if (strcmp(argv[i], "--do-work") == 0)
		{
			ctx.dowork = 1;
			if (i + 1 < argc)
			{
				w = atoi(argv[i + 1]);
				if (w > 0)
				{
					workcount = w;
					i++;
				}
			}
			printf("work count: %i\n", workcount);
		}
		else
		{
			printf("unknown option: %s\n", argv[i]);
			printf(
				"usage: %s [--use-mm|--use-fifo|--use-rr][--do-work [n]]\n",
				argv[0]);
			ret = 1;
			goto end;
		}
	}

	if (set_scheduling(schedpol)) {
		ret = 1;
		goto end;
	}

	if (create_source(&ctx, schedpol)) {
		ret = 1;
		goto end;
	}

	if (!set_timer(&ctx))
	{
		sigemptyset(&ss);
		sigaddset(&ss, SIGINT);
		sigaddset(&ss, SIGTERM);
		sigwait(&ss, &sig);
	}

	printf("\nmaximum latency: %.1f µs\n", ctx.maxlat * 1.0e6);
	printf("average latency: %.1f µs\n",
		ctx.avglat / (double) ctx.count * 1.0e6);
	printf("missed timer events: %u\n", ctx.overruns);

	timer_delete(ctx.tid);
end:
	close(tracefd);

	return ret;
}


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC/RFT PATCH] sched: automated per tty task groups
  2010-10-19 11:29 ` Mike Galbraith
@ 2010-10-19 11:56   ` Ingo Molnar
  2010-10-19 13:12     ` Mike Galbraith
  2010-10-19 15:28   ` Linus Torvalds
  1 sibling, 1 reply; 31+ messages in thread
From: Ingo Molnar @ 2010-10-19 11:56 UTC (permalink / raw)
  To: Mike Galbraith; +Cc: LKML, Peter Zijlstra, Linus Torvalds, Mathieu Desnoyers


* Mike Galbraith <efault@gmx.de> wrote:

> Using Mathieu Desnoyers' wakeup-latency testcase (attached):
> 
> With taskset -c 3 make -j 10 running..
> 
> taskset -c 3 ./wakeup-latency& sleep 30;killall wakeup-latency
> 
> without:
>  maximum latency: 42963.2 µs
>  average latency: 9077.0 µs
>  missed timer events: 0
> 
> with:
>  maximum latency: 4160.7 µs
>  average latency: 149.4 µs
>  missed timer events: 0
> 
> Patch makes a big difference in desktop feel under hefty load here.

That's really nice!

Could this feature realistically do block IO isolation as well? It's always annoying 
when some big IO job is making the desktop jerky. Especially as your patch is 
selecting the block cgroup feature already:

+       select BLK_CGROUP

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC/RFT PATCH] sched: automated per tty task groups
  2010-10-19 11:56   ` Ingo Molnar
@ 2010-10-19 13:12     ` Mike Galbraith
  0 siblings, 0 replies; 31+ messages in thread
From: Mike Galbraith @ 2010-10-19 13:12 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: LKML, Peter Zijlstra, Linus Torvalds, Mathieu Desnoyers

On Tue, 2010-10-19 at 13:56 +0200, Ingo Molnar wrote:

> Could this feature realistically do block IO isolation as well? It's always annoying 
> when some big IO job is making the desktop jerky. Especially as your patch is 
> selecting the block cgroup feature already:
> 
> +       select BLK_CGROUP

I know my cgroup pgid config helps a bunch with rummaging in my email
while other IO is going on.  I've been attributing that to BLK_CGROUP,
but have no proof.

	-Mike


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC/RFT PATCH] sched: automated per tty task groups
  2010-10-19 11:29 ` Mike Galbraith
  2010-10-19 11:56   ` Ingo Molnar
@ 2010-10-19 15:28   ` Linus Torvalds
  2010-10-19 18:13     ` Mike Galbraith
  1 sibling, 1 reply; 31+ messages in thread
From: Linus Torvalds @ 2010-10-19 15:28 UTC (permalink / raw)
  To: Mike Galbraith; +Cc: LKML, Ingo Molnar, Peter Zijlstra, Mathieu Desnoyers

On Tue, Oct 19, 2010 at 4:29 AM, Mike Galbraith <efault@gmx.de> wrote:
> It was suggested that I show a bit more info.

Yes, I was going to complain that the numbers in the commit message
made no sense without something to compare the numbers to.

> The same load without per tty task groups.

Very impressive. This definitely looks like something people will notice.

That said, I do think we should think carefully about calling this a
"tty" feature. I think we might want to leave the door open to other
heuristics than _just_ the tty group. I think the tty group approach
is wonderful for traditional Unix loads in a desktop environment, but
I suspect we might hit issues with IDE's etc too. I don't know if we
can notice things like that automatically, but I think it's worth
thinking about.

So I think the patch looks pretty good, and the numbers seem to look
just stunningly so, but I'd like to name the feature more along the
lines of "automatic process group scheduling" rather than about tty's
per se.

And you actually did that for the Kconfig option, which makes me quite happy.

The one other thing I do wonder about is how noticeable the group
scheduling overhead is. If people compare with a non-CGROUP_SCHED
kernel, will a desktop-optimized kernel suddenly have horrible pipe
latency due to much higher scheduling cost? Right now that whole
feature is hidden by EXPERIMENTAL, I don't know how much it hurts, and
I never timed it when I tried it out long ago..

                            Linus

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC/RFT PATCH] sched: automated per tty task groups
  2010-10-19 15:28   ` Linus Torvalds
@ 2010-10-19 18:13     ` Mike Galbraith
  2010-10-19 18:53       ` Mike Galbraith
  0 siblings, 1 reply; 31+ messages in thread
From: Mike Galbraith @ 2010-10-19 18:13 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: LKML, Ingo Molnar, Peter Zijlstra, Mathieu Desnoyers

On Tue, 2010-10-19 at 08:28 -0700, Linus Torvalds wrote:
> On Tue, Oct 19, 2010 at 4:29 AM, Mike Galbraith <efault@gmx.de> wrote:

> So I think the patch looks pretty good, and the numbers seem to look
> just stunningly so, but I'd like to name the feature more along the
> lines of "automatic process group scheduling" rather than about tty's
> per se.

Oh, absolutely, that's what it's all about really.  What I'd _like_ is
to get per process group scheduling working on the cheap..ish.  Your
idea of tty cgoups looked much simpler though, so I figured that would
be a great place to start.  It turned out to be much simpler than I
thought it would be, which is encouraging, and it works well in testing
(so far that is).

> And you actually did that for the Kconfig option, which makes me quite happy.

(Ingo's input.. spot on)

> The one other thing I do wonder about is how noticeable the group
> scheduling overhead is.

Very noticeable, cgroups is far from free.  It would make no sense for a
performance freak to even think about it.  I don't run cgroup enabled
kernels usually, and generally strip to the bone because I favor
throughput very very heavily, but when I look at the desktop under load,
the cost/performance trade-off ~seems to work out.

>  If people compare with a non-CGROUP_SCHED
> kernel, will a desktop-optimized kernel suddenly have horrible pipe
> latency due to much higher scheduling cost? Right now that whole
> feature is hidden by EXPERIMENTAL, I don't know how much it hurts, and
> I never timed it when I tried it out long ago..

The scheduling cost is quite high.  But realistically, the cost of a
distro kernel with full featured network stack is (much) higher.  I
seriously doubt the cost of cgroups would be noticed by the typical
_desktop_ user.  Overall latencies for any switchy microbenchmark will
certainly be considerably higher with the feature enabled.

	-Mike


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC/RFT PATCH] sched: automated per tty task groups
  2010-10-19 18:13     ` Mike Galbraith
@ 2010-10-19 18:53       ` Mike Galbraith
  2010-10-20  2:56         ` Ingo Molnar
  0 siblings, 1 reply; 31+ messages in thread
From: Mike Galbraith @ 2010-10-19 18:53 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: LKML, Ingo Molnar, Peter Zijlstra, Mathieu Desnoyers

> On Tue, 2010-10-19 at 08:28 -0700, Linus Torvalds wrote:
> >
> >  If people compare with a non-CGROUP_SCHED
> > kernel, will a desktop-optimized kernel suddenly have horrible pipe
> > latency due to much higher scheduling cost? Right now that whole
> > feature is hidden by EXPERIMENTAL, I don't know how much it hurts, and
> > I never timed it when I tried it out long ago..

Q/D test of kernels w/wo, with same .config using pipe-test (pure sched)
gives on my box ~590khz with tty_sched active, 620khz without cgroups
acitve in same kernel/config without patch.  last time I measured
stripped down config (not long ago, but not yesterday either) gave max
ctx rate ~690khz on this box.

(note: very Q, very D numbers, no variance testing, ballpark)

	-Mike


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC/RFT PATCH] sched: automated per tty task groups
  2010-10-19 18:53       ` Mike Galbraith
@ 2010-10-20  2:56         ` Ingo Molnar
  2010-10-21  8:11           ` Mike Galbraith
  0 siblings, 1 reply; 31+ messages in thread
From: Ingo Molnar @ 2010-10-20  2:56 UTC (permalink / raw)
  To: Mike Galbraith; +Cc: Linus Torvalds, LKML, Peter Zijlstra, Mathieu Desnoyers


* Mike Galbraith <efault@gmx.de> wrote:

> > On Tue, 2010-10-19 at 08:28 -0700, Linus Torvalds wrote:
> > >
> > >  If people compare with a non-CGROUP_SCHED
> > > kernel, will a desktop-optimized kernel suddenly have horrible pipe
> > > latency due to much higher scheduling cost? Right now that whole
> > > feature is hidden by EXPERIMENTAL, I don't know how much it hurts, and
> > > I never timed it when I tried it out long ago..
> 
> Q/D test of kernels w/wo, with same .config using pipe-test (pure sched) gives on 
> my box ~590khz with tty_sched active, 620khz without cgroups acitve in same 
> kernel/config without patch.  last time I measured stripped down config (not long 
> ago, but not yesterday either) gave max ctx rate ~690khz on this box.
> 
> (note: very Q, very D numbers, no variance testing, ballpark)

That's 5% overhead in context switches. Definitely not in the 'horrible' category.

This would be a rather tempting item for 2.6.37 ... especially as it really mainly 
reuses existing group scheduling functionality, in a clever way.

Mind doing more of the tty->desktop renames/generalizations as Linus suggested, and 
resend the patch?

I'd also suggest to move it out of EXPERIMENTAL - we dont really do that for core 
kernel features as most distros enable CONFIG_EXPERIMENTAL so it's a rather 
meaningless distinction. Since the feature is default-n, people will get the old 
scheduler by default but can also choose this desktop-centric scheduling mode.

I'd even argue to make it default-y, because this patch clearly cures a form of 
kbuild cancer.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC/RFT PATCH] sched: automated per tty task groups
  2010-10-19  9:16 Mike Galbraith
                   ` (2 preceding siblings ...)
  2010-10-19 11:29 ` Mike Galbraith
@ 2010-10-20 13:55 ` Markus Trippelsdorf
  2010-10-20 14:41   ` Mike Galbraith
  3 siblings, 1 reply; 31+ messages in thread
From: Markus Trippelsdorf @ 2010-10-20 13:55 UTC (permalink / raw)
  To: linux-kernel

Mike Galbraith wrote:

> Comments, suggestions etc highly welcome.

I've tested your patch and it runs smoothly on my machine.
However I had several NULL pointer dereference BUGs that happened when I 
left X or rebooted my system. I think this is caused by your patch. 
There is nothing in the logs unfortunately, but I scribbled down the 
following by hand (not the whole trace, I'm too lazy):

BUG: unable to handle NULL pointer dereference at 0..038
IP:   pick_next_task_fair 0xa7/0x1a0
...
Call Trace: schedule 
...






^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC/RFT PATCH] sched: automated per tty task groups
  2010-10-20 13:55 ` Markus Trippelsdorf
@ 2010-10-20 14:41   ` Mike Galbraith
  0 siblings, 0 replies; 31+ messages in thread
From: Mike Galbraith @ 2010-10-20 14:41 UTC (permalink / raw)
  To: Markus Trippelsdorf; +Cc: linux-kernel

On Wed, 2010-10-20 at 15:55 +0200, Markus Trippelsdorf wrote:
> Mike Galbraith wrote:
> 
> > Comments, suggestions etc highly welcome.
> 
> I've tested your patch and it runs smoothly on my machine.
> However I had several NULL pointer dereference BUGs that happened when I 
> left X or rebooted my system. I think this is caused by your patch. 
> There is nothing in the logs unfortunately, but I scribbled down the 
> following by hand (not the whole trace, I'm too lazy):
> 
> BUG: unable to handle NULL pointer dereference at 0..038
> IP:   pick_next_task_fair 0xa7/0x1a0
> ...
> Call Trace: schedule 
> ...

Hm.  Not much I can do without the trace, but thanks for testing and
reporting anyway, guess I need to do some heavy stress testing.  I'm
re-writing it as I write this anyway.

        thanks,

        -Mike


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC/RFT PATCH] sched: automated per tty task groups
  2010-10-19  9:43     ` Peter Zijlstra
  2010-10-19  9:46       ` Mike Galbraith
@ 2010-10-21  7:55       ` Mike Galbraith
  2010-10-21 10:28         ` Peter Zijlstra
  1 sibling, 1 reply; 31+ messages in thread
From: Mike Galbraith @ 2010-10-21  7:55 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: LKML, Ingo Molnar, Linus Torvalds

On Tue, 2010-10-19 at 11:43 +0200, Peter Zijlstra wrote: 
> On Tue, 2010-10-19 at 11:39 +0200, Mike Galbraith wrote:
> > 
> > Yeah. I should be able to just do sched_move_task(), but it doesn't
> > currently work even with your patch, turning tty_sched on/off can lead
> > to incredible delays before you get box back.  With virgin source, it's
> > size infinity for all intents and purposes.
> 
> Ah, first feedback I've got on that patch,. but surely since you created
> one that does work we can fix my patch and use the normal path? :-)

Actually, your patch works just peachy, my fiddling with sleepers
vruntime at attach time was a dainbramaged thing to do.

	-Mike


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC/RFT PATCH] sched: automated per tty task groups
  2010-10-20  2:56         ` Ingo Molnar
@ 2010-10-21  8:11           ` Mike Galbraith
  2010-10-21  8:31             ` Ingo Molnar
                               ` (2 more replies)
  0 siblings, 3 replies; 31+ messages in thread
From: Mike Galbraith @ 2010-10-21  8:11 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Linus Torvalds, LKML, Peter Zijlstra, Mathieu Desnoyers,
	Markus Trippelsdorf

[-- Attachment #1: Type: text/plain, Size: 11093 bytes --]

On Wed, 2010-10-20 at 04:56 +0200, Ingo Molnar wrote:

> Mind doing more of the tty->desktop renames/generalizations as Linus suggested, and 
> resend the patch?

Here she comes.  Better/Worse?

Changes:
- tty->autogroup.
- only autogroup fair class tasks.
- removed dainbramaged sleeper vruntime twiddling.
- removed paranoid locking.
- removed noop detatch code.

> I'd also suggest to move it out of EXPERIMENTAL - we dont really do that for core 
> kernel features as most distros enable CONFIG_EXPERIMENTAL so it's a rather 
> meaningless distinction. Since the feature is default-n, people will get the old 
> scheduler by default but can also choose this desktop-centric scheduling mode.
> 
> I'd even argue to make it default-y, because this patch clearly cures a form of 
> kbuild cancer.

You top dogs can make the default call.. it it's accepted that is ;-)

Marcus:  care to try the below?  Works fine for me (but so did first
cut).  It depends on the attached patch, and applied to virgin shiny new
2.6.36.

A recurring complaint from CFS users is that parallel kbuild has a negative
impact on desktop interactivity.  This patch implements an idea from Linus,
to automatically create task groups.  This patch only implements Linus' per
tty task group suggestion, and only for fair class tasks, but leaves the way
open for enhancement.

How it works: at tty alloc/dealloc time, a task group is created/destroyed,
so there is always a task group active per tty.  When we select a runqueue,
if the task has a has a tty association, and no task group, attach it to a
per tty autogroup on the fly.

The feature is enabled from boot by default if CONFIG_SCHED_AUTOGROUP is
selected, but can be disabled via the boot option noautogroup, and can be
also be turned on/off on the fly via..
   echo [01] > /proc/sys/kernel/sched_autogroup_enabled.
..which will automatically move tasks to/from the root task group.

Some numbers.

A 100% hog overhead measurement proggy pinned to the same CPU as a make -j10

About measurement proggy:
  pert/sec = perturbations/sec
  min/max/avg = scheduler service latencies in usecs
  sum/s = time accrued by the competition per sample period (1 sec here)
  overhead = %CPU received by the competition per sample period

pert/s:       31 >40475.37us:        3 min:  0.37 max:48103.60 avg:29573.74 sum/s:916786us overhead:90.24%
pert/s:       23 >41237.70us:       12 min:  0.36 max:56010.39 avg:40187.01 sum/s:924301us overhead:91.99%
pert/s:       24 >42150.22us:       12 min:  8.86 max:61265.91 avg:39459.91 sum/s:947038us overhead:92.20%
pert/s:       26 >42344.91us:       11 min:  3.83 max:52029.60 avg:36164.70 sum/s:940282us overhead:91.12%
pert/s:       24 >44262.90us:       14 min:  5.05 max:82735.15 avg:40314.33 sum/s:967544us overhead:92.22%

Same load with this patch applied.

pert/s:      229 >5484.43us:       41 min:  0.15 max:12069.42 avg:2193.81 sum/s:502382us overhead:50.24%
pert/s:      222 >5652.28us:       43 min:  0.46 max:12077.31 avg:2248.56 sum/s:499181us overhead:49.92%
pert/s:      211 >5809.38us:       43 min:  0.16 max:12064.78 avg:2381.70 sum/s:502538us overhead:50.25%
pert/s:      223 >6147.92us:       43 min:  0.15 max:16107.46 avg:2282.17 sum/s:508925us overhead:50.49%
pert/s:      218 >6252.64us:       43 min:  0.16 max:12066.13 avg:2324.11 sum/s:506656us overhead:50.27%

Average service latency is an order of magnitude better with autogroup.
(Imagine that pert were Xorg or whatnot instead)

Using Mathieu Desnoyers' wakeup-latency testcase:

With taskset -c 3 make -j 10 running..

taskset -c 3 ./wakeup-latency& sleep 30;killall wakeup-latency

without:
maximum latency: 42963.2 µs
average latency: 9077.0 µs
missed timer events: 0

with:
maximum latency: 4160.7 µs
average latency: 149.4 µs
missed timer events: 0

Signed-off-by: Mike Galbraith <efault@gmx.de>
---
 drivers/char/tty_io.c    |    2 +
 include/linux/sched.h    |   14 ++++++++
 include/linux/tty.h      |    3 +
 init/Kconfig             |   13 ++++++++
 kernel/sched.c           |    9 ++++-
 kernel/sched_autogroup.c |   76 +++++++++++++++++++++++++++++++++++++++++++++++
 kernel/sched_autogroup.h |    7 ++++
 kernel/sysctl.c          |   11 ++++++
 8 files changed, 134 insertions(+), 1 deletion(-)

Index: linux-2.6.36.git/include/linux/sched.h
===================================================================
--- linux-2.6.36.git.orig/include/linux/sched.h
+++ linux-2.6.36.git/include/linux/sched.h
@@ -1900,6 +1900,20 @@ int sched_rt_handler(struct ctl_table *t
 
 extern unsigned int sysctl_sched_compat_yield;
 
+#ifdef CONFIG_SCHED_AUTOGROUP
+int sched_autogroup_handler(struct ctl_table *table, int write,
+		void __user *buffer, size_t *lenp,
+		loff_t *ppos);
+
+extern unsigned int sysctl_sched_autogroup_enabled;
+
+void sched_autogroup_create_tty(struct tty_struct *tty);
+void sched_autogroup_destroy_tty(struct tty_struct *tty);
+#else
+static inline void sched_autogroup_create_tty(struct tty_struct *tty) { }
+static inline void sched_autogroup_destroy_tty(struct tty_struct *tty) { }
+#endif
+
 #ifdef CONFIG_RT_MUTEXES
 extern int rt_mutex_getprio(struct task_struct *p);
 extern void rt_mutex_setprio(struct task_struct *p, int prio);
Index: linux-2.6.36.git/include/linux/tty.h
===================================================================
--- linux-2.6.36.git.orig/include/linux/tty.h
+++ linux-2.6.36.git/include/linux/tty.h
@@ -327,6 +327,9 @@ struct tty_struct {
 	/* If the tty has a pending do_SAK, queue it here - akpm */
 	struct work_struct SAK_work;
 	struct tty_port *port;
+#ifdef CONFIG_SCHED_AUTOGROUP
+	struct task_group *tg;
+#endif
 };
 
 /* Each of a tty's open files has private_data pointing to tty_file_private */
Index: linux-2.6.36.git/kernel/sched.c
===================================================================
--- linux-2.6.36.git.orig/kernel/sched.c
+++ linux-2.6.36.git/kernel/sched.c
@@ -78,6 +78,7 @@
 
 #include "sched_cpupri.h"
 #include "workqueue_sched.h"
+#include "sched_autogroup.h"
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/sched.h>
@@ -612,11 +613,16 @@ static inline int cpu_of(struct rq *rq)
  */
 static inline struct task_group *task_group(struct task_struct *p)
 {
+	struct task_group *tg;
 	struct cgroup_subsys_state *css;
 
 	css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
 			lockdep_is_held(&task_rq(p)->lock));
-	return container_of(css, struct task_group, css);
+	tg = container_of(css, struct task_group, css);
+
+	autogroup_check_attach(p, &tg);
+
+	return tg;
 }
 
 /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
@@ -1920,6 +1926,7 @@ static void deactivate_task(struct rq *r
 #include "sched_idletask.c"
 #include "sched_fair.c"
 #include "sched_rt.c"
+#include "sched_autogroup.c"
 #ifdef CONFIG_SCHED_DEBUG
 # include "sched_debug.c"
 #endif
Index: linux-2.6.36.git/drivers/char/tty_io.c
===================================================================
--- linux-2.6.36.git.orig/drivers/char/tty_io.c
+++ linux-2.6.36.git/drivers/char/tty_io.c
@@ -185,6 +185,7 @@ void free_tty_struct(struct tty_struct *
 {
 	kfree(tty->write_buf);
 	tty_buffer_free_all(tty);
+	sched_autogroup_destroy_tty(tty);
 	kfree(tty);
 }
 
@@ -2823,6 +2824,7 @@ void initialize_tty_struct(struct tty_st
 	tty->ops = driver->ops;
 	tty->index = idx;
 	tty_line_name(driver, idx, tty->name);
+	sched_autogroup_create_tty(tty);
 }
 
 /**
Index: linux-2.6.36.git/kernel/sched_autogroup.h
===================================================================
--- /dev/null
+++ linux-2.6.36.git/kernel/sched_autogroup.h
@@ -0,0 +1,7 @@
+#ifdef CONFIG_SCHED_AUTOGROUP
+static inline void
+autogroup_check_attach(struct task_struct *p, struct task_group **tg);
+#else
+static inline void
+autogroup_check_attach(struct task_struct *p, struct task_group **tg) { }
+#endif
Index: linux-2.6.36.git/kernel/sysctl.c
===================================================================
--- linux-2.6.36.git.orig/kernel/sysctl.c
+++ linux-2.6.36.git/kernel/sysctl.c
@@ -384,6 +384,17 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
+#ifdef CONFIG_SCHED_AUTOGROUP
+	{
+		.procname	= "sched_autogroup_enabled",
+		.data		= &sysctl_sched_autogroup_enabled,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= sched_autogroup_handler,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
+#endif
 #ifdef CONFIG_PROVE_LOCKING
 	{
 		.procname	= "prove_locking",
Index: linux-2.6.36.git/init/Kconfig
===================================================================
--- linux-2.6.36.git.orig/init/Kconfig
+++ linux-2.6.36.git/init/Kconfig
@@ -652,6 +652,19 @@ config DEBUG_BLK_CGROUP
 
 endif # CGROUPS
 
+config SCHED_AUTOGROUP
+	bool "Automatic process group scheduling"
+	select CGROUPS
+	select CGROUP_SCHED
+	select FAIR_GROUP_SCHED
+	select BLK_CGROUP
+	help
+	  This option optimizes the scheduler for common desktop workloads by
+	  automatically creating and populating task groups.  This separation
+	  of workloads isolates aggressive CPU burners (like build jobs) from
+	  desktop applications.  Task group autogeneration is currently based
+	  upon task tty association.
+
 config MM_OWNER
 	bool
 
Index: linux-2.6.36.git/kernel/sched_autogroup.c
===================================================================
--- /dev/null
+++ linux-2.6.36.git/kernel/sched_autogroup.c
@@ -0,0 +1,76 @@
+#ifdef CONFIG_SCHED_AUTOGROUP
+#include <linux/tty.h>
+
+unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1;
+
+void sched_autogroup_create_tty(struct tty_struct *tty)
+{
+	tty->tg = sched_create_group(&init_task_group);
+	if (IS_ERR(tty->tg)) {
+		tty->tg = &init_task_group;
+		 WARN_ON(1);
+	}
+}
+EXPORT_SYMBOL(sched_autogroup_create_tty);
+
+void sched_autogroup_destroy_tty(struct tty_struct *tty)
+{
+	if (tty->tg && tty->tg != &init_task_group)
+		sched_destroy_group(tty->tg);
+}
+EXPORT_SYMBOL(sched_autogroup_destroy_tty);
+
+static void
+autogroup_attach_tty(struct task_struct *p, struct task_group **tg)
+{
+	struct tty_struct *tty = p->signal->tty;
+
+	if (!tty)
+		return;
+
+	*tg = p->signal->tty->tg;
+}
+
+static inline void
+autogroup_check_attach(struct task_struct *p, struct task_group **tg)
+{
+	if (!sysctl_sched_autogroup_enabled || *tg != &root_task_group ||
+			p->sched_class != &fair_sched_class)
+		return;
+
+	rcu_read_lock();
+
+	autogroup_attach_tty(p, tg);
+
+	rcu_read_unlock();
+}
+
+int sched_autogroup_handler(struct ctl_table *table, int write,
+		void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct task_struct *p, *t;
+	struct task_group *tg;
+	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+
+	if (ret || !write)
+		return ret;
+
+	for_each_process(p) {
+		tg = task_group(p);
+		sched_move_task(p);
+		list_for_each_entry_rcu(t, &p->thread_group, thread_group) {
+			sched_move_task(t);
+		}
+	}
+
+	return 0;
+}
+
+static int __init setup_autogroup(char *str)
+{
+	sysctl_sched_autogroup_enabled = 0;
+
+	return 1;
+}
+__setup("noautogroup", setup_autogroup);
+#endif


[-- Attachment #2: cgroup_fixup_broken_cgroup_movement.diff --]
[-- Type: text/x-patch, Size: 3078 bytes --]

Subject: sched, cgroup: Fixup broken cgroup movement
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri Oct 15 15:24:15 CEST 2010


Reported-by: Dima Zavin <dima@android.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/sched.h |    2 +-
 kernel/sched.c        |    8 ++++----
 kernel/sched_fair.c   |   25 +++++++++++++++++++------
 3 files changed, 24 insertions(+), 11 deletions(-)

Index: linux-2.6.36.git/kernel/sched.c
===================================================================
--- linux-2.6.36.git.orig/kernel/sched.c
+++ linux-2.6.36.git/kernel/sched.c
@@ -8297,12 +8297,12 @@ void sched_move_task(struct task_struct
 	if (unlikely(running))
 		tsk->sched_class->put_prev_task(rq, tsk);
 
-	set_task_rq(tsk, task_cpu(tsk));
-
 #ifdef CONFIG_FAIR_GROUP_SCHED
-	if (tsk->sched_class->moved_group)
-		tsk->sched_class->moved_group(tsk, on_rq);
+	if (tsk->sched_class->task_move_group)
+		tsk->sched_class->task_move_group(tsk, on_rq);
+	else
 #endif
+		set_task_rq(tsk, task_cpu(tsk));
 
 	if (unlikely(running))
 		tsk->sched_class->set_curr_task(rq);
Index: linux-2.6.36.git/include/linux/sched.h
===================================================================
--- linux-2.6.36.git.orig/include/linux/sched.h
+++ linux-2.6.36.git/include/linux/sched.h
@@ -1072,7 +1072,7 @@ struct sched_class {
 					 struct task_struct *task);
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-	void (*moved_group) (struct task_struct *p, int on_rq);
+	void (*task_move_group) (struct task_struct *p, int on_rq);
 #endif
 };
 
Index: linux-2.6.36.git/kernel/sched_fair.c
===================================================================
--- linux-2.6.36.git.orig/kernel/sched_fair.c
+++ linux-2.6.36.git/kernel/sched_fair.c
@@ -3824,13 +3824,26 @@ static void set_curr_task_fair(struct rq
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-static void moved_group_fair(struct task_struct *p, int on_rq)
+static void task_move_group_fair(struct task_struct *p, int on_rq)
 {
-	struct cfs_rq *cfs_rq = task_cfs_rq(p);
-
-	update_curr(cfs_rq);
+	/*
+	 * If the task was not on the rq at the time of this cgroup movement
+	 * it must have been asleep, sleeping tasks keep their ->vruntime
+	 * absolute on their old rq until wakeup (needed for the fair sleeper
+	 * bonus in place_entity()).
+	 *
+	 * If it was on the rq, we've just 'preempted' it, which does convert
+	 * ->vruntime to a relative base.
+	 *
+	 * Make sure both cases convert their relative position when migrating
+	 * to another cgroup's rq. This does somewhat interfere with the
+	 * fair sleeper stuff for the first placement, but who cares.
+	 */
+	if (!on_rq)
+		p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
+	set_task_rq(p, task_cpu(p));
 	if (!on_rq)
-		place_entity(cfs_rq, &p->se, 1);
+		p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime;
 }
 #endif
 
@@ -3882,7 +3895,7 @@ static const struct sched_class fair_sch
 	.get_rr_interval	= get_rr_interval_fair,
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-	.moved_group		= moved_group_fair,
+	.task_move_group	= task_move_group_fair,
 #endif
 };
 

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC/RFT PATCH] sched: automated per tty task groups
  2010-10-21  8:11           ` Mike Galbraith
@ 2010-10-21  8:31             ` Ingo Molnar
  2010-10-21  8:39               ` Mike Galbraith
  2010-10-21  8:48             ` Markus Trippelsdorf
  2010-10-21 10:51             ` Mathieu Desnoyers
  2 siblings, 1 reply; 31+ messages in thread
From: Ingo Molnar @ 2010-10-21  8:31 UTC (permalink / raw)
  To: Mike Galbraith
  Cc: Linus Torvalds, LKML, Peter Zijlstra, Mathieu Desnoyers,
	Markus Trippelsdorf


* Mike Galbraith <efault@gmx.de> wrote:

> On Wed, 2010-10-20 at 04:56 +0200, Ingo Molnar wrote:
> 
> > Mind doing more of the tty->desktop renames/generalizations as Linus suggested, and 
> > resend the patch?
> 
> Here she comes.  Better/Worse?
> 
> Changes:
> - tty->autogroup.
> - only autogroup fair class tasks.
> - removed dainbramaged sleeper vruntime twiddling.
> - removed paranoid locking.
> - removed noop detatch code.

I really like the new 'autogroup scheduler' name - as we really dont want to turn 
this into anything but an intelligent grouping thing. Via the naming we can resist 
heuristics for example.

Btw., how does Xorg fare with this? Can we remove sleeper fairness for example and 
simplify other bits of the CFS logic as a side-effect?

	Ingo

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC/RFT PATCH] sched: automated per tty task groups
  2010-10-21  8:31             ` Ingo Molnar
@ 2010-10-21  8:39               ` Mike Galbraith
  0 siblings, 0 replies; 31+ messages in thread
From: Mike Galbraith @ 2010-10-21  8:39 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Linus Torvalds, LKML, Peter Zijlstra, Mathieu Desnoyers,
	Markus Trippelsdorf

On Thu, 2010-10-21 at 10:31 +0200, Ingo Molnar wrote:

> Btw., how does Xorg fare with this? Can we remove sleeper fairness for example and 
> simplify other bits of the CFS logic as a side-effect?

Works a treat for me.  As I write this in evolution, I have amarok
playing with a visualization and a make -j100 running.  Song switch is
instant, visualization is nice and smooth despite unaccelerated Xorg.

We can't whack fair sleepers though, not without inventing a new
preemption model to take it's place.

	-Mike


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC/RFT PATCH] sched: automated per tty task groups
  2010-10-21  8:11           ` Mike Galbraith
  2010-10-21  8:31             ` Ingo Molnar
@ 2010-10-21  8:48             ` Markus Trippelsdorf
  2010-10-21  8:52               ` Mike Galbraith
  2010-10-21 10:51             ` Mathieu Desnoyers
  2 siblings, 1 reply; 31+ messages in thread
From: Markus Trippelsdorf @ 2010-10-21  8:48 UTC (permalink / raw)
  To: Mike Galbraith
  Cc: Ingo Molnar, Linus Torvalds, LKML, Peter Zijlstra,
	Mathieu Desnoyers

On Thu, Oct 21, 2010 at 10:11:55AM +0200, Mike Galbraith wrote:
> On Wed, 2010-10-20 at 04:56 +0200, Ingo Molnar wrote:
> 
> > Mind doing more of the tty->desktop renames/generalizations as Linus suggested, and 
> > resend the patch?
> 
> Here she comes.  Better/Worse?
> 
> Changes:
> - tty->autogroup.
> - only autogroup fair class tasks.
> - removed dainbramaged sleeper vruntime twiddling.
> - removed paranoid locking.
> - removed noop detatch code.
> 
> > I'd also suggest to move it out of EXPERIMENTAL - we dont really do that for core 
> > kernel features as most distros enable CONFIG_EXPERIMENTAL so it's a rather 
> > meaningless distinction. Since the feature is default-n, people will get the old 
> > scheduler by default but can also choose this desktop-centric scheduling mode.
> > 
> > I'd even argue to make it default-y, because this patch clearly cures a form of 
> > kbuild cancer.
> 
> You top dogs can make the default call.. it it's accepted that is ;-)
> 
> Marcus:  care to try the below?  Works fine for me (but so did first
> cut).  It depends on the attached patch, and applied to virgin shiny new
> 2.6.36.

Works fine interactively, but I still get the same kernel BUG on reboot
time as before. Would a photo of the trace help you?

-- 
Markus

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC/RFT PATCH] sched: automated per tty task groups
  2010-10-21  8:48             ` Markus Trippelsdorf
@ 2010-10-21  8:52               ` Mike Galbraith
       [not found]                 ` <20101021115723.GA1587@arch.trippelsdorf.de>
  0 siblings, 1 reply; 31+ messages in thread
From: Mike Galbraith @ 2010-10-21  8:52 UTC (permalink / raw)
  To: Markus Trippelsdorf
  Cc: Ingo Molnar, Linus Torvalds, LKML, Peter Zijlstra,
	Mathieu Desnoyers

On Thu, 2010-10-21 at 10:48 +0200, Markus Trippelsdorf wrote:
> On Thu, Oct 21, 2010 at 10:11:55AM +0200, Mike Galbraith wrote:
> > On Wed, 2010-10-20 at 04:56 +0200, Ingo Molnar wrote:
> > 
> > > Mind doing more of the tty->desktop renames/generalizations as Linus suggested, and 
> > > resend the patch?
> > 
> > Here she comes.  Better/Worse?
> > 
> > Changes:
> > - tty->autogroup.
> > - only autogroup fair class tasks.
> > - removed dainbramaged sleeper vruntime twiddling.
> > - removed paranoid locking.
> > - removed noop detatch code.
> > 
> > > I'd also suggest to move it out of EXPERIMENTAL - we dont really do that for core 
> > > kernel features as most distros enable CONFIG_EXPERIMENTAL so it's a rather 
> > > meaningless distinction. Since the feature is default-n, people will get the old 
> > > scheduler by default but can also choose this desktop-centric scheduling mode.
> > > 
> > > I'd even argue to make it default-y, because this patch clearly cures a form of 
> > > kbuild cancer.
> > 
> > You top dogs can make the default call.. it it's accepted that is ;-)
> > 
> > Marcus:  care to try the below?  Works fine for me (but so did first
> > cut).  It depends on the attached patch, and applied to virgin shiny new
> > 2.6.36.
> 
> Works fine interactively, but I still get the same kernel BUG on reboot
> time as before. Would a photo of the trace help you?

Odd.  Yeah, please send me the photo and your .config.

	-Mike


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC/RFT PATCH] sched: automated per tty task groups
  2010-10-21  7:55       ` Mike Galbraith
@ 2010-10-21 10:28         ` Peter Zijlstra
  0 siblings, 0 replies; 31+ messages in thread
From: Peter Zijlstra @ 2010-10-21 10:28 UTC (permalink / raw)
  To: Mike Galbraith; +Cc: LKML, Ingo Molnar, Linus Torvalds

On Thu, 2010-10-21 at 09:55 +0200, Mike Galbraith wrote:
> Actually, your patch works just peachy, my fiddling with sleepers
> vruntime at attach time was a dainbramaged thing to do. 

OK, I'll queue it with a Tested-by from you. Thanks!

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC/RFT PATCH] sched: automated per tty task groups
  2010-10-21  8:11           ` Mike Galbraith
  2010-10-21  8:31             ` Ingo Molnar
  2010-10-21  8:48             ` Markus Trippelsdorf
@ 2010-10-21 10:51             ` Mathieu Desnoyers
  2010-10-21 11:25               ` Peter Zijlstra
  2010-10-21 11:27               ` Mike Galbraith
  2 siblings, 2 replies; 31+ messages in thread
From: Mathieu Desnoyers @ 2010-10-21 10:51 UTC (permalink / raw)
  To: Mike Galbraith
  Cc: Ingo Molnar, Linus Torvalds, LKML, Peter Zijlstra,
	Markus Trippelsdorf

* Mike Galbraith (efault@gmx.de) wrote:
[...]
> +static void
> +autogroup_attach_tty(struct task_struct *p, struct task_group **tg)
> +{
> +	struct tty_struct *tty = p->signal->tty;
> +
> +	if (!tty)
> +		return;
> +
> +	*tg = p->signal->tty->tg;
> +}
> +
> +static inline void
> +autogroup_check_attach(struct task_struct *p, struct task_group **tg)
> +{
> +	if (!sysctl_sched_autogroup_enabled || *tg != &root_task_group ||
> +			p->sched_class != &fair_sched_class)
> +		return;
> +
> +	rcu_read_lock();
> +
> +	autogroup_attach_tty(p, tg);
> +
> +	rcu_read_unlock();
> +}
> +

Hi Mike,

This per-tty task grouping approach looks very promising. I'll give it a spin
when I find the time. Meanwhile, a little question about locking here: how is
the read lock supposed to protect from p->signal (and p->signal->tty)
modifications ? What's the locking scheme here ? So maybe just simple
rcu_dereference are missing, or maybe the tsk->sighand->siglock might be
required. In all cases, I feel something is missing there. 

Thanks!

Mathieu

-- 
Mathieu Desnoyers
Operating System Efficiency R&D Consultant
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC/RFT PATCH] sched: automated per tty task groups
  2010-10-21 10:51             ` Mathieu Desnoyers
@ 2010-10-21 11:25               ` Peter Zijlstra
  2010-10-21 16:29                 ` Oleg Nesterov
  2010-10-21 11:27               ` Mike Galbraith
  1 sibling, 1 reply; 31+ messages in thread
From: Peter Zijlstra @ 2010-10-21 11:25 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: Mike Galbraith, Ingo Molnar, Linus Torvalds, LKML,
	Markus Trippelsdorf, Oleg Nesterov

On Thu, 2010-10-21 at 06:51 -0400, Mathieu Desnoyers wrote:
> * Mike Galbraith (efault@gmx.de) wrote:
> [...]
> > +static void
> > +autogroup_attach_tty(struct task_struct *p, struct task_group **tg)
> > +{
> > +	struct tty_struct *tty = p->signal->tty;
> > +
> > +	if (!tty)
> > +		return;
> > +
> > +	*tg = p->signal->tty->tg;
> > +}
> > +
> > +static inline void
> > +autogroup_check_attach(struct task_struct *p, struct task_group **tg)
> > +{
> > +	if (!sysctl_sched_autogroup_enabled || *tg != &root_task_group ||
> > +			p->sched_class != &fair_sched_class)
> > +		return;
> > +
> > +	rcu_read_lock();
> > +
> > +	autogroup_attach_tty(p, tg);
> > +
> > +	rcu_read_unlock();
> > +}
> > +

>  Meanwhile, a little question about locking here: how is
> the read lock supposed to protect from p->signal (and p->signal->tty)
> modifications ? What's the locking scheme here ? So maybe just simple
> rcu_dereference are missing, or maybe the tsk->sighand->siglock might be
> required. In all cases, I feel something is missing there. 

Oleg, could you comment?


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC/RFT PATCH] sched: automated per tty task groups
  2010-10-21 10:51             ` Mathieu Desnoyers
  2010-10-21 11:25               ` Peter Zijlstra
@ 2010-10-21 11:27               ` Mike Galbraith
  1 sibling, 0 replies; 31+ messages in thread
From: Mike Galbraith @ 2010-10-21 11:27 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: Ingo Molnar, Linus Torvalds, LKML, Peter Zijlstra,
	Markus Trippelsdorf

On Thu, 2010-10-21 at 06:51 -0400, Mathieu Desnoyers wrote:
> * Mike Galbraith (efault@gmx.de) wrote:
> [...]
> > +static void
> > +autogroup_attach_tty(struct task_struct *p, struct task_group **tg)
> > +{
> > +	struct tty_struct *tty = p->signal->tty;
> > +
> > +	if (!tty)
> > +		return;
> > +
> > +	*tg = p->signal->tty->tg;
> > +}
> > +
> > +static inline void
> > +autogroup_check_attach(struct task_struct *p, struct task_group **tg)
> > +{
> > +	if (!sysctl_sched_autogroup_enabled || *tg != &root_task_group ||
> > +			p->sched_class != &fair_sched_class)
> > +		return;
> > +
> > +	rcu_read_lock();
> > +
> > +	autogroup_attach_tty(p, tg);
> > +
> > +	rcu_read_unlock();
> > +}
> > +
> 
> Hi Mike,
> 
> This per-tty task grouping approach looks very promising. I'll give it a spin
> when I find the time. Meanwhile, a little question about locking here: how is
> the read lock supposed to protect from p->signal (and p->signal->tty)
> modifications ? What's the locking scheme here ? So maybe just simple
> rcu_dereference are missing, or maybe the tsk->sighand->siglock might be
> required. In all cases, I feel something is missing there.

My assumption is that no additional locking is needed.  The tty is
refcounted, dropped in release_task()->__exit_signal(), at which point
the task is unhashed, is history.  The tty can't go away until the last
task referencing it goes away.

	-Mike


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC/RFT PATCH] sched: automated per tty task groups
       [not found]                 ` <20101021115723.GA1587@arch.trippelsdorf.de>
@ 2010-10-21 16:22                   ` Mathieu Desnoyers
  0 siblings, 0 replies; 31+ messages in thread
From: Mathieu Desnoyers @ 2010-10-21 16:22 UTC (permalink / raw)
  To: Markus Trippelsdorf
  Cc: Mike Galbraith, Ingo Molnar, Linus Torvalds, LKML, Peter Zijlstra

* Markus Trippelsdorf (markus@trippelsdorf.de) wrote:
> On Thu, Oct 21, 2010 at 10:52:45AM +0200, Mike Galbraith wrote:
> > On Thu, 2010-10-21 at 10:48 +0200, Markus Trippelsdorf wrote:
> > > On Thu, Oct 21, 2010 at 10:11:55AM +0200, Mike Galbraith wrote:
> > > > On Wed, 2010-10-20 at 04:56 +0200, Ingo Molnar wrote:
> > > > 
> > > > > Mind doing more of the tty->desktop renames/generalizations as Linus suggested, and 
> > > > > resend the patch?
> > > > 
> > > > Here she comes.  Better/Worse?
> > > > 
> > > > Changes:
> > > > - tty->autogroup.
> > > > - only autogroup fair class tasks.
> > > > - removed dainbramaged sleeper vruntime twiddling.
> > > > - removed paranoid locking.
> > > > - removed noop detatch code.
> > > > 
> > > > > I'd also suggest to move it out of EXPERIMENTAL - we dont really do that for core 
> > > > > kernel features as most distros enable CONFIG_EXPERIMENTAL so it's a rather 
> > > > > meaningless distinction. Since the feature is default-n, people will get the old 
> > > > > scheduler by default but can also choose this desktop-centric scheduling mode.
> > > > > 
> > > > > I'd even argue to make it default-y, because this patch clearly cures a form of 
> > > > > kbuild cancer.
> > > > 
> > > > You top dogs can make the default call.. it it's accepted that is ;-)
> > > > 
> > > > Marcus:  care to try the below?  Works fine for me (but so did first
> > > > cut).  It depends on the attached patch, and applied to virgin shiny new
> > > > 2.6.36.
> > > 
> > > Works fine interactively, but I still get the same kernel BUG on reboot
> > > time as before. Would a photo of the trace help you?
> > 
> > Odd.  Yeah, please send me the photo and your .config.
> 
> OK here you go. Both are attached.

In the backtrace, the scheduler is called from:

do_group_exit()
  __dequeue_signal()
    do_exit()

given that task_group() is called from many spots in the scheduler, I wonder if
some checks making sure that tg != NULL in task_group() would be appropriate ?
Also checking that p->signal is non-NULL in autogroup_attach_tty() might help.

Thanks,

Mathieu


-- 
Mathieu Desnoyers
Operating System Efficiency R&D Consultant
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC/RFT PATCH] sched: automated per tty task groups
  2010-10-21 11:25               ` Peter Zijlstra
@ 2010-10-21 16:29                 ` Oleg Nesterov
  2010-10-21 19:11                   ` Mike Galbraith
  0 siblings, 1 reply; 31+ messages in thread
From: Oleg Nesterov @ 2010-10-21 16:29 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Mathieu Desnoyers, Mike Galbraith, Ingo Molnar, Linus Torvalds,
	LKML, Markus Trippelsdorf

On 10/21, Peter Zijlstra wrote:
>
> On Thu, 2010-10-21 at 06:51 -0400, Mathieu Desnoyers wrote:
> > * Mike Galbraith (efault@gmx.de) wrote:
> > [...]
> > > +static void
> > > +autogroup_attach_tty(struct task_struct *p, struct task_group **tg)
> > > +{
> > > +	struct tty_struct *tty = p->signal->tty;
> > > +
> > > +	if (!tty)
> > > +		return;
> > > +
> > > +	*tg = p->signal->tty->tg;
> > > +}

minor nit, I think in theory this needs barrier(), or

	struct tty_struct *tty = ACCESS_ONCE(p->signal->tty);

	if (tty)
		*tg = tty->tg;

> > > +static inline void
> > > +autogroup_check_attach(struct task_struct *p, struct task_group **tg)
> > > +{
> > > +	if (!sysctl_sched_autogroup_enabled || *tg != &root_task_group ||
> > > +			p->sched_class != &fair_sched_class)
> > > +		return;
> > > +
> > > +	rcu_read_lock();
> > > +
> > > +	autogroup_attach_tty(p, tg);
> > > +
> > > +	rcu_read_unlock();
> > > +}
> > > +
>
> >  Meanwhile, a little question about locking here: how is
> > the read lock supposed to protect from p->signal (and p->signal->tty)
> > modifications ? What's the locking scheme here ? So maybe just simple
> > rcu_dereference are missing, or maybe the tsk->sighand->siglock might be
> > required. In all cases, I feel something is missing there.
>
> Oleg, could you comment?

No, I don't understand this ;) But I know nothig about task groups,
most probably this is OK.

It is not clear to me why do we need rcu_read_lock() and how it can help.
The tty can go away right after dereferencing signal->tty.

Even if the task doesn't exit, it (or its sub-thread) can do sys_setsid()
at any moment and free this tty. If any thread was moved to tty->sg, doesn't
this mean that, say, ->cfs_rq will point to the already freed tg->cfs_rq?

>From http://marc.info/?l=linux-kernel&m=128764874422614

	+int sched_autogroup_handler(struct ctl_table *table, int write,
	+		void __user *buffer, size_t *lenp, loff_t *ppos)
	+{
	+	struct task_struct *p, *t;
	+	struct task_group *tg;
	+	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
	+
	+	if (ret || !write)
	+		return ret;
	+
	+	for_each_process(p) {

Hmm. This needs rcu lock at least?

	+		tg = task_group(p);

Why?

	+		sched_move_task(p);
	+		list_for_each_entry_rcu(t, &p->thread_group, thread_group) {
	+			sched_move_task(t);
	+		}
	+	}

Looks like, you can just do

	do_each_thread(p, t) {
		sched_move_task(t);
	} while_each_thread(p, t);

With the same effect.

Oleg.


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC/RFT PATCH] sched: automated per tty task groups
  2010-10-21 16:29                 ` Oleg Nesterov
@ 2010-10-21 19:11                   ` Mike Galbraith
  0 siblings, 0 replies; 31+ messages in thread
From: Mike Galbraith @ 2010-10-21 19:11 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Peter Zijlstra, Mathieu Desnoyers, Ingo Molnar, Linus Torvalds,
	LKML, Markus Trippelsdorf

On Thu, 2010-10-21 at 18:29 +0200, Oleg Nesterov wrote:
> On 10/21, Peter Zijlstra wrote:
> >
> > On Thu, 2010-10-21 at 06:51 -0400, Mathieu Desnoyers wrote:
> > > * Mike Galbraith (efault@gmx.de) wrote:
> > > [...]
> > > > +static void
> > > > +autogroup_attach_tty(struct task_struct *p, struct task_group **tg)
> > > > +{
> > > > +	struct tty_struct *tty = p->signal->tty;
> > > > +
> > > > +	if (!tty)
> > > > +		return;
> > > > +
> > > > +	*tg = p->signal->tty->tg;
> > > > +}
> 
> minor nit, I think in theory this needs barrier(), or
> 
> 	struct tty_struct *tty = ACCESS_ONCE(p->signal->tty);
> 
> 	if (tty)
> 		*tg = tty->tg;

Thanks.

> > > > +static inline void
> > > > +autogroup_check_attach(struct task_struct *p, struct task_group **tg)
> > > > +{
> > > > +	if (!sysctl_sched_autogroup_enabled || *tg != &root_task_group ||
> > > > +			p->sched_class != &fair_sched_class)
> > > > +		return;
> > > > +
> > > > +	rcu_read_lock();
> > > > +
> > > > +	autogroup_attach_tty(p, tg);
> > > > +
> > > > +	rcu_read_unlock();
> > > > +}
> > > > +
> >
> > >  Meanwhile, a little question about locking here: how is
> > > the read lock supposed to protect from p->signal (and p->signal->tty)
> > > modifications ? What's the locking scheme here ? So maybe just simple
> > > rcu_dereference are missing, or maybe the tsk->sighand->siglock might be
> > > required. In all cases, I feel something is missing there.
> >
> > Oleg, could you comment?
> 
> No, I don't understand this ;) But I know nothig about task groups,
> most probably this is OK.
> 
> It is not clear to me why do we need rcu_read_lock() and how it can help.
> The tty can go away right after dereferencing signal->tty.

It was inherited.

> Even if the task doesn't exit, it (or its sub-thread) can do sys_setsid()
> at any moment and free this tty. If any thread was moved to tty->sg, doesn't
> this mean that, say, ->cfs_rq will point to the already freed tg->cfs_rq?

Ah, so isn't as safe as it looked. Thanks!

> >From http://marc.info/?l=linux-kernel&m=128764874422614
> 
> 	+int sched_autogroup_handler(struct ctl_table *table, int write,
> 	+		void __user *buffer, size_t *lenp, loff_t *ppos)
> 	+{
> 	+	struct task_struct *p, *t;
> 	+	struct task_group *tg;
> 	+	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
> 	+
> 	+	if (ret || !write)
> 	+		return ret;
> 	+
> 	+	for_each_process(p) {
> 
> Hmm. This needs rcu lock at least?

(used to be paranoid locking there.. vs required locking)

> 	+		tg = task_group(p);
> 
> Why?

A cleanup leftover.

> 
> 	+		sched_move_task(p);
> 	+		list_for_each_entry_rcu(t, &p->thread_group, thread_group) {
> 	+			sched_move_task(t);
> 	+		}
> 	+	}
> 
> Looks like, you can just do
> 
> 	do_each_thread(p, t) {
> 		sched_move_task(t);
> 	} while_each_thread(p, t);
> 
> With the same effect.

Yeah.

So in theory, the tty can go away on me.  I knew this was too easy.

	-Mike



^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC/RFT PATCH] sched: automated per tty task groups
@ 2010-12-12 13:49 Tom Gundersen
  2010-12-12 14:08 ` Mike Galbraith
  0 siblings, 1 reply; 31+ messages in thread
From: Tom Gundersen @ 2010-12-12 13:49 UTC (permalink / raw)
  To: Mike Galbraith; +Cc: LKML

Hi Mike,

I have a question about your patch:

If I understand sched-rt-group.txt correctly, a non-root user will not
be able to run rt tasks if it is in a cgroup that has not been
explicitly given non-zero bandwidth. I could not see from your patch
where the bandwidth is assigned. Care to explain how it works?

Sorry if this has already been discussed (or if it should be obvious).

Cheers,

Tom

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC/RFT PATCH] sched: automated per tty task groups
  2010-12-12 13:49 [RFC/RFT PATCH] sched: automated per tty task groups Tom Gundersen
@ 2010-12-12 14:08 ` Mike Galbraith
  0 siblings, 0 replies; 31+ messages in thread
From: Mike Galbraith @ 2010-12-12 14:08 UTC (permalink / raw)
  To: Tom Gundersen; +Cc: LKML

On Sun, 2010-12-12 at 14:49 +0100, Tom Gundersen wrote:
> Hi Mike,
> 
> I have a question about your patch:
> 
> If I understand sched-rt-group.txt correctly, a non-root user will not
> be able to run rt tasks if it is in a cgroup that has not been
> explicitly given non-zero bandwidth. I could not see from your patch
> where the bandwidth is assigned. Care to explain how it works?

It doesn't touch RT tasks.

	-Mike


^ permalink raw reply	[flat|nested] 31+ messages in thread

end of thread, other threads:[~2010-12-12 14:08 UTC | newest]

Thread overview: 31+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-12-12 13:49 [RFC/RFT PATCH] sched: automated per tty task groups Tom Gundersen
2010-12-12 14:08 ` Mike Galbraith
  -- strict thread matches above, loose matches on Subject: below --
2010-10-19  9:16 Mike Galbraith
2010-10-19  9:26 ` Peter Zijlstra
2010-10-19  9:39   ` Mike Galbraith
2010-10-19  9:43     ` Peter Zijlstra
2010-10-19  9:46       ` Mike Galbraith
2010-10-21  7:55       ` Mike Galbraith
2010-10-21 10:28         ` Peter Zijlstra
2010-10-19  9:29 ` Peter Zijlstra
2010-10-19  9:42   ` Mike Galbraith
2010-10-19 11:29 ` Mike Galbraith
2010-10-19 11:56   ` Ingo Molnar
2010-10-19 13:12     ` Mike Galbraith
2010-10-19 15:28   ` Linus Torvalds
2010-10-19 18:13     ` Mike Galbraith
2010-10-19 18:53       ` Mike Galbraith
2010-10-20  2:56         ` Ingo Molnar
2010-10-21  8:11           ` Mike Galbraith
2010-10-21  8:31             ` Ingo Molnar
2010-10-21  8:39               ` Mike Galbraith
2010-10-21  8:48             ` Markus Trippelsdorf
2010-10-21  8:52               ` Mike Galbraith
     [not found]                 ` <20101021115723.GA1587@arch.trippelsdorf.de>
2010-10-21 16:22                   ` Mathieu Desnoyers
2010-10-21 10:51             ` Mathieu Desnoyers
2010-10-21 11:25               ` Peter Zijlstra
2010-10-21 16:29                 ` Oleg Nesterov
2010-10-21 19:11                   ` Mike Galbraith
2010-10-21 11:27               ` Mike Galbraith
2010-10-20 13:55 ` Markus Trippelsdorf
2010-10-20 14:41   ` Mike Galbraith

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.