Re: [RFC patch 1/2] sched: dynamically adapt granularity with nr_running

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Peter Zijlstra <peterz@infradead.org>
To: Mike Galbraith <efault@gmx.de>
Cc: Ingo Molnar <mingo@elte.hu>,
	Mathieu Desnoyers <mathieu.desnoyers@efficios.com>,
	LKML <linux-kernel@vger.kernel.org>,
	Linus Torvalds <torvalds@linux-foundation.org>,
	Andrew Morton <akpm@linux-foundation.org>,
	Steven Rostedt <rostedt@goodmis.org>,
	Thomas Gleixner <tglx@linutronix.de>,
	Tony Lindgren <tony@atomide.com>
Subject: Re: [RFC patch 1/2] sched: dynamically adapt granularity with nr_running
Date: Mon, 13 Sep 2010 11:55:56 +0200	[thread overview]
Message-ID: <1284371756.2275.108.camel@laptop> (raw)
In-Reply-To: <1284371457.14888.9.camel@marge.simson.net>

On Mon, 2010-09-13 at 11:50 +0200, Mike Galbraith wrote:
> Perhaps lag should be negated if you've received a
> reasonable chunk or something.. but what we really want is a service
> deadline.

Hey, I've got a patch for that too :-)

Not rebased to anything current, but should be able to get frobbed onto
the last zero-lag thingy without too much grief (I think).

---
Subject: 
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue Apr 20 16:50:03 CEST 2010


Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
---
 include/linux/sched.h   |   17 +-
 kernel/sched.c          |    8 
 kernel/sched_debug.c    |   28 +--
 kernel/sched_fair.c     |  388 ++++++++++++++++++++++++++++++++++++------------
 kernel/sched_features.h |    5 
 kernel/sysctl.c         |    4 
 6 files changed, 337 insertions(+), 113 deletions(-)

Index: linux-2.6/kernel/sched_debug.c
===================================================================
--- linux-2.6.orig/kernel/sched_debug.c
+++ linux-2.6/kernel/sched_debug.c
@@ -94,20 +94,22 @@ print_task(struct seq_file *m, struct rq
 	else
 		SEQ_printf(m, " ");
 
-	SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ",
+	SEQ_printf(m, "%15s %5d %9Ld.%06ld %c %9Ld.%06ld %9Ld.%06ld %9Ld.%06ld %9Ld %5d ",
 		p->comm, p->pid,
 		SPLIT_NS(p->se.vruntime),
+		entity_eligible(cfs_rq_of(&p->se), &p->se) ? 'E' : 'N',
+		SPLIT_NS(p->se.deadline),
+		SPLIT_NS(p->se.slice),
+		SPLIT_NS(p->se.sum_exec_runtime),
 		(long long)(p->nvcsw + p->nivcsw),
 		p->prio);
+	SEQ_printf(m, "%9Ld.%06ld",
 #ifdef CONFIG_SCHEDSTATS
-	SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
-		SPLIT_NS(p->se.vruntime),
-		SPLIT_NS(p->se.sum_exec_runtime),
-		SPLIT_NS(p->se.statistics.sum_sleep_runtime));
+		SPLIT_NS(p->se.statistics.sum_sleep_runtime)
 #else
-	SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld",
-		0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
+		0LL, 0L
 #endif
+		);
 
 #ifdef CONFIG_CGROUP_SCHED
 	{
@@ -129,10 +131,12 @@ static void print_rq(struct seq_file *m,
 
 	SEQ_printf(m,
 	"\nrunnable tasks:\n"
-	"            task   PID         tree-key  switches  prio"
-	"     exec-runtime         sum-exec        sum-sleep\n"
-	"------------------------------------------------------"
-	"----------------------------------------------------\n");
+	"            task   PID         vruntime e         deadline"
+	"            slice     exec_runtime  switches  prio"
+	"        sum-sleep\n"
+	"----------------------------------------------------------"
+	"---------------------------------"
+	"-----------------\n");
 
 	read_lock_irqsave(&tasklist_lock, flags);
 
@@ -326,7 +330,7 @@ static int sched_debug_show(struct seq_f
 	SEQ_printf(m, "  .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
 	P(jiffies);
 	PN(sysctl_sched_latency);
-	PN(sysctl_sched_min_granularity);
+	PN(sysctl_sched_slice);
 	PN(sysctl_sched_wakeup_granularity);
 	PN(sysctl_sched_child_runs_first);
 	P(sysctl_sched_features);
Index: linux-2.6/kernel/sched_fair.c
===================================================================
--- linux-2.6.orig/kernel/sched_fair.c
+++ linux-2.6/kernel/sched_fair.c
@@ -24,21 +24,6 @@
 #include <linux/sched.h>
 
 /*
- * Targeted preemption latency for CPU-bound tasks:
- * (default: 5ms * (1 + ilog(ncpus)), units: nanoseconds)
- *
- * NOTE: this latency value is not the same as the concept of
- * 'timeslice length' - timeslices in CFS are of variable length
- * and have no persistent notion like in traditional, time-slice
- * based scheduling concepts.
- *
- * (to see the precise effective timeslice length of your workload,
- *  run vmstat and monitor the context-switches (cs) field)
- */
-unsigned int sysctl_sched_latency = 6000000ULL;
-unsigned int normalized_sysctl_sched_latency = 6000000ULL;
-
-/*
  * The initial- and re-scaling of tunables is configurable
  * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
  *
@@ -50,17 +35,14 @@ unsigned int normalized_sysctl_sched_lat
 enum sched_tunable_scaling sysctl_sched_tunable_scaling
 	= SCHED_TUNABLESCALING_LOG;
 
+unsigned int sysctl_sched_latency = 6000000ULL;
+
 /*
  * Minimal preemption granularity for CPU-bound tasks:
  * (default: 2 msec * (1 + ilog(ncpus)), units: nanoseconds)
  */
-unsigned int sysctl_sched_min_granularity = 2000000ULL;
-unsigned int normalized_sysctl_sched_min_granularity = 2000000ULL;
-
-/*
- * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
- */
-static unsigned int sched_nr_latency = 3;
+unsigned int sysctl_sched_slice = 2000000ULL;
+unsigned int normalized_sysctl_sched_slice = 2000000ULL;
 
 /*
  * After fork, child runs first. If set to 0 (default) then
@@ -272,6 +254,40 @@ find_matching_se(struct sched_entity **s
  * Scheduling class tree data structure manipulation methods:
  */
 
+static inline struct sched_entity *se_of(struct rb_node *node)
+{
+	return rb_entry(node, struct sched_entity, run_node);
+}
+
+static inline s64 deadline_key(struct cfs_rq *cfs_rq, u64 deadline)
+{
+	return (s64)(deadline - cfs_rq->min_vruntime);
+}
+
+#define deadline_gt(cfs_rq, field, lse, rse)			\
+({ deadline_key(cfs_rq, lse->field) >				\
+   deadline_key(cfs_rq, rse->field); })
+
+static void update_min_deadline(struct cfs_rq *cfs_rq,
+				struct sched_entity *se, struct rb_node *node)
+{
+	if (node && deadline_gt(cfs_rq, min_deadline, se, se_of(node)))
+		se->min_deadline = se_of(node)->min_deadline;
+}
+
+/*
+ * se->min_deadline = min(se->deadline, left->min_deadline, right->min_deadline)
+ */
+static void update_node(struct rb_node *node, void *data)
+{
+	struct cfs_rq *cfs_rq = data;
+	struct sched_entity *se = se_of(node);
+
+	se->min_deadline = se->deadline;
+	update_min_deadline(cfs_rq, se, node->rb_right);
+	update_min_deadline(cfs_rq, se, node->rb_left);
+}
+
 static inline u64 max_vruntime(u64 min_vruntime, u64 vruntime)
 {
 	s64 delta = (s64)(vruntime - min_vruntime);
@@ -375,6 +391,34 @@ static u64 avg_vruntime(struct cfs_rq *c
 	return cfs_rq->min_vruntime + lag;
 }
 
+/*
+ * Entity is eligible once it received less service than it ought to have,
+ * eg. lag >= 0.
+ *
+ * lag_i = S_i - s_i = w_i*(V - w_i)
+ *
+ * lag_i >=0 -> V >= v_i
+ *
+ *     \Sum (v_i - v)*w_i
+ * V = ------------------ + v
+ *          \Sum w_i
+ *
+ * lag_i >= 0 -> \Sum (v_i - v)*w_i >= (v_i - v)*(\Sum w_i)
+ */
+static int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	struct sched_entity *curr = cfs_rq->curr;
+	s64 avg_vruntime = cfs_rq->avg_vruntime;
+	long avg_load = cfs_rq->avg_load;
+
+	if (curr) {
+		avg_vruntime += entity_key(cfs_rq, curr) * curr->load.weight;
+		avg_load += curr->load.weight;
+	}
+
+	return avg_vruntime >= entity_key(cfs_rq, se) * avg_load;
+}
+
 static void __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime)
 {
 	/*
@@ -405,7 +449,7 @@ static void __enqueue_entity(struct cfs_
 	 */
 	while (*link) {
 		parent = *link;
-		entry = rb_entry(parent, struct sched_entity, run_node);
+		entry = se_of(parent);
 		/*
 		 * We dont care about collisions. Nodes with
 		 * the same key stay together.
@@ -427,10 +471,14 @@ static void __enqueue_entity(struct cfs_
 
 	rb_link_node(&se->run_node, parent, link);
 	rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
+
+	rb_augment_insert(&se->run_node, update_node, cfs_rq);
 }
 
 static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
+	struct rb_node *node = rb_augment_erase_begin(&se->run_node);
+
 	if (cfs_rq->rb_leftmost == &se->run_node) {
 		struct rb_node *next_node;
 
@@ -439,9 +487,58 @@ static void __dequeue_entity(struct cfs_
 	}
 
 	rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
+	rb_augment_erase_end(node, update_node, cfs_rq);
 	avg_vruntime_sub(cfs_rq, se);
 }
 
+#ifdef CONFIG_SCHED_DEBUG
+int rb_node_pid(struct rb_node *node)
+{
+	struct sched_entity *se = se_of(node);
+	if (!entity_is_task(se))
+		return -1;
+
+	return task_of(se)->pid;
+}
+
+static void rb_node_print(struct cfs_rq *cfs_rq, struct rb_node *node, struct rb_node *curr, int level)
+{
+	int i;
+
+	printk(KERN_ERR);
+	for (i = 0; i < level; i++)
+		printk("  ");
+
+	if (!node) {
+		printk("<NULL>\n");
+		return;
+	}
+
+	printk("%d v: %Ld md: %Ld d: %Ld %s %s\n",
+			rb_node_pid(node),
+			se_of(node)->vruntime - cfs_rq->min_vruntime,
+			se_of(node)->min_deadline - cfs_rq->min_vruntime,
+			se_of(node)->deadline - cfs_rq->min_vruntime,
+			entity_eligible(cfs_rq, se_of(node)) ? "E" : " ",
+			(node == curr) ? "<===" : ""
+			);
+
+	rb_node_print(cfs_rq, node->rb_left, curr, level+1);
+	rb_node_print(cfs_rq, node->rb_right, curr, level+1);
+}
+
+static void rb_tree_print(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	dump_stack();
+	printk(KERN_ERR "V: %Ld\n", avg_vruntime(cfs_rq) - cfs_rq->min_vruntime);
+	rb_node_print(cfs_rq, cfs_rq->tasks_timeline.rb_node, &se->run_node, 1);
+}
+#else
+static void rb_tree_print(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+}
+#endif
+
 static struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
 {
 	struct rb_node *left = cfs_rq->rb_leftmost;
@@ -449,7 +546,7 @@ static struct sched_entity *__pick_first
 	if (!left)
 		return NULL;
 
-	return rb_entry(left, struct sched_entity, run_node);
+	return se_of(left);
 }
 
 static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
@@ -459,7 +556,86 @@ static struct sched_entity *__pick_last_
 	if (!last)
 		return NULL;
 
-	return rb_entry(last, struct sched_entity, run_node);
+	return se_of(last);
+}
+
+/*
+ * Earliest Eligible Virtual Deadline First
+ *
+ * In order to provide latency guarantees for different request sizes
+ * EEVDF selects the best runnable task from two criteria:
+ *
+ *  1) the task must be eligible (must be owed service)
+ *
+ *  2) from those tasks that meet 1), we select the one
+ *     with the earliest virtual deadline.
+ *
+ * We can do this in O(log n) time due to an augmented RB-tree. The
+ * tree keeps the entries sorted on service, but also functions as a
+ * heap based on the deadline by keeping:
+ *
+ *  se->min_deadline = min(se->deadline, se->{left,right}->min_deadline)
+ *
+ * Which allows an EDF like search on (sub)trees.
+ */
+static struct sched_entity *__pick_next_eevdf(struct cfs_rq *cfs_rq)
+{
+	struct rb_node *node = cfs_rq->tasks_timeline.rb_node;
+	struct sched_entity *best = NULL;
+
+	while (node) {
+		struct sched_entity *se = se_of(node);
+
+		/*
+		 * If this entity is not eligible, try the left subtree.
+		 *
+		 * XXX: would it be worth it to do the single division for
+		 *      avg_vruntime() once, instead of the multiplication
+		 *      in entity_eligible() O(log n) times?
+		 */
+		if (!entity_eligible(cfs_rq, se)) {
+			node = node->rb_left;
+			continue;
+		}
+
+		/*
+		 * If this entity has an earlier deadline than the previous
+		 * best, take this one. If it also has the earliest deadline
+		 * of its subtree, we're done.
+		 */
+		if (!best || deadline_gt(cfs_rq, deadline, best, se)) {
+			best = se;
+			if (best->deadline == best->min_deadline)
+				break;
+		}
+
+		/*
+		 * If the earlest deadline in this subtree is in the fully
+		 * eligible left half of our space, go there.
+		 */
+		if (node->rb_left &&
+		    se_of(node->rb_left)->min_deadline == se->min_deadline) {
+			node = node->rb_left;
+			continue;
+		}
+
+		node = node->rb_right;
+	}
+
+	if (unlikely(!best && cfs_rq->nr_running)) {
+		rb_tree_print(cfs_rq, NULL);
+		return __pick_first_entity(cfs_rq);
+	}
+
+	return best;
+}
+
+static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
+{
+	if (sched_feat(EEVDF))
+		return __pick_next_eevdf(cfs_rq);
+
+	return __pick_first_entity(cfs_rq);
 }
 
 /**************************************************************
@@ -477,13 +653,9 @@ int sched_proc_update_handler(struct ctl
 	if (ret || !write)
 		return ret;
 
-	sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
-					sysctl_sched_min_granularity);
-
 #define WRT_SYSCTL(name) \
 	(normalized_sysctl_##name = sysctl_##name / (factor))
-	WRT_SYSCTL(sched_min_granularity);
-	WRT_SYSCTL(sched_latency);
+	WRT_SYSCTL(sched_slice);
 	WRT_SYSCTL(sched_wakeup_granularity);
 	WRT_SYSCTL(sched_shares_ratelimit);
 #undef WRT_SYSCTL
@@ -504,55 +676,6 @@ calc_delta_fair(unsigned long delta, str
 	return delta;
 }
 
-/*
- * The idea is to set a period in which each task runs once.
- *
- * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch
- * this period because otherwise the slices get too small.
- *
- * p = (nr <= nl) ? l : l*nr/nl
- */
-static u64 __sched_period(unsigned long nr_running)
-{
-	u64 period = sysctl_sched_latency;
-	unsigned long nr_latency = sched_nr_latency;
-
-	if (unlikely(nr_running > nr_latency)) {
-		period = sysctl_sched_min_granularity;
-		period *= nr_running;
-	}
-
-	return period;
-}
-
-/*
- * We calculate the wall-time slice from the period by taking a part
- * proportional to the weight.
- *
- * s = p*P[w/rw]
- */
-static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-	u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
-
-	for_each_sched_entity(se) {
-		struct load_weight *load;
-		struct load_weight lw;
-
-		cfs_rq = cfs_rq_of(se);
-		load = &cfs_rq->load;
-
-		if (unlikely(!se->on_rq)) {
-			lw = cfs_rq->load;
-
-			update_load_add(&lw, se->load.weight);
-			load = &lw;
-		}
-		slice = calc_delta_mine(slice, se->load.weight, load);
-	}
-	return slice;
-}
-
 static void update_min_vruntime(struct cfs_rq *cfs_rq, unsigned long delta_exec)
 {
 	struct sched_entity *left = __pick_first_entity(cfs_rq);
@@ -707,6 +830,53 @@ add_cfs_task_weight(struct cfs_rq *cfs_r
 }
 #endif
 
+static void __set_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	if (sched_feat(FAIR_DEADLINE)) {
+		/*
+		 * If v_i < V, set the deadline relative to V instead,
+		 * so that it will not constrain already running tasks.
+		 */
+		se->deadline = max_vruntime(avg_vruntime(cfs_rq), se->vruntime);
+	} else {
+		se->deadline = se->vruntime;
+	}
+
+	se->deadline += calc_delta_fair(se->slice, se);
+}
+
+static void new_slice(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+{
+	if (!sched_feat(EEVDF))
+		goto fixed;
+
+	if (flags & ENQUEUE_LATENCY) {
+		se->slice = calc_delta_mine(sysctl_sched_latency,
+					    se->load.weight, &cfs_rq->load);
+		se->interactive = DIV_ROUND_UP(sysctl_sched_slice, se->slice);
+	} else if (!(flags & ENQUEUE_IO)) {
+fixed:
+		se->interactive = 1;
+		se->slice = sysctl_sched_slice;
+	}
+
+	__set_slice(cfs_rq, se);
+}
+
+static void next_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	if (sched_feat(EEVDF) && se->interactive) {
+		se->slice = calc_delta_mine(sysctl_sched_latency,
+					    se->load.weight, &cfs_rq->load);
+		se->interactive--;
+	} else {
+		se->slice = sysctl_sched_slice;
+		se->interactive = 0;
+	}
+
+	__set_slice(cfs_rq, se);
+}
+
 static void
 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
@@ -814,9 +984,28 @@ place_entity(struct cfs_rq *cfs_rq, stru
 {
 	u64 vruntime = cfs_rq->min_vruntime;
 
-	/* sleeps up to a single latency don't count. */
+	/*
+	 * EEVDF strategy 1, preserve lag across leave/join.
+	 */
+	if (sched_feat(PRESERVE_LAG)) {
+		se->vruntime = vruntime + se->lag;
+		return;
+	}
+
+	/*
+	 * EEVDF strategy 2, always start a join with 0 lag.
+	 */
+	if (sched_feat(ZERO_LAG)) {
+		se->vruntime = vruntime;
+		return;
+	}
+
+	/*
+	 * CFS policy, let sleeps up to two default slices be considered
+	 * as competing instead of sleeping.
+	 */
 	if (sched_feat(FAIR_SLEEPERS) && !initial) {
-		unsigned long thresh = sysctl_sched_latency;
+		unsigned long thresh = 2*sysctl_sched_slice;
 
 		/*
 		 * Halve their sleep time's effect, to allow
@@ -851,6 +1040,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, st
 	if (flags & ENQUEUE_WAKEUP) {
 		place_entity(cfs_rq, se, 0);
 		enqueue_sleeper(cfs_rq, se);
+		new_slice(cfs_rq, se, flags);
 	}
 
 	update_stats_enqueue(cfs_rq, se);
@@ -884,6 +1074,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, st
 
 	update_stats_dequeue(cfs_rq, se);
 	if (flags & DEQUEUE_SLEEP) {
+		if (sched_feat(PRESERVE_LAG))
+			se->lag = se->vruntime - avg_vruntime(cfs_rq);
 #ifdef CONFIG_SCHEDSTATS
 		if (entity_is_task(se)) {
 			struct task_struct *tsk = task_of(se);
@@ -918,7 +1110,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, st
 static void
 check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 {
-	unsigned long slice = sched_slice(cfs_rq, curr);
+	unsigned long slice = curr->slice;
 
 	if (curr->sum_exec_runtime - curr->prev_sum_exec_runtime < slice)
 		return;
@@ -955,7 +1147,7 @@ wakeup_preempt_entity(struct sched_entit
 
 static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
 {
-	struct sched_entity *se = __pick_first_entity(cfs_rq);
+	struct sched_entity *se = __pick_next_entity(cfs_rq);
 	struct sched_entity *left = se;
 
 	if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
@@ -974,10 +1166,12 @@ static struct sched_entity *pick_next_en
 
 static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 {
-	unsigned long slice = sched_slice(cfs_rq, prev);
+	unsigned long slice = prev->slice;
 
-	if (prev->sum_exec_runtime - prev->prev_sum_exec_runtime >= slice)
+	if (prev->sum_exec_runtime - prev->prev_sum_exec_runtime >= slice) {
 		prev->prev_sum_exec_runtime += slice;
+		next_slice(cfs_rq, prev);
+	}
 
 	/*
 	 * If still on the runqueue then deactivate_task()
@@ -1037,9 +1231,8 @@ static void hrtick_start_fair(struct rq 
 	WARN_ON(task_rq(p) != rq);
 
 	if (hrtick_enabled(rq) && cfs_rq->nr_running > 1) {
-		u64 slice = sched_slice(cfs_rq, se);
 		u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
-		s64 delta = slice - ran;
+		s64 delta = se->slice - ran;
 
 		if (delta < 0) {
 			if (rq->curr == p)
@@ -1070,7 +1263,7 @@ static void hrtick_update(struct rq *rq)
 	if (curr->sched_class != &fair_sched_class)
 		return;
 
-	if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
+	if (cfs_rq_of(&curr->se)->nr_running > 1)
 		hrtick_start_fair(rq, curr);
 }
 #else /* !CONFIG_SCHED_HRTICK */
@@ -1095,6 +1288,9 @@ enqueue_task_fair(struct rq *rq, struct 
 	struct cfs_rq *cfs_rq;
 	struct sched_entity *se = &p->se;
 
+	if (p->sched_in_iowait)
+		flags |= ENQUEUE_IO;
+
 	for_each_sched_entity(se) {
 		if (se->on_rq)
 			break;
@@ -1717,7 +1913,7 @@ static void check_preempt_wakeup(struct 
 	if (unlikely(se == pse))
 		return;
 
-	if (!(wake_flags & WF_FORK) && p->se.interactive) {
+	if (!sched_feat(EEVDF) && !(wake_flags & WF_FORK) && p->se.interactive) {
 		clear_buddies(cfs_rq, NULL);
 		set_next_buddy(pse);
 		preempt = 1;
@@ -1748,6 +1944,14 @@ static void check_preempt_wakeup(struct 
 	update_curr(cfs_rq);
 	find_matching_se(&se, &pse);
 	BUG_ON(!pse);
+
+	if (sched_feat(EEVDF)) {
+		if (entity_eligible(cfs_rq, pse) &&
+		    deadline_gt(cfs_rq, deadline, se, pse))
+			goto preempt;
+		return;
+	}
+
 	if (preempt || wakeup_preempt_entity(se, pse) == 1)
 		goto preempt;
 
@@ -3813,8 +4017,10 @@ static void task_fork_fair(struct task_s
 
 	update_curr(cfs_rq);
 
-	if (curr)
+	if (curr) {
 		se->vruntime = curr->vruntime;
+		se->slice = curr->slice;
+	}
 	place_entity(cfs_rq, se, 1);
 
 	if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
@@ -3901,7 +4107,7 @@ static unsigned int get_rr_interval_fair
 	 * idle runqueue:
 	 */
 	if (rq->cfs.load.weight)
-		rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
+		rr_interval = NS_TO_JIFFIES(se->slice);
 
 	return rr_interval;
 }
Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -1028,9 +1028,11 @@ struct sched_domain;
 #define WF_FORK		0x02		/* child wakeup after fork */
 #define WF_INTERACTIVE	0x04
 
-#define ENQUEUE_WAKEUP		0x1
-#define ENQUEUE_WAKING		0x2
-#define ENQUEUE_HEAD		0x4
+#define ENQUEUE_WAKEUP		0x01
+#define ENQUEUE_WAKING		0x02
+#define ENQUEUE_HEAD		0x04
+#define ENQUEUE_IO		0x08
+#define ENQUEUE_LATENCY		0x10
 
 #define DEQUEUE_SLEEP		0x1
 
@@ -1125,13 +1127,18 @@ struct sched_entity {
 	struct rb_node		run_node;
 	struct list_head	group_node;
 	unsigned int		on_rq       : 1,
-				interactive : 1;
+				interactive : 8;
 
 	u64			exec_start;
 	u64			sum_exec_runtime;
 	u64			vruntime;
 	u64			prev_sum_exec_runtime;
 
+	u64			deadline;
+	u64			min_deadline;
+	u64			lag;
+	u64			slice;
+
 	u64			nr_migrations;
 
 #ifdef CONFIG_SCHEDSTATS
@@ -1871,7 +1878,7 @@ static inline void wake_up_idle_cpu(int 
 #endif
 
 extern unsigned int sysctl_sched_latency;
-extern unsigned int sysctl_sched_min_granularity;
+extern unsigned int sysctl_sched_slice;
 extern unsigned int sysctl_sched_wakeup_granularity;
 extern unsigned int sysctl_sched_shares_ratelimit;
 extern unsigned int sysctl_sched_shares_thresh;
Index: linux-2.6/kernel/sysctl.c
===================================================================
--- linux-2.6.orig/kernel/sysctl.c
+++ linux-2.6/kernel/sysctl.c
@@ -282,8 +282,8 @@ static struct ctl_table kern_table[] = {
 	},
 #ifdef CONFIG_SCHED_DEBUG
 	{
-		.procname	= "sched_min_granularity_ns",
-		.data		= &sysctl_sched_min_granularity,
+		.procname	= "sched_slice_ns",
+		.data		= &sysctl_sched_slice,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= sched_proc_update_handler,
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -2365,7 +2365,7 @@ static int try_to_wake_up(struct task_st
 		if (current->sched_wake_interactive ||
 				wake_flags & WF_INTERACTIVE ||
 				current->se.interactive)
-			p->se.interactive = 1;
+			en_flags |= ENQUEUE_LATENCY;
 	}
 
 	this_cpu = get_cpu();
@@ -2442,6 +2442,8 @@ out_activate:
 		      cpu == this_cpu, en_flags);
 	success = 1;
 out_running:
+	trace_printk("sched_wakeup: wake_flags: %d enqueue_flags: %d\n",
+			wake_flags, en_flags);
 	ttwu_post_activation(p, rq, wake_flags, success);
 out:
 	task_rq_unlock(rq, &flags);
@@ -2515,6 +2517,7 @@ static void __sched_fork(struct task_str
 	p->se.sum_exec_runtime		= 0;
 	p->se.prev_sum_exec_runtime	= 0;
 	p->se.nr_migrations		= 0;
+	p->se.lag			= 0;
 
 #ifdef CONFIG_SCHEDSTATS
 	memset(&p->se.statistics, 0, sizeof(p->se.statistics));
@@ -5410,8 +5413,7 @@ static void update_sysctl(void)
 
 #define SET_SYSCTL(name) \
 	(sysctl_##name = (factor) * normalized_sysctl_##name)
-	SET_SYSCTL(sched_min_granularity);
-	SET_SYSCTL(sched_latency);
+	SET_SYSCTL(sched_slice);
 	SET_SYSCTL(sched_wakeup_granularity);
 	SET_SYSCTL(sched_shares_ratelimit);
 #undef SET_SYSCTL
Index: linux-2.6/kernel/sched_features.h
===================================================================
--- linux-2.6.orig/kernel/sched_features.h
+++ linux-2.6/kernel/sched_features.h
@@ -52,3 +52,8 @@ SCHED_FEAT(INTERACTIVE, 0)
  * release the lock. Decreases scheduling overhead.
  */
 SCHED_FEAT(OWNER_SPIN, 1)
+
+SCHED_FEAT(EEVDF, 1)
+SCHED_FEAT(FAIR_DEADLINE, 1)
+SCHED_FEAT(ZERO_LAG, 0)
+SCHED_FEAT(PRESERVE_LAG, 0)

next prev parent reply	other threads:[~2010-09-13  9:56 UTC|newest]

Thread overview: 76+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2010-09-11 17:37 [RFC patch 0/2] sched: dynamically adapt granularity with nr_running Mathieu Desnoyers
2010-09-11 17:37 ` [RFC patch 1/2] " Mathieu Desnoyers
2010-09-11 18:57   ` Peter Zijlstra
2010-09-11 19:21     ` Linus Torvalds
2010-09-11 20:36       ` Peter Zijlstra
2010-09-11 20:45         ` Peter Zijlstra
2010-09-11 20:52           ` Linus Torvalds
2010-09-12  9:07             ` Peter Zijlstra
2010-09-11 20:48         ` Linus Torvalds
2010-09-12  9:06           ` Peter Zijlstra
2010-09-12  9:14             ` Peter Zijlstra
2010-09-12 20:39               ` Mathieu Desnoyers
2010-09-13 12:54                 ` Peter Zijlstra
2010-09-12 20:34             ` Mathieu Desnoyers
2010-09-13 12:53               ` Peter Zijlstra
2010-09-13  4:35             ` Mike Galbraith
2010-09-13  8:41               ` Peter Zijlstra
2010-09-13 11:22                 ` Ingo Molnar
2010-09-13 13:52                 ` Steven Rostedt
2010-09-13 13:54                   ` Peter Zijlstra
2010-09-13 14:02                     ` Peter Zijlstra
2010-09-13 14:21                       ` Ingo Molnar
2010-09-11 20:52         ` Mathieu Desnoyers
2010-09-11 19:57     ` Mathieu Desnoyers
2010-09-12 10:41       ` Peter Zijlstra
2010-09-12 20:37         ` Mathieu Desnoyers
2010-09-13 12:53           ` Peter Zijlstra
2010-09-13 13:15             ` Peter Zijlstra
2010-09-13 13:56               ` Mathieu Desnoyers
2010-09-13 14:16                 ` Peter Zijlstra
2010-09-13 14:43                   ` Steven Rostedt
2010-09-13 15:25                     ` Mathieu Desnoyers
2010-09-13 15:39                       ` Steven Rostedt
2010-09-13 16:16                   ` [RFC PATCH] check_preempt_tick should not compare vruntime with wall time Mathieu Desnoyers
2010-09-13 16:36                     ` Linus Torvalds
2010-09-13 17:45                       ` Mathieu Desnoyers
2010-09-13 17:51                         ` Linus Torvalds
2010-09-13 18:01                           ` Mathieu Desnoyers
2010-09-13 18:10                           ` Steven Rostedt
2010-09-13 18:03                         ` Ingo Molnar
2010-09-13 18:19                           ` Mathieu Desnoyers
2010-09-13 18:23                             ` [PATCH] sched: Improve latencies under load by decreasing minimum scheduling granularity Ingo Molnar
2010-09-13 18:28                               ` Joe Perches
2010-09-13 19:44                               ` Linus Torvalds
2010-09-13 20:00                                 ` Ingo Molnar
2010-09-13 18:19                         ` [RFC PATCH] check_preempt_tick should not compare vruntime with wall time Ingo Molnar
2010-09-13 17:36                     ` Ingo Molnar
2010-09-13 17:56                       ` Mathieu Desnoyers
2010-09-14  2:10                     ` Mike Galbraith
2010-09-13 14:44                 ` [RFC patch 1/2] sched: dynamically adapt granularity with nr_running Mike Galbraith
     [not found]               ` <1284386179.10436.6.camel@marge.simson.net>
2010-09-13 15:53                 ` Peter Zijlstra
2010-09-13 18:04                   ` [RFC][PATCH] sched: Improve tick preemption Peter Zijlstra
2010-09-14  2:27                   ` [RFC patch 1/2] sched: dynamically adapt granularity with nr_running Mike Galbraith
2010-09-12  6:14   ` Ingo Molnar
2010-09-12  7:21     ` Mike Galbraith
2010-09-12 18:16       ` Mathieu Desnoyers
2010-09-13  4:13         ` Mike Galbraith
2010-09-13  6:41           ` Ingo Molnar
2010-09-13  7:08             ` Mike Galbraith
2010-09-13  7:35               ` Mike Galbraith
2010-09-13  8:35               ` Peter Zijlstra
2010-09-13  9:16                 ` Mike Galbraith
2010-09-13  9:37                   ` Peter Zijlstra
2010-09-13  9:50                     ` Mike Galbraith
2010-09-13  9:55                       ` Peter Zijlstra [this message]
2010-09-13 10:06                         ` Mike Galbraith
2010-09-13 10:45                         ` Peter Zijlstra
2010-09-13 11:43                           ` Peter Zijlstra
2010-09-13 11:49                             ` Peter Zijlstra
2010-09-13 12:32                             ` Mike Galbraith
2010-09-13 20:19             ` Mathieu Desnoyers
2010-09-13 20:56               ` Mathieu Desnoyers
2010-09-12 18:13     ` Mathieu Desnoyers
2010-09-12 23:44       ` Mathieu Desnoyers
2010-09-11 17:37 ` [RFC patch 2/2] sched: sleepers coarse granularity on wakeup Mathieu Desnoyers
2010-09-12 12:44 ` [RFC patch 0/2] sched: dynamically adapt granularity with nr_running Peter Zijlstra

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1284371756.2275.108.camel@laptop \
    --to=peterz@infradead.org \
    --cc=akpm@linux-foundation.org \
    --cc=efault@gmx.de \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mathieu.desnoyers@efficios.com \
    --cc=mingo@elte.hu \
    --cc=rostedt@goodmis.org \
    --cc=tglx@linutronix.de \
    --cc=tony@atomide.com \
    --cc=torvalds@linux-foundation.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.