Re: [patch] CFS scheduler, -v14

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Ingo Molnar <mingo@elte.hu>
To: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: linux-kernel@vger.kernel.org,
	Linus Torvalds <torvalds@linux-foundation.org>,
	Andrew Morton <akpm@linux-foundation.org>,
	Mike Galbraith <efault@gmx.de>,
	Arjan van de Ven <arjan@infradead.org>,
	Thomas Gleixner <tglx@linutronix.de>,
	pranith-kumar_d@mentorg.com, Andi Kleen <andi@firstfloor.org>
Subject: Re: [patch] CFS scheduler, -v14
Date: Tue, 5 Jun 2007 09:57:06 +0200	[thread overview]
Message-ID: <20070605075706.GA2496@elte.hu> (raw)
In-Reply-To: <20070529102356.GB12620@linux.vnet.ibm.com>


* Balbir Singh <balbir@linux.vnet.ibm.com> wrote:

> +		/*
> +		 * Split up sched_exec_time according to the utime and
> +		 * stime ratio. At this point utime contains the summed
> +		 * sched_exec_runtime and stime is zero
> +		 */
> +		if (sum_us_time) {
> +			utime = ((tu_time * total_time) / sum_us_time);
> +			stime = ((ts_time * total_time) / sum_us_time);
> +		}
> +	}

hm, Dmitry Adamushko found out that this will cause rounding problems 
and might confuse 'top' - because total_time is a 10 msecs granular 
value, so under the above calculation the total of 'utime+stime' can 
shrink a bit as time goes forward. The symptom is that top will display 
a '99.9%' entry for tasks, sporadically.

I've attached below my current delta (ontop of -v15) which does the 
stime/utime splitup correctly and which includes some more enhancements 
from Dmitry - could you please take a look at this and add any deltas 
you might have ontop of it?

	Ingo

---
 Makefile                  |    2 +-
 fs/proc/array.c           |   33 ++++++++++++++++++++++++---------
 include/linux/sched.h     |    3 +--
 kernel/posix-cpu-timers.c |    2 +-
 kernel/sched.c            |   17 ++++++++++-------
 kernel/sched_debug.c      |   16 +++++++++++++++-
 kernel/sched_fair.c       |    2 +-
 kernel/sched_rt.c         |   12 ++++++++----
 8 files changed, 61 insertions(+), 26 deletions(-)

Index: linux/Makefile
===================================================================
--- linux.orig/Makefile
+++ linux/Makefile
@@ -1,7 +1,7 @@
 VERSION = 2
 PATCHLEVEL = 6
 SUBLEVEL = 21
-EXTRAVERSION = .3-cfs-v15
+EXTRAVERSION = .3-cfs-v16
 NAME = Nocturnal Monster Puppy
 
 # *DOCUMENTATION*
Index: linux/fs/proc/array.c
===================================================================
--- linux.orig/fs/proc/array.c
+++ linux/fs/proc/array.c
@@ -172,8 +172,8 @@ static inline char * task_state(struct t
 		"Uid:\t%d\t%d\t%d\t%d\n"
 		"Gid:\t%d\t%d\t%d\t%d\n",
 		get_task_state(p),
-		p->tgid, p->pid,
-		pid_alive(p) ? rcu_dereference(p->real_parent)->tgid : 0,
+	       	p->tgid, p->pid,
+	       	pid_alive(p) ? rcu_dereference(p->real_parent)->tgid : 0,
 		pid_alive(p) && p->ptrace ? rcu_dereference(p->parent)->pid : 0,
 		p->uid, p->euid, p->suid, p->fsuid,
 		p->gid, p->egid, p->sgid, p->fsgid);
@@ -312,24 +312,39 @@ int proc_pid_status(struct task_struct *
 
 static clock_t task_utime(struct task_struct *p)
 {
+	clock_t utime = cputime_to_clock_t(p->utime),
+		total = utime + cputime_to_clock_t(p->stime);
+
 	/*
 	 * Use CFS's precise accounting, if available:
 	 */
-	if (!has_rt_policy(p) && !(sysctl_sched_load_smoothing & 128))
-		return nsec_to_clock_t(p->sum_exec_runtime);
+	if (!(sysctl_sched_load_smoothing & 128)) {
+		u64 temp = (u64)nsec_to_clock_t(p->sum_exec_runtime);
+
+		if (total) {
+			temp *= utime;
+			do_div(temp, total);
+		}
+		utime = (clock_t)temp;
+	}
 
-	return cputime_to_clock_t(p->utime);
+	return utime;
 }
 
 static clock_t task_stime(struct task_struct *p)
 {
+	clock_t stime = cputime_to_clock_t(p->stime),
+		total = stime + cputime_to_clock_t(p->utime);
+
 	/*
-	 * Use CFS's precise accounting, if available:
+	 * Use CFS's precise accounting, if available (we subtract
+	 * utime from the total, to make sure the total observed
+	 * by userspace grows monotonically - apps rely on that):
 	 */
-	if (!has_rt_policy(p) && !(sysctl_sched_load_smoothing & 128))
-		return 0;
+	if (!(sysctl_sched_load_smoothing & 128))
+		stime = nsec_to_clock_t(p->sum_exec_runtime) - task_utime(p);
 
-	return cputime_to_clock_t(p->stime);
+	return stime;
 }
 
 
Index: linux/include/linux/sched.h
===================================================================
--- linux.orig/include/linux/sched.h
+++ linux/include/linux/sched.h
@@ -852,7 +852,6 @@ struct task_struct {
 	u64 block_max;
 	u64 exec_max;
 	u64 wait_max;
-	u64 last_ran;
 
 	s64 wait_runtime;
 	u64 sum_exec_runtime;
@@ -1235,7 +1234,7 @@ static inline int set_cpus_allowed(struc
 extern unsigned long long sched_clock(void);
 extern void sched_clock_unstable_event(void);
 extern unsigned long long
-current_sched_runtime(const struct task_struct *current_task);
+task_sched_runtime(struct task_struct *task);
 
 /* sched_exec is called by processes performing an exec */
 #ifdef CONFIG_SMP
Index: linux/kernel/posix-cpu-timers.c
===================================================================
--- linux.orig/kernel/posix-cpu-timers.c
+++ linux/kernel/posix-cpu-timers.c
@@ -161,7 +161,7 @@ static inline cputime_t virt_ticks(struc
 }
 static inline unsigned long long sched_ns(struct task_struct *p)
 {
-	return (p == current) ? current_sched_runtime(p) : p->sum_exec_runtime;
+	return task_sched_runtime(p);
 }
 
 int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp)
Index: linux/kernel/sched.c
===================================================================
--- linux.orig/kernel/sched.c
+++ linux/kernel/sched.c
@@ -1227,7 +1227,7 @@ static void task_running_tick(struct rq 
  */
 static void __sched_fork(struct task_struct *p)
 {
-	p->wait_start_fair = p->wait_start = p->exec_start = p->last_ran = 0;
+	p->wait_start_fair = p->wait_start = p->exec_start = 0;
 	p->sum_exec_runtime = 0;
 
 	p->wait_runtime = 0;
@@ -2592,17 +2592,20 @@ DEFINE_PER_CPU(struct kernel_stat, kstat
 EXPORT_PER_CPU_SYMBOL(kstat);
 
 /*
- * Return current->sum_exec_runtime plus any more ns on the sched_clock
- * that have not yet been banked.
+ * Return p->sum_exec_runtime plus any more ns on the sched_clock
+ * that have not yet been banked in case the task is currently running.
  */
-unsigned long long current_sched_runtime(const struct task_struct *p)
+unsigned long long task_sched_runtime(struct task_struct *p)
 {
 	unsigned long long ns;
 	unsigned long flags;
+	struct rq *rq;
 
-	local_irq_save(flags);
-	ns = p->sum_exec_runtime + sched_clock() - p->last_ran;
-	local_irq_restore(flags);
+	rq = task_rq_lock(p, &flags);
+	ns = p->sum_exec_runtime;
+	if (rq->curr == p)
+		ns += rq_clock(rq) - p->exec_start;
+	task_rq_unlock(rq, &flags);
 
 	return ns;
 }
Index: linux/kernel/sched_debug.c
===================================================================
--- linux.orig/kernel/sched_debug.c
+++ linux/kernel/sched_debug.c
@@ -188,6 +188,18 @@ __initcall(init_sched_debug_procfs);
 
 void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 {
+	unsigned long flags;
+	int num_threads = 1;
+
+	rcu_read_lock();
+	if (lock_task_sighand(p, &flags)) {
+		num_threads = atomic_read(&p->signal->count);
+		unlock_task_sighand(p, &flags);
+	}
+	rcu_read_unlock();
+
+	SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, num_threads);
+	SEQ_printf(m, "----------------------------------------------\n");
 #define P(F) \
 	SEQ_printf(m, "%-25s:%20Ld\n", #F, (long long)p->F)
 
@@ -201,11 +213,13 @@ void proc_sched_show_task(struct task_st
 	P(block_max);
 	P(exec_max);
 	P(wait_max);
-	P(last_ran);
 	P(wait_runtime);
 	P(wait_runtime_overruns);
 	P(wait_runtime_underruns);
 	P(sum_exec_runtime);
+	P(load_weight);
+	P(policy);
+	P(prio);
 #undef P
 
 	{
Index: linux/kernel/sched_fair.c
===================================================================
--- linux.orig/kernel/sched_fair.c
+++ linux/kernel/sched_fair.c
@@ -200,7 +200,7 @@ static inline void update_curr(struct rq
 	 * since the last time we changed raw_weighted_load:
 	 */
 	delta_exec = now - curr->exec_start;
-	if (unlikely(delta_exec < 0))
+	if (unlikely((s64)delta_exec < 0))
 		delta_exec = 0;
 	if (unlikely(delta_exec > curr->exec_max))
 		curr->exec_max = delta_exec;
Index: linux/kernel/sched_rt.c
===================================================================
--- linux.orig/kernel/sched_rt.c
+++ linux/kernel/sched_rt.c
@@ -54,6 +54,7 @@ static void check_preempt_curr_rt(struct
 static struct task_struct * pick_next_task_rt(struct rq *rq, u64 now)
 {
 	struct prio_array *array = &rq->active;
+	struct task_struct *next;
 	struct list_head *queue;
 	int idx;
 
@@ -62,14 +63,17 @@ static struct task_struct * pick_next_ta
 		return NULL;
 
 	queue = array->queue + idx;
-	return list_entry(queue->next, struct task_struct, run_list);
+	next = list_entry(queue->next, struct task_struct, run_list);
+
+	next->exec_start = now;
+
+	return next;
 }
 
-/*
- * No accounting done when RT tasks are descheduled:
- */
 static void put_prev_task_rt(struct rq *rq, struct task_struct *p, u64 now)
 {
+	p->sum_exec_runtime += now - p->exec_start;
+	p->exec_start = 0;
 }
 
 /*

next prev parent reply	other threads:[~2007-06-05  7:57 UTC|newest]

Thread overview: 36+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2007-05-23 12:06 [patch] CFS scheduler, -v14 Ingo Molnar
2007-05-23 19:39 ` Nicolas Mailhot
2007-05-23 19:57   ` Ingo Molnar
2007-05-23 20:02     ` Nicolas Mailhot
2007-05-24  6:42 ` Balbir Singh
2007-05-24  8:09   ` Ingo Molnar
2007-05-24  9:19     ` Balbir Singh
2007-05-24 17:25     ` Jeremy Fitzhardinge
2007-05-24 20:59       ` Ingo Molnar
2007-05-24 22:43         ` Jeremy Fitzhardinge
2007-05-25 12:46     ` Ingo Molnar
2007-05-25 16:45       ` Balbir Singh
2007-05-28 11:07         ` Ingo Molnar
2007-05-29 10:23           ` Balbir Singh
2007-06-05  7:57             ` Ingo Molnar [this message]
2007-05-29 10:19       ` Balbir Singh
2007-05-26 14:58 ` S.Çağlar Onur
2007-05-26 15:08   ` S.Çağlar Onur
2007-06-01 13:35   ` S.Çağlar Onur
2007-06-01 15:31     ` Linus Torvalds
2007-06-07 22:29       ` S.Çağlar Onur
2007-06-01 15:37     ` [OT] " Andreas Mohr
2007-05-27  2:49 ` Li Yu
2007-05-29  6:15   ` Ingo Molnar
2007-05-29  8:07     ` Ingo Molnar
2007-05-31  9:45       ` Li Yu
2007-05-31  9:53         ` Ingo Molnar
2007-06-01  7:16           ` Li Yu
2007-06-01 19:21             ` Ingo Molnar
2007-06-05  2:33               ` Li Yu
2007-06-05  8:01                 ` Ingo Molnar
2007-06-05  8:54                   ` Li Yu
2007-06-06  7:41                   ` Li Yu
2007-06-05  3:35               ` Li Yu
2007-05-28  1:17 ` Li Yu
2007-05-29  0:49   ` Li Yu

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20070605075706.GA2496@elte.hu \
    --to=mingo@elte.hu \
    --cc=akpm@linux-foundation.org \
    --cc=andi@firstfloor.org \
    --cc=arjan@infradead.org \
    --cc=balbir@linux.vnet.ibm.com \
    --cc=efault@gmx.de \
    --cc=linux-kernel@vger.kernel.org \
    --cc=pranith-kumar_d@mentorg.com \
    --cc=tglx@linutronix.de \
    --cc=torvalds@linux-foundation.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.