All of lore.kernel.org
 help / color / mirror / Atom feed
From: Ingo Molnar <mingo@elte.hu>
To: Antoine Martin <antoine@nagafix.co.uk>
Cc: Linux Kernel Development <linux-kernel@vger.kernel.org>
Subject: Re: CFS: some bad numbers with Java/database threading
Date: Thu, 13 Sep 2007 13:24:27 +0200	[thread overview]
Message-ID: <20070913112427.GA20686@elte.hu> (raw)
In-Reply-To: <46E871FE.9010908@nagafix.co.uk>


* Antoine Martin <antoine@nagafix.co.uk> wrote:

> Basically, all the previous kernels are pretty close (2.6.16 through 
> to 2.6.20 performed almost identically to 2.6.22 and are not shown 
> here to avoid cluttering the graphs)
> 
> All the 2.6.23-rc kernels performed poorly (except -rc3!): much more 
> erratically and with a sharp performance drop above 800 threads. The 
> load starts to go up and the performance takes a nosedive.

hm, could you try the patch below ontop of 2.6.23-rc6 and do:

 echo 1 > /proc/sys/kernel/sched_yield_bug_workaround

does this improve the numbers?

	Ingo

-------------->
Subject: sched: yield debugging
From: Ingo Molnar <mingo@elte.hu>

introduce various sched_yield implementations:

 # default one:
 echo 0 > /proc/sys/kernel/sched_yield_bug_workaround

 # always queues the current task next to the next task:
 echo 1 > /proc/sys/kernel/sched_yield_bug_workaround

 # NOP:
 echo 2 > /proc/sys/kernel/sched_yield_bug_workaround

tunability depends on CONFIG_SCHED_DEBUG=y.

Not-yet-signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |    2 +
 kernel/sched_fair.c   |   73 +++++++++++++++++++++++++++++++++++++++++++++-----
 kernel/sysctl.c       |   19 +++++++++++++
 3 files changed, 88 insertions(+), 6 deletions(-)

Index: linux/include/linux/sched.h
===================================================================
--- linux.orig/include/linux/sched.h
+++ linux/include/linux/sched.h
@@ -1392,10 +1392,12 @@ extern void sched_idle_next(void);
 #ifdef CONFIG_SCHED_DEBUG
 extern unsigned int sysctl_sched_latency;
 extern unsigned int sysctl_sched_min_granularity;
+extern unsigned int sysctl_sched_yield_granularity;
 extern unsigned int sysctl_sched_wakeup_granularity;
 extern unsigned int sysctl_sched_batch_wakeup_granularity;
 extern unsigned int sysctl_sched_stat_granularity;
 extern unsigned int sysctl_sched_runtime_limit;
+extern unsigned int sysctl_sched_yield_bug_workaround;
 extern unsigned int sysctl_sched_child_runs_first;
 extern unsigned int sysctl_sched_features;
 #endif
Index: linux/kernel/sched_fair.c
===================================================================
--- linux.orig/kernel/sched_fair.c
+++ linux/kernel/sched_fair.c
@@ -42,6 +42,16 @@ const_debug unsigned int sysctl_sched_la
  */
 const_debug unsigned int sysctl_sched_child_runs_first = 1;
 
+const_debug unsigned int sysctl_sched_yield_granularity = 10000000ULL;
+
+/*
+ * sys_sched_yield workaround switch.
+ *
+ * This option switches the yield implementation of the
+ * old scheduler back on.
+ */
+const_debug unsigned int sysctl_sched_yield_bug_workaround;
+
 /*
  * Minimal preemption granularity for CPU-bound tasks:
  * (default: 2 msec, units: nanoseconds)
@@ -675,15 +685,66 @@ static void dequeue_task_fair(struct rq 
  */
 static void yield_task_fair(struct rq *rq, struct task_struct *p)
 {
-	struct cfs_rq *cfs_rq = task_cfs_rq(p);
+	if (!sysctl_sched_yield_bug_workaround) {
+		struct cfs_rq *cfs_rq = task_cfs_rq(p);
+		__update_rq_clock(rq);
+
+		/*
+		 * Dequeue and enqueue the task to update its
+		 * position within the tree:
+		 */
+		dequeue_entity(cfs_rq, &p->se, 0);
+		enqueue_entity(cfs_rq, &p->se, 0);
+		return;
+	}
+
+	if (sysctl_sched_yield_bug_workaround == 1) {
+		struct cfs_rq *cfs_rq = task_cfs_rq(p);
+		struct rb_node *curr, *next, *first;
+		struct task_struct *p_next;
+		s64 yield_key;
+
+		__update_rq_clock(rq);
+		curr = &p->se.run_node;
+		first = first_fair(cfs_rq);
+		/*
+		 * Move this task to the second place in the tree:
+		 */
+		if (unlikely(curr != first)) {
+			next = first;
+		} else {
+			next = rb_next(curr);
+			/*
+			 * We were the last one already - nothing to do, return
+			 * and reschedule:
+			 */
+			if (unlikely(!next))
+				return;
+		}
+
+		p_next = rb_entry(next, struct task_struct, se.run_node);
+		/*
+		 * Minimally necessary key value to be the second in the tree:
+		 */
+		yield_key = p_next->se.fair_key + (int)sysctl_sched_yield_granularity;
+
+		dequeue_entity(cfs_rq, &p->se, 0);
+
+		/*
+		 * Only update the key if we need to move more backwards
+		 * than the minimally necessary position to be the second:
+		 */
+		if (p->se.fair_key < yield_key)
+			p->se.fair_key = yield_key;
+
+		__enqueue_entity(cfs_rq, &p->se);
+		return;
+	}
 
-	__update_rq_clock(rq);
 	/*
-	 * Dequeue and enqueue the task to update its
-	 * position within the tree:
+	 * Just reschedule, do nothing else:
 	 */
-	dequeue_entity(cfs_rq, &p->se, 0);
-	enqueue_entity(cfs_rq, &p->se, 0);
+	resched_task(p);
 }
 
 /*
Index: linux/kernel/sysctl.c
===================================================================
--- linux.orig/kernel/sysctl.c
+++ linux/kernel/sysctl.c
@@ -244,6 +244,17 @@ static ctl_table kern_table[] = {
 	},
 	{
 		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "sched_yield_granularity_ns",
+		.data		= &sysctl_sched_yield_granularity,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &min_sched_granularity_ns,
+		.extra2		= &max_sched_granularity_ns,
+	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "sched_wakeup_granularity_ns",
 		.data		= &sysctl_sched_wakeup_granularity,
 		.maxlen		= sizeof(unsigned int),
@@ -266,6 +277,14 @@ static ctl_table kern_table[] = {
 	},
 	{
 		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "sched_yield_bug_workaround",
+		.data		= &sysctl_sched_yield_bug_workaround,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "sched_child_runs_first",
 		.data		= &sysctl_sched_child_runs_first,
 		.maxlen		= sizeof(unsigned int),

  parent reply	other threads:[~2007-09-13 11:24 UTC|newest]

Thread overview: 32+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2007-09-12 23:10 CFS: some bad numbers with Java/database threading Antoine Martin
2007-09-13  7:18 ` David Schwartz
2007-09-12 23:33   ` Nick Piggin
2007-09-13 19:02     ` Antoine Martin
2007-09-13 21:47       ` David Schwartz
2007-09-13 11:24 ` Ingo Molnar [this message]
2007-09-14  8:32   ` CFS: " Ingo Molnar
2007-09-14 10:06     ` Satyam Sharma
2007-09-14 15:25       ` CFS: some bad numbers with Java/database threading [FIXED] Antoine Martin
2007-09-14 15:32         ` Ingo Molnar
2007-09-18 17:00           ` Chuck Ebbert
2007-09-18 22:46             ` Ingo Molnar
2007-09-18 23:02               ` Chuck Ebbert
2007-09-19 18:45                 ` David Schwartz
2007-09-19 19:48                   ` Chris Friesen
2007-09-19 22:56                     ` David Schwartz
2007-09-19 23:05                       ` David Schwartz
2007-09-19 23:52                         ` David Schwartz
2007-09-19 19:18                 ` Ingo Molnar
2007-09-19 19:39                   ` Linus Torvalds
2007-09-19 19:56                     ` Ingo Molnar
2007-09-19 20:26                       ` Ingo Molnar
2007-09-19 20:28                       ` Linus Torvalds
2007-09-19 21:41                         ` Ingo Molnar
2007-09-19 21:49                           ` Ingo Molnar
2007-09-19 21:58                           ` Peter Zijlstra
2007-09-26  1:46                           ` CFS: new java yield graphs Antoine Martin
2007-09-27  8:35                             ` Ingo Molnar
2007-09-19 20:00                   ` CFS: some bad numbers with Java/database threading [FIXED] Chris Friesen
2007-09-14 16:01       ` CFS: some bad numbers with Java/database threading Satyam Sharma
2007-09-14 16:08         ` Satyam Sharma
2007-09-17 12:17         ` Antoine Martin

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20070913112427.GA20686@elte.hu \
    --to=mingo@elte.hu \
    --cc=antoine@nagafix.co.uk \
    --cc=linux-kernel@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.