From: Peter Zijlstra <a.p.zijlstra@chello.nl>
To: Chris Mason <chris.mason@oracle.com>,
Frank Rowand <frank.rowand@am.sony.com>,
Ingo Molnar <mingo@elte.hu>, Thomas Gleixner <tglx@linutronix.de>,
Mike Galbraith <efault@gmx.de>, Oleg Nesterov <oleg@redhat.com>,
Paul Turner <pjt@google.com>, Jens Axboe <axboe@kernel.dk>,
Yong Zhang <yong.zhang0@gmail.com>
Cc: linux-kernel@vger.kernel.org, Peter Zijlstra <a.p.zijlstra@chello.nl>
Subject: [RFC][PATCH 16/17] sched: Move the second half of ttwu() to the remote cpu
Date: Fri, 24 Dec 2010 13:23:54 +0100 [thread overview]
Message-ID: <20101224123743.303699501@chello.nl> (raw)
In-Reply-To: 20101224122338.172750730@chello.nl
[-- Attachment #1: sched-ttwu-queue-remote.patch --]
[-- Type: text/plain, Size: 6527 bytes --]
Now that we've removed the rq->lock requirement from the first part of
ttwu() and can compute placement without holding any rq->lock, ensure
we execute the second half of ttwu() on the actual cpu we want the
task to run on.
This avoids having to take rq->lock and doing the task enqueue
remotely, saving lots on cacheline transfers.
As measured using: http://oss.oracle.com/~mason/sembench.c
$ echo 4096 32000 64 128 > /proc/sys/kernel/sem
$ ./sembench -t 2048 -w 1900 -o 0
unpatched: run time 30 seconds 537953 worker burns per second
patched: run time 30 seconds 657336 worker burns per second
Still need to sort out all the races marked XXX (non-trivial), and its
x86 only for the moment.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
arch/x86/kernel/smp.c | 1
include/linux/sched.h | 2
kernel/sched.c | 143 ++++++++++++++++++++++++++++++++++++------------
kernel/sched_features.h | 2
4 files changed, 114 insertions(+), 34 deletions(-)
Index: linux-2.6/arch/x86/kernel/smp.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/smp.c
+++ linux-2.6/arch/x86/kernel/smp.c
@@ -205,6 +205,7 @@ void smp_reschedule_interrupt(struct pt_
/*
* KVM uses this interrupt to force a cpu out of guest mode
*/
+ sched_ttwu_pending();
}
void smp_call_function_interrupt(struct pt_regs *regs)
Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -1020,6 +1020,7 @@ partition_sched_domains(int ndoms_new, c
}
#endif /* !CONFIG_SMP */
+void sched_ttwu_pending(void);
struct io_context; /* See blkdev.h */
@@ -1201,6 +1202,7 @@ struct task_struct {
int lock_depth; /* BKL lock depth */
#ifdef CONFIG_SMP
+ struct task_struct *wake_entry;
int on_cpu;
#endif
int on_rq;
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -559,6 +559,10 @@ struct rq {
/* BKL stats */
unsigned int bkl_count;
#endif
+
+#ifdef CONFIG_SMP
+ struct task_struct *wake_list;
+#endif
};
static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
@@ -2429,6 +2433,100 @@ ttwu_do_wakeup(struct rq *rq, struct tas
wq_worker_waking_up(p, cpu_of(rq));
}
+static void
+ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
+{
+#ifdef CONFIG_SMP
+ if (task_cpu(p) != cpu_of(rq))
+ set_task_cpu(p, cpu_of(rq));
+#endif
+
+ if (p->sched_contributes_to_load)
+ rq->nr_uninterruptible--;
+
+ activate_task(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
+ ttwu_do_wakeup(rq, p, wake_flags);
+}
+
+/*
+ * Called in case the task @p isn't fully descheduled from its runqueue,
+ * in this case we must do a remote wakeup. Its a 'light' wakeup though,
+ * since all we need to do is flip p->state to TASK_RUNNING, since
+ * the task is still ->on_rq.
+ */
+static int ttwu_remote(struct task_struct *p, int wake_flags)
+{
+ struct rq *rq;
+ int ret = 0;
+
+ rq = __task_rq_lock(p);
+ if (p->on_rq) {
+ ttwu_do_wakeup(rq, p, wake_flags);
+ ttwu_stat(p, task_cpu(p), wake_flags);
+ ret = 1;
+ }
+ __task_rq_unlock(rq);
+
+ return ret;
+}
+
+void sched_ttwu_pending(void)
+{
+#ifdef CONFIG_SMP
+ struct rq *rq = this_rq();
+ struct task_struct *list = xchg(&rq->wake_list, NULL);
+
+ if (!list)
+ return;
+
+ raw_spin_lock(&rq->lock);
+
+ while (list) {
+ struct task_struct *p = list;
+ list = list->wake_entry;
+ ttwu_do_activate(rq, p, 0);
+ }
+
+ raw_spin_unlock(&rq->lock);
+#endif
+}
+
+#ifdef CONFIG_SMP
+static void ttwu_queue_remote(struct task_struct *p, int cpu)
+{
+ struct task_struct *next = NULL;
+ struct rq *rq = cpu_rq(cpu);
+
+ for (;;) {
+ struct task_struct *old = next;
+
+ p->wake_entry = next;
+ next = cmpxchg(&rq->wake_list, old, p);
+ if (next == old)
+ break;
+ }
+
+ if (!next)
+ smp_send_reschedule(cpu);
+}
+#endif
+
+static void ttwu_queue(struct task_struct *p, int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+#ifdef CONFIG_SMP
+ if (!sched_feat(TTWU_FORCE_REMOTE) && cpu != smp_processor_id()) {
+ ttwu_queue_remote(p, cpu);
+ return;
+ }
+#endif
+
+ raw_spin_lock(&rq->lock);
+ ttwu_do_activate(rq, p, 0);
+ raw_spin_unlock(&rq->lock);
+}
+
/**
* try_to_wake_up - wake up a thread
* @p: the thread to be awakened
@@ -2447,29 +2545,18 @@ ttwu_do_wakeup(struct rq *rq, struct tas
static int
try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
{
- int cpu, this_cpu, success = 0;
unsigned long flags;
- struct rq *rq;
-
- this_cpu = get_cpu();
+ int cpu, success = 0;
smp_wmb();
raw_spin_lock_irqsave(&p->pi_lock, flags);
if (!(p->state & state))
goto out;
- cpu = task_cpu(p);
-
- if (p->on_rq) {
- rq = __task_rq_lock(p);
- if (p->on_rq)
- goto out_running;
- __task_rq_unlock(rq);
- }
+ success = 1; /* we're going to change ->state */
-#ifdef CONFIG_SMP
- while (p->on_cpu)
- cpu_relax();
+ if (p->on_rq && ttwu_remote(p, wake_flags))
+ goto out;
p->sched_contributes_to_load = !!task_contributes_to_load(p);
p->state = TASK_WAKING;
@@ -2482,7 +2569,12 @@ try_to_wake_up(struct task_struct *p, un
* their feet.
*/
smp_mb();
- raw_spin_unlock_wait(&task_rq(p)->lock);
+ cpu = task_cpu(p);
+ raw_spin_unlock_wait(&cpu_rq(cpu)->lock);
+
+#ifdef CONFIG_SMP
+ while (p->on_cpu)
+ cpu_relax();
if (p->sched_class->task_waking)
p->sched_class->task_waking(p);
@@ -2490,27 +2582,10 @@ try_to_wake_up(struct task_struct *p, un
cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
#endif /* CONFIG_SMP */
- rq = cpu_rq(cpu);
- raw_spin_lock(&rq->lock);
-
-#ifdef CONFIG_SMP
- if (cpu != task_cpu(p))
- set_task_cpu(p, cpu);
-
- if (p->sched_contributes_to_load)
- rq->nr_uninterruptible--;
-#endif
-
- activate_task(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
-out_running:
- ttwu_do_wakeup(rq, p, wake_flags);
- success = 1;
- __task_rq_unlock(rq);
-
+ ttwu_queue(p, cpu);
ttwu_stat(p, cpu, wake_flags);
out:
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
- put_cpu();
return success;
}
Index: linux-2.6/kernel/sched_features.h
===================================================================
--- linux-2.6.orig/kernel/sched_features.h
+++ linux-2.6/kernel/sched_features.h
@@ -64,3 +64,5 @@ SCHED_FEAT(OWNER_SPIN, 1)
* Decrement CPU power based on irq activity
*/
SCHED_FEAT(NONIRQ_POWER, 1)
+
+SCHED_FEAT(TTWU_FORCE_REMOTE, 0)
next prev parent reply other threads:[~2010-12-24 12:44 UTC|newest]
Thread overview: 59+ messages / expand[flat|nested] mbox.gz Atom feed top
2010-12-24 12:23 [RFC][PATCH 00/17] sched: Reduce runqueue lock contention -v3 Peter Zijlstra
2010-12-24 12:23 ` [RFC][PATCH 01/17] sched: Always provide p->on_cpu Peter Zijlstra
2010-12-24 12:23 ` [RFC][PATCH 02/17] mutex: Use p->on_cpu for the adaptive spin Peter Zijlstra
2010-12-24 12:23 ` [RFC][PATCH 03/17] sched: Change the ttwu success details Peter Zijlstra
2010-12-24 12:23 ` [RFC][PATCH 04/17] sched: Clean up ttwu stats Peter Zijlstra
2010-12-24 12:23 ` [RFC][PATCH 05/17] x86: Optimize arch_spin_unlock_wait() Peter Zijlstra
2010-12-24 18:26 ` Linus Torvalds
2011-01-03 11:32 ` Peter Zijlstra
2011-01-04 6:45 ` Nick Piggin
2011-01-05 19:14 ` [RFC][PATCH] spinlock: Kill spin_unlock_wait() Peter Zijlstra
2011-01-05 19:26 ` Oleg Nesterov
2011-01-05 19:43 ` Linus Torvalds
2011-01-06 9:32 ` Peter Zijlstra
2011-01-06 10:38 ` Nick Piggin
2011-01-06 18:26 ` Peter Zijlstra
2011-01-07 21:01 ` Tejun Heo
2011-01-07 21:13 ` Jeff Garzik
2011-01-07 21:33 ` Tejun Heo
2010-12-24 12:23 ` [RFC][PATCH 06/17] sched: Provide p->on_rq Peter Zijlstra
2010-12-29 14:14 ` Yong Zhang
2010-12-24 12:23 ` [RFC][PATCH 07/17] sched: Serialize p->cpus_allowed and ttwu() using p->pi_lock Peter Zijlstra
2010-12-29 14:20 ` Yong Zhang
2011-01-03 11:12 ` Peter Zijlstra
2010-12-24 12:23 ` [RFC][PATCH 08/17] sched: Drop the rq argument to sched_class::select_task_rq() Peter Zijlstra
2010-12-29 14:31 ` Yong Zhang
2011-01-03 11:16 ` Peter Zijlstra
2011-01-03 14:59 ` Oleg Nesterov
2011-01-03 15:21 ` Peter Zijlstra
2011-01-03 15:49 ` Oleg Nesterov
2011-01-03 16:35 ` Peter Zijlstra
2011-01-03 16:41 ` Peter Zijlstra
2011-01-04 7:27 ` Yong Zhang
2011-01-04 12:34 ` Peter Zijlstra
2011-01-04 5:59 ` Yong Zhang
2011-01-04 13:00 ` Peter Zijlstra
2011-01-03 18:05 ` Oleg Nesterov
2011-01-04 13:01 ` Peter Zijlstra
2010-12-24 12:23 ` [RFC][PATCH 09/17] sched: Remove rq argument to sched_class::task_waking() Peter Zijlstra
2010-12-24 12:23 ` [RFC][PATCH 10/17] sched: Add TASK_WAKING to task_rq_lock Peter Zijlstra
2010-12-24 12:23 ` [RFC][PATCH 11/17] sched: Delay task_contributes_to_load() Peter Zijlstra
2010-12-24 12:23 ` [RFC][PATCH 12/17] sched: Also serialize ttwu_local() with p->pi_lock Peter Zijlstra
2011-01-03 17:32 ` Oleg Nesterov
2011-01-09 23:11 ` Tejun Heo
2010-12-24 12:23 ` [RFC][PATCH 13/17] sched: Remove rq->lock from the first half of ttwu() Peter Zijlstra
2010-12-24 12:23 ` [RFC][PATCH 14/17] sched: Remove rq argument to ttwu_stat() Peter Zijlstra
2010-12-29 14:40 ` Yong Zhang
2011-01-03 11:20 ` Peter Zijlstra
2010-12-24 12:23 ` [RFC][PATCH 15/17] sched: Rename ttwu_post_activation Peter Zijlstra
2010-12-24 12:23 ` Peter Zijlstra [this message]
2011-01-03 14:36 ` [RFC][PATCH] sembench: add stddev to the burn stats Peter Zijlstra
2011-01-04 14:28 ` [RFC][PATCH 16/17] sched: Move the second half of ttwu() to the remote cpu Oleg Nesterov
2011-01-04 14:47 ` Peter Zijlstra
2011-01-04 15:18 ` Oleg Nesterov
2011-01-04 15:43 ` Peter Zijlstra
2011-01-04 16:06 ` Oleg Nesterov
2010-12-24 12:23 ` [RFC][PATCH 17/17] sched: Sort hotplug vs ttwu queueing Peter Zijlstra
2010-12-29 14:51 ` Yong Zhang
2011-01-03 11:21 ` Peter Zijlstra
2010-12-24 13:15 ` [RFC][PATCH 00/17] sched: Reduce runqueue lock contention -v3 Peter Zijlstra
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20101224123743.303699501@chello.nl \
--to=a.p.zijlstra@chello.nl \
--cc=axboe@kernel.dk \
--cc=chris.mason@oracle.com \
--cc=efault@gmx.de \
--cc=frank.rowand@am.sony.com \
--cc=linux-kernel@vger.kernel.org \
--cc=mingo@elte.hu \
--cc=oleg@redhat.com \
--cc=pjt@google.com \
--cc=tglx@linutronix.de \
--cc=yong.zhang0@gmail.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox