All of lore.kernel.org
 help / color / mirror / Atom feed
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
To: Chris Mason <chris.mason@oracle.com>,
	Frank Rowand <frank.rowand@am.sony.com>,
	Ingo Molnar <mingo@elte.hu>, Thomas Gleixner <tglx@linutronix.de>,
	Mike Galbraith <efault@gmx.de>, Oleg Nesterov <oleg@redhat.com>,
	Paul Turner <pjt@google.com>, Jens Axboe <axboe@kernel.dk>,
	Yong Zhang <yong.zhang0@gmail.com>
Cc: linux-kernel@vger.kernel.org, Peter Zijlstra <a.p.zijlstra@chello.nl>
Subject: [RFC][PATCH 11/18] sched: Add p->pi_lock to task_rq_lock()
Date: Tue, 04 Jan 2011 15:59:40 +0100	[thread overview]
Message-ID: <20110104150102.862431889@chello.nl> (raw)
In-Reply-To: 20110104145929.772813816@chello.nl

[-- Attachment #1: sched-ttwu-task_rq_lock.patch --]
[-- Type: text/plain, Size: 8014 bytes --]

In order to be able to call set_task_cpu() while either holding
p->pi_lock or task_rq(p)->lock we need to hold both locks in order to
stabilize task_rq().

This makes task_rq_lock() acquire both locks, and have
__task_rq_lock() validate that p->pi_lock is held. This increases the
locking overhead for most scheduler syscalls but allows reduction of
rq->lock contention for some scheduler hot paths (ttwu).

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 kernel/sched.c |   81 ++++++++++++++++++++++++++-------------------------------
 1 file changed, 37 insertions(+), 44 deletions(-)

Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -602,7 +602,7 @@ static inline int cpu_of(struct rq *rq)
  * Return the group to which this tasks belongs.
  *
  * We use task_subsys_state_check() and extend the RCU verification
- * with lockdep_is_held(&task_rq(p)->lock) because cpu_cgroup_attach()
+ * with lockdep_is_held(&p->pi_lock) because cpu_cgroup_attach()
  * holds that lock for each task it moves into the cgroup. Therefore
  * by holding that lock, we pin the task to the current cgroup.
  */
@@ -612,7 +612,7 @@ static inline struct task_group *task_gr
 	struct cgroup_subsys_state *css;
 
 	css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
-			lockdep_is_held(&task_rq(p)->lock));
+			lockdep_is_held(&p->pi_lock));
 	tg = container_of(css, struct task_group, css);
 
 	return autogroup_task_group(p, tg);
@@ -928,23 +928,15 @@ static inline void finish_lock_switch(st
 #endif /* __ARCH_WANT_UNLOCKED_CTXSW */
 
 /*
- * Check whether the task is waking, we use this to synchronize ->cpus_allowed
- * against ttwu().
- */
-static inline int task_is_waking(struct task_struct *p)
-{
-	return unlikely(p->state == TASK_WAKING);
-}
-
-/*
- * __task_rq_lock - lock the runqueue a given task resides on.
- * Must be called interrupts disabled.
+ * __task_rq_lock - lock the rq @p resides on.
  */
 static inline struct rq *__task_rq_lock(struct task_struct *p)
 	__acquires(rq->lock)
 {
 	struct rq *rq;
 
+	lockdep_assert_held(&p->pi_lock);
+
 	for (;;) {
 		rq = task_rq(p);
 		raw_spin_lock(&rq->lock);
@@ -955,22 +947,22 @@ static inline struct rq *__task_rq_lock(
 }
 
 /*
- * task_rq_lock - lock the runqueue a given task resides on and disable
- * interrupts. Note the ordering: we can safely lookup the task_rq without
- * explicitly disabling preemption.
+ * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
  */
 static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
+	__acquires(p->pi_lock)
 	__acquires(rq->lock)
 {
 	struct rq *rq;
 
 	for (;;) {
-		local_irq_save(*flags);
+		raw_spin_lock_irqsave(&p->pi_lock, *flags);
 		rq = task_rq(p);
 		raw_spin_lock(&rq->lock);
 		if (likely(rq == task_rq(p)))
 			return rq;
-		raw_spin_unlock_irqrestore(&rq->lock, *flags);
+		raw_spin_unlock(&rq->lock);
+		raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
 	}
 }
 
@@ -980,10 +972,13 @@ static void __task_rq_unlock(struct rq *
 	raw_spin_unlock(&rq->lock);
 }
 
-static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
+static inline void
+task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
 	__releases(rq->lock)
+	__releases(p->pi_lock)
 {
-	raw_spin_unlock_irqrestore(&rq->lock, *flags);
+	raw_spin_unlock(&rq->lock);
+	raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
 }
 
 /*
@@ -2115,6 +2110,11 @@ void set_task_cpu(struct task_struct *p,
 	 */
 	WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
 			!(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
+
+#ifdef CONFIG_LOCKDEP
+	WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
+				      lockdep_is_held(&task_rq(p)->lock)));
+#endif
 #endif
 
 	trace_sched_migrate_task(p, new_cpu);
@@ -2210,7 +2210,7 @@ unsigned long wait_task_inactive(struct 
 		ncsw = 0;
 		if (!match_state || p->state == match_state)
 			ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
-		task_rq_unlock(rq, &flags);
+		task_rq_unlock(rq, p, &flags);
 
 		/*
 		 * If it changed from the expected state, bail out now.
@@ -2596,6 +2596,7 @@ static void __sched_fork(struct task_str
  */
 void sched_fork(struct task_struct *p, int clone_flags)
 {
+	unsigned long flags;
 	int cpu = get_cpu();
 
 	__sched_fork(p);
@@ -2646,9 +2647,9 @@ void sched_fork(struct task_struct *p, i
 	 *
 	 * Silence PROVE_RCU.
 	 */
-	rcu_read_lock();
+	raw_spin_lock_irqsave(&p->pi_lock, flags);
 	set_task_cpu(p, cpu);
-	rcu_read_unlock();
+	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
 	if (likely(sched_info_on()))
@@ -3472,7 +3473,7 @@ unsigned long long task_delta_exec(struc
 
 	rq = task_rq_lock(p, &flags);
 	ns = do_task_delta_exec(p, rq);
-	task_rq_unlock(rq, &flags);
+	task_rq_unlock(rq, p, &flags);
 
 	return ns;
 }
@@ -3490,7 +3491,7 @@ unsigned long long task_sched_runtime(st
 
 	rq = task_rq_lock(p, &flags);
 	ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
-	task_rq_unlock(rq, &flags);
+	task_rq_unlock(rq, p, &flags);
 
 	return ns;
 }
@@ -3514,7 +3515,7 @@ unsigned long long thread_group_sched_ru
 	rq = task_rq_lock(p, &flags);
 	thread_group_cputime(p, &totals);
 	ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);
-	task_rq_unlock(rq, &flags);
+	task_rq_unlock(rq, p, &flags);
 
 	return ns;
 }
@@ -4538,16 +4539,13 @@ EXPORT_SYMBOL(sleep_on_timeout);
  */
 void rt_mutex_setprio(struct task_struct *p, int prio)
 {
-	unsigned long flags;
 	int oldprio, on_rq, running;
 	struct rq *rq;
 	const struct sched_class *prev_class;
 
 	BUG_ON(prio < 0 || prio > MAX_PRIO);
 
-	lockdep_assert_held(&p->pi_lock);
-
-	rq = task_rq_lock(p, &flags);
+	rq = __task_rq_lock(p);
 
 	trace_sched_pi_setprio(p, prio);
 	oldprio = p->prio;
@@ -4573,7 +4571,7 @@ void rt_mutex_setprio(struct task_struct
 
 		check_class_changed(rq, p, prev_class, oldprio, running);
 	}
-	task_rq_unlock(rq, &flags);
+	__task_rq_unlock(rq);
 }
 
 #endif
@@ -4621,7 +4619,7 @@ void set_user_nice(struct task_struct *p
 			resched_task(rq->curr);
 	}
 out_unlock:
-	task_rq_unlock(rq, &flags);
+	task_rq_unlock(rq, p, &flags);
 }
 EXPORT_SYMBOL(set_user_nice);
 
@@ -4843,13 +4841,11 @@ static int __sched_setscheduler(struct t
 	/*
 	 * make sure no PI-waiters arrive (or leave) while we are
 	 * changing the priority of the task:
-	 */
-	raw_spin_lock_irqsave(&p->pi_lock, flags);
-	/*
+	 *
 	 * To be able to change p->policy safely, the apropriate
 	 * runqueue lock must be held.
 	 */
-	rq = __task_rq_lock(p);
+	rq = task_rq_lock(p, &flags);
 
 	/*
 	 * Changing the policy of the stop threads its a very bad idea
@@ -4902,8 +4898,7 @@ static int __sched_setscheduler(struct t
 
 		check_class_changed(rq, p, prev_class, oldprio, running);
 	}
-	__task_rq_unlock(rq);
-	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+	task_rq_unlock(rq, p, &flags);
 
 	rt_mutex_adjust_pi(p);
 
@@ -5432,7 +5427,7 @@ SYSCALL_DEFINE2(sched_rr_get_interval, p
 
 	rq = task_rq_lock(p, &flags);
 	time_slice = p->sched_class->get_rr_interval(rq, p);
-	task_rq_unlock(rq, &flags);
+	task_rq_unlock(rq, p, &flags);
 
 	rcu_read_unlock();
 	jiffies_to_timespec(time_slice, &t);
@@ -5655,8 +5650,7 @@ int set_cpus_allowed_ptr(struct task_str
 	unsigned int dest_cpu;
 	int ret = 0;
 
-	raw_spin_lock_irqsave(&p->pi_lock, flags);
-	rq = __task_rq_lock(p);
+	rq = task_rq_lock(p, &flags);
 
 	if (!cpumask_intersects(new_mask, cpu_active_mask)) {
 		ret = -EINVAL;
@@ -5691,8 +5685,7 @@ int set_cpus_allowed_ptr(struct task_str
 		return 0;
 	}
 out:
-	__task_rq_unlock(rq);
-	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+	task_rq_unlock(rq, p, &flags);
 
 	return ret;
 }
@@ -8463,7 +8456,7 @@ void sched_move_task(struct task_struct 
 	if (on_rq)
 		enqueue_task(rq, tsk, 0);
 
-	task_rq_unlock(rq, &flags);
+	task_rq_unlock(rq, tsk, &flags);
 }
 #endif /* CONFIG_CGROUP_SCHED */
 



  parent reply	other threads:[~2011-01-04 15:12 UTC|newest]

Thread overview: 44+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2011-01-04 14:59 [RFC][PATCH 00/18] sched: Reduce runqueue lock contention -v4 Peter Zijlstra
2011-01-04 14:59 ` [RFC][PATCH 01/18] sched: Always provide p->on_cpu Peter Zijlstra
2011-01-04 14:59 ` [RFC][PATCH 02/18] mutex: Use p->on_cpu for the adaptive spin Peter Zijlstra
2011-01-04 14:59 ` [RFC][PATCH 03/18] sched: Change the ttwu success details Peter Zijlstra
2011-01-04 14:59 ` [RFC][PATCH 04/18] sched: Clean up ttwu stats Peter Zijlstra
2011-01-04 14:59 ` [RFC][PATCH 05/18] sched: Provide p->on_rq Peter Zijlstra
2011-01-05  8:13   ` Yong Zhang
2011-01-05  9:53     ` Peter Zijlstra
2011-01-29  0:10   ` Frank Rowand
2011-01-04 14:59 ` [RFC][PATCH 06/18] sched: Serialize p->cpus_allowed and ttwu() using p->pi_lock Peter Zijlstra
2011-01-04 14:59 ` [RFC][PATCH 07/18] sched: Drop the rq argument to sched_class::select_task_rq() Peter Zijlstra
2011-01-06 13:57   ` Peter Zijlstra
2011-01-06 14:23     ` Peter Zijlstra
2011-01-04 14:59 ` [RFC][PATCH 08/18] sched: Remove rq argument to sched_class::task_waking() Peter Zijlstra
2011-01-04 14:59 ` [RFC][PATCH 09/18] sched: Delay task_contributes_to_load() Peter Zijlstra
2011-01-04 14:59 ` [RFC][PATCH 10/18] sched: Also serialize ttwu_local() with p->pi_lock Peter Zijlstra
2011-01-04 14:59 ` Peter Zijlstra [this message]
2011-01-05 18:46   ` [RFC][PATCH 11/18] sched: Add p->pi_lock to task_rq_lock() Oleg Nesterov
2011-01-05 19:33     ` Peter Zijlstra
2011-01-29  0:21   ` Frank Rowand
2011-02-03 17:16     ` Peter Zijlstra
2011-02-03 17:49       ` Frank Rowand
2011-01-04 14:59 ` [RFC][PATCH 12/18] sched: Drop rq->lock from first part of wake_up_new_task() Peter Zijlstra
2011-01-04 14:59 ` [RFC][PATCH 13/18] sched: Drop rq->lock from sched_exec() Peter Zijlstra
2011-01-04 14:59 ` [RFC][PATCH 14/18] sched: Remove rq->lock from the first half of ttwu() Peter Zijlstra
2011-01-06 16:29   ` Peter Zijlstra
2011-01-29  1:05   ` Frank Rowand
2011-02-03 17:16     ` Peter Zijlstra
2011-01-04 14:59 ` [RFC][PATCH 15/18] sched: Remove rq argument from ttwu_stat() Peter Zijlstra
2011-01-04 14:59 ` [RFC][PATCH 16/18] sched: Rename ttwu_post_activation Peter Zijlstra
2011-01-29  1:08   ` Frank Rowand
2011-01-04 14:59 ` [RFC][PATCH 17/18] sched: Move the second half of ttwu() to the remote cpu Peter Zijlstra
2011-01-05 21:07   ` Oleg Nesterov
2011-01-06 15:09     ` Peter Zijlstra
2011-01-07 15:22       ` Oleg Nesterov
2011-01-18 16:38         ` Peter Zijlstra
2011-01-19 19:37           ` Oleg Nesterov
2011-01-29  0:04           ` Frank Rowand
2011-02-03 17:16             ` Peter Zijlstra
2011-01-04 14:59 ` [RFC][PATCH 18/18] sched: Sort hotplug vs ttwu queueing Peter Zijlstra
2011-01-05 20:47   ` Oleg Nesterov
2011-01-06 10:56     ` Peter Zijlstra
2011-01-04 15:16 ` [RFC][PATCH 00/18] sched: Reduce runqueue lock contention -v4 Ingo Molnar
2011-01-29  1:20 ` Frank Rowand

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20110104150102.862431889@chello.nl \
    --to=a.p.zijlstra@chello.nl \
    --cc=axboe@kernel.dk \
    --cc=chris.mason@oracle.com \
    --cc=efault@gmx.de \
    --cc=frank.rowand@am.sony.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@elte.hu \
    --cc=oleg@redhat.com \
    --cc=pjt@google.com \
    --cc=tglx@linutronix.de \
    --cc=yong.zhang0@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.