public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
To: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Simon Kirby <sim@hostway.ca>,
	Linux Kernel Mailing List <linux-kernel@vger.kernel.org>,
	Dave Jones <davej@redhat.com>,
	Thomas Gleixner <tglx@linutronix.de>,
	Martin Schwidefsky <schwidefsky@de.ibm.com>,
	Ingo Molnar <mingo@elte.hu>
Subject: Re: Linux 3.1-rc9
Date: Mon, 17 Oct 2011 12:34:18 +0200	[thread overview]
Message-ID: <1318847658.6594.40.camel@twins> (raw)
In-Reply-To: <CA+55aFxxfsFMgqA7kYKukvkCX6Z9HrKOtP-22w5Mc21iQj4c=Q@mail.gmail.com>

On Sun, 2011-10-16 at 18:39 -0700, Linus Torvalds wrote:

> Quite frankly, I personally consider it to be broken - why are we
> introducing this new lock for this very special thing? A spinlock to
> protect a *single* word of counter seems broken.

Well, I thought atomic64_t would be more expensive on 32bit archs, i386
uses the horridly expensive cmpxchg8b thing to implement it.

That said, I'm more than glad to use it.

> However, I don't see why that spinlock is needed at all. Why aren't
> those fields just atomics (or at least just "sum_exec_runtime")? 

Done.

> And
> why does "cputime_add()" exist at all? It seems to always be just a
> plain add, and nothing else would seem to ever make sense *anyway*?

Martin and me were discussing the merit of that only a few weeks ago ;-)

BTW what would we all think about a coccinelle generated patch that
fixes atomic*_add()'s argument order?

---
Subject: cputimer: Cure lock inversion
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon Oct 17 11:50:30 CEST 2011

There's a lock inversion between the cputimer->lock and rq->lock; notably
the two callchains involved are:

 update_rlimit_cpu()
   sighand->siglock
   set_process_cpu_timer()
     cpu_timer_sample_group()
       thread_group_cputimer()
         cputimer->lock
         thread_group_cputime()
           task_sched_runtime()
             ->pi_lock
             rq->lock

 scheduler_tick()
   rq->lock
   task_tick_fair()
     update_curr()
       account_group_exec()
         cputimer->lock

Where the first one is enabling a CLOCK_PROCESS_CPUTIME_ID timer, and the
second one is keeping up-to-date.

Note that e8abccb7193 ("posix-cpu-timers: Cure SMP accounting oddities") didn't
introduce this problem, but merely made it much more likely to happen, see how
cpu_timer_sample_group() for the CPUCLOCK_SCHED case also takes rq->lock.

Cure this inversion by removing the need to acquire cputimer->lock in the
update path by converting task_cputime::sum_exec_runtime to an atomic64_t.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/sched.h     |    4 ++--
 kernel/fork.c             |    2 +-
 kernel/posix-cpu-timers.c |   41 ++++++++++++++++++++++++-----------------
 kernel/sched.c            |    2 +-
 kernel/sched_rt.c         |    6 ++++--
 kernel/sched_stats.h      |    4 +---
 6 files changed, 33 insertions(+), 26 deletions(-)
Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -474,7 +474,7 @@ struct cpu_itimer {
 struct task_cputime {
 	cputime_t utime;
 	cputime_t stime;
-	unsigned long long sum_exec_runtime;
+	atomic64_t sum_exec_runtime;
 };
 /* Alternate field names when used to cache expirations. */
 #define prof_exp	stime
@@ -485,7 +485,7 @@ struct task_cputime {
 	(struct task_cputime) {					\
 		.utime = cputime_zero,				\
 		.stime = cputime_zero,				\
-		.sum_exec_runtime = 0,				\
+		.sum_exec_runtime = ATOMIC64_INIT(0),		\
 	}
 
 /*
Index: linux-2.6/kernel/fork.c
===================================================================
--- linux-2.6.orig/kernel/fork.c
+++ linux-2.6/kernel/fork.c
@@ -1033,7 +1033,7 @@ static void posix_cpu_timers_init(struct
 {
 	tsk->cputime_expires.prof_exp = cputime_zero;
 	tsk->cputime_expires.virt_exp = cputime_zero;
-	tsk->cputime_expires.sched_exp = 0;
+	atomic64_set(&tsk->cputime_expires.sched_exp, 0);
 	INIT_LIST_HEAD(&tsk->cpu_timers[0]);
 	INIT_LIST_HEAD(&tsk->cpu_timers[1]);
 	INIT_LIST_HEAD(&tsk->cpu_timers[2]);
Index: linux-2.6/kernel/posix-cpu-timers.c
===================================================================
--- linux-2.6.orig/kernel/posix-cpu-timers.c
+++ linux-2.6/kernel/posix-cpu-timers.c
@@ -239,7 +239,7 @@ void thread_group_cputime(struct task_st
 
 	times->utime = sig->utime;
 	times->stime = sig->stime;
-	times->sum_exec_runtime = sig->sum_sched_runtime;
+	atomic64_set(&times->sum_exec_runtime, sig->sum_sched_runtime);
 
 	rcu_read_lock();
 	/* make sure we can trust tsk->thread_group list */
@@ -250,7 +250,7 @@ void thread_group_cputime(struct task_st
 	do {
 		times->utime = cputime_add(times->utime, t->utime);
 		times->stime = cputime_add(times->stime, t->stime);
-		times->sum_exec_runtime += task_sched_runtime(t);
+		atomic64_add(task_sched_runtime(t), &times->sum_exec_runtime);
 	} while_each_thread(tsk, t);
 out:
 	rcu_read_unlock();
@@ -264,8 +264,11 @@ static void update_gt_cputime(struct tas
 	if (cputime_gt(b->stime, a->stime))
 		a->stime = b->stime;
 
-	if (b->sum_exec_runtime > a->sum_exec_runtime)
-		a->sum_exec_runtime = b->sum_exec_runtime;
+	if (atomic64_read(&b->sum_exec_runtime) >
+			atomic64_read(&a->sum_exec_runtime)) {
+		atomic64_set(&a->sum_exec_runtime,
+				atomic64_read(&b->sum_exec_runtime));
+	}
 }
 
 void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
@@ -287,6 +290,8 @@ void thread_group_cputimer(struct task_s
 		update_gt_cputime(&cputimer->cputime, &sum);
 	}
 	*times = cputimer->cputime;
+	atomic64_set(&times->sum_exec_runtime,
+			atomic64_read(&cputimer->cputime.sum_exec_runtime));
 	spin_unlock_irqrestore(&cputimer->lock, flags);
 }
 
@@ -313,7 +318,7 @@ static int cpu_clock_sample_group(const 
 		break;
 	case CPUCLOCK_SCHED:
 		thread_group_cputime(p, &cputime);
-		cpu->sched = cputime.sum_exec_runtime;
+		cpu->sched = atomic64_read(&cputime.sum_exec_runtime);
 		break;
 	}
 	return 0;
@@ -593,9 +598,9 @@ static void arm_timer(struct k_itimer *t
 				cputime_expires->virt_exp = exp->cpu;
 			break;
 		case CPUCLOCK_SCHED:
-			if (cputime_expires->sched_exp == 0 ||
-			    cputime_expires->sched_exp > exp->sched)
-				cputime_expires->sched_exp = exp->sched;
+			if (atomic64_read(&cputime_expires->sched_exp) == 0 ||
+			    atomic64_read(&cputime_expires->sched_exp) > exp->sched)
+				atomic64_set(&cputime_expires->sched_exp, exp->sched);
 			break;
 		}
 	}
@@ -656,7 +661,7 @@ static int cpu_timer_sample_group(const 
 		cpu->cpu = cputime.utime;
 		break;
 	case CPUCLOCK_SCHED:
-		cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p);
+		cpu->sched = atomic64_read(&cputime.sum_exec_runtime) + task_delta_exec(p);
 		break;
 	}
 	return 0;
@@ -947,13 +952,14 @@ static void check_thread_timers(struct t
 
 	++timers;
 	maxfire = 20;
-	tsk->cputime_expires.sched_exp = 0;
+	atomic64_set(&tsk->cputime_expires.sched_exp, 0);
 	while (!list_empty(timers)) {
 		struct cpu_timer_list *t = list_first_entry(timers,
 						      struct cpu_timer_list,
 						      entry);
 		if (!--maxfire || tsk->se.sum_exec_runtime < t->expires.sched) {
-			tsk->cputime_expires.sched_exp = t->expires.sched;
+			atomic64_set(&tsk->cputime_expires.sched_exp,
+				     t->expires.sched);
 			break;
 		}
 		t->firing = 1;
@@ -1049,7 +1055,7 @@ static inline int task_cputime_zero(cons
 {
 	if (cputime_eq(cputime->utime, cputime_zero) &&
 	    cputime_eq(cputime->stime, cputime_zero) &&
-	    cputime->sum_exec_runtime == 0)
+	    atomic64_read(&cputime->sum_exec_runtime) == 0)
 		return 1;
 	return 0;
 }
@@ -1076,7 +1082,7 @@ static void check_process_timers(struct 
 	thread_group_cputimer(tsk, &cputime);
 	utime = cputime.utime;
 	ptime = cputime_add(utime, cputime.stime);
-	sum_sched_runtime = cputime.sum_exec_runtime;
+	sum_sched_runtime = atomic64_read(&cputime.sum_exec_runtime);
 	maxfire = 20;
 	prof_expires = cputime_zero;
 	while (!list_empty(timers)) {
@@ -1161,7 +1167,7 @@ static void check_process_timers(struct 
 
 	sig->cputime_expires.prof_exp = prof_expires;
 	sig->cputime_expires.virt_exp = virt_expires;
-	sig->cputime_expires.sched_exp = sched_expires;
+	atomic64_set(&sig->cputime_expires.sched_exp, sched_expires);
 	if (task_cputime_zero(&sig->cputime_expires))
 		stop_process_timers(sig);
 }
@@ -1255,8 +1261,9 @@ static inline int task_cputime_expired(c
 	    cputime_ge(cputime_add(sample->utime, sample->stime),
 		       expires->stime))
 		return 1;
-	if (expires->sum_exec_runtime != 0 &&
-	    sample->sum_exec_runtime >= expires->sum_exec_runtime)
+	if (atomic64_read(&expires->sum_exec_runtime) != 0 &&
+	    atomic64_read(&sample->sum_exec_runtime) >=
+			atomic64_read(&expires->sum_exec_runtime))
 		return 1;
 	return 0;
 }
@@ -1279,7 +1286,7 @@ static inline int fastpath_timer_check(s
 		struct task_cputime task_sample = {
 			.utime = tsk->utime,
 			.stime = tsk->stime,
-			.sum_exec_runtime = tsk->se.sum_exec_runtime
+			.sum_exec_runtime = ATOMIC64_INIT(tsk->se.sum_exec_runtime),
 		};
 
 		if (task_cputime_expired(&task_sample, &tsk->cputime_expires))
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -4075,7 +4075,7 @@ void thread_group_times(struct task_stru
 	thread_group_cputime(p, &cputime);
 
 	total = cputime_add(cputime.utime, cputime.stime);
-	rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
+	rtime = nsecs_to_cputime(atomic64_read(&cputime.sum_exec_runtime));
 
 	if (total) {
 		u64 temp = rtime;
Index: linux-2.6/kernel/sched_rt.c
===================================================================
--- linux-2.6.orig/kernel/sched_rt.c
+++ linux-2.6/kernel/sched_rt.c
@@ -1763,8 +1763,10 @@ static void watchdog(struct rq *rq, stru
 
 		p->rt.timeout++;
 		next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
-		if (p->rt.timeout > next)
-			p->cputime_expires.sched_exp = p->se.sum_exec_runtime;
+		if (p->rt.timeout > next) {
+			atomic64_set(&p->cputime_expires.sched_exp,
+					p->se.sum_exec_runtime);
+		}
 	}
 }
 
Index: linux-2.6/kernel/sched_stats.h
===================================================================
--- linux-2.6.orig/kernel/sched_stats.h
+++ linux-2.6/kernel/sched_stats.h
@@ -330,7 +330,5 @@ static inline void account_group_exec_ru
 	if (!cputimer->running)
 		return;
 
-	spin_lock(&cputimer->lock);
-	cputimer->cputime.sum_exec_runtime += ns;
-	spin_unlock(&cputimer->lock);
+	atomic64_add(ns, &cputimer->cputime.sum_exec_runtime);
 }


  parent reply	other threads:[~2011-10-17 12:17 UTC|newest]

Thread overview: 98+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2011-10-05  1:40 Linux 3.1-rc9 Linus Torvalds
2011-10-07  7:08 ` Simon Kirby
2011-10-07 17:48   ` Simon Kirby
2011-10-07 18:01     ` Peter Zijlstra
2011-10-08  0:33       ` Simon Kirby
2011-10-08  0:50       ` Simon Kirby
2011-10-08  7:55         ` Peter Zijlstra
2011-10-12 21:35           ` Simon Kirby
2011-10-13 23:25             ` Simon Kirby
2011-10-17  1:39               ` Linus Torvalds
2011-10-17  4:58                 ` Ingo Molnar
2011-10-17  9:03                   ` Thomas Gleixner
2011-10-17 10:40                     ` Peter Zijlstra
2011-10-17 11:40                       ` Alan Cox
2011-10-17 18:49                     ` Ingo Molnar
2011-10-17 20:35                       ` H. Peter Anvin
2011-10-17 21:19                         ` Ingo Molnar
2011-10-17 21:22                           ` H. Peter Anvin
2011-10-17 21:39                             ` Ingo Molnar
2011-10-17 22:03                               ` Ingo Molnar
2011-10-17 22:04                                 ` Ingo Molnar
2011-10-17 22:08                               ` H. Peter Anvin
2011-10-18  6:01                                 ` Ingo Molnar
2011-10-18  7:12                                 ` Geert Uytterhoeven
2011-10-18 18:50                                   ` H. Peter Anvin
2011-10-17 21:31                           ` Ingo Molnar
2011-10-17  7:55                 ` Martin Schwidefsky
2011-10-17  9:12                   ` Peter Zijlstra
2011-10-17  9:18                     ` Martin Schwidefsky
2011-10-17 20:48                   ` H. Peter Anvin
2011-10-18  7:20                     ` Martin Schwidefsky
2011-10-17 10:34                 ` Peter Zijlstra [this message]
2011-10-17 14:07                   ` Martin Schwidefsky
2011-10-17 14:57                   ` Linus Torvalds
2011-10-17 17:54                     ` Peter Zijlstra
2011-10-17 18:31                       ` Linus Torvalds
2011-10-17 19:23                         ` Peter Zijlstra
2011-10-17 21:00                           ` Thomas Gleixner
2011-10-18  8:39                             ` Thomas Gleixner
2011-10-18  9:05                               ` Peter Zijlstra
2011-10-18 14:59                                 ` Linus Torvalds
2011-10-18 15:26                                   ` Thomas Gleixner
2011-10-18 18:07                                   ` Ingo Molnar
2011-10-18 18:14                                   ` [GIT PULL] timer fix Ingo Molnar
2011-10-18 16:13                                 ` Linux 3.1-rc9 Dave Jones
2011-10-18 18:20                                 ` Simon Kirby
2011-10-18 19:48                                   ` Thomas Gleixner
2011-10-18 20:12                                     ` Linus Torvalds
2011-10-25 15:26                                       ` Simon Kirby
2011-10-26  1:47                                         ` Yong Zhang
2011-10-24 19:02                                     ` Simon Kirby
2011-10-25  7:13                                       ` Linus Torvalds
2011-10-25  9:01                                         ` David Miller
2011-10-25 12:30                                           ` Thomas Gleixner
2011-10-25 23:18                                             ` David Miller
2011-10-25 20:20                                       ` Simon Kirby
2011-10-31 17:32                                         ` Simon Kirby
2011-11-02 16:40                                           ` Thomas Gleixner
2011-11-02 17:27                                             ` Eric Dumazet
2011-11-02 17:46                                               ` Linus Torvalds
2011-11-02 17:53                                                 ` Eric Dumazet
2011-11-02 18:00                                                   ` Linus Torvalds
2011-11-02 18:05                                                     ` Eric Dumazet
2011-11-02 18:10                                                       ` Linus Torvalds
2011-11-02 17:49                                               ` Eric Dumazet
2011-11-02 17:58                                                 ` Eric Dumazet
2011-11-02 19:16                                                   ` Simon Kirby
2011-11-02 22:42                                                     ` Eric Dumazet
2011-11-03  0:24                                                       ` Thomas Gleixner
2011-11-03  0:52                                                       ` Simon Kirby
2011-11-03 22:07                                                         ` David Miller
2011-11-03  6:06                                                       ` Jörg-Volker Peetz
2011-11-03  6:26                                                         ` Eric Dumazet
2011-11-03  6:43                                                           ` David Miller
2011-11-02 17:54                                               ` Thomas Gleixner
2011-11-02 18:04                                                 ` Eric Dumazet
2011-11-02 18:28                                             ` Simon Kirby
2011-11-02 18:30                                               ` Thomas Gleixner
2011-11-02 22:10                                           ` Steven Rostedt
2011-11-02 23:00                                             ` Steven Rostedt
2011-11-03  0:09                                               ` Simon Kirby
2011-11-03  0:15                                                 ` Steven Rostedt
2011-11-03  0:17                                                   ` Simon Kirby
2011-11-18 23:11                                         ` [tip:perf/core] lockdep: Show subclass in pretty print of lockdep output tip-bot for Steven Rostedt
2011-10-20 14:36                 ` Linux 3.1-rc9 Martin Schwidefsky
2011-10-23 11:34                   ` Ingo Molnar
2011-10-24  7:48                     ` Martin Schwidefsky
2011-10-24  7:51                       ` Linus Torvalds
2011-10-24  8:08                         ` Martin Schwidefsky
2011-10-18  5:40             ` Simon Kirby
2011-10-09 20:51 ` Arkadiusz Miśkiewicz
2011-10-10  2:29   ` [tpmdd-devel] " Stefan Berger
2011-10-10 16:23     ` Rajiv Andrade
2011-10-10 17:05       ` Arkadiusz Miśkiewicz
2011-10-10 17:22         ` Stefan Berger
2011-10-10 17:57           ` Arkadiusz Miśkiewicz
2011-10-10 21:08             ` Arkadiusz Miśkiewicz
2011-10-11  7:09             ` [tpmdd-devel] " Peter.Huewe

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1318847658.6594.40.camel@twins \
    --to=a.p.zijlstra@chello.nl \
    --cc=davej@redhat.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@elte.hu \
    --cc=schwidefsky@de.ibm.com \
    --cc=sim@hostway.ca \
    --cc=tglx@linutronix.de \
    --cc=torvalds@linux-foundation.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox