From: Peter Zijlstra <a.p.zijlstra@chello.nl>
To: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Simon Kirby <sim@hostway.ca>,
Linux Kernel Mailing List <linux-kernel@vger.kernel.org>,
Dave Jones <davej@redhat.com>,
Thomas Gleixner <tglx@linutronix.de>,
Martin Schwidefsky <schwidefsky@de.ibm.com>,
Ingo Molnar <mingo@elte.hu>
Subject: Re: Linux 3.1-rc9
Date: Mon, 17 Oct 2011 12:34:18 +0200 [thread overview]
Message-ID: <1318847658.6594.40.camel@twins> (raw)
In-Reply-To: <CA+55aFxxfsFMgqA7kYKukvkCX6Z9HrKOtP-22w5Mc21iQj4c=Q@mail.gmail.com>
On Sun, 2011-10-16 at 18:39 -0700, Linus Torvalds wrote:
> Quite frankly, I personally consider it to be broken - why are we
> introducing this new lock for this very special thing? A spinlock to
> protect a *single* word of counter seems broken.
Well, I thought atomic64_t would be more expensive on 32bit archs, i386
uses the horridly expensive cmpxchg8b thing to implement it.
That said, I'm more than glad to use it.
> However, I don't see why that spinlock is needed at all. Why aren't
> those fields just atomics (or at least just "sum_exec_runtime")?
Done.
> And
> why does "cputime_add()" exist at all? It seems to always be just a
> plain add, and nothing else would seem to ever make sense *anyway*?
Martin and me were discussing the merit of that only a few weeks ago ;-)
BTW what would we all think about a coccinelle generated patch that
fixes atomic*_add()'s argument order?
---
Subject: cputimer: Cure lock inversion
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon Oct 17 11:50:30 CEST 2011
There's a lock inversion between the cputimer->lock and rq->lock; notably
the two callchains involved are:
update_rlimit_cpu()
sighand->siglock
set_process_cpu_timer()
cpu_timer_sample_group()
thread_group_cputimer()
cputimer->lock
thread_group_cputime()
task_sched_runtime()
->pi_lock
rq->lock
scheduler_tick()
rq->lock
task_tick_fair()
update_curr()
account_group_exec()
cputimer->lock
Where the first one is enabling a CLOCK_PROCESS_CPUTIME_ID timer, and the
second one is keeping up-to-date.
Note that e8abccb7193 ("posix-cpu-timers: Cure SMP accounting oddities") didn't
introduce this problem, but merely made it much more likely to happen, see how
cpu_timer_sample_group() for the CPUCLOCK_SCHED case also takes rq->lock.
Cure this inversion by removing the need to acquire cputimer->lock in the
update path by converting task_cputime::sum_exec_runtime to an atomic64_t.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
include/linux/sched.h | 4 ++--
kernel/fork.c | 2 +-
kernel/posix-cpu-timers.c | 41 ++++++++++++++++++++++++-----------------
kernel/sched.c | 2 +-
kernel/sched_rt.c | 6 ++++--
kernel/sched_stats.h | 4 +---
6 files changed, 33 insertions(+), 26 deletions(-)
Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -474,7 +474,7 @@ struct cpu_itimer {
struct task_cputime {
cputime_t utime;
cputime_t stime;
- unsigned long long sum_exec_runtime;
+ atomic64_t sum_exec_runtime;
};
/* Alternate field names when used to cache expirations. */
#define prof_exp stime
@@ -485,7 +485,7 @@ struct task_cputime {
(struct task_cputime) { \
.utime = cputime_zero, \
.stime = cputime_zero, \
- .sum_exec_runtime = 0, \
+ .sum_exec_runtime = ATOMIC64_INIT(0), \
}
/*
Index: linux-2.6/kernel/fork.c
===================================================================
--- linux-2.6.orig/kernel/fork.c
+++ linux-2.6/kernel/fork.c
@@ -1033,7 +1033,7 @@ static void posix_cpu_timers_init(struct
{
tsk->cputime_expires.prof_exp = cputime_zero;
tsk->cputime_expires.virt_exp = cputime_zero;
- tsk->cputime_expires.sched_exp = 0;
+ atomic64_set(&tsk->cputime_expires.sched_exp, 0);
INIT_LIST_HEAD(&tsk->cpu_timers[0]);
INIT_LIST_HEAD(&tsk->cpu_timers[1]);
INIT_LIST_HEAD(&tsk->cpu_timers[2]);
Index: linux-2.6/kernel/posix-cpu-timers.c
===================================================================
--- linux-2.6.orig/kernel/posix-cpu-timers.c
+++ linux-2.6/kernel/posix-cpu-timers.c
@@ -239,7 +239,7 @@ void thread_group_cputime(struct task_st
times->utime = sig->utime;
times->stime = sig->stime;
- times->sum_exec_runtime = sig->sum_sched_runtime;
+ atomic64_set(×->sum_exec_runtime, sig->sum_sched_runtime);
rcu_read_lock();
/* make sure we can trust tsk->thread_group list */
@@ -250,7 +250,7 @@ void thread_group_cputime(struct task_st
do {
times->utime = cputime_add(times->utime, t->utime);
times->stime = cputime_add(times->stime, t->stime);
- times->sum_exec_runtime += task_sched_runtime(t);
+ atomic64_add(task_sched_runtime(t), ×->sum_exec_runtime);
} while_each_thread(tsk, t);
out:
rcu_read_unlock();
@@ -264,8 +264,11 @@ static void update_gt_cputime(struct tas
if (cputime_gt(b->stime, a->stime))
a->stime = b->stime;
- if (b->sum_exec_runtime > a->sum_exec_runtime)
- a->sum_exec_runtime = b->sum_exec_runtime;
+ if (atomic64_read(&b->sum_exec_runtime) >
+ atomic64_read(&a->sum_exec_runtime)) {
+ atomic64_set(&a->sum_exec_runtime,
+ atomic64_read(&b->sum_exec_runtime));
+ }
}
void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
@@ -287,6 +290,8 @@ void thread_group_cputimer(struct task_s
update_gt_cputime(&cputimer->cputime, &sum);
}
*times = cputimer->cputime;
+ atomic64_set(×->sum_exec_runtime,
+ atomic64_read(&cputimer->cputime.sum_exec_runtime));
spin_unlock_irqrestore(&cputimer->lock, flags);
}
@@ -313,7 +318,7 @@ static int cpu_clock_sample_group(const
break;
case CPUCLOCK_SCHED:
thread_group_cputime(p, &cputime);
- cpu->sched = cputime.sum_exec_runtime;
+ cpu->sched = atomic64_read(&cputime.sum_exec_runtime);
break;
}
return 0;
@@ -593,9 +598,9 @@ static void arm_timer(struct k_itimer *t
cputime_expires->virt_exp = exp->cpu;
break;
case CPUCLOCK_SCHED:
- if (cputime_expires->sched_exp == 0 ||
- cputime_expires->sched_exp > exp->sched)
- cputime_expires->sched_exp = exp->sched;
+ if (atomic64_read(&cputime_expires->sched_exp) == 0 ||
+ atomic64_read(&cputime_expires->sched_exp) > exp->sched)
+ atomic64_set(&cputime_expires->sched_exp, exp->sched);
break;
}
}
@@ -656,7 +661,7 @@ static int cpu_timer_sample_group(const
cpu->cpu = cputime.utime;
break;
case CPUCLOCK_SCHED:
- cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p);
+ cpu->sched = atomic64_read(&cputime.sum_exec_runtime) + task_delta_exec(p);
break;
}
return 0;
@@ -947,13 +952,14 @@ static void check_thread_timers(struct t
++timers;
maxfire = 20;
- tsk->cputime_expires.sched_exp = 0;
+ atomic64_set(&tsk->cputime_expires.sched_exp, 0);
while (!list_empty(timers)) {
struct cpu_timer_list *t = list_first_entry(timers,
struct cpu_timer_list,
entry);
if (!--maxfire || tsk->se.sum_exec_runtime < t->expires.sched) {
- tsk->cputime_expires.sched_exp = t->expires.sched;
+ atomic64_set(&tsk->cputime_expires.sched_exp,
+ t->expires.sched);
break;
}
t->firing = 1;
@@ -1049,7 +1055,7 @@ static inline int task_cputime_zero(cons
{
if (cputime_eq(cputime->utime, cputime_zero) &&
cputime_eq(cputime->stime, cputime_zero) &&
- cputime->sum_exec_runtime == 0)
+ atomic64_read(&cputime->sum_exec_runtime) == 0)
return 1;
return 0;
}
@@ -1076,7 +1082,7 @@ static void check_process_timers(struct
thread_group_cputimer(tsk, &cputime);
utime = cputime.utime;
ptime = cputime_add(utime, cputime.stime);
- sum_sched_runtime = cputime.sum_exec_runtime;
+ sum_sched_runtime = atomic64_read(&cputime.sum_exec_runtime);
maxfire = 20;
prof_expires = cputime_zero;
while (!list_empty(timers)) {
@@ -1161,7 +1167,7 @@ static void check_process_timers(struct
sig->cputime_expires.prof_exp = prof_expires;
sig->cputime_expires.virt_exp = virt_expires;
- sig->cputime_expires.sched_exp = sched_expires;
+ atomic64_set(&sig->cputime_expires.sched_exp, sched_expires);
if (task_cputime_zero(&sig->cputime_expires))
stop_process_timers(sig);
}
@@ -1255,8 +1261,9 @@ static inline int task_cputime_expired(c
cputime_ge(cputime_add(sample->utime, sample->stime),
expires->stime))
return 1;
- if (expires->sum_exec_runtime != 0 &&
- sample->sum_exec_runtime >= expires->sum_exec_runtime)
+ if (atomic64_read(&expires->sum_exec_runtime) != 0 &&
+ atomic64_read(&sample->sum_exec_runtime) >=
+ atomic64_read(&expires->sum_exec_runtime))
return 1;
return 0;
}
@@ -1279,7 +1286,7 @@ static inline int fastpath_timer_check(s
struct task_cputime task_sample = {
.utime = tsk->utime,
.stime = tsk->stime,
- .sum_exec_runtime = tsk->se.sum_exec_runtime
+ .sum_exec_runtime = ATOMIC64_INIT(tsk->se.sum_exec_runtime),
};
if (task_cputime_expired(&task_sample, &tsk->cputime_expires))
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -4075,7 +4075,7 @@ void thread_group_times(struct task_stru
thread_group_cputime(p, &cputime);
total = cputime_add(cputime.utime, cputime.stime);
- rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
+ rtime = nsecs_to_cputime(atomic64_read(&cputime.sum_exec_runtime));
if (total) {
u64 temp = rtime;
Index: linux-2.6/kernel/sched_rt.c
===================================================================
--- linux-2.6.orig/kernel/sched_rt.c
+++ linux-2.6/kernel/sched_rt.c
@@ -1763,8 +1763,10 @@ static void watchdog(struct rq *rq, stru
p->rt.timeout++;
next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
- if (p->rt.timeout > next)
- p->cputime_expires.sched_exp = p->se.sum_exec_runtime;
+ if (p->rt.timeout > next) {
+ atomic64_set(&p->cputime_expires.sched_exp,
+ p->se.sum_exec_runtime);
+ }
}
}
Index: linux-2.6/kernel/sched_stats.h
===================================================================
--- linux-2.6.orig/kernel/sched_stats.h
+++ linux-2.6/kernel/sched_stats.h
@@ -330,7 +330,5 @@ static inline void account_group_exec_ru
if (!cputimer->running)
return;
- spin_lock(&cputimer->lock);
- cputimer->cputime.sum_exec_runtime += ns;
- spin_unlock(&cputimer->lock);
+ atomic64_add(ns, &cputimer->cputime.sum_exec_runtime);
}
next prev parent reply other threads:[~2011-10-17 12:17 UTC|newest]
Thread overview: 98+ messages / expand[flat|nested] mbox.gz Atom feed top
2011-10-05 1:40 Linux 3.1-rc9 Linus Torvalds
2011-10-07 7:08 ` Simon Kirby
2011-10-07 17:48 ` Simon Kirby
2011-10-07 18:01 ` Peter Zijlstra
2011-10-08 0:33 ` Simon Kirby
2011-10-08 0:50 ` Simon Kirby
2011-10-08 7:55 ` Peter Zijlstra
2011-10-12 21:35 ` Simon Kirby
2011-10-13 23:25 ` Simon Kirby
2011-10-17 1:39 ` Linus Torvalds
2011-10-17 4:58 ` Ingo Molnar
2011-10-17 9:03 ` Thomas Gleixner
2011-10-17 10:40 ` Peter Zijlstra
2011-10-17 11:40 ` Alan Cox
2011-10-17 18:49 ` Ingo Molnar
2011-10-17 20:35 ` H. Peter Anvin
2011-10-17 21:19 ` Ingo Molnar
2011-10-17 21:22 ` H. Peter Anvin
2011-10-17 21:39 ` Ingo Molnar
2011-10-17 22:03 ` Ingo Molnar
2011-10-17 22:04 ` Ingo Molnar
2011-10-17 22:08 ` H. Peter Anvin
2011-10-18 6:01 ` Ingo Molnar
2011-10-18 7:12 ` Geert Uytterhoeven
2011-10-18 18:50 ` H. Peter Anvin
2011-10-17 21:31 ` Ingo Molnar
2011-10-17 7:55 ` Martin Schwidefsky
2011-10-17 9:12 ` Peter Zijlstra
2011-10-17 9:18 ` Martin Schwidefsky
2011-10-17 20:48 ` H. Peter Anvin
2011-10-18 7:20 ` Martin Schwidefsky
2011-10-17 10:34 ` Peter Zijlstra [this message]
2011-10-17 14:07 ` Martin Schwidefsky
2011-10-17 14:57 ` Linus Torvalds
2011-10-17 17:54 ` Peter Zijlstra
2011-10-17 18:31 ` Linus Torvalds
2011-10-17 19:23 ` Peter Zijlstra
2011-10-17 21:00 ` Thomas Gleixner
2011-10-18 8:39 ` Thomas Gleixner
2011-10-18 9:05 ` Peter Zijlstra
2011-10-18 14:59 ` Linus Torvalds
2011-10-18 15:26 ` Thomas Gleixner
2011-10-18 18:07 ` Ingo Molnar
2011-10-18 18:14 ` [GIT PULL] timer fix Ingo Molnar
2011-10-18 16:13 ` Linux 3.1-rc9 Dave Jones
2011-10-18 18:20 ` Simon Kirby
2011-10-18 19:48 ` Thomas Gleixner
2011-10-18 20:12 ` Linus Torvalds
2011-10-25 15:26 ` Simon Kirby
2011-10-26 1:47 ` Yong Zhang
2011-10-24 19:02 ` Simon Kirby
2011-10-25 7:13 ` Linus Torvalds
2011-10-25 9:01 ` David Miller
2011-10-25 12:30 ` Thomas Gleixner
2011-10-25 23:18 ` David Miller
2011-10-25 20:20 ` Simon Kirby
2011-10-31 17:32 ` Simon Kirby
2011-11-02 16:40 ` Thomas Gleixner
2011-11-02 17:27 ` Eric Dumazet
2011-11-02 17:46 ` Linus Torvalds
2011-11-02 17:53 ` Eric Dumazet
2011-11-02 18:00 ` Linus Torvalds
2011-11-02 18:05 ` Eric Dumazet
2011-11-02 18:10 ` Linus Torvalds
2011-11-02 17:49 ` Eric Dumazet
2011-11-02 17:58 ` Eric Dumazet
2011-11-02 19:16 ` Simon Kirby
2011-11-02 22:42 ` Eric Dumazet
2011-11-03 0:24 ` Thomas Gleixner
2011-11-03 0:52 ` Simon Kirby
2011-11-03 22:07 ` David Miller
2011-11-03 6:06 ` Jörg-Volker Peetz
2011-11-03 6:26 ` Eric Dumazet
2011-11-03 6:43 ` David Miller
2011-11-02 17:54 ` Thomas Gleixner
2011-11-02 18:04 ` Eric Dumazet
2011-11-02 18:28 ` Simon Kirby
2011-11-02 18:30 ` Thomas Gleixner
2011-11-02 22:10 ` Steven Rostedt
2011-11-02 23:00 ` Steven Rostedt
2011-11-03 0:09 ` Simon Kirby
2011-11-03 0:15 ` Steven Rostedt
2011-11-03 0:17 ` Simon Kirby
2011-11-18 23:11 ` [tip:perf/core] lockdep: Show subclass in pretty print of lockdep output tip-bot for Steven Rostedt
2011-10-20 14:36 ` Linux 3.1-rc9 Martin Schwidefsky
2011-10-23 11:34 ` Ingo Molnar
2011-10-24 7:48 ` Martin Schwidefsky
2011-10-24 7:51 ` Linus Torvalds
2011-10-24 8:08 ` Martin Schwidefsky
2011-10-18 5:40 ` Simon Kirby
2011-10-09 20:51 ` Arkadiusz Miśkiewicz
2011-10-10 2:29 ` [tpmdd-devel] " Stefan Berger
2011-10-10 16:23 ` Rajiv Andrade
2011-10-10 17:05 ` Arkadiusz Miśkiewicz
2011-10-10 17:22 ` Stefan Berger
2011-10-10 17:57 ` Arkadiusz Miśkiewicz
2011-10-10 21:08 ` Arkadiusz Miśkiewicz
2011-10-11 7:09 ` [tpmdd-devel] " Peter.Huewe
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1318847658.6594.40.camel@twins \
--to=a.p.zijlstra@chello.nl \
--cc=davej@redhat.com \
--cc=linux-kernel@vger.kernel.org \
--cc=mingo@elte.hu \
--cc=schwidefsky@de.ibm.com \
--cc=sim@hostway.ca \
--cc=tglx@linutronix.de \
--cc=torvalds@linux-foundation.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox