linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] sched: fix divide by zero at {thread_group,task}_times
@ 2012-08-08  9:27 Stanislaw Gruszka
  2012-08-08 19:50 ` Mike Galbraith
  2012-08-13 16:50 ` [tip:sched/urgent] sched: fix divide by zero at {thread_group, task}_times tip-bot for Stanislaw Gruszka
  0 siblings, 2 replies; 6+ messages in thread
From: Stanislaw Gruszka @ 2012-08-08  9:27 UTC (permalink / raw)
  To: linux-kernel; +Cc: Peter Zijlstra, Ingo Molnar

On architectures where cputime_t is 64 bit type, is possible to trigger
divide by zero on do_div(temp, (__force u32) total) line, if total is a
non zero number but has lower 32 bit's zeroed. Removing casting is not
a good solution since some do_div() implementations do cast to u32
internally.

This problem can be triggered in practice on very long lived processes:

  PID: 2331   TASK: ffff880472814b00  CPU: 2   COMMAND: "oraagent.bin"
   #0 [ffff880472a51b70] machine_kexec at ffffffff8103214b
   #1 [ffff880472a51bd0] crash_kexec at ffffffff810b91c2
   #2 [ffff880472a51ca0] oops_end at ffffffff814f0b00
   #3 [ffff880472a51cd0] die at ffffffff8100f26b
   #4 [ffff880472a51d00] do_trap at ffffffff814f03f4
   #5 [ffff880472a51d60] do_divide_error at ffffffff8100cfff
   #6 [ffff880472a51e00] divide_error at ffffffff8100be7b
      [exception RIP: thread_group_times+0x56]
      RIP: ffffffff81056a16  RSP: ffff880472a51eb8  RFLAGS: 00010046
      RAX: bc3572c9fe12d194  RBX: ffff880874150800  RCX: 0000000110266fad
      RDX: 0000000000000000  RSI: ffff880472a51eb8  RDI: 001038ae7d9633dc
      RBP: ffff880472a51ef8   R8: 00000000b10a3a64   R9: ffff880874150800
      R10: 00007fcba27ab680  R11: 0000000000000202  R12: ffff880472a51f08
      R13: ffff880472a51f10  R14: 0000000000000000  R15: 0000000000000007
      ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018
   #7 [ffff880472a51f00] do_sys_times at ffffffff8108845d
   #8 [ffff880472a51f40] sys_times at ffffffff81088524
   #9 [ffff880472a51f80] system_call_fastpath at ffffffff8100b0f2
      RIP: 0000003808caac3a  RSP: 00007fcba27ab6d8  RFLAGS: 00000202
      RAX: 0000000000000064  RBX: ffffffff8100b0f2  RCX: 0000000000000000
      RDX: 00007fcba27ab6e0  RSI: 000000000076d58e  RDI: 00007fcba27ab6e0
      RBP: 00007fcba27ab700   R8: 0000000000000020   R9: 000000000000091b
      R10: 00007fcba27ab680  R11: 0000000000000202  R12: 00007fff9ca41940
      R13: 0000000000000000  R14: 00007fcba27ac9c0  R15: 00007fff9ca41940
      ORIG_RAX: 0000000000000064  CS: 0033  SS: 002b

Cc: stable@vger.kernel.org
Signed-off-by: Stanislaw Gruszka <sgruszka@redhat.com>
---
 kernel/sched/core.c |   34 ++++++++++++++++++++--------------
 1 files changed, 20 insertions(+), 14 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 468bdd4..1d441b0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3142,6 +3142,20 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
 # define nsecs_to_cputime(__nsecs)	nsecs_to_jiffies(__nsecs)
 #endif
 
+static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total)
+{
+	u64 temp = (__force u64) rtime;
+
+	temp *= (__force u64) utime;
+
+	if (sizeof(cputime_t) == 4)
+		do_div(temp, (__force u32) total);
+	else
+		temp = div64_u64(temp, (__force u64) total);
+
+	return (__force cputime_t) temp;
+}
+
 void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
 {
 	cputime_t rtime, utime = p->utime, total = utime + p->stime;
@@ -3151,13 +3165,9 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
 	 */
 	rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
 
-	if (total) {
-		u64 temp = (__force u64) rtime;
-
-		temp *= (__force u64) utime;
-		do_div(temp, (__force u32) total);
-		utime = (__force cputime_t) temp;
-	} else
+	if (total)
+		utime = scale_utime(utime, rtime, total);
+	else
 		utime = rtime;
 
 	/*
@@ -3184,13 +3194,9 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
 	total = cputime.utime + cputime.stime;
 	rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
 
-	if (total) {
-		u64 temp = (__force u64) rtime;
-
-		temp *= (__force u64) cputime.utime;
-		do_div(temp, (__force u32) total);
-		utime = (__force cputime_t) temp;
-	} else
+	if (total)
+		utime = scale_utime(cputime.utime, rtime, total);
+	else
 		utime = rtime;
 
 	sig->prev_utime = max(sig->prev_utime, utime);
-- 
1.7.1


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH] sched: fix divide by zero at {thread_group,task}_times
  2012-08-08  9:27 [PATCH] sched: fix divide by zero at {thread_group,task}_times Stanislaw Gruszka
@ 2012-08-08 19:50 ` Mike Galbraith
  2012-08-08 20:08   ` Peter Zijlstra
  2012-08-13 16:50 ` [tip:sched/urgent] sched: fix divide by zero at {thread_group, task}_times tip-bot for Stanislaw Gruszka
  1 sibling, 1 reply; 6+ messages in thread
From: Mike Galbraith @ 2012-08-08 19:50 UTC (permalink / raw)
  To: Stanislaw Gruszka; +Cc: linux-kernel, Peter Zijlstra, Ingo Molnar

On Wed, 2012-08-08 at 11:27 +0200, Stanislaw Gruszka wrote: 
> On architectures where cputime_t is 64 bit type, is possible to trigger
> divide by zero on do_div(temp, (__force u32) total) line, if total is a
> non zero number but has lower 32 bit's zeroed. Removing casting is not
> a good solution since some do_div() implementations do cast to u32
> internally.

I wonder if do_div() vs div64_u64() makes a big difference.  I swiped
all the kernel bits, and took them to userland.  32bit built do_div()
and div64_u64() both sucked equally compared to 64bit.. on Q6600.  Too
bad P4 box kicked the bucket...

boring numbers..

marge:~ # gcc -O2 -m32 -DDO_DIV -g -o xx xx.c
marge:~ # taskset -c 0 perf stat -- taskset -c 3 ./xx

 Performance counter stats for 'taskset -c 3 ./xx':

      13067.570787 task-clock:HG             #    0.999 CPUs utilized          
              1117 context-switches:HG       #    0.085 K/sec                  
                11 CPU-migrations:HG         #    0.001 K/sec                  
               225 page-faults:HG            #    0.017 K/sec                  
       31211154646 cycles:HG                 #    2.388 GHz                     [50.00%]
   <not supported> stalled-cycles-frontend:HG
   <not supported> stalled-cycles-backend:HG
       69004479084 instructions:HG           #    2.21  insns per cycle         [75.00%]
       16004049251 branches:HG               # 1224.715 M/sec                   [75.00%]
            106179 branch-misses:HG          #    0.00% of all branches         [75.00%]

      13.074907234 seconds time elapsed

marge:~ # gcc -O2 -m32 -g -o xx xx.c
marge:~ # taskset -c 0 perf stat -- taskset -c 3 ./xx

 Performance counter stats for 'taskset -c 3 ./xx':

      13065.936608 task-clock:HG             #    0.999 CPUs utilized          
              1117 context-switches:HG       #    0.085 K/sec                  
                 3 CPU-migrations:HG         #    0.000 K/sec                  
               225 page-faults:HG            #    0.017 K/sec                  
       31202249014 cycles:HG                 #    2.388 GHz                     [50.01%]
   <not supported> stalled-cycles-frontend:HG
   <not supported> stalled-cycles-backend:HG
       69013127982 instructions:HG           #    2.21  insns per cycle         [75.00%]
       16003545136 branches:HG               # 1224.830 M/sec                   [75.00%]
            106156 branch-misses:HG          #    0.00% of all branches         [75.00%]

      13.073238280 seconds time elapsed

64bit is loads faster.

marge:~ # gcc -O2 -m64 -DDO_DIV -g -o xx xx.c
marge:~ # taskset -c 0 perf stat -- taskset -c 3 ./xx

 Performance counter stats for 'taskset -c 3 ./xx':

       5575.994225 task-clock:HG             #    0.999 CPUs utilized          
               473 context-switches:HG       #    0.085 K/sec                  
                 5 CPU-migrations:HG         #    0.001 K/sec                  
               251 page-faults:HG            #    0.045 K/sec                  
       13311760132 cycles:HG                 #    2.387 GHz                     [50.00%]
   <not supported> stalled-cycles-frontend:HG
   <not supported> stalled-cycles-backend:HG
        7003731700 instructions:HG           #    0.53  insns per cycle         [75.04%]
        1001242479 branches:HG               #  179.563 M/sec                   [75.05%]
             37489 branch-misses:HG          #    0.00% of all branches         [74.98%]

       5.579075361 seconds time elapsed

marge:~ # gcc -O2 -m64 -g -o xx xx.c
marge:~ # taskset -c 0 perf stat -- taskset -c 3 ./xx

 Performance counter stats for 'taskset -c 3 ./xx':

       5576.756281 task-clock:HG             #    0.999 CPUs utilized          
               482 context-switches:HG       #    0.086 K/sec                  
                 5 CPU-migrations:HG         #    0.001 K/sec                  
               251 page-faults:HG            #    0.045 K/sec                  
       13316232700 cycles:HG                 #    2.388 GHz                     [49.98%]
   <not supported> stalled-cycles-frontend:HG
   <not supported> stalled-cycles-backend:HG
        7006965347 instructions:HG           #    0.53  insns per cycle         [74.99%]
        1001095727 branches:HG               #  179.512 M/sec                   [75.04%]
             39450 branch-misses:HG          #    0.00% of all branches         [75.01%]

       5.580954618 seconds time elapsed


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] sched: fix divide by zero at {thread_group,task}_times
  2012-08-08 19:50 ` Mike Galbraith
@ 2012-08-08 20:08   ` Peter Zijlstra
  2012-08-13 11:48     ` Stanislaw Gruszka
  0 siblings, 1 reply; 6+ messages in thread
From: Peter Zijlstra @ 2012-08-08 20:08 UTC (permalink / raw)
  To: Mike Galbraith; +Cc: Stanislaw Gruszka, linux-kernel, Ingo Molnar

On Wed, 2012-08-08 at 21:50 +0200, Mike Galbraith wrote:
> 32bit built do_div()
> and div64_u64() both sucked equally compared to 64bit 

/me peeks at div64_u64 fallback implementation and sees why, it still
does a single div, it does some neat fls tricks.

Ok, no point in avoiding this then.. 

I did the below little edit, no point in mixing the old and new
primitives.. those __force things annoy me, but I guess otherwise we'll
upset sparse.

---
Index: linux-2.6/kernel/sched/core.c
===================================================================
--- linux-2.6.orig/kernel/sched/core.c
+++ linux-2.6/kernel/sched/core.c
@@ -3149,7 +3149,7 @@ static cputime_t scale_utime(cputime_t u
 	temp *= (__force u64) utime;
 
 	if (sizeof(cputime_t) == 4)
-		do_div(temp, (__force u32) total);
+		temp = div_u64(temp, (__force u32) total);
 	else
 		temp = div64_u64(temp, (__force u64) total);
 



^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] sched: fix divide by zero at {thread_group,task}_times
  2012-08-08 20:08   ` Peter Zijlstra
@ 2012-08-13 11:48     ` Stanislaw Gruszka
  2012-08-13 13:59       ` Peter Zijlstra
  0 siblings, 1 reply; 6+ messages in thread
From: Stanislaw Gruszka @ 2012-08-13 11:48 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Mike Galbraith, linux-kernel, Ingo Molnar, Martin Schwidefsky

On Wed, Aug 08, 2012 at 10:08:20PM +0200, Peter Zijlstra wrote:
> On Wed, 2012-08-08 at 21:50 +0200, Mike Galbraith wrote:
> > 32bit built do_div()
> > and div64_u64() both sucked equally compared to 64bit 
> 
> /me peeks at div64_u64 fallback implementation and sees why, it still
> does a single div, it does some neat fls tricks.
> 
> Ok, no point in avoiding this then.. 
> 
> I did the below little edit, no point in mixing the old and new
> primitives.. those __force things annoy me, but I guess otherwise we'll
> upset sparse.

Yeah, __force is needed for sparse, since we marked cputime_t with 
__nocast (by commit 648616343cdbe904c585a6c12e323d3b3c72e46f, which btw
looks like very nice cleanup at whole).

> Index: linux-2.6/kernel/sched/core.c
> ===================================================================
> --- linux-2.6.orig/kernel/sched/core.c
> +++ linux-2.6/kernel/sched/core.c
> @@ -3149,7 +3149,7 @@ static cputime_t scale_utime(cputime_t u
>  	temp *= (__force u64) utime;
>  
>  	if (sizeof(cputime_t) == 4)
> -		do_div(temp, (__force u32) total);
> +		temp = div_u64(temp, (__force u32) total);
>  	else
>  		temp = div64_u64(temp, (__force u64) total);

Is this or will be queued (I do not see it queued anywhere)? Or should
I repost with above change?

Thanks
Stanislaw

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] sched: fix divide by zero at {thread_group,task}_times
  2012-08-13 11:48     ` Stanislaw Gruszka
@ 2012-08-13 13:59       ` Peter Zijlstra
  0 siblings, 0 replies; 6+ messages in thread
From: Peter Zijlstra @ 2012-08-13 13:59 UTC (permalink / raw)
  To: Stanislaw Gruszka
  Cc: Mike Galbraith, linux-kernel, Ingo Molnar, Martin Schwidefsky

On Mon, 2012-08-13 at 13:48 +0200, Stanislaw Gruszka wrote:
> 
> Is this or will be queued (I do not see it queued anywhere)? Or should
> I repost with above change? 

I've got it.. my queue lives at:

 http://programming.kicks-ass.net/sekrit/patches.tar.bz2

until Ingo gets around to stuffing them into -tip. That's usually only a
few days, but he's currently traveling so latency is somewhat increased.



^ permalink raw reply	[flat|nested] 6+ messages in thread

* [tip:sched/urgent] sched: fix divide by zero at {thread_group, task}_times
  2012-08-08  9:27 [PATCH] sched: fix divide by zero at {thread_group,task}_times Stanislaw Gruszka
  2012-08-08 19:50 ` Mike Galbraith
@ 2012-08-13 16:50 ` tip-bot for Stanislaw Gruszka
  1 sibling, 0 replies; 6+ messages in thread
From: tip-bot for Stanislaw Gruszka @ 2012-08-13 16:50 UTC (permalink / raw)
  To: linux-tip-commits; +Cc: linux-kernel, hpa, mingo, a.p.zijlstra, tglx, sgruszka

Commit-ID:  bea6832cc8c4a0a9a65dd17da6aaa657fe27bc3e
Gitweb:     http://git.kernel.org/tip/bea6832cc8c4a0a9a65dd17da6aaa657fe27bc3e
Author:     Stanislaw Gruszka <sgruszka@redhat.com>
AuthorDate: Wed, 8 Aug 2012 11:27:15 +0200
Committer:  Thomas Gleixner <tglx@linutronix.de>
CommitDate: Mon, 13 Aug 2012 18:41:54 +0200

sched: fix divide by zero at {thread_group,task}_times

On architectures where cputime_t is 64 bit type, is possible to trigger
divide by zero on do_div(temp, (__force u32) total) line, if total is a
non zero number but has lower 32 bit's zeroed. Removing casting is not
a good solution since some do_div() implementations do cast to u32
internally.

This problem can be triggered in practice on very long lived processes:

  PID: 2331   TASK: ffff880472814b00  CPU: 2   COMMAND: "oraagent.bin"
   #0 [ffff880472a51b70] machine_kexec at ffffffff8103214b
   #1 [ffff880472a51bd0] crash_kexec at ffffffff810b91c2
   #2 [ffff880472a51ca0] oops_end at ffffffff814f0b00
   #3 [ffff880472a51cd0] die at ffffffff8100f26b
   #4 [ffff880472a51d00] do_trap at ffffffff814f03f4
   #5 [ffff880472a51d60] do_divide_error at ffffffff8100cfff
   #6 [ffff880472a51e00] divide_error at ffffffff8100be7b
      [exception RIP: thread_group_times+0x56]
      RIP: ffffffff81056a16  RSP: ffff880472a51eb8  RFLAGS: 00010046
      RAX: bc3572c9fe12d194  RBX: ffff880874150800  RCX: 0000000110266fad
      RDX: 0000000000000000  RSI: ffff880472a51eb8  RDI: 001038ae7d9633dc
      RBP: ffff880472a51ef8   R8: 00000000b10a3a64   R9: ffff880874150800
      R10: 00007fcba27ab680  R11: 0000000000000202  R12: ffff880472a51f08
      R13: ffff880472a51f10  R14: 0000000000000000  R15: 0000000000000007
      ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018
   #7 [ffff880472a51f00] do_sys_times at ffffffff8108845d
   #8 [ffff880472a51f40] sys_times at ffffffff81088524
   #9 [ffff880472a51f80] system_call_fastpath at ffffffff8100b0f2
      RIP: 0000003808caac3a  RSP: 00007fcba27ab6d8  RFLAGS: 00000202
      RAX: 0000000000000064  RBX: ffffffff8100b0f2  RCX: 0000000000000000
      RDX: 00007fcba27ab6e0  RSI: 000000000076d58e  RDI: 00007fcba27ab6e0
      RBP: 00007fcba27ab700   R8: 0000000000000020   R9: 000000000000091b
      R10: 00007fcba27ab680  R11: 0000000000000202  R12: 00007fff9ca41940
      R13: 0000000000000000  R14: 00007fcba27ac9c0  R15: 00007fff9ca41940
      ORIG_RAX: 0000000000000064  CS: 0033  SS: 002b

Cc: stable@vger.kernel.org
Signed-off-by: Stanislaw Gruszka <sgruszka@redhat.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20120808092714.GA3580@redhat.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched/core.c |   34 ++++++++++++++++++++--------------
 1 files changed, 20 insertions(+), 14 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2cb4e77..e2c5841 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3142,6 +3142,20 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
 # define nsecs_to_cputime(__nsecs)	nsecs_to_jiffies(__nsecs)
 #endif
 
+static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total)
+{
+	u64 temp = (__force u64) rtime;
+
+	temp *= (__force u64) utime;
+
+	if (sizeof(cputime_t) == 4)
+		temp = div_u64(temp, (__force u32) total);
+	else
+		temp = div64_u64(temp, (__force u64) total);
+
+	return (__force cputime_t) temp;
+}
+
 void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
 {
 	cputime_t rtime, utime = p->utime, total = utime + p->stime;
@@ -3151,13 +3165,9 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
 	 */
 	rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
 
-	if (total) {
-		u64 temp = (__force u64) rtime;
-
-		temp *= (__force u64) utime;
-		do_div(temp, (__force u32) total);
-		utime = (__force cputime_t) temp;
-	} else
+	if (total)
+		utime = scale_utime(utime, rtime, total);
+	else
 		utime = rtime;
 
 	/*
@@ -3184,13 +3194,9 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
 	total = cputime.utime + cputime.stime;
 	rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
 
-	if (total) {
-		u64 temp = (__force u64) rtime;
-
-		temp *= (__force u64) cputime.utime;
-		do_div(temp, (__force u32) total);
-		utime = (__force cputime_t) temp;
-	} else
+	if (total)
+		utime = scale_utime(cputime.utime, rtime, total);
+	else
 		utime = rtime;
 
 	sig->prev_utime = max(sig->prev_utime, utime);

^ permalink raw reply related	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2012-08-13 16:50 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2012-08-08  9:27 [PATCH] sched: fix divide by zero at {thread_group,task}_times Stanislaw Gruszka
2012-08-08 19:50 ` Mike Galbraith
2012-08-08 20:08   ` Peter Zijlstra
2012-08-13 11:48     ` Stanislaw Gruszka
2012-08-13 13:59       ` Peter Zijlstra
2012-08-13 16:50 ` [tip:sched/urgent] sched: fix divide by zero at {thread_group, task}_times tip-bot for Stanislaw Gruszka

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).