From: Oleg Nesterov <oleg@redhat.com>
To: Ingo Molnar <mingo@redhat.com>,
Peter Zijlstra <peterz@infradead.org>,
Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Fox <afox@redhat.com>,
Stephen Johnston <sjohnsto@redhat.com>,
linux-kernel@vger.kernel.org,
Stanislaw Gruszka <sgruszka@redhat.com>
Subject: Re: [PATCH] sched/cputime: make scale_stime() more precise
Date: Fri, 24 Jan 2020 16:42:15 +0100 [thread overview]
Message-ID: <20200124154215.GA14714@redhat.com> (raw)
In-Reply-To: <20200122164612.GA19818@redhat.com>
On 01/22, Oleg Nesterov wrote:
>
> To remind, scale_stime(stime, rtime, total) is not precise, to say at
> least. For example:
>
> stime = -1ul/33333; total = stime*3; rtime = total*5555555555;
>
> scale_stime() returns 9067034312525142184 while the correct result is
> 6148914688753325707.
>
> OK, these random numbers are not realistic, usually the relative error
> is small enough.
>
> However, even if the relative error is small, the absolute error can be
> huge. And this means that if you watch /proc/$pid/status incrementally
> to see how stime/utime grow, you can get the completely wrong numbers.
>
> Say, utime (or stime) can be frozen for unpredictably long time, as if
> the monitored application "hangs" in kernel mode, while the real split
> is 50/50.
See another test-case below. Arguments:
start_time start_utime_percent inc_time inc_utime_percent
For example,
$ ./test 8640000 50 600 50 | head
simulates process which runs 100 days 50/50 in user/kernel mode, then it
starts to check utime/stime every 600 seconds and print the difference.
The output:
old new
0:600000000000 300000000000:300000000000
0:600000000000 300000000000:300000000000
0:600000000000 300000000000:300000000000
600000000000:0 300000000000:300000000000
499469920248:100530079752 300000000000:300000000000
0:600000000000 300000000000:300000000000
0:600000000000 300000000000:300000000000
600000000000:0 300000000000:300000000000
499490181588:100509818412 300000000000:300000000000
it looks as if this process can spend 20 minutes entirely in kernel mode.
Oleg.
-------------------------------------------------------------------------------
#include <stdlib.h>
#include <stdio.h>
#include <assert.h>
#define noinline __attribute__((__noinline__))
typedef unsigned long long u64;
typedef unsigned int u32;
typedef unsigned __int128 u128;
static inline u64 div_u64_rem(u64 dividend, u32 divisor, u32 *remainder)
{
*remainder = dividend % divisor;
return dividend / divisor;
}
static inline u64 div64_u64_rem(u64 dividend, u64 divisor, u64 *remainder)
{
*remainder = dividend % divisor;
return dividend / divisor;
}
static inline u64 div64_u64(u64 dividend, u64 divisor)
{
return dividend / divisor;
}
static inline u64 div_u64(u64 dividend, u32 divisor)
{
u32 remainder;
return div_u64_rem(dividend, divisor, &remainder);
}
static inline int fls64(u64 x)
{
int bitpos = -1;
/*
* AMD64 says BSRQ won't clobber the dest reg if x==0; Intel64 says the
* dest reg is undefined if x==0, but their CPU architect says its
* value is written to set it to the same as before.
*/
asm("bsrq %1,%q0"
: "+r" (bitpos)
: "rm" (x));
return bitpos + 1;
}
static inline int ilog2(u64 n)
{
return fls64(n) - 1;
}
#define swap(a, b) \
do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0)
u64 scale_stime(u64 stime, u64 rtime, u64 total)
{
u64 scaled;
for (;;) {
/* Make sure "rtime" is the bigger of stime/rtime */
if (stime > rtime)
swap(rtime, stime);
/* Make sure 'total' fits in 32 bits */
if (total >> 32)
goto drop_precision;
/* Does rtime (and thus stime) fit in 32 bits? */
if (!(rtime >> 32))
break;
/* Can we just balance rtime/stime rather than dropping bits? */
if (stime >> 31)
goto drop_precision;
/* We can grow stime and shrink rtime and try to make them both fit */
stime <<= 1;
rtime >>= 1;
continue;
drop_precision:
/* We drop from rtime, it has more bits than stime */
rtime >>= 1;
total >>= 1;
}
/*
* Make sure gcc understands that this is a 32x32->64 multiply,
* followed by a 64/32->64 divide.
*/
scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total);
return scaled;
}
u64 new_scale_stime(u64 stime, u64 rtime, u64 total)
{
u64 res = 0, div, rem;
if (ilog2(stime) + ilog2(rtime) > 62) {
div = div64_u64_rem(rtime, total, &rem);
res = div * stime;
rtime = rem;
int shift = ilog2(stime) + ilog2(rtime) - 62;
if (shift > 0) {
rtime >>= shift;
total >>= shift;
if (!total)
return res;
}
}
return res + div64_u64(stime * rtime, total);
}
struct task_cputime {
u64 stime;
u64 utime;
unsigned long long sum_exec_runtime;
};
struct prev_cputime {
u64 utime;
u64 stime;
};
void cputime_adjust(int new, struct task_cputime *curr, struct prev_cputime *prev,
u64 *ut, u64 *st)
{
u64 rtime, stime, utime;
rtime = curr->sum_exec_runtime;
if (prev->stime + prev->utime >= rtime)
goto out;
stime = curr->stime;
utime = curr->utime;
if (stime == 0) {
utime = rtime;
goto update;
}
if (utime == 0) {
stime = rtime;
goto update;
}
stime = (new ? new_scale_stime : scale_stime)(stime, rtime, stime + utime);
update:
if (stime < prev->stime)
stime = prev->stime;
utime = rtime - stime;
if (utime < prev->utime) {
utime = prev->utime;
stime = rtime - utime;
}
prev->stime = stime;
prev->utime = utime;
out:
*ut = prev->utime;
*st = prev->stime;
}
void prdiff(int new, struct task_cputime *curr, struct prev_cputime *prev)
{
struct prev_cputime __prev = *prev;
u64 ut, st, ud, sd;
cputime_adjust(new, curr, prev, &ut, &st);
ud = ut - __prev.utime;
sd = st - __prev.stime;
printf("%16llu:%-16llu", ud, sd);
}
#define SEC 1000000000ULL
void parse_cputime(struct task_cputime *t, char **argv)
{
double total = strtod(argv[0], NULL) * SEC;
double utime = strtod(argv[1], NULL) / 100;
utime *= total;
t->utime = utime;
t->stime = total - utime;
}
int main(int argc, char **argv)
{
struct prev_cputime old_prev = {};
struct prev_cputime new_prev = {};
struct task_cputime curr, diff;
u64 tmp;
if (argc != 5) {
printf("usage: %s start_time utime_percent inc_time utime_percent\n", argv[0]);
return 0;
}
parse_cputime(&curr, argv+1);
parse_cputime(&diff, argv+3);
curr.sum_exec_runtime = curr.utime + curr.stime;
cputime_adjust(0, &curr, &old_prev, &tmp, &tmp);
cputime_adjust(1, &curr, &new_prev, &tmp, &tmp);
printf("%18s%15s\t%18s\n", "old", "", "new");
for (;;) {
curr.utime += diff.utime;
curr.stime += diff.stime;
curr.sum_exec_runtime = curr.utime + curr.stime;
prdiff(0, &curr, &old_prev);
printf("\t");
prdiff(1, &curr, &new_prev);
printf("\n");
}
return 0;
}
next prev parent reply other threads:[~2020-01-24 15:42 UTC|newest]
Thread overview: 30+ messages / expand[flat|nested] mbox.gz Atom feed top
2019-07-18 13:18 [PATCH] sched/cputime: make scale_stime() more precise Oleg Nesterov
2019-07-18 13:21 ` Oleg Nesterov
2019-07-18 14:55 ` Oleg Nesterov
2019-07-19 11:03 ` Peter Zijlstra
2019-07-19 13:47 ` Peter Zijlstra
2019-07-19 14:37 ` Oleg Nesterov
2019-07-22 19:56 ` Peter Zijlstra
2019-07-23 14:00 ` Oleg Nesterov
2019-07-23 14:29 ` Oleg Nesterov
2019-07-19 14:03 ` Oleg Nesterov
2019-07-22 19:45 ` Peter Zijlstra
2019-07-22 10:52 ` Stanislaw Gruszka
2019-07-22 20:00 ` Peter Zijlstra
2019-07-23 9:37 ` Stanislaw Gruszka
2020-01-22 16:46 ` Oleg Nesterov
2020-01-23 13:05 ` Oleg Nesterov
2020-01-24 15:42 ` Oleg Nesterov [this message]
2020-01-27 12:28 ` [PATCH v2] " Oleg Nesterov
2020-05-15 17:24 ` Oleg Nesterov
2020-05-19 17:25 ` Peter Zijlstra
2020-05-19 18:33 ` Linus Torvalds
2020-05-19 18:42 ` Peter Zijlstra
2020-05-19 19:11 ` Peter Zijlstra
2020-05-19 19:51 ` Linus Torvalds
2020-05-20 15:24 ` Oleg Nesterov
2020-05-20 15:36 ` Peter Zijlstra
2020-05-20 20:10 ` Peter Zijlstra
2020-05-21 13:26 ` Oleg Nesterov
2020-06-16 12:21 ` [tip: sched/core] sched/cputime: Improve cputime_adjust() tip-bot2 for Oleg Nesterov
-- strict thread matches above, loose matches on Subject: below --
2019-07-18 13:15 [PATCH] sched/cputime: make scale_stime() more precise Oleg Nesterov
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20200124154215.GA14714@redhat.com \
--to=oleg@redhat.com \
--cc=afox@redhat.com \
--cc=linux-kernel@vger.kernel.org \
--cc=mingo@redhat.com \
--cc=peterz@infradead.org \
--cc=sgruszka@redhat.com \
--cc=sjohnsto@redhat.com \
--cc=tglx@linutronix.de \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.