All of lore.kernel.org
 help / color / mirror / Atom feed
From: Oleg Nesterov <oleg@redhat.com>
To: Ingo Molnar <mingo@redhat.com>,
	Peter Zijlstra <peterz@infradead.org>,
	Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Fox <afox@redhat.com>,
	Stephen Johnston <sjohnsto@redhat.com>,
	linux-kernel@vger.kernel.org,
	Stanislaw Gruszka <sgruszka@redhat.com>
Subject: Re: [PATCH] sched/cputime: make scale_stime() more precise
Date: Fri, 24 Jan 2020 16:42:15 +0100	[thread overview]
Message-ID: <20200124154215.GA14714@redhat.com> (raw)
In-Reply-To: <20200122164612.GA19818@redhat.com>

On 01/22, Oleg Nesterov wrote:
>
> To remind, scale_stime(stime, rtime, total) is not precise, to say at
> least. For example:
>
> 	stime = -1ul/33333; total = stime*3; rtime = total*5555555555;
>
> scale_stime() returns 9067034312525142184 while the correct result is
> 6148914688753325707.
>
> OK, these random numbers are not realistic, usually the relative error
> is small enough.
>
> However, even if the relative error is small, the absolute error can be
> huge. And this means that if you watch /proc/$pid/status incrementally
> to see how stime/utime grow, you can get the completely wrong numbers.
>
> Say, utime (or stime) can be frozen for unpredictably long time, as if
> the monitored application "hangs" in kernel mode, while the real split
> is 50/50.

See another test-case below. Arguments:

	start_time start_utime_percent inc_time inc_utime_percent

For example,

	$ ./test 8640000 50 600 50 | head

simulates process which runs 100 days 50/50 in user/kernel mode, then it
starts to check utime/stime every 600 seconds and print the difference.

The output:

               old               	               new
               0:600000000000    	    300000000000:300000000000
               0:600000000000    	    300000000000:300000000000
               0:600000000000    	    300000000000:300000000000
    600000000000:0               	    300000000000:300000000000
    499469920248:100530079752    	    300000000000:300000000000
               0:600000000000    	    300000000000:300000000000
               0:600000000000    	    300000000000:300000000000
    600000000000:0               	    300000000000:300000000000
    499490181588:100509818412    	    300000000000:300000000000


it looks as if this process can spend 20 minutes entirely in kernel mode.

Oleg.

-------------------------------------------------------------------------------
#include <stdlib.h>
#include <stdio.h>
#include <assert.h>

#define   noinline                      __attribute__((__noinline__))

typedef unsigned long long u64;
typedef unsigned int u32;
typedef unsigned __int128 u128;

static inline u64 div_u64_rem(u64 dividend, u32 divisor, u32 *remainder)
{
	*remainder = dividend % divisor;
	return dividend / divisor;
}
static inline u64 div64_u64_rem(u64 dividend, u64 divisor, u64 *remainder)
{
	*remainder = dividend % divisor;
	return dividend / divisor;
}
static inline u64 div64_u64(u64 dividend, u64 divisor)
{
	return dividend / divisor;
}
static inline u64 div_u64(u64 dividend, u32 divisor)
{
	u32 remainder;
	return div_u64_rem(dividend, divisor, &remainder);
}

static inline int fls64(u64 x)
{
	int bitpos = -1;
	/*
	 * AMD64 says BSRQ won't clobber the dest reg if x==0; Intel64 says the
	 * dest reg is undefined if x==0, but their CPU architect says its
	 * value is written to set it to the same as before.
	 */
	asm("bsrq %1,%q0"
	    : "+r" (bitpos)
	    : "rm" (x));
	return bitpos + 1;
}

static inline int ilog2(u64 n)
{
	return fls64(n) - 1;
}

#define swap(a, b) \
	do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0)

u64 scale_stime(u64 stime, u64 rtime, u64 total)
{
	u64 scaled;

	for (;;) {
		/* Make sure "rtime" is the bigger of stime/rtime */
		if (stime > rtime)
			swap(rtime, stime);

		/* Make sure 'total' fits in 32 bits */
		if (total >> 32)
			goto drop_precision;

		/* Does rtime (and thus stime) fit in 32 bits? */
		if (!(rtime >> 32))
			break;

		/* Can we just balance rtime/stime rather than dropping bits? */
		if (stime >> 31)
			goto drop_precision;

		/* We can grow stime and shrink rtime and try to make them both fit */
		stime <<= 1;
		rtime >>= 1;
		continue;

drop_precision:
		/* We drop from rtime, it has more bits than stime */
		rtime >>= 1;
		total >>= 1;
	}

	/*
	 * Make sure gcc understands that this is a 32x32->64 multiply,
	 * followed by a 64/32->64 divide.
	 */
	scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total);
	return scaled;
}

u64 new_scale_stime(u64 stime, u64 rtime, u64 total)
{
	u64 res = 0, div, rem;

	if (ilog2(stime) + ilog2(rtime) > 62) {
		div = div64_u64_rem(rtime, total, &rem);
		res = div * stime;
		rtime = rem;

		int shift = ilog2(stime) + ilog2(rtime) - 62;
		if (shift > 0) {
			rtime >>= shift;
			total >>= shift;
			if (!total)
				return res;
		}
	}

	return res + div64_u64(stime * rtime, total);
}

struct task_cputime {
	u64				stime;
	u64				utime;
	unsigned long long		sum_exec_runtime;
};
struct prev_cputime {
	u64				utime;
	u64				stime;
};

void cputime_adjust(int new, struct task_cputime *curr, struct prev_cputime *prev,
		    u64 *ut, u64 *st)
{
	u64 rtime, stime, utime;

	rtime = curr->sum_exec_runtime;

	if (prev->stime + prev->utime >= rtime)
		goto out;

	stime = curr->stime;
	utime = curr->utime;

	if (stime == 0) {
		utime = rtime;
		goto update;
	}

	if (utime == 0) {
		stime = rtime;
		goto update;
	}

	stime = (new ? new_scale_stime : scale_stime)(stime, rtime, stime + utime);

update:
	if (stime < prev->stime)
		stime = prev->stime;
	utime = rtime - stime;

	if (utime < prev->utime) {
		utime = prev->utime;
		stime = rtime - utime;
	}

	prev->stime = stime;
	prev->utime = utime;
out:
	*ut = prev->utime;
	*st = prev->stime;
}

void prdiff(int new, struct task_cputime *curr, struct prev_cputime *prev)
{
	struct prev_cputime __prev = *prev;
	u64 ut, st, ud, sd;

	cputime_adjust(new, curr, prev, &ut, &st);
	ud = ut - __prev.utime;
	sd = st - __prev.stime;

	printf("%16llu:%-16llu", ud, sd);
}

#define SEC	1000000000ULL

void parse_cputime(struct task_cputime *t, char **argv)
{
	double total = strtod(argv[0], NULL) * SEC;
	double utime = strtod(argv[1], NULL) / 100;

	utime *= total;
	t->utime = utime;
	t->stime = total - utime;
}

int main(int argc, char **argv)
{
	struct prev_cputime old_prev = {};
	struct prev_cputime new_prev = {};
	struct task_cputime curr, diff;
	u64 tmp;

	if (argc != 5) {
		printf("usage: %s start_time utime_percent inc_time utime_percent\n", argv[0]);
		return 0;
	}

	parse_cputime(&curr, argv+1);
	parse_cputime(&diff, argv+3);

	curr.sum_exec_runtime = curr.utime + curr.stime;
	cputime_adjust(0, &curr, &old_prev, &tmp, &tmp);
	cputime_adjust(1, &curr, &new_prev, &tmp, &tmp);

	printf("%18s%15s\t%18s\n", "old", "", "new");
	for (;;) {
		curr.utime += diff.utime;
		curr.stime += diff.stime;
		curr.sum_exec_runtime = curr.utime + curr.stime;

		prdiff(0, &curr, &old_prev);
		printf("\t");
		prdiff(1, &curr, &new_prev);
		printf("\n");
	}

	return 0;
}


  parent reply	other threads:[~2020-01-24 15:42 UTC|newest]

Thread overview: 30+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-07-18 13:18 [PATCH] sched/cputime: make scale_stime() more precise Oleg Nesterov
2019-07-18 13:21 ` Oleg Nesterov
2019-07-18 14:55 ` Oleg Nesterov
2019-07-19 11:03 ` Peter Zijlstra
2019-07-19 13:47   ` Peter Zijlstra
2019-07-19 14:37     ` Oleg Nesterov
2019-07-22 19:56       ` Peter Zijlstra
2019-07-23 14:00         ` Oleg Nesterov
2019-07-23 14:29           ` Oleg Nesterov
2019-07-19 14:03   ` Oleg Nesterov
2019-07-22 19:45     ` Peter Zijlstra
2019-07-22 10:52   ` Stanislaw Gruszka
2019-07-22 20:00     ` Peter Zijlstra
2019-07-23  9:37       ` Stanislaw Gruszka
2020-01-22 16:46 ` Oleg Nesterov
2020-01-23 13:05   ` Oleg Nesterov
2020-01-24 15:42   ` Oleg Nesterov [this message]
2020-01-27 12:28 ` [PATCH v2] " Oleg Nesterov
2020-05-15 17:24   ` Oleg Nesterov
2020-05-19 17:25   ` Peter Zijlstra
2020-05-19 18:33     ` Linus Torvalds
2020-05-19 18:42       ` Peter Zijlstra
2020-05-19 19:11       ` Peter Zijlstra
2020-05-19 19:51         ` Linus Torvalds
2020-05-20 15:24     ` Oleg Nesterov
2020-05-20 15:36       ` Peter Zijlstra
2020-05-20 20:10         ` Peter Zijlstra
2020-05-21 13:26           ` Oleg Nesterov
2020-06-16 12:21     ` [tip: sched/core] sched/cputime: Improve cputime_adjust() tip-bot2 for Oleg Nesterov
  -- strict thread matches above, loose matches on Subject: below --
2019-07-18 13:15 [PATCH] sched/cputime: make scale_stime() more precise Oleg Nesterov

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200124154215.GA14714@redhat.com \
    --to=oleg@redhat.com \
    --cc=afox@redhat.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@redhat.com \
    --cc=peterz@infradead.org \
    --cc=sgruszka@redhat.com \
    --cc=sjohnsto@redhat.com \
    --cc=tglx@linutronix.de \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.