All of lore.kernel.org
 help / color / mirror / Atom feed
From: Oleg Nesterov <oleg@redhat.com>
To: Ingo Molnar <mingo@redhat.com>,
	Peter Zijlstra <peterz@infradead.org>,
	Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Fox <afox@redhat.com>,
	Stephen Johnston <sjohnsto@redhat.com>,
	linux-kernel@vger.kernel.org,
	Stanislaw Gruszka <sgruszka@redhat.com>
Subject: Re: [PATCH] sched/cputime: make scale_stime() more precise
Date: Thu, 23 Jan 2020 14:05:41 +0100	[thread overview]
Message-ID: <20200123130541.GA30620@redhat.com> (raw)
In-Reply-To: <20200122164612.GA19818@redhat.com>

On 01/22, Oleg Nesterov wrote:
>
> But there is another reason why I think it makes more sense. It is also
> faster on x86-64, much faster when the numbers are big. See the naive
> test code below. For example,
>
> 	$ ./test 553407856289849 18446744066259977121 1660223568869547
> 	553407856289849 * 18446744066259977121 / 1660223568869547 =
> 		(128) 6148914688753325707
> 		(asm) 6148914688753325707
> 		(new) 6148914691236512239
> 		(old) 9067034312525142184
>
> 	ticks:
> 		asm: 7183908591
> 		new: 4891383871
> 		old: 23585547775

Just for completeness, see the updated code which can be compiled with -m32.
As expected, my version is slower on 32-bit when the numbers are small,

	$ ./test 1 3 2
	1 * 3 / 2 =
		(new) 1
		(old) 1

	ticks:
		new: 3624344961
		old: 2514403456

But still faster when rtime is big enough:

	$ ./test 1 68719476736 2
	1 * 68719476736 / 2 =
		(new) 34359738368
		(old) 34359738368

	ticks:
		new: 5044284834
		old: 5347969883

	$ ./test 553407856289849 18446744066259977121 1660223568869547
	553407856289849 * 18446744066259977121 / 1660223568869547 =
		(new) 6148914691236512239
		(old) 9067034312525142184

	ticks:
		new: 11496181242
		old: 33622910386

Oleg.

------------------------------------------------------------------------------
#include <stdlib.h>
#include <stdio.h>
#include <assert.h>

#define   noinline                      __attribute__((__noinline__))

typedef unsigned long long u64;
typedef unsigned int u32;

#ifdef __x86_64__
typedef unsigned __int128 u128;

u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 c)
{
	u64 q;
	asm ("mulq %2; divq %3" : "=a" (q) : "a" (a), "rm" (b), "rm" (c) : "rdx");
	return q;
}

static inline u64 div_u64_rem(u64 dividend, u32 divisor, u32 *remainder)
{
	*remainder = dividend % divisor;
	return dividend / divisor;
}
static inline u64 div64_u64_rem(u64 dividend, u64 divisor, u64 *remainder)
{
	*remainder = dividend % divisor;
	return dividend / divisor;
}
static inline u64 div64_u64(u64 dividend, u64 divisor)
{
	return dividend / divisor;
}
static inline u64 div_u64(u64 dividend, u32 divisor)
{
	u32 remainder;
	return div_u64_rem(dividend, divisor, &remainder);
}

static inline int fls64(u64 x)
{
	int bitpos = -1;
	/*
	 * AMD64 says BSRQ won't clobber the dest reg if x==0; Intel64 says the
	 * dest reg is undefined if x==0, but their CPU architect says its
	 * value is written to set it to the same as before.
	 */
	asm("bsrq %1,%q0"
	    : "+r" (bitpos)
	    : "rm" (x));
	return bitpos + 1;
}
#else // 32-bit
static inline u64 div_u64_rem(u64 dividend, u32 divisor, u32 *remainder)
{
	union {
		u64 v64;
		u32 v32[2];
	} d = { dividend };
	u32 upper;

	upper = d.v32[1];
	d.v32[1] = 0;
	if (upper >= divisor) {
		d.v32[1] = upper / divisor;
		upper %= divisor;
	}
	asm ("divl %2" : "=a" (d.v32[0]), "=d" (*remainder) :
		"rm" (divisor), "0" (d.v32[0]), "1" (upper));
	return d.v64;
}

static inline u64 div_u64(u64 dividend, u32 divisor)
{
	u32 remainder;
	return div_u64_rem(dividend, divisor, &remainder);
}

static inline int fls(unsigned int x)
{
	int r;

	asm("bsrl %1,%0\n\t"
	    "cmovzl %2,%0"
	    : "=&r" (r) : "rm" (x), "rm" (-1));

	return r + 1;
}
static inline int fls64(u64 x)
{
	u32 h = x >> 32;
	if (h)
		return fls(h) + 32;
	return fls(x);
}

u64 noinline div64_u64_rem(u64 dividend, u64 divisor, u64 *remainder)
{
	u32 high = divisor >> 32;
	u64 quot;

	if (high == 0) {
		u32 rem32;
		quot = div_u64_rem(dividend, divisor, &rem32);
		*remainder = rem32;
	} else {
		int n = fls(high);
		quot = div_u64(dividend >> n, divisor >> n);

		if (quot != 0)
			quot--;

		*remainder = dividend - quot * divisor;
		if (*remainder >= divisor) {
			quot++;
			*remainder -= divisor;
		}
	}

	return quot;
}
u64 noinline div64_u64(u64 dividend, u64 divisor)
{
	u32 high = divisor >> 32;
	u64 quot;

	if (high == 0) {
		quot = div_u64(dividend, divisor);
	} else {
		int n = fls(high);
		quot = div_u64(dividend >> n, divisor >> n);

		if (quot != 0)
			quot--;
		if ((dividend - quot * divisor) >= divisor)
			quot++;
	}

	return quot;
}
#endif

static inline int ilog2(u64 n)
{
	return fls64(n) - 1;
}

#define swap(a, b) \
	do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0)

u64 scale_stime(u64 stime, u64 rtime, u64 total)
{
	u64 scaled;

	for (;;) {
		/* Make sure "rtime" is the bigger of stime/rtime */
		if (stime > rtime)
			swap(rtime, stime);

		/* Make sure 'total' fits in 32 bits */
		if (total >> 32)
			goto drop_precision;

		/* Does rtime (and thus stime) fit in 32 bits? */
		if (!(rtime >> 32))
			break;

		/* Can we just balance rtime/stime rather than dropping bits? */
		if (stime >> 31)
			goto drop_precision;

		/* We can grow stime and shrink rtime and try to make them both fit */
		stime <<= 1;
		rtime >>= 1;
		continue;

drop_precision:
		/* We drop from rtime, it has more bits than stime */
		rtime >>= 1;
		total >>= 1;
	}

	/*
	 * Make sure gcc understands that this is a 32x32->64 multiply,
	 * followed by a 64/32->64 divide.
	 */
	scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total);
	return scaled;
}

u64 new_scale_stime(u64 stime, u64 rtime, u64 total)
{
	u64 res = 0, div, rem;

	if (ilog2(stime) + ilog2(rtime) > 62) {
		div = div64_u64_rem(rtime, total, &rem);
		res = div * stime;
		rtime = rem;

		int shift = ilog2(stime) + ilog2(rtime) - 62;
		if (shift > 0) {
			rtime >>= shift;
			total >>= shift;
			if (!total)
				return res;
		}
	}

	return res + div64_u64(stime * rtime, total);
}

static inline u64 rdtsc(void)
{
	unsigned low, high;
	asm volatile("rdtsc" : "=a" (low), "=d" (high));
	return ((low) | ((u64)(high) << 32));
}

u64 S, R, T;

u64 noinline profile(u64 (*f)(u64,u64,u64))
{
//	u64 s = S, r = R, t = T;
	u64 tsc1, tsc2;
	int i;

	tsc1 = rdtsc();

	for (i = 0; i < 100*1000*1000; ++i)
//		f(s++, r++, t++);
		f(S,R,T);

	tsc2 = rdtsc();

	return tsc2 - tsc1;
}


int main(int argc, char **argv)
{
	if (argc != 4) {
		printf("usage: %s stime rtime total\n", argv[0]);
		return 1;
	}

	S = strtoull(argv[1], NULL, 0);
	R = strtoull(argv[2], NULL, 0);
	T = strtoull(argv[3], NULL, 0);
	assert(S < T);
	assert(T < R);

	if (1) {
		printf("%llu * %llu / %llu =\n", S,R,T);
#ifdef __x86_64__
		printf("\t(128) %lld\n", (u64)( ((u128)S)*((u128)R)/((u128)T) ));
		printf("\t(asm) %lld\n", mul_u64_u64_div_u64(S,R,T));
#endif
		printf("\t(new) %lld\n", new_scale_stime(S,R,T));
		printf("\t(old) %lld\n", scale_stime(S,R,T));
		printf("\n");
	}

	printf("ticks:\n");
#ifdef __x86_64__
	printf("\tasm: %lld\n", profile(mul_u64_u64_div_u64));
#endif
	printf("\tnew: %lld\n", profile(new_scale_stime));
	printf("\told: %lld\n", profile(scale_stime));

	return 0;
}


  reply	other threads:[~2020-01-23 13:05 UTC|newest]

Thread overview: 30+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-07-18 13:18 [PATCH] sched/cputime: make scale_stime() more precise Oleg Nesterov
2019-07-18 13:21 ` Oleg Nesterov
2019-07-18 14:55 ` Oleg Nesterov
2019-07-19 11:03 ` Peter Zijlstra
2019-07-19 13:47   ` Peter Zijlstra
2019-07-19 14:37     ` Oleg Nesterov
2019-07-22 19:56       ` Peter Zijlstra
2019-07-23 14:00         ` Oleg Nesterov
2019-07-23 14:29           ` Oleg Nesterov
2019-07-19 14:03   ` Oleg Nesterov
2019-07-22 19:45     ` Peter Zijlstra
2019-07-22 10:52   ` Stanislaw Gruszka
2019-07-22 20:00     ` Peter Zijlstra
2019-07-23  9:37       ` Stanislaw Gruszka
2020-01-22 16:46 ` Oleg Nesterov
2020-01-23 13:05   ` Oleg Nesterov [this message]
2020-01-24 15:42   ` Oleg Nesterov
2020-01-27 12:28 ` [PATCH v2] " Oleg Nesterov
2020-05-15 17:24   ` Oleg Nesterov
2020-05-19 17:25   ` Peter Zijlstra
2020-05-19 18:33     ` Linus Torvalds
2020-05-19 18:42       ` Peter Zijlstra
2020-05-19 19:11       ` Peter Zijlstra
2020-05-19 19:51         ` Linus Torvalds
2020-05-20 15:24     ` Oleg Nesterov
2020-05-20 15:36       ` Peter Zijlstra
2020-05-20 20:10         ` Peter Zijlstra
2020-05-21 13:26           ` Oleg Nesterov
2020-06-16 12:21     ` [tip: sched/core] sched/cputime: Improve cputime_adjust() tip-bot2 for Oleg Nesterov
  -- strict thread matches above, loose matches on Subject: below --
2019-07-18 13:15 [PATCH] sched/cputime: make scale_stime() more precise Oleg Nesterov

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200123130541.GA30620@redhat.com \
    --to=oleg@redhat.com \
    --cc=afox@redhat.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@redhat.com \
    --cc=peterz@infradead.org \
    --cc=sgruszka@redhat.com \
    --cc=sjohnsto@redhat.com \
    --cc=tglx@linutronix.de \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.