* [PATCH V2] perf: x86: Improve accuracy of perf/sched clock
@ 2015-07-28 21:14 Adrian Hunter
2015-08-17 7:34 ` Adrian Hunter
2015-08-20 19:31 ` Thomas Gleixner
0 siblings, 2 replies; 9+ messages in thread
From: Adrian Hunter @ 2015-07-28 21:14 UTC (permalink / raw)
To: Peter Zijlstra, Ingo Molnar
Cc: Arnaldo Carvalho de Melo, Andy Lutomirski, Thomas Gleixner,
linux-kernel, Stephane Eranian, Andi Kleen
When TSC is stable perf/sched clock is based on it.
However the conversion from cycles to nanoseconds
is not as accurate as it could be. Because
CYC2NS_SCALE_FACTOR is 10, the accuracy is +/- 1/2048
The change is to calculate the maximum shift that
results in a multiplier that is still a 32-bit number.
For example all frequencies over 1 GHz will have
a shift of 32, making the accuracy of the conversion
+/- 1/(2^33)
Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
---
arch/x86/kernel/tsc.c | 33 ++++++++++++++++++++-------------
1 file changed, 20 insertions(+), 13 deletions(-)
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 7437b41f6a47..e7085bcfb06b 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -167,21 +167,21 @@ static void cyc2ns_write_end(int cpu, struct cyc2ns_data *data)
* ns = cycles * cyc2ns_scale / SC
*
* And since SC is a constant power of two, we can convert the div
- * into a shift.
+ * into a shift. The larger SC is, the more accurate the conversion, but
+ * cyc2ns_scale needs to be a 32-bit value so that 32-bit multiplication
+ * (64-bit result) can be used. So start by trying SC = 2^32, reducing
+ * until the criteria are met.
*
- * We can use khz divisor instead of mhz to keep a better precision, since
- * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
+ * We can use khz divisor instead of mhz to keep a better precision.
* (mathieu.desnoyers@polymtl.ca)
*
* -johnstul@us.ibm.com "math is hard, lets go shopping!"
*/
-#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
-
static void cyc2ns_data_init(struct cyc2ns_data *data)
{
data->cyc2ns_mul = 0;
- data->cyc2ns_shift = CYC2NS_SCALE_FACTOR;
+ data->cyc2ns_shift = 0;
data->cyc2ns_offset = 0;
data->__count = 0;
}
@@ -215,14 +215,14 @@ static inline unsigned long long cycles_2_ns(unsigned long long cyc)
if (likely(data == tail)) {
ns = data->cyc2ns_offset;
- ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
+ ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift);
} else {
data->__count++;
barrier();
ns = data->cyc2ns_offset;
- ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
+ ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift);
barrier();
@@ -239,6 +239,8 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
unsigned long long tsc_now, ns_now;
struct cyc2ns_data *data;
unsigned long flags;
+ u64 mult;
+ u32 shft = 32;
local_irq_save(flags);
sched_clock_idle_sleep_event();
@@ -256,12 +258,17 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
* time function is continuous; see the comment near struct
* cyc2ns_data.
*/
- data->cyc2ns_mul =
- DIV_ROUND_CLOSEST(NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR,
- cpu_khz);
- data->cyc2ns_shift = CYC2NS_SCALE_FACTOR;
+ mult = (u64)NSEC_PER_MSEC << 32;
+ mult += cpu_khz / 2;
+ do_div(mult, cpu_khz);
+ while (mult > U32_MAX) {
+ mult >>= 1;
+ shft -= 1;
+ }
+ data->cyc2ns_mul = mult;
+ data->cyc2ns_shift = shft;
data->cyc2ns_offset = ns_now -
- mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
+ mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, data->cyc2ns_shift);
cyc2ns_write_end(cpu, data);
--
1.9.1
^ permalink raw reply related [flat|nested] 9+ messages in thread
* Re: [PATCH V2] perf: x86: Improve accuracy of perf/sched clock
2015-07-28 21:14 [PATCH V2] perf: x86: Improve accuracy of perf/sched clock Adrian Hunter
@ 2015-08-17 7:34 ` Adrian Hunter
2015-08-17 9:56 ` Peter Zijlstra
2015-08-20 19:31 ` Thomas Gleixner
1 sibling, 1 reply; 9+ messages in thread
From: Adrian Hunter @ 2015-08-17 7:34 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Adrian Hunter, Ingo Molnar, Arnaldo Carvalho de Melo,
Andy Lutomirski, Thomas Gleixner, linux-kernel, Stephane Eranian,
Andi Kleen
On 29/07/15 00:14, Adrian Hunter wrote:
> When TSC is stable perf/sched clock is based on it.
> However the conversion from cycles to nanoseconds
> is not as accurate as it could be. Because
> CYC2NS_SCALE_FACTOR is 10, the accuracy is +/- 1/2048
>
> The change is to calculate the maximum shift that
> results in a multiplier that is still a 32-bit number.
> For example all frequencies over 1 GHz will have
> a shift of 32, making the accuracy of the conversion
> +/- 1/(2^33)
>
> Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Is this OK?
> ---
> arch/x86/kernel/tsc.c | 33 ++++++++++++++++++++-------------
> 1 file changed, 20 insertions(+), 13 deletions(-)
>
> diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
> index 7437b41f6a47..e7085bcfb06b 100644
> --- a/arch/x86/kernel/tsc.c
> +++ b/arch/x86/kernel/tsc.c
> @@ -167,21 +167,21 @@ static void cyc2ns_write_end(int cpu, struct cyc2ns_data *data)
> * ns = cycles * cyc2ns_scale / SC
> *
> * And since SC is a constant power of two, we can convert the div
> - * into a shift.
> + * into a shift. The larger SC is, the more accurate the conversion, but
> + * cyc2ns_scale needs to be a 32-bit value so that 32-bit multiplication
> + * (64-bit result) can be used. So start by trying SC = 2^32, reducing
> + * until the criteria are met.
> *
> - * We can use khz divisor instead of mhz to keep a better precision, since
> - * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
> + * We can use khz divisor instead of mhz to keep a better precision.
> * (mathieu.desnoyers@polymtl.ca)
> *
> * -johnstul@us.ibm.com "math is hard, lets go shopping!"
> */
>
> -#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
> -
> static void cyc2ns_data_init(struct cyc2ns_data *data)
> {
> data->cyc2ns_mul = 0;
> - data->cyc2ns_shift = CYC2NS_SCALE_FACTOR;
> + data->cyc2ns_shift = 0;
> data->cyc2ns_offset = 0;
> data->__count = 0;
> }
> @@ -215,14 +215,14 @@ static inline unsigned long long cycles_2_ns(unsigned long long cyc)
>
> if (likely(data == tail)) {
> ns = data->cyc2ns_offset;
> - ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
> + ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift);
> } else {
> data->__count++;
>
> barrier();
>
> ns = data->cyc2ns_offset;
> - ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
> + ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift);
>
> barrier();
>
> @@ -239,6 +239,8 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
> unsigned long long tsc_now, ns_now;
> struct cyc2ns_data *data;
> unsigned long flags;
> + u64 mult;
> + u32 shft = 32;
>
> local_irq_save(flags);
> sched_clock_idle_sleep_event();
> @@ -256,12 +258,17 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
> * time function is continuous; see the comment near struct
> * cyc2ns_data.
> */
> - data->cyc2ns_mul =
> - DIV_ROUND_CLOSEST(NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR,
> - cpu_khz);
> - data->cyc2ns_shift = CYC2NS_SCALE_FACTOR;
> + mult = (u64)NSEC_PER_MSEC << 32;
> + mult += cpu_khz / 2;
> + do_div(mult, cpu_khz);
> + while (mult > U32_MAX) {
> + mult >>= 1;
> + shft -= 1;
> + }
> + data->cyc2ns_mul = mult;
> + data->cyc2ns_shift = shft;
> data->cyc2ns_offset = ns_now -
> - mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
> + mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, data->cyc2ns_shift);
>
> cyc2ns_write_end(cpu, data);
>
>
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH V2] perf: x86: Improve accuracy of perf/sched clock
2015-08-17 7:34 ` Adrian Hunter
@ 2015-08-17 9:56 ` Peter Zijlstra
0 siblings, 0 replies; 9+ messages in thread
From: Peter Zijlstra @ 2015-08-17 9:56 UTC (permalink / raw)
To: Adrian Hunter
Cc: Ingo Molnar, Arnaldo Carvalho de Melo, Andy Lutomirski,
Thomas Gleixner, linux-kernel, Stephane Eranian, Andi Kleen
On Mon, Aug 17, 2015 at 10:34:03AM +0300, Adrian Hunter wrote:
> On 29/07/15 00:14, Adrian Hunter wrote:
> > When TSC is stable perf/sched clock is based on it.
> > However the conversion from cycles to nanoseconds
> > is not as accurate as it could be. Because
> > CYC2NS_SCALE_FACTOR is 10, the accuracy is +/- 1/2048
> >
> > The change is to calculate the maximum shift that
> > results in a multiplier that is still a 32-bit number.
> > For example all frequencies over 1 GHz will have
> > a shift of 32, making the accuracy of the conversion
> > +/- 1/(2^33)
> >
> > Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
>
> Is this OK?
Yes I think so, sorry I got backlogged with preparation for Seattle.
Thomas, if you see this, could you merge with my ACK on, otherwise I'll
queue it the normal route once I'm back.
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH V2] perf: x86: Improve accuracy of perf/sched clock
2015-07-28 21:14 [PATCH V2] perf: x86: Improve accuracy of perf/sched clock Adrian Hunter
2015-08-17 7:34 ` Adrian Hunter
@ 2015-08-20 19:31 ` Thomas Gleixner
2015-08-21 6:46 ` Adrian Hunter
1 sibling, 1 reply; 9+ messages in thread
From: Thomas Gleixner @ 2015-08-20 19:31 UTC (permalink / raw)
To: Adrian Hunter
Cc: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
Andy Lutomirski, linux-kernel, Stephane Eranian, Andi Kleen
On Wed, 29 Jul 2015, Adrian Hunter wrote:
> @@ -239,6 +239,8 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
> unsigned long long tsc_now, ns_now;
> struct cyc2ns_data *data;
> unsigned long flags;
> + u64 mult;
> + u32 shft = 32;
>
> local_irq_save(flags);
> sched_clock_idle_sleep_event();
> @@ -256,12 +258,17 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
> * time function is continuous; see the comment near struct
> * cyc2ns_data.
> */
> - data->cyc2ns_mul =
> - DIV_ROUND_CLOSEST(NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR,
> - cpu_khz);
> - data->cyc2ns_shift = CYC2NS_SCALE_FACTOR;
> + mult = (u64)NSEC_PER_MSEC << 32;
> + mult += cpu_khz / 2;
> + do_div(mult, cpu_khz);
> + while (mult > U32_MAX) {
> + mult >>= 1;
> + shft -= 1;
> + }
This is an open coded variant of clocks_calc_mult_shift(). Can we
please use that one?
Thanks,
tglx
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH V2] perf: x86: Improve accuracy of perf/sched clock
2015-08-20 19:31 ` Thomas Gleixner
@ 2015-08-21 6:46 ` Adrian Hunter
2015-08-21 9:05 ` [PATCH V3] " Adrian Hunter
0 siblings, 1 reply; 9+ messages in thread
From: Adrian Hunter @ 2015-08-21 6:46 UTC (permalink / raw)
To: Thomas Gleixner
Cc: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
Andy Lutomirski, linux-kernel, Stephane Eranian, Andi Kleen
On 20/08/15 22:31, Thomas Gleixner wrote:
> On Wed, 29 Jul 2015, Adrian Hunter wrote:
>> @@ -239,6 +239,8 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
>> unsigned long long tsc_now, ns_now;
>> struct cyc2ns_data *data;
>> unsigned long flags;
>> + u64 mult;
>> + u32 shft = 32;
>>
>> local_irq_save(flags);
>> sched_clock_idle_sleep_event();
>> @@ -256,12 +258,17 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
>> * time function is continuous; see the comment near struct
>> * cyc2ns_data.
>> */
>> - data->cyc2ns_mul =
>> - DIV_ROUND_CLOSEST(NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR,
>> - cpu_khz);
>> - data->cyc2ns_shift = CYC2NS_SCALE_FACTOR;
>> + mult = (u64)NSEC_PER_MSEC << 32;
>> + mult += cpu_khz / 2;
>> + do_div(mult, cpu_khz);
>> + while (mult > U32_MAX) {
>> + mult >>= 1;
>> + shft -= 1;
>> + }
>
> This is an open coded variant of clocks_calc_mult_shift(). Can we
> please use that one?
Sure. clocks_calc_mult_shift() does a division on each shift which is a bit
slower (avoids a 1-bit rounding error but that will never be more than 1 in
2^32 in this case), and the 'maxsec' functionality is not needed because
mul_u64_u32_shr() is used to avoid 64-bit overflow.
I will send V3.
^ permalink raw reply [flat|nested] 9+ messages in thread
* [PATCH V3] perf: x86: Improve accuracy of perf/sched clock
2015-08-21 6:46 ` Adrian Hunter
@ 2015-08-21 9:05 ` Adrian Hunter
2015-09-01 8:36 ` Adrian Hunter
2015-09-13 11:08 ` [tip:perf/core] perf/x86: " tip-bot for Adrian Hunter
0 siblings, 2 replies; 9+ messages in thread
From: Adrian Hunter @ 2015-08-21 9:05 UTC (permalink / raw)
To: Peter Zijlstra, Ingo Molnar
Cc: Arnaldo Carvalho de Melo, Andy Lutomirski, Thomas Gleixner,
linux-kernel, Stephane Eranian, Andi Kleen
When TSC is stable perf/sched clock is based on it.
However the conversion from cycles to nanoseconds
is not as accurate as it could be. Because
CYC2NS_SCALE_FACTOR is 10, the accuracy is +/- 1/2048
The change is to calculate the maximum shift that
results in a multiplier that is still a 32-bit number.
For example all frequencies over 1 GHz will have
a shift of 32, making the accuracy of the conversion
+/- 1/(2^33). That is achieved by using the
'clocks_calc_mult_shift()' function.
Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
---
Changes in V3:
Use clocks_calc_mult_shift() instead of open-coded
calculation
Changes in V2:
Remove definition of CYC2NS_SCALE_FACTOR
Amend comments
arch/x86/kernel/tsc.c | 24 +++++++++++-------------
1 file changed, 11 insertions(+), 13 deletions(-)
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index c8d52cb4cb6e..39ec3d07affd 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -167,21 +167,20 @@ static void cyc2ns_write_end(int cpu, struct cyc2ns_data *data)
* ns = cycles * cyc2ns_scale / SC
*
* And since SC is a constant power of two, we can convert the div
- * into a shift.
+ * into a shift. The larger SC is, the more accurate the conversion, but
+ * cyc2ns_scale needs to be a 32-bit value so that 32-bit multiplication
+ * (64-bit result) can be used.
*
- * We can use khz divisor instead of mhz to keep a better precision, since
- * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
+ * We can use khz divisor instead of mhz to keep a better precision.
* (mathieu.desnoyers@polymtl.ca)
*
* -johnstul@us.ibm.com "math is hard, lets go shopping!"
*/
-#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
-
static void cyc2ns_data_init(struct cyc2ns_data *data)
{
data->cyc2ns_mul = 0;
- data->cyc2ns_shift = CYC2NS_SCALE_FACTOR;
+ data->cyc2ns_shift = 0;
data->cyc2ns_offset = 0;
data->__count = 0;
}
@@ -215,14 +214,14 @@ static inline unsigned long long cycles_2_ns(unsigned long long cyc)
if (likely(data == tail)) {
ns = data->cyc2ns_offset;
- ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
+ ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift);
} else {
data->__count++;
barrier();
ns = data->cyc2ns_offset;
- ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
+ ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift);
barrier();
@@ -256,12 +255,11 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
* time function is continuous; see the comment near struct
* cyc2ns_data.
*/
- data->cyc2ns_mul =
- DIV_ROUND_CLOSEST(NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR,
- cpu_khz);
- data->cyc2ns_shift = CYC2NS_SCALE_FACTOR;
+ clocks_calc_mult_shift(&data->cyc2ns_mul, &data->cyc2ns_shift, cpu_khz,
+ NSEC_PER_MSEC, 0);
+
data->cyc2ns_offset = ns_now -
- mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
+ mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, data->cyc2ns_shift);
cyc2ns_write_end(cpu, data);
--
1.9.1
^ permalink raw reply related [flat|nested] 9+ messages in thread
* Re: [PATCH V3] perf: x86: Improve accuracy of perf/sched clock
2015-08-21 9:05 ` [PATCH V3] " Adrian Hunter
@ 2015-09-01 8:36 ` Adrian Hunter
2015-09-01 8:57 ` Peter Zijlstra
2015-09-13 11:08 ` [tip:perf/core] perf/x86: " tip-bot for Adrian Hunter
1 sibling, 1 reply; 9+ messages in thread
From: Adrian Hunter @ 2015-09-01 8:36 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Ingo Molnar, Arnaldo Carvalho de Melo, Andy Lutomirski,
Thomas Gleixner, linux-kernel, Stephane Eranian, Andi Kleen
On 21/08/15 12:05, Adrian Hunter wrote:
> When TSC is stable perf/sched clock is based on it.
> However the conversion from cycles to nanoseconds
> is not as accurate as it could be. Because
> CYC2NS_SCALE_FACTOR is 10, the accuracy is +/- 1/2048
>
> The change is to calculate the maximum shift that
> results in a multiplier that is still a 32-bit number.
> For example all frequencies over 1 GHz will have
> a shift of 32, making the accuracy of the conversion
> +/- 1/(2^33). That is achieved by using the
> 'clocks_calc_mult_shift()' function.
>
> Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Ping?
> ---
>
>
> Changes in V3:
>
> Use clocks_calc_mult_shift() instead of open-coded
> calculation
>
> Changes in V2:
>
> Remove definition of CYC2NS_SCALE_FACTOR
> Amend comments
>
>
> arch/x86/kernel/tsc.c | 24 +++++++++++-------------
> 1 file changed, 11 insertions(+), 13 deletions(-)
>
> diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
> index c8d52cb4cb6e..39ec3d07affd 100644
> --- a/arch/x86/kernel/tsc.c
> +++ b/arch/x86/kernel/tsc.c
> @@ -167,21 +167,20 @@ static void cyc2ns_write_end(int cpu, struct cyc2ns_data *data)
> * ns = cycles * cyc2ns_scale / SC
> *
> * And since SC is a constant power of two, we can convert the div
> - * into a shift.
> + * into a shift. The larger SC is, the more accurate the conversion, but
> + * cyc2ns_scale needs to be a 32-bit value so that 32-bit multiplication
> + * (64-bit result) can be used.
> *
> - * We can use khz divisor instead of mhz to keep a better precision, since
> - * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
> + * We can use khz divisor instead of mhz to keep a better precision.
> * (mathieu.desnoyers@polymtl.ca)
> *
> * -johnstul@us.ibm.com "math is hard, lets go shopping!"
> */
>
> -#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
> -
> static void cyc2ns_data_init(struct cyc2ns_data *data)
> {
> data->cyc2ns_mul = 0;
> - data->cyc2ns_shift = CYC2NS_SCALE_FACTOR;
> + data->cyc2ns_shift = 0;
> data->cyc2ns_offset = 0;
> data->__count = 0;
> }
> @@ -215,14 +214,14 @@ static inline unsigned long long cycles_2_ns(unsigned long long cyc)
>
> if (likely(data == tail)) {
> ns = data->cyc2ns_offset;
> - ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
> + ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift);
> } else {
> data->__count++;
>
> barrier();
>
> ns = data->cyc2ns_offset;
> - ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
> + ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift);
>
> barrier();
>
> @@ -256,12 +255,11 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
> * time function is continuous; see the comment near struct
> * cyc2ns_data.
> */
> - data->cyc2ns_mul =
> - DIV_ROUND_CLOSEST(NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR,
> - cpu_khz);
> - data->cyc2ns_shift = CYC2NS_SCALE_FACTOR;
> + clocks_calc_mult_shift(&data->cyc2ns_mul, &data->cyc2ns_shift, cpu_khz,
> + NSEC_PER_MSEC, 0);
> +
> data->cyc2ns_offset = ns_now -
> - mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
> + mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, data->cyc2ns_shift);
>
> cyc2ns_write_end(cpu, data);
>
>
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH V3] perf: x86: Improve accuracy of perf/sched clock
2015-09-01 8:36 ` Adrian Hunter
@ 2015-09-01 8:57 ` Peter Zijlstra
0 siblings, 0 replies; 9+ messages in thread
From: Peter Zijlstra @ 2015-09-01 8:57 UTC (permalink / raw)
To: Adrian Hunter
Cc: Ingo Molnar, Arnaldo Carvalho de Melo, Andy Lutomirski,
Thomas Gleixner, linux-kernel, Stephane Eranian, Andi Kleen
On Tue, Sep 01, 2015 at 11:36:38AM +0300, Adrian Hunter wrote:
> On 21/08/15 12:05, Adrian Hunter wrote:
> > When TSC is stable perf/sched clock is based on it.
> > However the conversion from cycles to nanoseconds
> > is not as accurate as it could be. Because
> > CYC2NS_SCALE_FACTOR is 10, the accuracy is +/- 1/2048
> >
> > The change is to calculate the maximum shift that
> > results in a multiplier that is still a 32-bit number.
> > For example all frequencies over 1 GHz will have
> > a shift of 32, making the accuracy of the conversion
> > +/- 1/(2^33). That is achieved by using the
> > 'clocks_calc_mult_shift()' function.
> >
> > Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
>
> Ping?
I've queued it. Thanks
^ permalink raw reply [flat|nested] 9+ messages in thread
* [tip:perf/core] perf/x86: Improve accuracy of perf/sched clock
2015-08-21 9:05 ` [PATCH V3] " Adrian Hunter
2015-09-01 8:36 ` Adrian Hunter
@ 2015-09-13 11:08 ` tip-bot for Adrian Hunter
1 sibling, 0 replies; 9+ messages in thread
From: tip-bot for Adrian Hunter @ 2015-09-13 11:08 UTC (permalink / raw)
To: linux-tip-commits
Cc: acme, peterz, torvalds, mingo, adrian.hunter, vincent.weaver,
acme, jolsa, tglx, hpa, linux-kernel, luto, eranian
Commit-ID: b20112edeadf0b8a1416de061caa4beb11539902
Gitweb: http://git.kernel.org/tip/b20112edeadf0b8a1416de061caa4beb11539902
Author: Adrian Hunter <adrian.hunter@intel.com>
AuthorDate: Fri, 21 Aug 2015 12:05:18 +0300
Committer: Ingo Molnar <mingo@kernel.org>
CommitDate: Sun, 13 Sep 2015 11:27:20 +0200
perf/x86: Improve accuracy of perf/sched clock
When TSC is stable perf/sched clock is based on it.
However the conversion from cycles to nanoseconds
is not as accurate as it could be. Because
CYC2NS_SCALE_FACTOR is 10, the accuracy is +/- 1/2048
The change is to calculate the maximum shift that
results in a multiplier that is still a 32-bit number.
For example all frequencies over 1 GHz will have
a shift of 32, making the accuracy of the conversion
+/- 1/(2^33). That is achieved by using the
'clocks_calc_mult_shift()' function.
Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1440147918-22250-1-git-send-email-adrian.hunter@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
arch/x86/kernel/tsc.c | 24 +++++++++++-------------
1 file changed, 11 insertions(+), 13 deletions(-)
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index c8d52cb..39ec3d0 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -167,21 +167,20 @@ static void cyc2ns_write_end(int cpu, struct cyc2ns_data *data)
* ns = cycles * cyc2ns_scale / SC
*
* And since SC is a constant power of two, we can convert the div
- * into a shift.
+ * into a shift. The larger SC is, the more accurate the conversion, but
+ * cyc2ns_scale needs to be a 32-bit value so that 32-bit multiplication
+ * (64-bit result) can be used.
*
- * We can use khz divisor instead of mhz to keep a better precision, since
- * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
+ * We can use khz divisor instead of mhz to keep a better precision.
* (mathieu.desnoyers@polymtl.ca)
*
* -johnstul@us.ibm.com "math is hard, lets go shopping!"
*/
-#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
-
static void cyc2ns_data_init(struct cyc2ns_data *data)
{
data->cyc2ns_mul = 0;
- data->cyc2ns_shift = CYC2NS_SCALE_FACTOR;
+ data->cyc2ns_shift = 0;
data->cyc2ns_offset = 0;
data->__count = 0;
}
@@ -215,14 +214,14 @@ static inline unsigned long long cycles_2_ns(unsigned long long cyc)
if (likely(data == tail)) {
ns = data->cyc2ns_offset;
- ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
+ ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift);
} else {
data->__count++;
barrier();
ns = data->cyc2ns_offset;
- ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
+ ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift);
barrier();
@@ -256,12 +255,11 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
* time function is continuous; see the comment near struct
* cyc2ns_data.
*/
- data->cyc2ns_mul =
- DIV_ROUND_CLOSEST(NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR,
- cpu_khz);
- data->cyc2ns_shift = CYC2NS_SCALE_FACTOR;
+ clocks_calc_mult_shift(&data->cyc2ns_mul, &data->cyc2ns_shift, cpu_khz,
+ NSEC_PER_MSEC, 0);
+
data->cyc2ns_offset = ns_now -
- mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
+ mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, data->cyc2ns_shift);
cyc2ns_write_end(cpu, data);
^ permalink raw reply related [flat|nested] 9+ messages in thread
end of thread, other threads:[~2015-09-13 11:08 UTC | newest]
Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2015-07-28 21:14 [PATCH V2] perf: x86: Improve accuracy of perf/sched clock Adrian Hunter
2015-08-17 7:34 ` Adrian Hunter
2015-08-17 9:56 ` Peter Zijlstra
2015-08-20 19:31 ` Thomas Gleixner
2015-08-21 6:46 ` Adrian Hunter
2015-08-21 9:05 ` [PATCH V3] " Adrian Hunter
2015-09-01 8:36 ` Adrian Hunter
2015-09-01 8:57 ` Peter Zijlstra
2015-09-13 11:08 ` [tip:perf/core] perf/x86: " tip-bot for Adrian Hunter
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.