[PATCH V2] perf: x86: Improve accuracy of perf/sched clock

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH V2] perf: x86: Improve accuracy of perf/sched clock
@ 2015-07-28 21:14 Adrian Hunter
  2015-08-17  7:34 ` Adrian Hunter
  2015-08-20 19:31 ` Thomas Gleixner
  0 siblings, 2 replies; 9+ messages in thread
From: Adrian Hunter @ 2015-07-28 21:14 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar
  Cc: Arnaldo Carvalho de Melo, Andy Lutomirski, Thomas Gleixner,
	linux-kernel, Stephane Eranian, Andi Kleen

When TSC is stable perf/sched clock is based on it.
However the conversion from cycles to nanoseconds
is not as accurate as it could be.  Because
CYC2NS_SCALE_FACTOR is 10, the accuracy is +/- 1/2048

The change is to calculate the maximum shift that
results in a multiplier that is still a 32-bit number.
For example all frequencies over 1 GHz will have
a shift of 32, making the accuracy of the conversion
+/- 1/(2^33)

Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
---
 arch/x86/kernel/tsc.c | 33 ++++++++++++++++++++-------------
 1 file changed, 20 insertions(+), 13 deletions(-)

diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 7437b41f6a47..e7085bcfb06b 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -167,21 +167,21 @@ static void cyc2ns_write_end(int cpu, struct cyc2ns_data *data)
  *              ns = cycles * cyc2ns_scale / SC
  *
  *      And since SC is a constant power of two, we can convert the div
- *  into a shift.
+ *  into a shift. The larger SC is, the more accurate the conversion, but
+ *  cyc2ns_scale needs to be a 32-bit value so that 32-bit multiplication
+ *  (64-bit result) can be used. So start by trying SC = 2^32, reducing
+ *  until the criteria are met.
  *
- *  We can use khz divisor instead of mhz to keep a better precision, since
- *  cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
+ *  We can use khz divisor instead of mhz to keep a better precision.
  *  (mathieu.desnoyers@polymtl.ca)
  *
  *                      -johnstul@us.ibm.com "math is hard, lets go shopping!"
  */
 
-#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
-
 static void cyc2ns_data_init(struct cyc2ns_data *data)
 {
 	data->cyc2ns_mul = 0;
-	data->cyc2ns_shift = CYC2NS_SCALE_FACTOR;
+	data->cyc2ns_shift = 0;
 	data->cyc2ns_offset = 0;
 	data->__count = 0;
 }
@@ -215,14 +215,14 @@ static inline unsigned long long cycles_2_ns(unsigned long long cyc)
 
 	if (likely(data == tail)) {
 		ns = data->cyc2ns_offset;
-		ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
+		ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift);
 	} else {
 		data->__count++;
 
 		barrier();
 
 		ns = data->cyc2ns_offset;
-		ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
+		ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift);
 
 		barrier();
 
@@ -239,6 +239,8 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
 	unsigned long long tsc_now, ns_now;
 	struct cyc2ns_data *data;
 	unsigned long flags;
+	u64 mult;
+	u32 shft = 32;
 
 	local_irq_save(flags);
 	sched_clock_idle_sleep_event();
@@ -256,12 +258,17 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
 	 * time function is continuous; see the comment near struct
 	 * cyc2ns_data.
 	 */
-	data->cyc2ns_mul =
-		DIV_ROUND_CLOSEST(NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR,
-				  cpu_khz);
-	data->cyc2ns_shift = CYC2NS_SCALE_FACTOR;
+	mult = (u64)NSEC_PER_MSEC << 32;
+	mult += cpu_khz / 2;
+	do_div(mult, cpu_khz);
+	while (mult > U32_MAX) {
+		mult >>= 1;
+		shft -= 1;
+	}
+	data->cyc2ns_mul = mult;
+	data->cyc2ns_shift = shft;
 	data->cyc2ns_offset = ns_now -
-		mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
+		mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, data->cyc2ns_shift);
 
 	cyc2ns_write_end(cpu, data);
 
-- 
1.9.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH V2] perf: x86: Improve accuracy of perf/sched clock
  2015-07-28 21:14 [PATCH V2] perf: x86: Improve accuracy of perf/sched clock Adrian Hunter
@ 2015-08-17  7:34 ` Adrian Hunter
  2015-08-17  9:56   ` Peter Zijlstra
  2015-08-20 19:31 ` Thomas Gleixner
  1 sibling, 1 reply; 9+ messages in thread
From: Adrian Hunter @ 2015-08-17  7:34 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Adrian Hunter, Ingo Molnar, Arnaldo Carvalho de Melo,
	Andy Lutomirski, Thomas Gleixner, linux-kernel, Stephane Eranian,
	Andi Kleen

On 29/07/15 00:14, Adrian Hunter wrote:
> When TSC is stable perf/sched clock is based on it.
> However the conversion from cycles to nanoseconds
> is not as accurate as it could be.  Because
> CYC2NS_SCALE_FACTOR is 10, the accuracy is +/- 1/2048
> 
> The change is to calculate the maximum shift that
> results in a multiplier that is still a 32-bit number.
> For example all frequencies over 1 GHz will have
> a shift of 32, making the accuracy of the conversion
> +/- 1/(2^33)
> 
> Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>

Is this OK?

> ---
>  arch/x86/kernel/tsc.c | 33 ++++++++++++++++++++-------------
>  1 file changed, 20 insertions(+), 13 deletions(-)
> 
> diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
> index 7437b41f6a47..e7085bcfb06b 100644
> --- a/arch/x86/kernel/tsc.c
> +++ b/arch/x86/kernel/tsc.c
> @@ -167,21 +167,21 @@ static void cyc2ns_write_end(int cpu, struct cyc2ns_data *data)
>   *              ns = cycles * cyc2ns_scale / SC
>   *
>   *      And since SC is a constant power of two, we can convert the div
> - *  into a shift.
> + *  into a shift. The larger SC is, the more accurate the conversion, but
> + *  cyc2ns_scale needs to be a 32-bit value so that 32-bit multiplication
> + *  (64-bit result) can be used. So start by trying SC = 2^32, reducing
> + *  until the criteria are met.
>   *
> - *  We can use khz divisor instead of mhz to keep a better precision, since
> - *  cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
> + *  We can use khz divisor instead of mhz to keep a better precision.
>   *  (mathieu.desnoyers@polymtl.ca)
>   *
>   *                      -johnstul@us.ibm.com "math is hard, lets go shopping!"
>   */
>  
> -#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
> -
>  static void cyc2ns_data_init(struct cyc2ns_data *data)
>  {
>  	data->cyc2ns_mul = 0;
> -	data->cyc2ns_shift = CYC2NS_SCALE_FACTOR;
> +	data->cyc2ns_shift = 0;
>  	data->cyc2ns_offset = 0;
>  	data->__count = 0;
>  }
> @@ -215,14 +215,14 @@ static inline unsigned long long cycles_2_ns(unsigned long long cyc)
>  
>  	if (likely(data == tail)) {
>  		ns = data->cyc2ns_offset;
> -		ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
> +		ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift);
>  	} else {
>  		data->__count++;
>  
>  		barrier();
>  
>  		ns = data->cyc2ns_offset;
> -		ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
> +		ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift);
>  
>  		barrier();
>  
> @@ -239,6 +239,8 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
>  	unsigned long long tsc_now, ns_now;
>  	struct cyc2ns_data *data;
>  	unsigned long flags;
> +	u64 mult;
> +	u32 shft = 32;
>  
>  	local_irq_save(flags);
>  	sched_clock_idle_sleep_event();
> @@ -256,12 +258,17 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
>  	 * time function is continuous; see the comment near struct
>  	 * cyc2ns_data.
>  	 */
> -	data->cyc2ns_mul =
> -		DIV_ROUND_CLOSEST(NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR,
> -				  cpu_khz);
> -	data->cyc2ns_shift = CYC2NS_SCALE_FACTOR;
> +	mult = (u64)NSEC_PER_MSEC << 32;
> +	mult += cpu_khz / 2;
> +	do_div(mult, cpu_khz);
> +	while (mult > U32_MAX) {
> +		mult >>= 1;
> +		shft -= 1;
> +	}
> +	data->cyc2ns_mul = mult;
> +	data->cyc2ns_shift = shft;
>  	data->cyc2ns_offset = ns_now -
> -		mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
> +		mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, data->cyc2ns_shift);
>  
>  	cyc2ns_write_end(cpu, data);
>  
> 


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH V2] perf: x86: Improve accuracy of perf/sched clock
  2015-08-17  7:34 ` Adrian Hunter
@ 2015-08-17  9:56   ` Peter Zijlstra
  0 siblings, 0 replies; 9+ messages in thread
From: Peter Zijlstra @ 2015-08-17  9:56 UTC (permalink / raw)
  To: Adrian Hunter
  Cc: Ingo Molnar, Arnaldo Carvalho de Melo, Andy Lutomirski,
	Thomas Gleixner, linux-kernel, Stephane Eranian, Andi Kleen

On Mon, Aug 17, 2015 at 10:34:03AM +0300, Adrian Hunter wrote:
> On 29/07/15 00:14, Adrian Hunter wrote:
> > When TSC is stable perf/sched clock is based on it.
> > However the conversion from cycles to nanoseconds
> > is not as accurate as it could be.  Because
> > CYC2NS_SCALE_FACTOR is 10, the accuracy is +/- 1/2048
> > 
> > The change is to calculate the maximum shift that
> > results in a multiplier that is still a 32-bit number.
> > For example all frequencies over 1 GHz will have
> > a shift of 32, making the accuracy of the conversion
> > +/- 1/(2^33)
> > 
> > Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
> 
> Is this OK?

Yes I think so, sorry I got backlogged with preparation for Seattle.

Thomas, if you see this, could you merge with my ACK on, otherwise I'll
queue it the normal route once I'm back.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH V2] perf: x86: Improve accuracy of perf/sched clock
  2015-07-28 21:14 [PATCH V2] perf: x86: Improve accuracy of perf/sched clock Adrian Hunter
  2015-08-17  7:34 ` Adrian Hunter
@ 2015-08-20 19:31 ` Thomas Gleixner
  2015-08-21  6:46   ` Adrian Hunter
  1 sibling, 1 reply; 9+ messages in thread
From: Thomas Gleixner @ 2015-08-20 19:31 UTC (permalink / raw)
  To: Adrian Hunter
  Cc: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Andy Lutomirski, linux-kernel, Stephane Eranian, Andi Kleen

On Wed, 29 Jul 2015, Adrian Hunter wrote:
> @@ -239,6 +239,8 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
>  	unsigned long long tsc_now, ns_now;
>  	struct cyc2ns_data *data;
>  	unsigned long flags;
> +	u64 mult;
> +	u32 shft = 32;
>  
>  	local_irq_save(flags);
>  	sched_clock_idle_sleep_event();
> @@ -256,12 +258,17 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
>  	 * time function is continuous; see the comment near struct
>  	 * cyc2ns_data.
>  	 */
> -	data->cyc2ns_mul =
> -		DIV_ROUND_CLOSEST(NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR,
> -				  cpu_khz);
> -	data->cyc2ns_shift = CYC2NS_SCALE_FACTOR;
> +	mult = (u64)NSEC_PER_MSEC << 32;
> +	mult += cpu_khz / 2;
> +	do_div(mult, cpu_khz);
> +	while (mult > U32_MAX) {
> +		mult >>= 1;
> +		shft -= 1;
> +	}

This is an open coded variant of clocks_calc_mult_shift(). Can we
please use that one?

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH V2] perf: x86: Improve accuracy of perf/sched clock
  2015-08-20 19:31 ` Thomas Gleixner
@ 2015-08-21  6:46   ` Adrian Hunter
  2015-08-21  9:05     ` [PATCH V3] " Adrian Hunter
  0 siblings, 1 reply; 9+ messages in thread
From: Adrian Hunter @ 2015-08-21  6:46 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Andy Lutomirski, linux-kernel, Stephane Eranian, Andi Kleen

On 20/08/15 22:31, Thomas Gleixner wrote:
> On Wed, 29 Jul 2015, Adrian Hunter wrote:
>> @@ -239,6 +239,8 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
>>  	unsigned long long tsc_now, ns_now;
>>  	struct cyc2ns_data *data;
>>  	unsigned long flags;
>> +	u64 mult;
>> +	u32 shft = 32;
>>  
>>  	local_irq_save(flags);
>>  	sched_clock_idle_sleep_event();
>> @@ -256,12 +258,17 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
>>  	 * time function is continuous; see the comment near struct
>>  	 * cyc2ns_data.
>>  	 */
>> -	data->cyc2ns_mul =
>> -		DIV_ROUND_CLOSEST(NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR,
>> -				  cpu_khz);
>> -	data->cyc2ns_shift = CYC2NS_SCALE_FACTOR;
>> +	mult = (u64)NSEC_PER_MSEC << 32;
>> +	mult += cpu_khz / 2;
>> +	do_div(mult, cpu_khz);
>> +	while (mult > U32_MAX) {
>> +		mult >>= 1;
>> +		shft -= 1;
>> +	}
> 
> This is an open coded variant of clocks_calc_mult_shift(). Can we
> please use that one?

Sure.  clocks_calc_mult_shift() does a division on each shift which is a bit
slower (avoids a 1-bit rounding error but that will never be more than 1 in
2^32 in this case), and the 'maxsec' functionality is not needed because
mul_u64_u32_shr() is used to avoid 64-bit overflow.

I will send V3.


^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH V3] perf: x86: Improve accuracy of perf/sched clock
  2015-08-21  6:46   ` Adrian Hunter
@ 2015-08-21  9:05     ` Adrian Hunter
  2015-09-01  8:36       ` Adrian Hunter
  2015-09-13 11:08       ` [tip:perf/core] perf/x86: " tip-bot for Adrian Hunter
  0 siblings, 2 replies; 9+ messages in thread
From: Adrian Hunter @ 2015-08-21  9:05 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar
  Cc: Arnaldo Carvalho de Melo, Andy Lutomirski, Thomas Gleixner,
	linux-kernel, Stephane Eranian, Andi Kleen

When TSC is stable perf/sched clock is based on it.
However the conversion from cycles to nanoseconds
is not as accurate as it could be.  Because
CYC2NS_SCALE_FACTOR is 10, the accuracy is +/- 1/2048

The change is to calculate the maximum shift that
results in a multiplier that is still a 32-bit number.
For example all frequencies over 1 GHz will have
a shift of 32, making the accuracy of the conversion
+/- 1/(2^33).  That is achieved by using the
'clocks_calc_mult_shift()' function.

Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
---


Changes in V3:

	Use clocks_calc_mult_shift() instead of open-coded
	calculation

Changes in V2:

	Remove definition of CYC2NS_SCALE_FACTOR
	Amend comments


 arch/x86/kernel/tsc.c | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index c8d52cb4cb6e..39ec3d07affd 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -167,21 +167,20 @@ static void cyc2ns_write_end(int cpu, struct cyc2ns_data *data)
  *              ns = cycles * cyc2ns_scale / SC
  *
  *      And since SC is a constant power of two, we can convert the div
- *  into a shift.
+ *  into a shift. The larger SC is, the more accurate the conversion, but
+ *  cyc2ns_scale needs to be a 32-bit value so that 32-bit multiplication
+ *  (64-bit result) can be used.
  *
- *  We can use khz divisor instead of mhz to keep a better precision, since
- *  cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
+ *  We can use khz divisor instead of mhz to keep a better precision.
  *  (mathieu.desnoyers@polymtl.ca)
  *
  *                      -johnstul@us.ibm.com "math is hard, lets go shopping!"
  */
 
-#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
-
 static void cyc2ns_data_init(struct cyc2ns_data *data)
 {
 	data->cyc2ns_mul = 0;
-	data->cyc2ns_shift = CYC2NS_SCALE_FACTOR;
+	data->cyc2ns_shift = 0;
 	data->cyc2ns_offset = 0;
 	data->__count = 0;
 }
@@ -215,14 +214,14 @@ static inline unsigned long long cycles_2_ns(unsigned long long cyc)
 
 	if (likely(data == tail)) {
 		ns = data->cyc2ns_offset;
-		ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
+		ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift);
 	} else {
 		data->__count++;
 
 		barrier();
 
 		ns = data->cyc2ns_offset;
-		ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
+		ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift);
 
 		barrier();
 
@@ -256,12 +255,11 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
 	 * time function is continuous; see the comment near struct
 	 * cyc2ns_data.
 	 */
-	data->cyc2ns_mul =
-		DIV_ROUND_CLOSEST(NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR,
-				  cpu_khz);
-	data->cyc2ns_shift = CYC2NS_SCALE_FACTOR;
+	clocks_calc_mult_shift(&data->cyc2ns_mul, &data->cyc2ns_shift, cpu_khz,
+			       NSEC_PER_MSEC, 0);
+
 	data->cyc2ns_offset = ns_now -
-		mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
+		mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, data->cyc2ns_shift);
 
 	cyc2ns_write_end(cpu, data);
 
-- 
1.9.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH V3] perf: x86: Improve accuracy of perf/sched clock
  2015-08-21  9:05     ` [PATCH V3] " Adrian Hunter
@ 2015-09-01  8:36       ` Adrian Hunter
  2015-09-01  8:57         ` Peter Zijlstra
  2015-09-13 11:08       ` [tip:perf/core] perf/x86: " tip-bot for Adrian Hunter
  1 sibling, 1 reply; 9+ messages in thread
From: Adrian Hunter @ 2015-09-01  8:36 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, Arnaldo Carvalho de Melo, Andy Lutomirski,
	Thomas Gleixner, linux-kernel, Stephane Eranian, Andi Kleen

On 21/08/15 12:05, Adrian Hunter wrote:
> When TSC is stable perf/sched clock is based on it.
> However the conversion from cycles to nanoseconds
> is not as accurate as it could be.  Because
> CYC2NS_SCALE_FACTOR is 10, the accuracy is +/- 1/2048
> 
> The change is to calculate the maximum shift that
> results in a multiplier that is still a 32-bit number.
> For example all frequencies over 1 GHz will have
> a shift of 32, making the accuracy of the conversion
> +/- 1/(2^33).  That is achieved by using the
> 'clocks_calc_mult_shift()' function.
> 
> Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>

Ping?

> ---
> 
> 
> Changes in V3:
> 
> 	Use clocks_calc_mult_shift() instead of open-coded
> 	calculation
> 
> Changes in V2:
> 
> 	Remove definition of CYC2NS_SCALE_FACTOR
> 	Amend comments
> 
> 
>  arch/x86/kernel/tsc.c | 24 +++++++++++-------------
>  1 file changed, 11 insertions(+), 13 deletions(-)
> 
> diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
> index c8d52cb4cb6e..39ec3d07affd 100644
> --- a/arch/x86/kernel/tsc.c
> +++ b/arch/x86/kernel/tsc.c
> @@ -167,21 +167,20 @@ static void cyc2ns_write_end(int cpu, struct cyc2ns_data *data)
>   *              ns = cycles * cyc2ns_scale / SC
>   *
>   *      And since SC is a constant power of two, we can convert the div
> - *  into a shift.
> + *  into a shift. The larger SC is, the more accurate the conversion, but
> + *  cyc2ns_scale needs to be a 32-bit value so that 32-bit multiplication
> + *  (64-bit result) can be used.
>   *
> - *  We can use khz divisor instead of mhz to keep a better precision, since
> - *  cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
> + *  We can use khz divisor instead of mhz to keep a better precision.
>   *  (mathieu.desnoyers@polymtl.ca)
>   *
>   *                      -johnstul@us.ibm.com "math is hard, lets go shopping!"
>   */
>  
> -#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
> -
>  static void cyc2ns_data_init(struct cyc2ns_data *data)
>  {
>  	data->cyc2ns_mul = 0;
> -	data->cyc2ns_shift = CYC2NS_SCALE_FACTOR;
> +	data->cyc2ns_shift = 0;
>  	data->cyc2ns_offset = 0;
>  	data->__count = 0;
>  }
> @@ -215,14 +214,14 @@ static inline unsigned long long cycles_2_ns(unsigned long long cyc)
>  
>  	if (likely(data == tail)) {
>  		ns = data->cyc2ns_offset;
> -		ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
> +		ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift);
>  	} else {
>  		data->__count++;
>  
>  		barrier();
>  
>  		ns = data->cyc2ns_offset;
> -		ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
> +		ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift);
>  
>  		barrier();
>  
> @@ -256,12 +255,11 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
>  	 * time function is continuous; see the comment near struct
>  	 * cyc2ns_data.
>  	 */
> -	data->cyc2ns_mul =
> -		DIV_ROUND_CLOSEST(NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR,
> -				  cpu_khz);
> -	data->cyc2ns_shift = CYC2NS_SCALE_FACTOR;
> +	clocks_calc_mult_shift(&data->cyc2ns_mul, &data->cyc2ns_shift, cpu_khz,
> +			       NSEC_PER_MSEC, 0);
> +
>  	data->cyc2ns_offset = ns_now -
> -		mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
> +		mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, data->cyc2ns_shift);
>  
>  	cyc2ns_write_end(cpu, data);
>  
> 


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH V3] perf: x86: Improve accuracy of perf/sched clock
  2015-09-01  8:36       ` Adrian Hunter
@ 2015-09-01  8:57         ` Peter Zijlstra
  0 siblings, 0 replies; 9+ messages in thread
From: Peter Zijlstra @ 2015-09-01  8:57 UTC (permalink / raw)
  To: Adrian Hunter
  Cc: Ingo Molnar, Arnaldo Carvalho de Melo, Andy Lutomirski,
	Thomas Gleixner, linux-kernel, Stephane Eranian, Andi Kleen

On Tue, Sep 01, 2015 at 11:36:38AM +0300, Adrian Hunter wrote:
> On 21/08/15 12:05, Adrian Hunter wrote:
> > When TSC is stable perf/sched clock is based on it.
> > However the conversion from cycles to nanoseconds
> > is not as accurate as it could be.  Because
> > CYC2NS_SCALE_FACTOR is 10, the accuracy is +/- 1/2048
> > 
> > The change is to calculate the maximum shift that
> > results in a multiplier that is still a 32-bit number.
> > For example all frequencies over 1 GHz will have
> > a shift of 32, making the accuracy of the conversion
> > +/- 1/(2^33).  That is achieved by using the
> > 'clocks_calc_mult_shift()' function.
> > 
> > Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
> 
> Ping?

I've queued it. Thanks

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [tip:perf/core] perf/x86: Improve accuracy of perf/sched clock
  2015-08-21  9:05     ` [PATCH V3] " Adrian Hunter
  2015-09-01  8:36       ` Adrian Hunter
@ 2015-09-13 11:08       ` tip-bot for Adrian Hunter
  1 sibling, 0 replies; 9+ messages in thread
From: tip-bot for Adrian Hunter @ 2015-09-13 11:08 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: acme, peterz, torvalds, mingo, adrian.hunter, vincent.weaver,
	acme, jolsa, tglx, hpa, linux-kernel, luto, eranian

Commit-ID:  b20112edeadf0b8a1416de061caa4beb11539902
Gitweb:     http://git.kernel.org/tip/b20112edeadf0b8a1416de061caa4beb11539902
Author:     Adrian Hunter <adrian.hunter@intel.com>
AuthorDate: Fri, 21 Aug 2015 12:05:18 +0300
Committer:  Ingo Molnar <mingo@kernel.org>
CommitDate: Sun, 13 Sep 2015 11:27:20 +0200

perf/x86: Improve accuracy of perf/sched clock

When TSC is stable perf/sched clock is based on it.
However the conversion from cycles to nanoseconds
is not as accurate as it could be.  Because
CYC2NS_SCALE_FACTOR is 10, the accuracy is +/- 1/2048

The change is to calculate the maximum shift that
results in a multiplier that is still a 32-bit number.
For example all frequencies over 1 GHz will have
a shift of 32, making the accuracy of the conversion
+/- 1/(2^33).  That is achieved by using the
'clocks_calc_mult_shift()' function.

Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1440147918-22250-1-git-send-email-adrian.hunter@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/tsc.c | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index c8d52cb..39ec3d0 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -167,21 +167,20 @@ static void cyc2ns_write_end(int cpu, struct cyc2ns_data *data)
  *              ns = cycles * cyc2ns_scale / SC
  *
  *      And since SC is a constant power of two, we can convert the div
- *  into a shift.
+ *  into a shift. The larger SC is, the more accurate the conversion, but
+ *  cyc2ns_scale needs to be a 32-bit value so that 32-bit multiplication
+ *  (64-bit result) can be used.
  *
- *  We can use khz divisor instead of mhz to keep a better precision, since
- *  cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
+ *  We can use khz divisor instead of mhz to keep a better precision.
  *  (mathieu.desnoyers@polymtl.ca)
  *
  *                      -johnstul@us.ibm.com "math is hard, lets go shopping!"
  */
 
-#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
-
 static void cyc2ns_data_init(struct cyc2ns_data *data)
 {
 	data->cyc2ns_mul = 0;
-	data->cyc2ns_shift = CYC2NS_SCALE_FACTOR;
+	data->cyc2ns_shift = 0;
 	data->cyc2ns_offset = 0;
 	data->__count = 0;
 }
@@ -215,14 +214,14 @@ static inline unsigned long long cycles_2_ns(unsigned long long cyc)
 
 	if (likely(data == tail)) {
 		ns = data->cyc2ns_offset;
-		ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
+		ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift);
 	} else {
 		data->__count++;
 
 		barrier();
 
 		ns = data->cyc2ns_offset;
-		ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
+		ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift);
 
 		barrier();
 
@@ -256,12 +255,11 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
 	 * time function is continuous; see the comment near struct
 	 * cyc2ns_data.
 	 */
-	data->cyc2ns_mul =
-		DIV_ROUND_CLOSEST(NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR,
-				  cpu_khz);
-	data->cyc2ns_shift = CYC2NS_SCALE_FACTOR;
+	clocks_calc_mult_shift(&data->cyc2ns_mul, &data->cyc2ns_shift, cpu_khz,
+			       NSEC_PER_MSEC, 0);
+
 	data->cyc2ns_offset = ns_now -
-		mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
+		mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, data->cyc2ns_shift);
 
 	cyc2ns_write_end(cpu, data);
 

^ permalink raw reply related	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2015-09-13 11:08 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2015-07-28 21:14 [PATCH V2] perf: x86: Improve accuracy of perf/sched clock Adrian Hunter
2015-08-17  7:34 ` Adrian Hunter
2015-08-17  9:56   ` Peter Zijlstra
2015-08-20 19:31 ` Thomas Gleixner
2015-08-21  6:46   ` Adrian Hunter
2015-08-21  9:05     ` [PATCH V3] " Adrian Hunter
2015-09-01  8:36       ` Adrian Hunter
2015-09-01  8:57         ` Peter Zijlstra
2015-09-13 11:08       ` [tip:perf/core] perf/x86: " tip-bot for Adrian Hunter

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox