All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH V2] perf: x86: Improve accuracy of perf/sched clock
@ 2015-07-28 21:14 Adrian Hunter
  2015-08-17  7:34 ` Adrian Hunter
  2015-08-20 19:31 ` Thomas Gleixner
  0 siblings, 2 replies; 9+ messages in thread
From: Adrian Hunter @ 2015-07-28 21:14 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar
  Cc: Arnaldo Carvalho de Melo, Andy Lutomirski, Thomas Gleixner,
	linux-kernel, Stephane Eranian, Andi Kleen

When TSC is stable perf/sched clock is based on it.
However the conversion from cycles to nanoseconds
is not as accurate as it could be.  Because
CYC2NS_SCALE_FACTOR is 10, the accuracy is +/- 1/2048

The change is to calculate the maximum shift that
results in a multiplier that is still a 32-bit number.
For example all frequencies over 1 GHz will have
a shift of 32, making the accuracy of the conversion
+/- 1/(2^33)

Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
---
 arch/x86/kernel/tsc.c | 33 ++++++++++++++++++++-------------
 1 file changed, 20 insertions(+), 13 deletions(-)

diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 7437b41f6a47..e7085bcfb06b 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -167,21 +167,21 @@ static void cyc2ns_write_end(int cpu, struct cyc2ns_data *data)
  *              ns = cycles * cyc2ns_scale / SC
  *
  *      And since SC is a constant power of two, we can convert the div
- *  into a shift.
+ *  into a shift. The larger SC is, the more accurate the conversion, but
+ *  cyc2ns_scale needs to be a 32-bit value so that 32-bit multiplication
+ *  (64-bit result) can be used. So start by trying SC = 2^32, reducing
+ *  until the criteria are met.
  *
- *  We can use khz divisor instead of mhz to keep a better precision, since
- *  cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
+ *  We can use khz divisor instead of mhz to keep a better precision.
  *  (mathieu.desnoyers@polymtl.ca)
  *
  *                      -johnstul@us.ibm.com "math is hard, lets go shopping!"
  */
 
-#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
-
 static void cyc2ns_data_init(struct cyc2ns_data *data)
 {
 	data->cyc2ns_mul = 0;
-	data->cyc2ns_shift = CYC2NS_SCALE_FACTOR;
+	data->cyc2ns_shift = 0;
 	data->cyc2ns_offset = 0;
 	data->__count = 0;
 }
@@ -215,14 +215,14 @@ static inline unsigned long long cycles_2_ns(unsigned long long cyc)
 
 	if (likely(data == tail)) {
 		ns = data->cyc2ns_offset;
-		ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
+		ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift);
 	} else {
 		data->__count++;
 
 		barrier();
 
 		ns = data->cyc2ns_offset;
-		ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
+		ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift);
 
 		barrier();
 
@@ -239,6 +239,8 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
 	unsigned long long tsc_now, ns_now;
 	struct cyc2ns_data *data;
 	unsigned long flags;
+	u64 mult;
+	u32 shft = 32;
 
 	local_irq_save(flags);
 	sched_clock_idle_sleep_event();
@@ -256,12 +258,17 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
 	 * time function is continuous; see the comment near struct
 	 * cyc2ns_data.
 	 */
-	data->cyc2ns_mul =
-		DIV_ROUND_CLOSEST(NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR,
-				  cpu_khz);
-	data->cyc2ns_shift = CYC2NS_SCALE_FACTOR;
+	mult = (u64)NSEC_PER_MSEC << 32;
+	mult += cpu_khz / 2;
+	do_div(mult, cpu_khz);
+	while (mult > U32_MAX) {
+		mult >>= 1;
+		shft -= 1;
+	}
+	data->cyc2ns_mul = mult;
+	data->cyc2ns_shift = shft;
 	data->cyc2ns_offset = ns_now -
-		mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
+		mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, data->cyc2ns_shift);
 
 	cyc2ns_write_end(cpu, data);
 
-- 
1.9.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH V2] perf: x86: Improve accuracy of perf/sched clock
  2015-07-28 21:14 [PATCH V2] perf: x86: Improve accuracy of perf/sched clock Adrian Hunter
@ 2015-08-17  7:34 ` Adrian Hunter
  2015-08-17  9:56   ` Peter Zijlstra
  2015-08-20 19:31 ` Thomas Gleixner
  1 sibling, 1 reply; 9+ messages in thread
From: Adrian Hunter @ 2015-08-17  7:34 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Adrian Hunter, Ingo Molnar, Arnaldo Carvalho de Melo,
	Andy Lutomirski, Thomas Gleixner, linux-kernel, Stephane Eranian,
	Andi Kleen

On 29/07/15 00:14, Adrian Hunter wrote:
> When TSC is stable perf/sched clock is based on it.
> However the conversion from cycles to nanoseconds
> is not as accurate as it could be.  Because
> CYC2NS_SCALE_FACTOR is 10, the accuracy is +/- 1/2048
> 
> The change is to calculate the maximum shift that
> results in a multiplier that is still a 32-bit number.
> For example all frequencies over 1 GHz will have
> a shift of 32, making the accuracy of the conversion
> +/- 1/(2^33)
> 
> Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>

Is this OK?

> ---
>  arch/x86/kernel/tsc.c | 33 ++++++++++++++++++++-------------
>  1 file changed, 20 insertions(+), 13 deletions(-)
> 
> diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
> index 7437b41f6a47..e7085bcfb06b 100644
> --- a/arch/x86/kernel/tsc.c
> +++ b/arch/x86/kernel/tsc.c
> @@ -167,21 +167,21 @@ static void cyc2ns_write_end(int cpu, struct cyc2ns_data *data)
>   *              ns = cycles * cyc2ns_scale / SC
>   *
>   *      And since SC is a constant power of two, we can convert the div
> - *  into a shift.
> + *  into a shift. The larger SC is, the more accurate the conversion, but
> + *  cyc2ns_scale needs to be a 32-bit value so that 32-bit multiplication
> + *  (64-bit result) can be used. So start by trying SC = 2^32, reducing
> + *  until the criteria are met.
>   *
> - *  We can use khz divisor instead of mhz to keep a better precision, since
> - *  cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
> + *  We can use khz divisor instead of mhz to keep a better precision.
>   *  (mathieu.desnoyers@polymtl.ca)
>   *
>   *                      -johnstul@us.ibm.com "math is hard, lets go shopping!"
>   */
>  
> -#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
> -
>  static void cyc2ns_data_init(struct cyc2ns_data *data)
>  {
>  	data->cyc2ns_mul = 0;
> -	data->cyc2ns_shift = CYC2NS_SCALE_FACTOR;
> +	data->cyc2ns_shift = 0;
>  	data->cyc2ns_offset = 0;
>  	data->__count = 0;
>  }
> @@ -215,14 +215,14 @@ static inline unsigned long long cycles_2_ns(unsigned long long cyc)
>  
>  	if (likely(data == tail)) {
>  		ns = data->cyc2ns_offset;
> -		ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
> +		ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift);
>  	} else {
>  		data->__count++;
>  
>  		barrier();
>  
>  		ns = data->cyc2ns_offset;
> -		ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
> +		ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift);
>  
>  		barrier();
>  
> @@ -239,6 +239,8 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
>  	unsigned long long tsc_now, ns_now;
>  	struct cyc2ns_data *data;
>  	unsigned long flags;
> +	u64 mult;
> +	u32 shft = 32;
>  
>  	local_irq_save(flags);
>  	sched_clock_idle_sleep_event();
> @@ -256,12 +258,17 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
>  	 * time function is continuous; see the comment near struct
>  	 * cyc2ns_data.
>  	 */
> -	data->cyc2ns_mul =
> -		DIV_ROUND_CLOSEST(NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR,
> -				  cpu_khz);
> -	data->cyc2ns_shift = CYC2NS_SCALE_FACTOR;
> +	mult = (u64)NSEC_PER_MSEC << 32;
> +	mult += cpu_khz / 2;
> +	do_div(mult, cpu_khz);
> +	while (mult > U32_MAX) {
> +		mult >>= 1;
> +		shft -= 1;
> +	}
> +	data->cyc2ns_mul = mult;
> +	data->cyc2ns_shift = shft;
>  	data->cyc2ns_offset = ns_now -
> -		mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
> +		mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, data->cyc2ns_shift);
>  
>  	cyc2ns_write_end(cpu, data);
>  
> 


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH V2] perf: x86: Improve accuracy of perf/sched clock
  2015-08-17  7:34 ` Adrian Hunter
@ 2015-08-17  9:56   ` Peter Zijlstra
  0 siblings, 0 replies; 9+ messages in thread
From: Peter Zijlstra @ 2015-08-17  9:56 UTC (permalink / raw)
  To: Adrian Hunter
  Cc: Ingo Molnar, Arnaldo Carvalho de Melo, Andy Lutomirski,
	Thomas Gleixner, linux-kernel, Stephane Eranian, Andi Kleen

On Mon, Aug 17, 2015 at 10:34:03AM +0300, Adrian Hunter wrote:
> On 29/07/15 00:14, Adrian Hunter wrote:
> > When TSC is stable perf/sched clock is based on it.
> > However the conversion from cycles to nanoseconds
> > is not as accurate as it could be.  Because
> > CYC2NS_SCALE_FACTOR is 10, the accuracy is +/- 1/2048
> > 
> > The change is to calculate the maximum shift that
> > results in a multiplier that is still a 32-bit number.
> > For example all frequencies over 1 GHz will have
> > a shift of 32, making the accuracy of the conversion
> > +/- 1/(2^33)
> > 
> > Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
> 
> Is this OK?

Yes I think so, sorry I got backlogged with preparation for Seattle.

Thomas, if you see this, could you merge with my ACK on, otherwise I'll
queue it the normal route once I'm back.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH V2] perf: x86: Improve accuracy of perf/sched clock
  2015-07-28 21:14 [PATCH V2] perf: x86: Improve accuracy of perf/sched clock Adrian Hunter
  2015-08-17  7:34 ` Adrian Hunter
@ 2015-08-20 19:31 ` Thomas Gleixner
  2015-08-21  6:46   ` Adrian Hunter
  1 sibling, 1 reply; 9+ messages in thread
From: Thomas Gleixner @ 2015-08-20 19:31 UTC (permalink / raw)
  To: Adrian Hunter
  Cc: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Andy Lutomirski, linux-kernel, Stephane Eranian, Andi Kleen

On Wed, 29 Jul 2015, Adrian Hunter wrote:
> @@ -239,6 +239,8 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
>  	unsigned long long tsc_now, ns_now;
>  	struct cyc2ns_data *data;
>  	unsigned long flags;
> +	u64 mult;
> +	u32 shft = 32;
>  
>  	local_irq_save(flags);
>  	sched_clock_idle_sleep_event();
> @@ -256,12 +258,17 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
>  	 * time function is continuous; see the comment near struct
>  	 * cyc2ns_data.
>  	 */
> -	data->cyc2ns_mul =
> -		DIV_ROUND_CLOSEST(NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR,
> -				  cpu_khz);
> -	data->cyc2ns_shift = CYC2NS_SCALE_FACTOR;
> +	mult = (u64)NSEC_PER_MSEC << 32;
> +	mult += cpu_khz / 2;
> +	do_div(mult, cpu_khz);
> +	while (mult > U32_MAX) {
> +		mult >>= 1;
> +		shft -= 1;
> +	}

This is an open coded variant of clocks_calc_mult_shift(). Can we
please use that one?

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH V2] perf: x86: Improve accuracy of perf/sched clock
  2015-08-20 19:31 ` Thomas Gleixner
@ 2015-08-21  6:46   ` Adrian Hunter
  2015-08-21  9:05     ` [PATCH V3] " Adrian Hunter
  0 siblings, 1 reply; 9+ messages in thread
From: Adrian Hunter @ 2015-08-21  6:46 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Andy Lutomirski, linux-kernel, Stephane Eranian, Andi Kleen

On 20/08/15 22:31, Thomas Gleixner wrote:
> On Wed, 29 Jul 2015, Adrian Hunter wrote:
>> @@ -239,6 +239,8 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
>>  	unsigned long long tsc_now, ns_now;
>>  	struct cyc2ns_data *data;
>>  	unsigned long flags;
>> +	u64 mult;
>> +	u32 shft = 32;
>>  
>>  	local_irq_save(flags);
>>  	sched_clock_idle_sleep_event();
>> @@ -256,12 +258,17 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
>>  	 * time function is continuous; see the comment near struct
>>  	 * cyc2ns_data.
>>  	 */
>> -	data->cyc2ns_mul =
>> -		DIV_ROUND_CLOSEST(NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR,
>> -				  cpu_khz);
>> -	data->cyc2ns_shift = CYC2NS_SCALE_FACTOR;
>> +	mult = (u64)NSEC_PER_MSEC << 32;
>> +	mult += cpu_khz / 2;
>> +	do_div(mult, cpu_khz);
>> +	while (mult > U32_MAX) {
>> +		mult >>= 1;
>> +		shft -= 1;
>> +	}
> 
> This is an open coded variant of clocks_calc_mult_shift(). Can we
> please use that one?

Sure.  clocks_calc_mult_shift() does a division on each shift which is a bit
slower (avoids a 1-bit rounding error but that will never be more than 1 in
2^32 in this case), and the 'maxsec' functionality is not needed because
mul_u64_u32_shr() is used to avoid 64-bit overflow.

I will send V3.


^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH V3] perf: x86: Improve accuracy of perf/sched clock
  2015-08-21  6:46   ` Adrian Hunter
@ 2015-08-21  9:05     ` Adrian Hunter
  2015-09-01  8:36       ` Adrian Hunter
  2015-09-13 11:08       ` [tip:perf/core] perf/x86: " tip-bot for Adrian Hunter
  0 siblings, 2 replies; 9+ messages in thread
From: Adrian Hunter @ 2015-08-21  9:05 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar
  Cc: Arnaldo Carvalho de Melo, Andy Lutomirski, Thomas Gleixner,
	linux-kernel, Stephane Eranian, Andi Kleen

When TSC is stable perf/sched clock is based on it.
However the conversion from cycles to nanoseconds
is not as accurate as it could be.  Because
CYC2NS_SCALE_FACTOR is 10, the accuracy is +/- 1/2048

The change is to calculate the maximum shift that
results in a multiplier that is still a 32-bit number.
For example all frequencies over 1 GHz will have
a shift of 32, making the accuracy of the conversion
+/- 1/(2^33).  That is achieved by using the
'clocks_calc_mult_shift()' function.

Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
---


Changes in V3:

	Use clocks_calc_mult_shift() instead of open-coded
	calculation

Changes in V2:

	Remove definition of CYC2NS_SCALE_FACTOR
	Amend comments


 arch/x86/kernel/tsc.c | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index c8d52cb4cb6e..39ec3d07affd 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -167,21 +167,20 @@ static void cyc2ns_write_end(int cpu, struct cyc2ns_data *data)
  *              ns = cycles * cyc2ns_scale / SC
  *
  *      And since SC is a constant power of two, we can convert the div
- *  into a shift.
+ *  into a shift. The larger SC is, the more accurate the conversion, but
+ *  cyc2ns_scale needs to be a 32-bit value so that 32-bit multiplication
+ *  (64-bit result) can be used.
  *
- *  We can use khz divisor instead of mhz to keep a better precision, since
- *  cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
+ *  We can use khz divisor instead of mhz to keep a better precision.
  *  (mathieu.desnoyers@polymtl.ca)
  *
  *                      -johnstul@us.ibm.com "math is hard, lets go shopping!"
  */
 
-#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
-
 static void cyc2ns_data_init(struct cyc2ns_data *data)
 {
 	data->cyc2ns_mul = 0;
-	data->cyc2ns_shift = CYC2NS_SCALE_FACTOR;
+	data->cyc2ns_shift = 0;
 	data->cyc2ns_offset = 0;
 	data->__count = 0;
 }
@@ -215,14 +214,14 @@ static inline unsigned long long cycles_2_ns(unsigned long long cyc)
 
 	if (likely(data == tail)) {
 		ns = data->cyc2ns_offset;
-		ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
+		ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift);
 	} else {
 		data->__count++;
 
 		barrier();
 
 		ns = data->cyc2ns_offset;
-		ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
+		ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift);
 
 		barrier();
 
@@ -256,12 +255,11 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
 	 * time function is continuous; see the comment near struct
 	 * cyc2ns_data.
 	 */
-	data->cyc2ns_mul =
-		DIV_ROUND_CLOSEST(NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR,
-				  cpu_khz);
-	data->cyc2ns_shift = CYC2NS_SCALE_FACTOR;
+	clocks_calc_mult_shift(&data->cyc2ns_mul, &data->cyc2ns_shift, cpu_khz,
+			       NSEC_PER_MSEC, 0);
+
 	data->cyc2ns_offset = ns_now -
-		mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
+		mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, data->cyc2ns_shift);
 
 	cyc2ns_write_end(cpu, data);
 
-- 
1.9.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH V3] perf: x86: Improve accuracy of perf/sched clock
  2015-08-21  9:05     ` [PATCH V3] " Adrian Hunter
@ 2015-09-01  8:36       ` Adrian Hunter
  2015-09-01  8:57         ` Peter Zijlstra
  2015-09-13 11:08       ` [tip:perf/core] perf/x86: " tip-bot for Adrian Hunter
  1 sibling, 1 reply; 9+ messages in thread
From: Adrian Hunter @ 2015-09-01  8:36 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, Arnaldo Carvalho de Melo, Andy Lutomirski,
	Thomas Gleixner, linux-kernel, Stephane Eranian, Andi Kleen

On 21/08/15 12:05, Adrian Hunter wrote:
> When TSC is stable perf/sched clock is based on it.
> However the conversion from cycles to nanoseconds
> is not as accurate as it could be.  Because
> CYC2NS_SCALE_FACTOR is 10, the accuracy is +/- 1/2048
> 
> The change is to calculate the maximum shift that
> results in a multiplier that is still a 32-bit number.
> For example all frequencies over 1 GHz will have
> a shift of 32, making the accuracy of the conversion
> +/- 1/(2^33).  That is achieved by using the
> 'clocks_calc_mult_shift()' function.
> 
> Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>

Ping?

> ---
> 
> 
> Changes in V3:
> 
> 	Use clocks_calc_mult_shift() instead of open-coded
> 	calculation
> 
> Changes in V2:
> 
> 	Remove definition of CYC2NS_SCALE_FACTOR
> 	Amend comments
> 
> 
>  arch/x86/kernel/tsc.c | 24 +++++++++++-------------
>  1 file changed, 11 insertions(+), 13 deletions(-)
> 
> diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
> index c8d52cb4cb6e..39ec3d07affd 100644
> --- a/arch/x86/kernel/tsc.c
> +++ b/arch/x86/kernel/tsc.c
> @@ -167,21 +167,20 @@ static void cyc2ns_write_end(int cpu, struct cyc2ns_data *data)
>   *              ns = cycles * cyc2ns_scale / SC
>   *
>   *      And since SC is a constant power of two, we can convert the div
> - *  into a shift.
> + *  into a shift. The larger SC is, the more accurate the conversion, but
> + *  cyc2ns_scale needs to be a 32-bit value so that 32-bit multiplication
> + *  (64-bit result) can be used.
>   *
> - *  We can use khz divisor instead of mhz to keep a better precision, since
> - *  cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
> + *  We can use khz divisor instead of mhz to keep a better precision.
>   *  (mathieu.desnoyers@polymtl.ca)
>   *
>   *                      -johnstul@us.ibm.com "math is hard, lets go shopping!"
>   */
>  
> -#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
> -
>  static void cyc2ns_data_init(struct cyc2ns_data *data)
>  {
>  	data->cyc2ns_mul = 0;
> -	data->cyc2ns_shift = CYC2NS_SCALE_FACTOR;
> +	data->cyc2ns_shift = 0;
>  	data->cyc2ns_offset = 0;
>  	data->__count = 0;
>  }
> @@ -215,14 +214,14 @@ static inline unsigned long long cycles_2_ns(unsigned long long cyc)
>  
>  	if (likely(data == tail)) {
>  		ns = data->cyc2ns_offset;
> -		ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
> +		ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift);
>  	} else {
>  		data->__count++;
>  
>  		barrier();
>  
>  		ns = data->cyc2ns_offset;
> -		ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
> +		ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift);
>  
>  		barrier();
>  
> @@ -256,12 +255,11 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
>  	 * time function is continuous; see the comment near struct
>  	 * cyc2ns_data.
>  	 */
> -	data->cyc2ns_mul =
> -		DIV_ROUND_CLOSEST(NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR,
> -				  cpu_khz);
> -	data->cyc2ns_shift = CYC2NS_SCALE_FACTOR;
> +	clocks_calc_mult_shift(&data->cyc2ns_mul, &data->cyc2ns_shift, cpu_khz,
> +			       NSEC_PER_MSEC, 0);
> +
>  	data->cyc2ns_offset = ns_now -
> -		mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
> +		mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, data->cyc2ns_shift);
>  
>  	cyc2ns_write_end(cpu, data);
>  
> 


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH V3] perf: x86: Improve accuracy of perf/sched clock
  2015-09-01  8:36       ` Adrian Hunter
@ 2015-09-01  8:57         ` Peter Zijlstra
  0 siblings, 0 replies; 9+ messages in thread
From: Peter Zijlstra @ 2015-09-01  8:57 UTC (permalink / raw)
  To: Adrian Hunter
  Cc: Ingo Molnar, Arnaldo Carvalho de Melo, Andy Lutomirski,
	Thomas Gleixner, linux-kernel, Stephane Eranian, Andi Kleen

On Tue, Sep 01, 2015 at 11:36:38AM +0300, Adrian Hunter wrote:
> On 21/08/15 12:05, Adrian Hunter wrote:
> > When TSC is stable perf/sched clock is based on it.
> > However the conversion from cycles to nanoseconds
> > is not as accurate as it could be.  Because
> > CYC2NS_SCALE_FACTOR is 10, the accuracy is +/- 1/2048
> > 
> > The change is to calculate the maximum shift that
> > results in a multiplier that is still a 32-bit number.
> > For example all frequencies over 1 GHz will have
> > a shift of 32, making the accuracy of the conversion
> > +/- 1/(2^33).  That is achieved by using the
> > 'clocks_calc_mult_shift()' function.
> > 
> > Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
> 
> Ping?

I've queued it. Thanks

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [tip:perf/core] perf/x86: Improve accuracy of perf/sched clock
  2015-08-21  9:05     ` [PATCH V3] " Adrian Hunter
  2015-09-01  8:36       ` Adrian Hunter
@ 2015-09-13 11:08       ` tip-bot for Adrian Hunter
  1 sibling, 0 replies; 9+ messages in thread
From: tip-bot for Adrian Hunter @ 2015-09-13 11:08 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: acme, peterz, torvalds, mingo, adrian.hunter, vincent.weaver,
	acme, jolsa, tglx, hpa, linux-kernel, luto, eranian

Commit-ID:  b20112edeadf0b8a1416de061caa4beb11539902
Gitweb:     http://git.kernel.org/tip/b20112edeadf0b8a1416de061caa4beb11539902
Author:     Adrian Hunter <adrian.hunter@intel.com>
AuthorDate: Fri, 21 Aug 2015 12:05:18 +0300
Committer:  Ingo Molnar <mingo@kernel.org>
CommitDate: Sun, 13 Sep 2015 11:27:20 +0200

perf/x86: Improve accuracy of perf/sched clock

When TSC is stable perf/sched clock is based on it.
However the conversion from cycles to nanoseconds
is not as accurate as it could be.  Because
CYC2NS_SCALE_FACTOR is 10, the accuracy is +/- 1/2048

The change is to calculate the maximum shift that
results in a multiplier that is still a 32-bit number.
For example all frequencies over 1 GHz will have
a shift of 32, making the accuracy of the conversion
+/- 1/(2^33).  That is achieved by using the
'clocks_calc_mult_shift()' function.

Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1440147918-22250-1-git-send-email-adrian.hunter@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/tsc.c | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index c8d52cb..39ec3d0 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -167,21 +167,20 @@ static void cyc2ns_write_end(int cpu, struct cyc2ns_data *data)
  *              ns = cycles * cyc2ns_scale / SC
  *
  *      And since SC is a constant power of two, we can convert the div
- *  into a shift.
+ *  into a shift. The larger SC is, the more accurate the conversion, but
+ *  cyc2ns_scale needs to be a 32-bit value so that 32-bit multiplication
+ *  (64-bit result) can be used.
  *
- *  We can use khz divisor instead of mhz to keep a better precision, since
- *  cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
+ *  We can use khz divisor instead of mhz to keep a better precision.
  *  (mathieu.desnoyers@polymtl.ca)
  *
  *                      -johnstul@us.ibm.com "math is hard, lets go shopping!"
  */
 
-#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
-
 static void cyc2ns_data_init(struct cyc2ns_data *data)
 {
 	data->cyc2ns_mul = 0;
-	data->cyc2ns_shift = CYC2NS_SCALE_FACTOR;
+	data->cyc2ns_shift = 0;
 	data->cyc2ns_offset = 0;
 	data->__count = 0;
 }
@@ -215,14 +214,14 @@ static inline unsigned long long cycles_2_ns(unsigned long long cyc)
 
 	if (likely(data == tail)) {
 		ns = data->cyc2ns_offset;
-		ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
+		ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift);
 	} else {
 		data->__count++;
 
 		barrier();
 
 		ns = data->cyc2ns_offset;
-		ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
+		ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift);
 
 		barrier();
 
@@ -256,12 +255,11 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
 	 * time function is continuous; see the comment near struct
 	 * cyc2ns_data.
 	 */
-	data->cyc2ns_mul =
-		DIV_ROUND_CLOSEST(NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR,
-				  cpu_khz);
-	data->cyc2ns_shift = CYC2NS_SCALE_FACTOR;
+	clocks_calc_mult_shift(&data->cyc2ns_mul, &data->cyc2ns_shift, cpu_khz,
+			       NSEC_PER_MSEC, 0);
+
 	data->cyc2ns_offset = ns_now -
-		mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
+		mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, data->cyc2ns_shift);
 
 	cyc2ns_write_end(cpu, data);
 

^ permalink raw reply related	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2015-09-13 11:08 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2015-07-28 21:14 [PATCH V2] perf: x86: Improve accuracy of perf/sched clock Adrian Hunter
2015-08-17  7:34 ` Adrian Hunter
2015-08-17  9:56   ` Peter Zijlstra
2015-08-20 19:31 ` Thomas Gleixner
2015-08-21  6:46   ` Adrian Hunter
2015-08-21  9:05     ` [PATCH V3] " Adrian Hunter
2015-09-01  8:36       ` Adrian Hunter
2015-09-01  8:57         ` Peter Zijlstra
2015-09-13 11:08       ` [tip:perf/core] perf/x86: " tip-bot for Adrian Hunter

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.