[PATCH] cpufreq: cppc: Reduce cppc delivered perf sampling jitter

Linux Power Management development
 help / color / mirror / Atom feed

* [PATCH] cpufreq: cppc: Reduce cppc delivered perf sampling jitter
@ 2026-06-02 21:20 Jeremy Linton
  2026-06-03 10:54 ` Breno Leitao
  0 siblings, 1 reply; 3+ messages in thread
From: Jeremy Linton @ 2026-06-02 21:20 UTC (permalink / raw)
  To: linux-pm
  Cc: sumitg, pierre.gondois, zhenglifeng1, zhanjie9, viresh.kumar,
	leitao, rafael, linux-kernel, Jeremy Linton

CPPC uses a pair of registers cycling at different frequencies to
determine an accumulated performance level. For userspace reporting we
want to convert this to an instantaneous CPU frequency, but over short
time periods small errors caused by CPPC counter reads can cause
fairly significant reported frequency variations even when the core
CPU clock isn't changing.

Reduce this by keeping a start sample fixed and retrying the end
sample until the counter deltas are large enough to reduce short
window error, or until adjacent delivered performance estimates are
within the CPU's observed CPPC read noise floor.

To begin, resample the initial pair a small fixed number of times
looking for matching delivered performance deltas.  This reduces the
chance that a disturbed start sample anchors the rest of the
calculation.

Then look for an end sample while updating the noise floor from the
best error seen between samples.  The floor remains zero on systems
with stable feedback reads, but lets noisy systems stop early once
another retry is unlikely to improve the result.  The retry loop is
capped at 200 iterations, giving an ~20 usec explicit delay budget
derived from ndelay(100).

Signed-off-by: Jeremy Linton <jeremy.linton@arm.com>
---
 drivers/cpufreq/cppc_cpufreq.c | 68 ++++++++++++++++++++++++++++++----
 1 file changed, 61 insertions(+), 7 deletions(-)

diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c
index 7e7f9dfb7a24..362c08def420 100644
--- a/drivers/cpufreq/cppc_cpufreq.c
+++ b/drivers/cpufreq/cppc_cpufreq.c
@@ -50,7 +50,7 @@ struct cppc_freq_invariance {
 static DEFINE_PER_CPU(struct cppc_freq_invariance, cppc_freq_inv);
 static struct kthread_worker *kworker_fie;
 
-static int cppc_perf_from_fbctrs(u64 reference_perf,
+static u64 cppc_perf_from_fbctrs(u64 reference_perf,
 				 struct cppc_perf_fb_ctrs *fb_ctrs_t0,
 				 struct cppc_perf_fb_ctrs *fb_ctrs_t1);
 
@@ -750,7 +750,7 @@ static inline u64 get_delta(u64 t1, u64 t0)
 	return (u32)t1 - (u32)t0;
 }
 
-static int cppc_perf_from_fbctrs(u64 reference_perf,
+static u64 cppc_perf_from_fbctrs(u64 reference_perf,
 				 struct cppc_perf_fb_ctrs *fb_ctrs_t0,
 				 struct cppc_perf_fb_ctrs *fb_ctrs_t1)
 {
@@ -771,19 +771,71 @@ static int cppc_perf_from_fbctrs(u64 reference_perf,
 	return (reference_perf * delta_delivered) / delta_reference;
 }
 
-static int cppc_get_perf_ctrs_sample(int cpu,
+/* CPPC read noise floor for early retry exit. */
+static DEFINE_PER_CPU(u64, err_floor);
+
+#define CPPC_SAMPLE_MAX_RETRIES	200
+
+static int cppc_get_perf_ctrs_sample(int cpu, u64 ref,
 				     struct cppc_perf_fb_ctrs *fb_ctrs_t0,
 				     struct cppc_perf_fb_ctrs *fb_ctrs_t1)
 {
 	int ret;
+	s64 last_delivered = 0;
+	u64 smallest_error = 0;
+	int tries = 0;
+	u64 min_counts = ref * 2000;
+
+	/* Two subsequent reads with the same offset avoids one off large jitter values */
+	for (int x = 0; x < 10; x++) {
+		ret = cppc_get_perf_ctrs(cpu, fb_ctrs_t0);
+		if (ret)
+			return ret;
+
+		ret = cppc_get_perf_ctrs(cpu, fb_ctrs_t1);
+		if (ret)
+			return ret;
+
+		if (last_delivered == cppc_perf_from_fbctrs(ref, fb_ctrs_t0, fb_ctrs_t1))
+			break;
+
+		last_delivered = cppc_perf_from_fbctrs(ref, fb_ctrs_t0, fb_ctrs_t1);
+	}
+	last_delivered = 0;
+again:
+	ndelay(100);
 
-	ret = cppc_get_perf_ctrs(cpu, fb_ctrs_t0);
+	ret = cppc_get_perf_ctrs(cpu, fb_ctrs_t1);
 	if (ret)
 		return ret;
 
-	udelay(2); /* 2usec delay between sampling */
+	/*
+	 * We want at least two significant figures, if the counts are low, then there
+	 * can be rounding errors that show up as frequency that is swinging around a few hundred
+	 * Mhz. OTOH, if the delay gets too long the clock rate can be affected.
+	 * So we want it exactly long enough to have sufficient counter turn over, and
+	 * a repeatable low error value.
+	 */
+	if ((get_delta(fb_ctrs_t1->reference, fb_ctrs_t0->reference) < min_counts) ||
+	    (get_delta(fb_ctrs_t1->delivered, fb_ctrs_t0->delivered) < min_counts)) {
+		s64 delivered = cppc_perf_from_fbctrs(ref, fb_ctrs_t0, fb_ctrs_t1);
+		u64 error = abs(last_delivered - delivered);
+
+		if (smallest_error == 0 || smallest_error > error)
+			smallest_error = error;
+
+		if (error > per_cpu(err_floor, cpu)) {
+			last_delivered = delivered;
+			tries++;
+			if (tries < CPPC_SAMPLE_MAX_RETRIES)
+				goto again;
+		}
+	}
 
-	return cppc_get_perf_ctrs(cpu, fb_ctrs_t1);
+	/* compute a running error */
+	per_cpu(err_floor, cpu) = (per_cpu(err_floor, cpu) + smallest_error) / 2;
+
+	return ret;
 }
 
 static unsigned int cppc_cpufreq_get_rate(unsigned int cpu)
@@ -799,7 +851,9 @@ static unsigned int cppc_cpufreq_get_rate(unsigned int cpu)
 
 	cpu_data = policy->driver_data;
 
-	ret = cppc_get_perf_ctrs_sample(cpu, &fb_ctrs_t0, &fb_ctrs_t1);
+	ret = cppc_get_perf_ctrs_sample(cpu, cpu_data->perf_caps.reference_perf,
+					&fb_ctrs_t0, &fb_ctrs_t1);
+
 	if (ret) {
 		if (ret == -EFAULT)
 			/* Any of the associated CPPC regs is 0. */
-- 
2.54.0


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH] cpufreq: cppc: Reduce cppc delivered perf sampling jitter
  2026-06-02 21:20 [PATCH] cpufreq: cppc: Reduce cppc delivered perf sampling jitter Jeremy Linton
@ 2026-06-03 10:54 ` Breno Leitao
  2026-06-03 16:46   ` Jeremy Linton
  0 siblings, 1 reply; 3+ messages in thread
From: Breno Leitao @ 2026-06-03 10:54 UTC (permalink / raw)
  To: Jeremy Linton
  Cc: linux-pm, sumitg, pierre.gondois, zhenglifeng1, zhanjie9,
	viresh.kumar, rafael, linux-kernel

On Tue, Jun 02, 2026 at 04:20:52PM -0500, Jeremy Linton wrote:
> CPPC uses a pair of registers cycling at different frequencies to
> determine an accumulated performance level. For userspace reporting we
> want to convert this to an instantaneous CPU frequency, but over short
> time periods small errors caused by CPPC counter reads can cause
> fairly significant reported frequency variations even when the core
> CPU clock isn't changing.
> 
> Reduce this by keeping a start sample fixed and retrying the end
> sample until the counter deltas are large enough to reduce short
> window error, or until adjacent delivered performance estimates are
> within the CPU's observed CPPC read noise floor.
> 
> To begin, resample the initial pair a small fixed number of times
> looking for matching delivered performance deltas.  This reduces the
> chance that a disturbed start sample anchors the rest of the
> calculation.
> 
> Then look for an end sample while updating the noise floor from the
> best error seen between samples.  The floor remains zero on systems
> with stable feedback reads, but lets noisy systems stop early once
> another retry is unlikely to improve the result.  The retry loop is
> capped at 200 iterations, giving an ~20 usec explicit delay budget
> derived from ndelay(100).
> 
> Signed-off-by: Jeremy Linton <jeremy.linton@arm.com>
> ---
>  drivers/cpufreq/cppc_cpufreq.c | 68 ++++++++++++++++++++++++++++++----
>  1 file changed, 61 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c
> index 7e7f9dfb7a24..362c08def420 100644
> --- a/drivers/cpufreq/cppc_cpufreq.c
> +++ b/drivers/cpufreq/cppc_cpufreq.c
> @@ -50,7 +50,7 @@ struct cppc_freq_invariance {
>  static DEFINE_PER_CPU(struct cppc_freq_invariance, cppc_freq_inv);
>  static struct kthread_worker *kworker_fie;
>  
> -static int cppc_perf_from_fbctrs(u64 reference_perf,
> +static u64 cppc_perf_from_fbctrs(u64 reference_perf,
>  				 struct cppc_perf_fb_ctrs *fb_ctrs_t0,
>  				 struct cppc_perf_fb_ctrs *fb_ctrs_t1);
>  
> @@ -750,7 +750,7 @@ static inline u64 get_delta(u64 t1, u64 t0)
>  	return (u32)t1 - (u32)t0;
>  }
>  
> -static int cppc_perf_from_fbctrs(u64 reference_perf,
> +static u64 cppc_perf_from_fbctrs(u64 reference_perf,
>  				 struct cppc_perf_fb_ctrs *fb_ctrs_t0,
>  				 struct cppc_perf_fb_ctrs *fb_ctrs_t1)
>  {
> @@ -771,19 +771,71 @@ static int cppc_perf_from_fbctrs(u64 reference_perf,
>  	return (reference_perf * delta_delivered) / delta_reference;
>  }
>  
> -static int cppc_get_perf_ctrs_sample(int cpu,
> +/* CPPC read noise floor for early retry exit. */
> +static DEFINE_PER_CPU(u64, err_floor);
> +
> +#define CPPC_SAMPLE_MAX_RETRIES	200

Could the remaining tuning literals get the same treatment?
Specifically:

- the 10 initial-resample iteration count
- the 2000 multiplier in ref * 2000
- the 100 ns in ndelay(100)

Thanks
--breno

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH] cpufreq: cppc: Reduce cppc delivered perf sampling jitter
  2026-06-03 10:54 ` Breno Leitao
@ 2026-06-03 16:46   ` Jeremy Linton
  0 siblings, 0 replies; 3+ messages in thread
From: Jeremy Linton @ 2026-06-03 16:46 UTC (permalink / raw)
  To: Breno Leitao
  Cc: linux-pm, sumitg, pierre.gondois, zhenglifeng1, zhanjie9,
	viresh.kumar, rafael, linux-kernel

Hi,

Thanks for looking at this.

On 6/3/26 5:54 AM, Breno Leitao wrote:
> On Tue, Jun 02, 2026 at 04:20:52PM -0500, Jeremy Linton wrote:
>> CPPC uses a pair of registers cycling at different frequencies to
>> determine an accumulated performance level. For userspace reporting we
>> want to convert this to an instantaneous CPU frequency, but over short
>> time periods small errors caused by CPPC counter reads can cause
>> fairly significant reported frequency variations even when the core
>> CPU clock isn't changing.
>>
>> Reduce this by keeping a start sample fixed and retrying the end
>> sample until the counter deltas are large enough to reduce short
>> window error, or until adjacent delivered performance estimates are
>> within the CPU's observed CPPC read noise floor.
>>
>> To begin, resample the initial pair a small fixed number of times
>> looking for matching delivered performance deltas.  This reduces the
>> chance that a disturbed start sample anchors the rest of the
>> calculation.
>>
>> Then look for an end sample while updating the noise floor from the
>> best error seen between samples.  The floor remains zero on systems
>> with stable feedback reads, but lets noisy systems stop early once
>> another retry is unlikely to improve the result.  The retry loop is
>> capped at 200 iterations, giving an ~20 usec explicit delay budget
>> derived from ndelay(100).
>>
>> Signed-off-by: Jeremy Linton <jeremy.linton@arm.com>
>> ---
>>   drivers/cpufreq/cppc_cpufreq.c | 68 ++++++++++++++++++++++++++++++----
>>   1 file changed, 61 insertions(+), 7 deletions(-)
>>
>> diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c
>> index 7e7f9dfb7a24..362c08def420 100644
>> --- a/drivers/cpufreq/cppc_cpufreq.c
>> +++ b/drivers/cpufreq/cppc_cpufreq.c
>> @@ -50,7 +50,7 @@ struct cppc_freq_invariance {
>>   static DEFINE_PER_CPU(struct cppc_freq_invariance, cppc_freq_inv);
>>   static struct kthread_worker *kworker_fie;
>>   
>> -static int cppc_perf_from_fbctrs(u64 reference_perf,
>> +static u64 cppc_perf_from_fbctrs(u64 reference_perf,
>>   				 struct cppc_perf_fb_ctrs *fb_ctrs_t0,
>>   				 struct cppc_perf_fb_ctrs *fb_ctrs_t1);
>>   
>> @@ -750,7 +750,7 @@ static inline u64 get_delta(u64 t1, u64 t0)
>>   	return (u32)t1 - (u32)t0;
>>   }
>>   
>> -static int cppc_perf_from_fbctrs(u64 reference_perf,
>> +static u64 cppc_perf_from_fbctrs(u64 reference_perf,
>>   				 struct cppc_perf_fb_ctrs *fb_ctrs_t0,
>>   				 struct cppc_perf_fb_ctrs *fb_ctrs_t1)
>>   {
>> @@ -771,19 +771,71 @@ static int cppc_perf_from_fbctrs(u64 reference_perf,
>>   	return (reference_perf * delta_delivered) / delta_reference;
>>   }
>>   
>> -static int cppc_get_perf_ctrs_sample(int cpu,
>> +/* CPPC read noise floor for early retry exit. */
>> +static DEFINE_PER_CPU(u64, err_floor);
>> +
>> +#define CPPC_SAMPLE_MAX_RETRIES	200
> 
> Could the remaining tuning literals get the same treatment?
> Specifically:
> 
> - the 10 initial-resample iteration count
> - the 2000 multiplier in ref * 2000
> - the 100 ns in ndelay(100)

Sure. A few of these were personal judgment from the platforms I tried 
it on. I had some instrumentation at the bottom which was printing loop 
counts and error values and largely I picked those values based on how 
they were behaving, or back of the evelope estimates. For example, that 
200 is afaik overkill, its usually settles down around 20 or less, which 
makes this faster than the old method on at least one platform I tried 
it on. And they are all intended to be "upper bound" exit the loop 
because something isn't working right values.


I'm interested in whether this patch stabilizes the frequency reporting 
in some of the cases I've heard people talking about.





^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2026-06-03 16:46 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-06-02 21:20 [PATCH] cpufreq: cppc: Reduce cppc delivered perf sampling jitter Jeremy Linton
2026-06-03 10:54 ` Breno Leitao
2026-06-03 16:46   ` Jeremy Linton

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox