All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Roger Pau Monné" <roger.pau@citrix.com>
To: Jan Beulich <jbeulich@suse.com>
Cc: "xen-devel@lists.xenproject.org" <xen-devel@lists.xenproject.org>,
	Andrew Cooper <andrew.cooper3@citrix.com>,
	Teddy Astie <teddy.astie@vates.tech>,
	Marek Marczykowski <marmarek@invisiblethingslab.com>
Subject: Re: [PATCH] x86/cpuidle: split the max_cstate variable
Date: Mon, 20 Apr 2026 18:14:20 +0200	[thread overview]
Message-ID: <aeZQ3FcNl_EsPTdE@macbook.local> (raw)
In-Reply-To: <4b89f640-046a-49c1-95f1-947d98135e5b@suse.com>

On Wed, Apr 08, 2026 at 01:34:43PM +0200, Jan Beulich wrote:
> The admin can control the upper bound wanted not only via command line
> option, but also via XEN_SYSCTL_pm_op_set_max_cstate. While decisions how
> to set up the system are okay this way as long as we deem the command line
> option a strict upper bound, what to do during S3 resume should not be
> based on that potentially varying value. Decisions there need to use
> solely the strict upper bound we may have enforced ourselves (or which was
> forced onto us via command line option).
> 
> Rather than altering pit_broadcast_is_available(), drop the function
> altogether. It's pretty odd for acpi/cpu_idle.c to call into time.c, just
> for that to call into acpi/cpu_idle.c again.
> 
> Fixes: 8d24303023ec ("x86: don't write_tsc() non-zero values on CPUs updating only the lower 32 bits")
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> cpuidle_disable_deep_cstate(), when called from handle_rtc_once(), is
> still somewhat of a problem: Boot time and resume time runs of
> _disable_pit_irq() may still behave differently because of that.
> 
> If we wanted the sysctl to possibly move the maximum C-state beyond what
> was given on the command line, things would get yet more complicated, as
> we'd then need to re-configure the driver that's in use.
> 
> I wonder how useful the ACPI_STATE_C<n> #define-s really are. Plain 1 is
> used in e.g. probe_c3_errata(), and the plain 7 doesn't even have a
> respective constant (nor would that be suitable, as that's not really an
> ACPI state).
> 
> --- a/xen/arch/x86/acpi/cpu_idle.c
> +++ b/xen/arch/x86/acpi/cpu_idle.c
> @@ -119,7 +119,7 @@ bool lapic_timer_init(void)
>          lapic_timer_off = hpet_broadcast_enter;
>          lapic_timer_on = hpet_broadcast_exit;
>      }
> -    else if ( pit_broadcast_is_available() )
> +    else if ( cpuidle_usable_deep_cstate() )
>      {
>          lapic_timer_off = pit_broadcast_enter;
>          lapic_timer_on = pit_broadcast_exit;
> @@ -131,12 +131,15 @@ bool lapic_timer_init(void)
>  }
>  
>  void (*__read_mostly pm_idle_save)(void);
> -unsigned int max_cstate __read_mostly = UINT_MAX;
> +
> +unsigned int max_usable_cstate __read_mostly = UINT_MAX;
> +unsigned int max_allowed_cstate __read_mostly = UINT_MAX;
>  unsigned int max_csubstate __read_mostly = UINT_MAX;
>  
>  static int __init cf_check parse_cstate(const char *s)
>  {
> -    max_cstate = simple_strtoul(s, &s, 0);
> +    max_allowed_cstate = simple_strtoul(s, &s, 0);
> +    max_usable_cstate = max_allowed_cstate;
>      if ( *s == ',' )
>          max_csubstate = simple_strtoul(s + 1, NULL, 0);
>      return 0;
> @@ -413,10 +416,11 @@ static void cf_check dump_cx(unsigned ch
>      unsigned int cpu;
>  
>      printk("'%c' pressed -> printing ACPI Cx structures\n", key);
> -    if ( max_cstate < UINT_MAX )
> +    if ( max_cstate() < UINT_MAX )
>      {
> -        printk("max state: C%u\n", max_cstate);
> -        if ( max_csubstate < UINT_MAX )
> +        printk("max state: C%u\n", max_cstate());
> +        if ( max_allowed_cstate <= max_usable_cstate &&
> +             max_csubstate < UINT_MAX )
>              printk("max sub-state: %u\n", max_csubstate);
>          else
>              printk("max sub-state: unlimited\n");
> @@ -690,18 +694,18 @@ static void cf_check acpi_processor_idle
>      u32 exp = 0, pred = 0;
>      u32 irq_traced[4] = { 0 };
>  
> -    if ( max_cstate > 0 && power &&
> +    if ( max_cstate() > 0 && power &&
>           (next_state = cpuidle_current_governor->select(power)) > 0 )
>      {
>          unsigned int max_state = sched_has_urgent_vcpu() ? ACPI_STATE_C1
> -                                                         : max_cstate;
> +                                                         : max_cstate();
>  
>          do {
>              cx = &power->states[next_state];
>          } while ( (cx->type > max_state ||
>                     cx->entry_method == ACPI_CSTATE_EM_NONE ||
>                     (cx->entry_method == ACPI_CSTATE_EM_FFH &&
> -                    cx->type == max_cstate &&
> +                    cx->type == max_allowed_cstate &&

I'm afraid I'm missing why this uses max_allowed_cstate instead of
max_state.

>                      (cx->address & MWAIT_SUBSTATE_MASK) > max_csubstate)) &&
>                    --next_state );
>          if ( next_state )
> @@ -1448,7 +1452,7 @@ static void amd_cpuidle_init(struct acpi
>  
>      for ( i = 0; i < nr; ++i )
>      {
> -        if ( cx[i].type > max_cstate )
> +        if ( cx[i].type > max_cstate() )
>              break;
>          power->states[i + 1] = cx[i];
>          power->states[i + 1].idx = i + 1;
> @@ -1611,21 +1615,22 @@ int pmstat_reset_cx_stat(unsigned int cp
>  
>  void cpuidle_disable_deep_cstate(void)
>  {
> -    if ( max_cstate > ACPI_STATE_C1 )
> +    if ( max_usable_cstate > ACPI_STATE_C1 )
>      {
>          if ( local_apic_timer_c2_ok )
> -            max_cstate = ACPI_STATE_C2;
> +            max_usable_cstate = ACPI_STATE_C2;
>          else
> -            max_cstate = ACPI_STATE_C1;
> +            max_usable_cstate = ACPI_STATE_C1;
>      }
>  
>      hpet_disable_legacy_broadcast();
>  }
>  
> -bool cpuidle_using_deep_cstate(void)
> +bool cpuidle_usable_deep_cstate(void)
>  {
> -    return xen_cpuidle && max_cstate > (local_apic_timer_c2_ok ? ACPI_STATE_C2
> -                                                               : ACPI_STATE_C1);
> +    return xen_cpuidle &&
> +           max_usable_cstate > (local_apic_timer_c2_ok ? ACPI_STATE_C2
> +                                                       : ACPI_STATE_C1);
>  }
>  
>  static int cf_check cpu_callback(
> --- a/xen/arch/x86/cpu/intel.c
> +++ b/xen/arch/x86/cpu/intel.c
> @@ -384,12 +384,12 @@ static void probe_c3_errata(const struct
>      };
>  
>      /* Serialized by the AP bringup code. */
> -    if ( max_cstate > 1 && (c->apicid & (c->x86_num_siblings - 1)) &&
> +    if ( max_usable_cstate > 1 && (c->apicid & (c->x86_num_siblings - 1)) &&
>           x86_match_cpu(models) )
>      {
>          printk(XENLOG_WARNING
>  	       "Disabling C-states C3 and C6 due to CPU errata\n");
> -        max_cstate = 1;
> +        max_usable_cstate = 1;
>      }
>  }
>  
> --- a/xen/arch/x86/cpu/mwait-idle.c
> +++ b/xen/arch/x86/cpu/mwait-idle.c
> @@ -1045,15 +1045,16 @@ static void cf_check mwait_idle(void)
>  	u64 before, after;
>  	u32 exp = 0, pred = 0, irq_traced[4] = { 0 };
>  
> -	if (max_cstate > 0 && power &&
> +	if (max_cstate() > 0 && power &&
>  	    (next_state = cpuidle_current_governor->select(power)) > 0) {
>  		unsigned int max_state = sched_has_urgent_vcpu() ? ACPI_STATE_C1
> -								 : max_cstate;
> +								 : max_cstate();
>  
>  		do {
>  			cx = &power->states[next_state];
> -		} while ((cx->type > max_state || (cx->type == max_cstate &&
> -			  MWAIT_HINT2SUBSTATE(cx->address) > max_csubstate)) &&
> +		} while ((cx->type > max_state ||
> +                          (cx->type == max_allowed_cstate &&

Indentation is weird for the above line IMO, you should use hard 3
tabs plus spaces afterwards, like the surrounding indentation?

> +			   MWAIT_HINT2SUBSTATE(cx->address) > max_csubstate)) &&
>  			 --next_state);
>  		if (!next_state)
>  			cx = NULL;

Seeing max_cstate() is used in multiple places here, you might want to
introduce a local max_cstate variable?

> @@ -1458,7 +1459,7 @@ static void __init sklh_idle_state_table
>  	u64 msr;
>  
>  	/* if PC10 disabled via cmdline max_cstate=7 or shallower */
> -	if (max_cstate <= 7)
> +	if (max_cstate() <= 7)
>  		return;
>  
>  	/* if PC10 not present in CPUID.MWAIT.EDX */
> @@ -1623,7 +1624,7 @@ static int __init mwait_idle_probe(void)
>  	    !mwait_substates)
>  		return -ENODEV;
>  
> -	if (!max_cstate || !opt_mwait_idle) {
> +	if (!max_cstate() || !opt_mwait_idle) {
>  		pr_debug(PREFIX "disabled\n");
>  		return -EPERM;
>  	}
> @@ -1714,8 +1715,8 @@ static int cf_check mwait_idle_cpu_init(
>  		hint = flg2MWAIT(cpuidle_state_table[cstate].flags);
>  		state = MWAIT_HINT2CSTATE(hint) + 1;
>  
> -		if (state > max_cstate) {
> -			printk(PREFIX "max C-state %u reached\n", max_cstate);
> +		if (state > max_cstate()) {
> +			printk(PREFIX "max C-state %u reached\n", max_cstate());
>  			break;
>  		}
>  
> --- a/xen/arch/x86/include/asm/time.h
> +++ b/xen/arch/x86/include/asm/time.h
> @@ -31,7 +31,6 @@ int cpu_frequency_change(u64 freq);
>  
>  void cf_check pit_broadcast_enter(void);
>  void cf_check pit_broadcast_exit(void);
> -int pit_broadcast_is_available(void);
>  
>  uint64_t cf_check acpi_pm_tick_to_ns(uint64_t ticks);
>  
> --- a/xen/arch/x86/time.c
> +++ b/xen/arch/x86/time.c
> @@ -517,7 +517,7 @@ static int64_t __init cf_check init_hpet
>      bool disable_hpet = false;
>  
>      if ( hpet_address && strcmp(opt_clocksource, pts->id) &&
> -         cpuidle_using_deep_cstate() )
> +         cpuidle_usable_deep_cstate() )
>      {
>          if ( pci_conf_read16(PCI_SBDF(0, 0, 0x1f, 0),
>                               PCI_VENDOR_ID) == PCI_VENDOR_ID_INTEL )
> @@ -2655,7 +2655,7 @@ static int _disable_pit_irq(bool init)
>       * XXX dom0 may rely on RTC interrupt delivery, so only enable
>       * hpet_broadcast if FSB mode available or if force_hpet_broadcast.
>       */
> -    if ( cpuidle_using_deep_cstate() && !boot_cpu_has(X86_FEATURE_XEN_ARAT) )
> +    if ( cpuidle_usable_deep_cstate() && !boot_cpu_has(X86_FEATURE_XEN_ARAT) )
>      {
>          init ? hpet_broadcast_init() : hpet_broadcast_resume();
>          if ( !hpet_broadcast_is_available() )
> @@ -2707,11 +2707,6 @@ void cf_check pit_broadcast_exit(void)
>          reprogram_timer(this_cpu(timer_deadline));
>  }
>  
> -int pit_broadcast_is_available(void)
> -{
> -    return cpuidle_using_deep_cstate();
> -}
> -
>  void send_timer_event(struct vcpu *v)
>  {
>      send_guest_vcpu_virq(v, VIRQ_TIMER);
> @@ -2999,7 +2994,7 @@ static void cf_check dump_softtsc(unsign
>      else if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC ) )
>      {
>          printk("TSC has constant rate, ");
> -        if ( max_cstate <= ACPI_STATE_C2 && tsc_max_warp == 0 )
> +        if ( max_usable_cstate <= ACPI_STATE_C2 && tsc_max_warp == 0 )
>              printk("no deep Cstates, passed warp test, deemed reliable, ");
>          else
>              printk("deep Cstates possible, so not reliable, ");
> --- a/xen/include/xen/acpi.h
> +++ b/xen/include/xen/acpi.h
> @@ -142,30 +142,33 @@ int acpi_gsi_to_irq (u32 gsi, unsigned i
>  
>  #ifdef	CONFIG_ACPI_CSTATE
>  /*
> - * max_cstate sets the highest legal C-state.
> - * max_cstate = 0: C0 okay, but not C1
> - * max_cstate = 1: C1 okay, but not C2
> - * max_cstate = 2: C2 okay, but not C3 etc.
> -
> - * max_csubstate sets the highest legal C-state sub-state. Only applies to the
> - * highest legal C-state.
> - * max_cstate = 1, max_csubstate = 0 ==> C0, C1 okay, but not C1E
> - * max_cstate = 1, max_csubstate = 1 ==> C0, C1 and C1E okay, but not C2
> - * max_cstate = 2, max_csubstate = 0 ==> C0, C1, C1E, C2 okay, but not C3
> - * max_cstate = 2, max_csubstate = 1 ==> C0, C1, C1E, C2 okay, but not C3
> + * max_{allowed,usable}_cstate sets the highest allowed / usable C-state, where
> + * "allowed" is command line / sysctl based.

Hm, this is a bit misleading, because max_usable_cstate is also
command line based (plus system errata).  What about:

"max_{allowed,usable}_cstate sets the highest allowed / usable C-state.
max_usable_cstate can only be set from the command line, while
max_allowed_cstate can be set from both command line and systcl."

> + * max_*_cstate = 0: C0 okay, but not C1
> + * max_*_cstate = 1: C1 okay, but not C2
> + * max_*_cstate = 2: C2 okay, but not C3 etc.
> + *
> + * max_csubstate sets the highest allowed C-state sub-state. Only applies to
> + * the highest allowed C-state.
> + * max_allowed_cstate = 1, max_csubstate = 0 ==> C0, C1 okay, but not C1E
> + * max_allowed_cstate = 1, max_csubstate = 1 ==> C0, C1 and C1E okay, but not C2
> + * max_allowed_cstate = 2, max_csubstate = 0 ==> C0, C1, C1E, C2 okay, but not C3
> + * max_allowed_cstate = 2, max_csubstate = 1 ==> C0, C1, C1E, C2 okay, but not C3
>   */
>  
> -extern unsigned int max_cstate;
> +extern unsigned int max_usable_cstate;
> +extern unsigned int max_allowed_cstate;
>  extern unsigned int max_csubstate;
>  
> +#define max_cstate() min(max_usable_cstate, max_allowed_cstate)

I would be tempted to drop the ending parenthesis so that you don't
need to adjust callers, but that's likely misleading, as then it would
need to be uppercase MAX_CSTATE.

> +
>  static inline unsigned int acpi_get_cstate_limit(void)
>  {
> -	return max_cstate;
> +	return max_allowed_cstate;
>  }
>  static inline void acpi_set_cstate_limit(unsigned int new_limit)
>  {
> -	max_cstate = new_limit;
> -	return;
> +	max_allowed_cstate = new_limit;

Do we want to check the new limit doesn't exceed max_usable_cstate and
return -ERANGE or similar on failure?

After this change it's a bit weird to silently ignore invalid values
IMO.

>  }
>  
>  static inline unsigned int acpi_get_csubstate_limit(void)
> --- a/xen/include/xen/cpuidle.h
> +++ b/xen/include/xen/cpuidle.h
> @@ -89,7 +89,7 @@ struct cpuidle_governor
>  extern int8_t xen_cpuidle;
>  extern struct cpuidle_governor *cpuidle_current_governor;
>  
> -bool cpuidle_using_deep_cstate(void);
> +bool cpuidle_usable_deep_cstate(void);
>  void cpuidle_disable_deep_cstate(void);
>  
>  #define CPUIDLE_DRIVER_STATE_START  1


  parent reply	other threads:[~2026-04-20 16:14 UTC|newest]

Thread overview: 6+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-04-08 11:34 [PATCH] x86/cpuidle: split the max_cstate variable Jan Beulich
2026-04-08 21:11 ` Marek Marczykowski
2026-04-09  5:54   ` Jan Beulich
2026-04-20 16:14 ` Roger Pau Monné [this message]
2026-04-21  7:34   ` Jan Beulich
2026-04-24  9:33     ` Roger Pau Monné

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=aeZQ3FcNl_EsPTdE@macbook.local \
    --to=roger.pau@citrix.com \
    --cc=andrew.cooper3@citrix.com \
    --cc=jbeulich@suse.com \
    --cc=marmarek@invisiblethingslab.com \
    --cc=teddy.astie@vates.tech \
    --cc=xen-devel@lists.xenproject.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.