[PATCH] cpuidle: Allow menu governor to enter deeper sleep states after some time

linux-pm.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH] cpuidle: Allow menu governor to enter deeper sleep states after some time
       [not found] <a181bf42-9462-476c-6dcd-39fc7151957f@tu-dresden.de>
@ 2017-07-27 12:50 ` Thomas Ilsche
  2017-10-19  7:46   ` Len Brown
  2017-10-20  0:17 ` Doug Smythies
  1 sibling, 1 reply; 25+ messages in thread
From: Thomas Ilsche @ 2017-07-27 12:50 UTC (permalink / raw)
  To: Rafael J. Wysocki, Alex Shi, Ingo Molnar, Rik van Riel,
	Daniel Lezcano, Nicholas Piggin, linux-pm
  Cc: Marcus Hähnel, Daniel Hackenberg, Robert Schöne,
	mario.bielert@tu-dresden.de

On several contemporary Intel systems, we have observed that the idle
power consumption is sometimes significantly too high, e.g. 105 W instead
of 74 W for several seconds.

We tracked this issue down to patterns that confuse the heuristic of the
menu idle governor. If the governor observes several consecutive short
sleeps, it does expect another short sleep duration despite no immediate
timers, sending the CPU into a shallow sleep state. On a dyntick-idle
kernel, there are no means for the core to enter a more efficient sleep
state if the prediction was wrong. Ironically this is particularly
problematic if some cores of the system have very infrequent activity, i.e.
many cores and optimized configuration for low idle power.

The effect, cause, triggers, mitigation technique and verification thereof
are described in detail in the a paper that is pending publication [1].

Fixing the heuristic to make a better prediction does not seem to be
generally feasible. The following patch addresses this issue by setting a
timer such that when the an expected immediate wake-up fails to appear, the
CPU is woken up to go into a deeper sleep state. If the heuristic was
right, the timer is simply canceled.

Please consider the patch a draft for discussion. We need to address:

  * Avoid programming the fallback timer when the deepest sleep state is
    already chosen.
  * Determine good default values for the introduced configurables. This
    is difficult due to the large variety of system configurations affected
    by the menu governor. Nevertheless we believe this can be done such
     that many systems benefit without significant degradation in any case.
  * Document (or remove) the sysfs entries.

But first, I would like to invite comments if this is going in the right
direction, or if this should be addressed in a different way.

[1] https://tu-dresden.de/zih/forschung/ressourcen/dateien/projekte/haec/powernightmares.pdf

       
Signed-off-by: Thomas Ilsche <thomas.ilsche@tu-dresden.de>
Signed-off-by: Marcus Hähnel <mhaehnel@os.inf.tu-dresden.de>
---

diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index 61b64c2b2cb8..45bbeb362809 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -22,6 +22,7 @@
  #include <linux/sched/stat.h>
  #include <linux/math64.h>
  #include <linux/cpu.h>
+#include <linux/smp.h>
  
  /*
   * Please note when changing the tuning values:
@@ -130,6 +131,10 @@ struct menu_device {
  	unsigned int	correction_factor[BUCKETS];
  	unsigned int	intervals[INTERVALS];
  	int		interval_ptr;
+
+	struct hrtimer	fallback_timer;
+	int             have_timer;
+	unsigned int	disregard_past;
  };
  
  
@@ -190,6 +195,85 @@ static inline int performance_multiplier(unsigned long nr_iowaiters, unsigned lo
  
  static DEFINE_PER_CPU(struct menu_device, menu_devices);
  
+static unsigned int fallback_timer_disregard_past = 1;
+static unsigned int diff_threshold_bits = 8;
+static unsigned int fallback_timer_enabled;
+static unsigned int fallback_timer_interval_us = 10000;
+
+#define MENU_ATTR_RW(name, var, range_min, range_max, wfun) \
+	static ssize_t show_##name(struct device *dev, \
+				   struct device_attribute *attr, char *buf) \
+	{ \
+		return snprintf(buf, 12, "%i\n", var); \
+	} \
+	static ssize_t store_##name(struct device *dev, \
+				    struct device_attribute *attr, \
+				    const char *buf, size_t count) \
+	{ \
+		unsigned int tmp; \
+		int ret = kstrtouint(buf, 10, &tmp); \
+		if (ret != 1) \
+			return -EINVAL; \
+		if (tmp > range_max || tmp < range_min) { \
+			pr_warn("value outside of valid range [%u, %u]\n", \
+				range_min, range_max); \
+			return -EINVAL; \
+		} \
+		var = tmp; \
+		wfun \
+		return count; \
+	} \
+	static DEVICE_ATTR(fallback_timer_##name, 0644, \
+			   show_##name, store_##name)
+
+MENU_ATTR_RW(threshold_bits, diff_threshold_bits, 0, 31, {});
+
+MENU_ATTR_RW(enable, fallback_timer_enabled, 0, 1, {
+	int i;
+
+	for_each_possible_cpu(i) {
+		struct menu_device *data = per_cpu_ptr(&menu_devices, i);
+
+		if (!fallback_timer_enabled) {
+			data->have_timer = 0;
+			hrtimer_cancel(&(data->fallback_timer));
+		}
+	} });
+
+MENU_ATTR_RW(interval_us, fallback_timer_interval_us, 1, 1000000, {});
+
+MENU_ATTR_RW(disregard_past, fallback_timer_disregard_past, 0, 1, {
+	int i;
+
+	for_each_possible_cpu(i) {
+		struct menu_device *data = per_cpu_ptr(&menu_devices, i);
+
+		data->disregard_past = 0;
+	} });
+
+static struct attribute *menu_attrs[] = {
+	&dev_attr_fallback_timer_threshold_bits.attr,
+	&dev_attr_fallback_timer_enable.attr,
+	&dev_attr_fallback_timer_interval_us.attr,
+	&dev_attr_fallback_timer_disregard_past.attr,
+	NULL
+};
+
+static struct attribute_group menu_attr_group = {
+	.attrs = menu_attrs,
+	.name = "cpuidle_menu",
+};
+
+int menu_add_interface(struct device *dev)
+{
+	return sysfs_create_group(&dev->kobj, &menu_attr_group);
+}
+
+void menu_remove_interface(struct device *dev)
+{
+	sysfs_remove_group(&dev->kobj, &menu_attr_group);
+}
+
  static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev);
  
  /*
@@ -275,6 +359,14 @@ static unsigned int get_typical_interval(struct menu_device *data)
  	goto again;
  }
  
+static enum hrtimer_restart fallback_timer_fun(struct hrtimer *tmr)
+{
+	struct menu_device *mdata = this_cpu_ptr(&menu_devices);
+
+	mdata->disregard_past = 1;
+	return HRTIMER_NORESTART;
+}
+
  /**
   * menu_select - selects the next idle state to enter
   * @drv: cpuidle driver containing state data
@@ -293,6 +385,11 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
  	unsigned long nr_iowaiters, cpu_load;
  	int resume_latency = dev_pm_qos_raw_read_value(device);
  
+	if (fallback_timer_enabled && data->have_timer) {
+		data->have_timer = 0;
+		hrtimer_cancel(&(data->fallback_timer));
+	}
+
  	if (data->needs_update) {
  		menu_update(drv, dev);
  		data->needs_update = 0;
@@ -322,7 +419,32 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
  					 RESOLUTION * DECAY);
  
  	expected_interval = get_typical_interval(data);
-	expected_interval = min(expected_interval, data->next_timer_us);
+
+	if (fallback_timer_enabled && fallback_timer_disregard_past
+	    && data->disregard_past)  {
+		expected_interval = data->next_timer_us;
+		// Only disregard the past once! Then try again
+		data->disregard_past = 0;
+	} else {
+		if (fallback_timer_enabled
+		    && expected_interval < (data->next_timer_us >> diff_threshold_bits)
+		    && data->next_timer_us > fallback_timer_interval_us * 2) {
+			/*
+			 * Program the fallback timer if the gap between the
+			 * expected interval by heuristic and the next regular
+			 * timer are too far apart.
+			 * However, only do this when we didn't just wakup from
+			 * a timer and are told to disregard the heuristic
+			 */
+			ktime_t interval =
+				ktime_set(0, fallback_timer_interval_us * 1000);
+
+			hrtimer_start(&(data->fallback_timer), interval,
+				      HRTIMER_MODE_REL_PINNED);
+			data->have_timer = 1;
+		}
+		expected_interval = min(expected_interval, data->next_timer_us);
+	}
  
  	if (CPUIDLE_DRIVER_STATE_START > 0) {
  		struct cpuidle_state *s = &drv->states[CPUIDLE_DRIVER_STATE_START];
@@ -398,6 +520,11 @@ static void menu_reflect(struct cpuidle_device *dev, int index)
  {
  	struct menu_device *data = this_cpu_ptr(&menu_devices);
  
+	if (fallback_timer_enabled && data->have_timer) {
+		data->have_timer = 0;
+		hrtimer_cancel(&data->fallback_timer);
+	}
+
  	data->last_state_idx = index;
  	data->needs_update = 1;
  }
@@ -486,6 +613,10 @@ static int menu_enable_device(struct cpuidle_driver *drv,
  
  	memset(data, 0, sizeof(struct menu_device));
  
+	hrtimer_init(&(data->fallback_timer),
+		     CLOCK_REALTIME, HRTIMER_MODE_REL_PINNED);
+	data->fallback_timer.function = fallback_timer_fun;
+
  	/*
  	 * if the correction factor is 0 (eg first time init or cpu hotplug
  	 * etc), we actually want to start out with a unity factor.
@@ -509,6 +640,10 @@ static struct cpuidle_governor menu_governor = {
   */
  static int __init init_menu(void)
  {
+	int ret = menu_add_interface(cpu_subsys.dev_root);
+
+	if (ret)
+		return ret;
  	return cpuidle_register_governor(&menu_governor);
  }
  

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* Re: [PATCH] cpuidle: Allow menu governor to enter deeper sleep states after some time
  2017-07-27 12:50 ` [PATCH] cpuidle: Allow menu governor to enter deeper sleep states after some time Thomas Ilsche
@ 2017-10-19  7:46   ` Len Brown
  2017-10-20 16:31     ` Thomas Ilsche
  2017-10-21 14:27     ` Doug Smythies
  0 siblings, 2 replies; 25+ messages in thread
From: Len Brown @ 2017-10-19  7:46 UTC (permalink / raw)
  To: Thomas Ilsche
  Cc: Rafael J. Wysocki, Alex Shi, Ingo Molnar, Rik van Riel,
	Daniel Lezcano, Nicholas Piggin, Linux PM list,
	Marcus Hähnel, Daniel Hackenberg, Robert Schöne,
	mario.bielert@tu-dresden.de

Thomas,

Thanks for doing this investigation and sending this note.   It was
very helpful that you boiled down the stimulus to a simple example
sequence.  I created an "idle-virus" using that sequence, and I can
reproduce your results on my hardware.

Indeed, it is actually possible to observe this issue by comparing a
big idle system in single user mode vs multi-user mode.  Simply run
turbostat in both scenarios and watch the C-states requested and
Pkg%pc6 achieved, and the RAPL DC power consumed.  The "background OS
noise" in multi-user mode is enough to trigger the issue you describe.
C1E is requested and Pkg%pc6 sometimes go to zero, even when Busy% is
0.

To prove that this is the result of OS C-state demotion, I have
implemented option "d)" in your paper.  I added a module param,
currently called "cpuidle.powersave", that causes cpuidle to bypass
the governor altogether and select the deepest state that is allowed
by PM_QOS.  (maybe I should call it cpuidle.bypass, or
cpuidle.deepest...)  When i set this parameter,  multi-user C-state
residency and power equal that seen in single-user mode, with no
spikes.

Note that the paper is not correct when it states that the OS can not
enforce latency constraints when relying on HW C-state selection.  The
OS PM_QOS constraint is still effective in this path because it can
still limit the OS request, and the HW will never "promote" to a
deeper state than that requested by the OS -- the hardware can only
demote and then un-demote -- but never deeper than the OS request.

Indeed, on systems where PM_QOS is used properly (and it is working
correctly, a different story), and where the hardware is smart about
demoting C-states, one can make a case that the menu governor is not
needed at all.

Regarding your proposed patch.  I agree with the concept, but not the
implementation.  Rather than create a new timer, we should simply
decline to disable the OS tick when we predict short idle on the
tickless-idle-kernel.  That existing timer is a sufficient "safety
net" to assure that the shallow C-state does not run too long.
Indeed, Aubrey Li has discovered that it is also a latency issue for
the OS to be disabling and then re-enabling the timer tick for short
idles, so this is a situation where if we do less work, we can help
both performance and power!:-)

Also, I think we should re-examine menu governor's heuristic that you
pointed out where it throws out some results -- maybe that isn't such
a great heuristic...

thanks,
-Len

ps. I strongly encourage you to universally use the words "shallow"
and "deep" to describe C-states, and never use the words of "high" and
"low".  The former are never ambiguous.  The later can get very
confusing because a higher latency C-state have lower power, and the
numbering of C-states has lower power C-states with higher numbers...

^ permalink raw reply	[flat|nested] 25+ messages in thread

* RE: [PATCH] cpuidle: Allow menu governor to enter deeper sleep states after some time
       [not found] <a181bf42-9462-476c-6dcd-39fc7151957f@tu-dresden.de>
  2017-07-27 12:50 ` [PATCH] cpuidle: Allow menu governor to enter deeper sleep states after some time Thomas Ilsche
@ 2017-10-20  0:17 ` Doug Smythies
  2017-10-20 17:13   ` Thomas Ilsche
  2017-10-21 14:28   ` Doug Smythies
  1 sibling, 2 replies; 25+ messages in thread
From: Doug Smythies @ 2017-10-20  0:17 UTC (permalink / raw)
  To: 'Thomas Ilsche'
  Cc: 'Marcus Hähnel', 'Daniel Hackenberg',
	'Robert Schöne', mario.bielert,
	'Rafael J. Wysocki', 'Alex Shi',
	'Ingo Molnar', 'Rik van Riel',
	'Daniel Lezcano', 'Nicholas Piggin', linux-pm,
	'Len Brown'

Hi Thomas,

Note 1: I think it is just a coincidence that Len Brown also replied to
	this old e-mail earlier today.

Note 2: Rafael referred me to this e-mail about a week ago, after I was
	complaining about excessive energy consumption due to unused CPUs
	not going idle when they should. My example use case was a 100%
	load on one CPU.

Note 3: The patch does not work for me as it was sent. See notes in-line
	further below.

On 2017.07.27 05:50 Thomas Ilsche wrote:

> On several contemporary Intel systems, we have observed that the idle
> power consumption is sometimes significantly too high, e.g. 105 W instead
> of 74 W for several seconds.

On my test system, I do not observe this magnitude of excessive idle energy.
While I do observe occurrences of excessive time spent in idle state 0, they
do not add up to significant energy waste.

> We tracked this issue down to patterns that confuse the heuristic of the
> menu idle governor. If the governor observes several consecutive short
> sleeps, it does expect another short sleep duration despite no immediate
> timers, sending the CPU into a shallow sleep state. On a dyntick-idle
> kernel, there are no means for the core to enter a more efficient sleep
> state if the prediction was wrong. Ironically this is particularly
> problematic if some cores of the system have very infrequent activity, i.e.
> many cores and optimized configuration for low idle power.

If I use your "powernightmare" method (or idle-virus as Len called it)
I get a drastic package power consumption difference between
fallback_timer_enabled being enabled or disabled. It seems mainly due
to changes in time spent in idle state 2. There was also a contribution
from idle state 0 on one CPU. This was a surprise to me as my thinking was
that wasted energy was dominated by extra time in idle state 0.

Example data (kernel 4.14-rc5 + this patch):

Average package power with fallback timer disabled: ~8.06 watts
Average package power with fallback timer enabled: ~3.95 watts

> The effect, cause, triggers, mitigation technique and verification thereof
> are described in detail in the a paper that is pending publication [1].

The paper is very interesting. Thank you.

> Fixing the heuristic to make a better prediction does not seem to be
> generally feasible. The following patch addresses this issue by setting a
> timer such that when the an expected immediate wake-up fails to appear, the
> CPU is woken up to go into a deeper sleep state. If the heuristic was
> right, the timer is simply canceled.
>
> Please consider the patch a draft for discussion. We need to address:
>
>  * Avoid programming the fallback timer when the deepest sleep state is
>    already chosen.
>  * Determine good default values for the introduced configurables. This
>    is difficult due to the large variety of system configurations affected
>    by the menu governor. Nevertheless we believe this can be done such
>     that many systems benefit without significant degradation in any case.
>  * Document (or remove) the sysfs entries.
>
> But first, I would like to invite comments if this is going in the right
> direction, or if this should be addressed in a different way.

The problem I am struggling with is the patch makes no difference
for my example use case of 100% load on one CPU, others idle.
The wasted power is entirely from idle state 0, and idle state 0
times remain about the same with or without the fallback timer.
If I merely disable idle state 0, things are great. However,
just disabling idle state 0 does not help when "powermightmares"
are present.

Example data (kernel 4.14-rc5 + this patch):

Average package power with fallback timer disabled: ~27.2 watts
Average package power with fallback timer enabled: ~28 watts
Average package power with state 0 disabled, fallback timer disabled: ~23.9 watts
Average package power with state 0 disabled, fallback timer enabled: ~23.9 watts

Question: Could this be because I made a mistake re-basing the patch to kernel 4.14-rc5?
Answer: Perhaps. I am unfamiliar with this area of the code.

>
> [1] https://tu-dresden.de/zih/forschung/ressourcen/dateien/projekte/haec/powernightmares.pdf
>
> Signed-off-by: Thomas Ilsche <thomas.ilsche@tu-dresden.de>
> Signed-off-by: Marcus Hähnel <mhaehnel@os.inf.tu-dresden.de>
> ---
>
> diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
> index 61b64c2b2cb8..45bbeb362809 100644
> --- a/drivers/cpuidle/governors/menu.c
> +++ b/drivers/cpuidle/governors/menu.c
> @@ -22,6 +22,7 @@
>   #include <linux/sched/stat.h>
>   #include <linux/math64.h>
>   #include <linux/cpu.h>
> +#include <linux/smp.h>
>   
>   /*
>    * Please note when changing the tuning values:
> @@ -130,6 +131,10 @@ struct menu_device {
>   	unsigned int	correction_factor[BUCKETS];
>   	unsigned int	intervals[INTERVALS];
>   	int		interval_ptr;
> +
> +	struct hrtimer	fallback_timer;
> +	int             have_timer;
> +	unsigned int	disregard_past;
>   };
>   
>   
> @@ -190,6 +195,85 @@ static inline int performance_multiplier(unsigned long nr_iowaiters, unsigned lo
>   
>   static DEFINE_PER_CPU(struct menu_device, menu_devices);
>   
> +static unsigned int fallback_timer_disregard_past = 1;
> +static unsigned int diff_threshold_bits = 8;
> +static unsigned int fallback_timer_enabled;

Shouldn't fallback_timer_enabled be initialized?
On my system it defaults to 0, or disabled, and due to another problem,
see further down, this patch wasn't working.

> +static unsigned int fallback_timer_interval_us = 10000;
> +
> +#define MENU_ATTR_RW(name, var, range_min, range_max, wfun) \
> +	static ssize_t show_##name(struct device *dev, \
> +				   struct device_attribute *attr, char *buf) \
> +	{ \
> +		return snprintf(buf, 12, "%i\n", var); \
> +	} \
> +	static ssize_t store_##name(struct device *dev, \
> +				    struct device_attribute *attr, \
> +				    const char *buf, size_t count) \
> +	{ \
> +		unsigned int tmp; \
> +		int ret = kstrtouint(buf, 10, &tmp); \
> +		if (ret != 1) \

Shouldn't this be:

 +		if (ret != 0) \

(or maybe just " if (ret) \". I did (and therefore tested) the previous.

?
Anyway, it doesn't work unless I make this change.

> +			return -EINVAL; \
> +		if (tmp > range_max || tmp < range_min) { \
> +			pr_warn("value outside of valid range [%u, %u]\n", \
> +				range_min, range_max); \
> +			return -EINVAL; \
> +		} \
> +		var = tmp; \
> +		wfun \
> +		return count; \
> +	} \
> +	static DEVICE_ATTR(fallback_timer_##name, 0644, \
> +			   show_##name, store_##name)
> +
> +MENU_ATTR_RW(threshold_bits, diff_threshold_bits, 0, 31, {});
> +
> +MENU_ATTR_RW(enable, fallback_timer_enabled, 0, 1, {
> +	int i;
> +
> +	for_each_possible_cpu(i) {
> +		struct menu_device *data = per_cpu_ptr(&menu_devices, i);
> +
> +		if (!fallback_timer_enabled) {
> +			data->have_timer = 0;
> +			hrtimer_cancel(&(data->fallback_timer));
> +		}
> +	} });
> +
> +MENU_ATTR_RW(interval_us, fallback_timer_interval_us, 1, 1000000, {});
> +
> +MENU_ATTR_RW(disregard_past, fallback_timer_disregard_past, 0, 1, {
> +	int i;
> +
> +	for_each_possible_cpu(i) {
> +		struct menu_device *data = per_cpu_ptr(&menu_devices, i);
> +
> +		data->disregard_past = 0;
> +	} });
> +
> +static struct attribute *menu_attrs[] = {
> +	&dev_attr_fallback_timer_threshold_bits.attr,
> +	&dev_attr_fallback_timer_enable.attr,
> +	&dev_attr_fallback_timer_interval_us.attr,
> +	&dev_attr_fallback_timer_disregard_past.attr,
> +	NULL
> +};
> +
> +static struct attribute_group menu_attr_group = {
> +	.attrs = menu_attrs,
> +	.name = "cpuidle_menu",
> + };
> +
> +int menu_add_interface(struct device *dev)
> +{
> +	return sysfs_create_group(&dev->kobj, &menu_attr_group);
> +}
> +
> +void menu_remove_interface(struct device *dev)
> +{
> +	sysfs_remove_group(&dev->kobj, &menu_attr_group);
> +}
> +
>   static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev);
>   
>   /*
> @@ -275,6 +359,14 @@ static unsigned int get_typical_interval(struct menu_device *data)
>   	goto again;
>   }
>   
> +static enum hrtimer_restart fallback_timer_fun(struct hrtimer *tmr)
> +{
> +	struct menu_device *mdata = this_cpu_ptr(&menu_devices);
> +
> +	mdata->disregard_past = 1;
> +	return HRTIMER_NORESTART;
> +}
> +
>   /**
>    * menu_select - selects the next idle state to enter
>    * @drv: cpuidle driver containing state data
> @@ -293,6 +385,11 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
>   	unsigned long nr_iowaiters, cpu_load;
>   	int resume_latency = dev_pm_qos_raw_read_value(device);
>   
> +	if (fallback_timer_enabled && data->have_timer) {
> +		data->have_timer = 0;
> +		hrtimer_cancel(&(data->fallback_timer));
> +	}
> +
>   	if (data->needs_update) {
>   		menu_update(drv, dev);
>   		data->needs_update = 0;
> @@ -322,7 +419,32 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
>   					 RESOLUTION * DECAY);
>   
>   	expected_interval = get_typical_interval(data);
> -	expected_interval = min(expected_interval, data->next_timer_us);
> +
> +	if (fallback_timer_enabled && fallback_timer_disregard_past
> +	    && data->disregard_past)  {
> +		expected_interval = data->next_timer_us;
> +		// Only disregard the past once! Then try again
> +		data->disregard_past = 0;
> +	} else {
> +		if (fallback_timer_enabled
> +		    && expected_interval < (data->next_timer_us >> diff_threshold_bits)
> +		    && data->next_timer_us > fallback_timer_interval_us * 2) {
> +			/*
> +			 * Program the fallback timer if the gap between the
> +			 * expected interval by heuristic and the next regular
> +			 * timer are too far apart.
> +			 * However, only do this when we didn't just wakup from
> +			 * a timer and are told to disregard the heuristic
> +			 */
> +			ktime_t interval =
> +				ktime_set(0, fallback_timer_interval_us * 1000);
> +
> +			hrtimer_start(&(data->fallback_timer), interval,
> +				      HRTIMER_MODE_REL_PINNED);
> +			data->have_timer = 1;
> +		}
> +		expected_interval = min(expected_interval, data->next_timer_us);
> +	}
>  
>  	if (CPUIDLE_DRIVER_STATE_START > 0) {
>  		struct cpuidle_state *s = &drv->states[CPUIDLE_DRIVER_STATE_START];
> @@ -398,6 +520,11 @@ static void menu_reflect(struct cpuidle_device *dev, int index)
>  {
>  	struct menu_device *data = this_cpu_ptr(&menu_devices);
>  
> +	if (fallback_timer_enabled && data->have_timer) {
> +		data->have_timer = 0;
> +		hrtimer_cancel(&data->fallback_timer);
> +	}
> +
>  	data->last_state_idx = index;
>  	data->needs_update = 1;
>  }
> @@ -486,6 +613,10 @@ static int menu_enable_device(struct cpuidle_driver *drv,
>  
>  	memset(data, 0, sizeof(struct menu_device));
>   
> +	hrtimer_init(&(data->fallback_timer),
> +		     CLOCK_REALTIME, HRTIMER_MODE_REL_PINNED);
> +	data->fallback_timer.function = fallback_timer_fun;
> +
>  	/*
>  	 * if the correction factor is 0 (eg first time init or cpu hotplug
>  	 * etc), we actually want to start out with a unity factor.
> @@ -509,6 +640,10 @@ static struct cpuidle_governor menu_governor = {
>   */
>  static int __init init_menu(void)
>  {
> +	int ret = menu_add_interface(cpu_subsys.dev_root);
> +
> +	if (ret)
> +		return ret;
>  	return cpuidle_register_governor(&menu_governor);
>  }
 

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] cpuidle: Allow menu governor to enter deeper sleep states after some time
  2017-10-19  7:46   ` Len Brown
@ 2017-10-20 16:31     ` Thomas Ilsche
  2017-10-21 14:27     ` Doug Smythies
  1 sibling, 0 replies; 25+ messages in thread
From: Thomas Ilsche @ 2017-10-20 16:31 UTC (permalink / raw)
  To: Len Brown
  Cc: Rafael J. Wysocki, Alex Shi, Ingo Molnar, Rik van Riel,
	Daniel Lezcano, Nicholas Piggin, Linux PM list,
	Marcus Hähnel, Daniel Hackenberg, Robert Schöne,
	mario.bielert@tu-dresden.de

[-- Attachment #1: Type: text/plain, Size: 3744 bytes --]

Len,

thank you for the detailed response. Answers below.

> To prove that this is the result of OS C-state demotion, I have
> implemented option "d)" in your paper.  I added a module param,
> currently called "cpuidle.powersave", that causes cpuidle to bypass
> the governor altogether and select the deepest state that is allowed
> by PM_QOS.  (maybe I should call it cpuidle.bypass, or
> cpuidle.deepest...)  When i set this parameter,  multi-user C-state
> residency and power equal that seen in single-user mode, with no
> spikes.

Can you please share the patch, I'd like to use it for some experiments.

> Note that the paper is not correct when it states that the OS can not
> enforce latency constraints when relying on HW C-state selection.  The
> OS PM_QOS constraint is still effective in this path because it can
> still limit the OS request, and the HW will never "promote" to a
> deeper state than that requested by the OS -- the hardware can only
> demote and then un-demote -- but never deeper than the OS request.

Agreed. Since there is only C1E HW promotion, which is AFAIK disabled
on Linux, practically one would only use HW (un)demotion with which the
os can still enforce constraints.

> Indeed, on systems where PM_QOS is used properly (and it is working
> correctly, a different story), and where the hardware is smart about
> demoting C-states, one can make a case that the menu governor is not
> needed at all.

We haven't looked at auto-demotion in practice yet, but could do that.
Would you think this is a serious option for Linux, given that it means
handing quite some control to in-transparent hardware?

There also might be some concern about different architectures.

> Regarding your proposed patch.  I agree with the concept, but not the
> implementation.  Rather than create a new timer, we should simply
> decline to disable the OS tick when we predict short idle on the
> tickless-idle-kernel.  That existing timer is a sufficient "safety
> net" to assure that the shallow C-state does not run too long.
> Indeed, Aubrey Li has discovered that it is also a latency issue for
> the OS to be disabling and then re-enabling the timer tick for short
> idles, so this is a situation where if we do less work, we can help
> both performance and power!:-)

That sounds like a viable alternative. I would like to try that, but I
am unsure how to implement. If I read the code correctly, the tick-timer
is disabled on the outer idle-loop in do_idle, whereas the decision is
made later in the code.

Deferring the tick_nohz_idle_enter would create some redundancy and I'm
not sure it is free of side effects. Also it might complicate the calls
from idle to governor.

> Also, I think we should re-examine menu governor's heuristic that you
> pointed out where it throws out some results -- maybe that isn't such
> a great heuristic...

I agree, but
1) we need the mitigation anyway because the heuristic will never be
     perfect.
2) i find it hard to optimize the heuristic with an incomplete
    understanding of workload patterns and the latency optimization
    target. I'm assuming a reason for some of the odd aspects of the
    heuristic, but I don't know it.

So as a secondary quest, for instance the outlier detection could be
limited for below-median cases.

Best,
Thomas
-- 
Dipl. Inf. Thomas Ilsche
Computer Scientist
Highly Adaptive Energy-Efficient Computing
CRC 912 HAEC: http://tu-dresden.de/sfb912
Technische Universität Dresden
Center for Information Services and High Performance Computing (ZIH)
01062 Dresden, Germany

Phone: +49 351 463-42168
Fax: +49 351 463-37773
E-Mail: thomas.ilsche@tu-dresden.de


[-- Attachment #2: S/MIME Cryptographic Signature --]
[-- Type: application/pkcs7-signature, Size: 5214 bytes --]

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] cpuidle: Allow menu governor to enter deeper sleep states after some time
  2017-10-20  0:17 ` Doug Smythies
@ 2017-10-20 17:13   ` Thomas Ilsche
  2017-10-21 14:28   ` Doug Smythies
  1 sibling, 0 replies; 25+ messages in thread
From: Thomas Ilsche @ 2017-10-20 17:13 UTC (permalink / raw)
  To: Doug Smythies
  Cc: 'Marcus Hähnel', 'Daniel Hackenberg',
	'Robert Schöne', mario.bielert,
	'Rafael J. Wysocki', 'Alex Shi',
	'Ingo Molnar', 'Rik van Riel',
	'Daniel Lezcano', 'Nicholas Piggin', linux-pm,
	'Len Brown'

[-- Attachment #1: Type: text/plain, Size: 4558 bytes --]

Hi Doug,

thanks for reproducing, I'll come right to your issue

>> On several contemporary Intel systems, we have observed that the idle
>> power consumption is sometimes significantly too high, e.g. 105 W instead
>> of 74 W for several seconds.
> 
> On my test system, I do not observe this magnitude of excessive idle energy.
> While I do observe occurrences of excessive time spent in idle state 0, they
> do not add up to significant energy waste.
> 
> [...]
> 
> The problem I am struggling with is the patch makes no difference
> for my example use case of 100% load on one CPU, others idle.
> The wasted power is entirely from idle state 0, and idle state 0
> times remain about the same with or without the fallback timer.
> If I merely disable idle state 0, things are great. However,
> just disabling idle state 0 does not help when "powermightmares"
> are present.
> 
> Example data (kernel 4.14-rc5 + this patch):
> 
> Average package power with fallback timer disabled: ~27.2 watts
> Average package power with fallback timer enabled: ~28 watts
> Average package power with state 0 disabled, fallback timer disabled: ~23.9 watts
> Average package power with state 0 disabled, fallback timer enabled: ~23.9 watts
  
That's strange, we haven't seen issues with state0 residency. The
menu idle governor explicitly tries to avoid polling, and on our
system C1 has a latency of 1. I'm not even sure if poll_idle can
run indefinitely similar to mwait.

Unfortunately, I don't have a complete picture of your issue.

1) What processor do you use and what are the following
/sys/devices/system/cpu/cpu*/cpuidle/state*/{name,latency,residency}

2) To get a better picture you can use perf via the following
events in addition to power and residency:

sched:sched_switch     -> which tasks are scheduled when.
power:cpu_idle (state) -> which state was used when going to sleep

Record a timeline with these events by using perf record -a -e
You could also use -C to select only the core that should be
idle but uses C0
You can analyse it via perf script (or perf timeline if you have a
scalable and fast SVG viewer, which I'm not sure exists)

If you see state0 selected before a long idle phase, it's what we
call a Powernightmare. The intervals of 8 previous idles can tell
what the heuristic tries.

Some things to consider to avoid other influences on power
- uniformity of the busy core's workload
- fixed userspace p-state
- pinned threads

If you can share a reproducible use-case I can also try with
our tool chain (recording toolchain is FOSS, visualizer is not)

>> @@ -190,6 +195,85 @@ static inline int performance_multiplier(unsigned long nr_iowaiters, unsigned lo
>>    
>>    static DEFINE_PER_CPU(struct menu_device, menu_devices);
>>    
>> +static unsigned int fallback_timer_disregard_past = 1;
>> +static unsigned int diff_threshold_bits = 8;
>> +static unsigned int fallback_timer_enabled;
> 
> Shouldn't fallback_timer_enabled be initialized?
> On my system it defaults to 0, or disabled, and due to another problem,
> see further down, this patch wasn't working.

It was meant to be 0 for now, which is what static will be initialized to,
but I prefer explicit initialization.

>> +static unsigned int fallback_timer_interval_us = 10000;
>> +
>> +#define MENU_ATTR_RW(name, var, range_min, range_max, wfun) \
>> +	static ssize_t show_##name(struct device *dev, \
>> +				   struct device_attribute *attr, char *buf) \
>> +	{ \
>> +		return snprintf(buf, 12, "%i\n", var); \
>> +	} \
>> +	static ssize_t store_##name(struct device *dev, \
>> +				    struct device_attribute *attr, \
>> +				    const char *buf, size_t count) \
>> +	{ \
>> +		unsigned int tmp; \
>> +		int ret = kstrtouint(buf, 10, &tmp); \
>> +		if (ret != 1) \
> 
> Shouldn't this be:
> 
>   +		if (ret != 0) \
> 
> (or maybe just " if (ret) \". I did (and therefore tested) the previous.
> 
> ?
> Anyway, it doesn't work unless I make this change.

Yes. An unfortunate oversight during the last style-checker changes.
For sscanf the return code had to be 1. Sorry for the trouble.

Best,
Thomas

-- 
Dipl. Inf. Thomas Ilsche
Computer Scientist
Highly Adaptive Energy-Efficient Computing
CRC 912 HAEC: http://tu-dresden.de/sfb912
Technische Universität Dresden
Center for Information Services and High Performance Computing (ZIH)
01062 Dresden, Germany

Phone: +49 351 463-42168
Fax: +49 351 463-37773
E-Mail: thomas.ilsche@tu-dresden.de


[-- Attachment #2: S/MIME Cryptographic Signature --]
[-- Type: application/pkcs7-signature, Size: 5214 bytes --]

^ permalink raw reply	[flat|nested] 25+ messages in thread

* RE: [PATCH] cpuidle: Allow menu governor to enter deeper sleep states after some time
  2017-10-19  7:46   ` Len Brown
  2017-10-20 16:31     ` Thomas Ilsche
@ 2017-10-21 14:27     ` Doug Smythies
  1 sibling, 0 replies; 25+ messages in thread
From: Doug Smythies @ 2017-10-21 14:27 UTC (permalink / raw)
  To: 'Thomas Ilsche', 'Len Brown'
  Cc: 'Rafael J. Wysocki', 'Alex Shi',
	'Ingo Molnar', 'Rik van Riel',
	'Daniel Lezcano', 'Nicholas Piggin',
	'Linux PM list', 'Marcus Hähnel',
	'Daniel Hackenberg', 'Robert Schöne',
	mario.bielert

On 2017.10.20 09:32 Thomas Ilsche wrote:

>> To prove that this is the result of OS C-state demotion, I have
>> implemented option "d)" in your paper.  I added a module param,
>> currently called "cpuidle.powersave", that causes cpuidle to bypass
>> the governor altogether and select the deepest state that is allowed
>> by PM_QOS.  (maybe I should call it cpuidle.bypass, or
>> cpuidle.deepest...)  When i set this parameter,  multi-user C-state
>> residency and power equal that seen in single-user mode, with no
>> spikes.
>
> Can you please share the patch, I'd like to use it for some experiments.

Can you not achieve the same, or at least similar and good enough for
testing, conditions by disabling all the other idle states?

On my system, the max idle state is 4 (C6), and I just disable states
0-3 to force always state 4.

... Doug

^ permalink raw reply	[flat|nested] 25+ messages in thread

* RE: [PATCH] cpuidle: Allow menu governor to enter deeper sleep states after some time
  2017-10-20  0:17 ` Doug Smythies
  2017-10-20 17:13   ` Thomas Ilsche
@ 2017-10-21 14:28   ` Doug Smythies
  2017-11-07 23:04     ` Thomas Ilsche
                       ` (3 more replies)
  1 sibling, 4 replies; 25+ messages in thread
From: Doug Smythies @ 2017-10-21 14:28 UTC (permalink / raw)
  To: 'Thomas Ilsche'
  Cc: 'Marcus Hähnel', 'Daniel Hackenberg',
	'Robert Schöne', mario.bielert,
	'Rafael J. Wysocki', 'Alex Shi',
	'Ingo Molnar', 'Rik van Riel',
	'Daniel Lezcano', 'Nicholas Piggin', linux-pm,
	'Len Brown'

Hi Thomas,

Thanks for your quick reply.

On 2017.10.20 Thomas Ilsche wrote:
  
> Unfortunately, I don't have a complete picture of your issue.
>
> 1) What processor do you use and what are the following
> /sys/devices/system/cpu/cpu*/cpuidle/state*/{name,latency,residency}

Processor: Intel(R) Core(TM) i7-2600K CPU @ 3.40GHz
4 cores, 8 CPUs. I do not disable hyperthreading, which
your paper mentions you do.

All CPU have the same numbers, so CPU 0 used:

$ cat /sys/devices/system/cpu/cpu0/cpuidle/state*/name
POLL
C1
C1E
C3
C6

$ cat /sys/devices/system/cpu/cpu0/cpuidle/state*/latency
0
2
10
80
104

$ cat /sys/devices/system/cpu/cpu0/cpuidle/state*/residency
0
2
20
211
345

> 2) To get a better picture you can use perf via the following
> events in addition to power and residency:
>
> sched:sched_switch     -> which tasks are scheduled when.
> power:cpu_idle (state) -> which state was used when going to sleep
>
> Record a timeline with these events by using perf record -a -e
> You could also use -C to select only the core that should be
> idle but uses C0
> You can analyse it via perf script (or perf timeline if you have a
> scalable and fast SVG viewer, which I'm not sure exists)
>
> If you see state0 selected before a long idle phase, it's what we
> call a Powernightmare. The intervals of 8 previous idles can tell
> what the heuristic tries.

Very interesting. Running "perf record" with your events
has a drastic effect on the rate of occurrence of the issue.
Actually, it pretty much eliminates it. However, I was still
able to capture a few. (data further below.)

> Some things to consider to avoid other influences on power
> - uniformity of the busy core's workload
> - fixed userspace p-state
> - pinned threads

Agreed.

> If you can share a reproducible use-case I can also try with
> our tool chain (recording toolchain is FOSS, visualizer is not)

My test computer is a server, with no GUI. To make "idle" even
more "idle", I do this:

$ cat set_cpu_turn_off_services
#! /bin/bash
# Turn off some services to try to get "idle" to be more "idle"
sudo systemctl stop mysql.service
sudo systemctl stop apache2.service
sudo systemctl stop nmbd.service
sudo systemctl stop smbd.service
sudo systemctl stop cron.service
sudo systemctl stop winbind.service
sudo systemctl stop apt-daily.timer
sudo systemctl stop libvirtd.service

My original "Reported-and-tested-by" work for 0c313cb
only looked at my "idle" test system, and still is good
enough.

However, now I observe significant excessive power sometimes 
under load, the simplest being a single threaded CPU intensive
job.
 
I merely load one CPU 100% (i.e. loadave = 1), and watch the others,
using turbostat (at >= 2 minutes per sample) and/or monitoring
idle state times (at >= 2 minutes, but typically more,
between samples) and/or the intel_pstate_tracer.py tool.

Some example trace data (edited). Look for "<<<<<<<<".
The previous 8 idle entry/exits are included in each sample:

CPU 1 sample 1:

doug@s15:~/idle/perf$ cat t02_1_1.txt
   kworker/u16:1  5273 [001] 137660.900599: sched:sched_switch: kworker/u16:1:5273 [120] t ==> swapper/1:0 [120]
         swapper     0 [001] 137660.900601:     power:cpu_idle: state=4 cpu_id=1
         swapper     0 [001] 137660.900645:     power:cpu_idle: state=4294967295 cpu_id=1 time: 44
         swapper     0 [001] 137660.900649: sched:sched_switch: swapper/1:0 [120] R ==> kworker/u16:1:5273 [120]
   kworker/u16:1  5273 [001] 137660.900655: sched:sched_switch: kworker/u16:1:5273 [120] t ==> swapper/1:0 [120]
         swapper     0 [001] 137660.900657: sched:sched_switch: swapper/1:0 [120] R ==> kworker/u16:1:5273 [120]
   kworker/u16:1  5273 [001] 137660.900659: sched:sched_switch: kworker/u16:1:5273 [120] t ==> swapper/1:0 [120]
         swapper     0 [001] 137660.900661:     power:cpu_idle: state=4 cpu_id=1
         swapper     0 [001] 137660.900704:     power:cpu_idle: state=4294967295 cpu_id=1 time: 43
         swapper     0 [001] 137660.900708: sched:sched_switch: swapper/1:0 [120] R ==> kworker/u16:1:5273 [120]
   kworker/u16:1  5273 [001] 137660.900713: sched:sched_switch: kworker/u16:1:5273 [120] t ==> swapper/1:0 [120]
         swapper     0 [001] 137660.900717: sched:sched_switch: swapper/1:0 [120] R ==> kworker/u16:1:5273 [120]
   kworker/u16:1  5273 [001] 137660.900718: sched:sched_switch: kworker/u16:1:5273 [120] t ==> swapper/1:0 [120]
         swapper     0 [001] 137660.900720:     power:cpu_idle: state=4 cpu_id=1
         swapper     0 [001] 137660.900763:     power:cpu_idle: state=4294967295 cpu_id=1 time: 43
         swapper     0 [001] 137660.900766: sched:sched_switch: swapper/1:0 [120] R ==> kworker/u16:1:5273 [120]
   kworker/u16:1  5273 [001] 137660.900772: sched:sched_switch: kworker/u16:1:5273 [120] t ==> swapper/1:0 [120]
         swapper     0 [001] 137660.900775:     power:cpu_idle: state=4 cpu_id=1
         swapper     0 [001] 137660.900778:     power:cpu_idle: state=4294967295 cpu_id=1 time: 3
         swapper     0 [001] 137660.900779: sched:sched_switch: swapper/1:0 [120] R ==> kworker/u16:1:5273 [120]
   kworker/u16:1  5273 [001] 137660.900781: sched:sched_switch: kworker/u16:1:5273 [120] t ==> swapper/1:0 [120]
         swapper     0 [001] 137660.900783:     power:cpu_idle: state=4 cpu_id=1
         swapper     0 [001] 137660.900818:     power:cpu_idle: state=4294967295 cpu_id=1 time: 35
         swapper     0 [001] 137660.900821: sched:sched_switch: swapper/1:0 [120] R ==> kworker/u16:1:5273 [120]
   kworker/u16:1  5273 [001] 137660.900827: sched:sched_switch: kworker/u16:1:5273 [120] t ==> swapper/1:0 [120]
         swapper     0 [001] 137660.900830: sched:sched_switch: swapper/1:0 [120] R ==> kworker/u16:1:5273 [120]
   kworker/u16:1  5273 [001] 137660.900831: sched:sched_switch: kworker/u16:1:5273 [120] t ==> swapper/1:0 [120]
         swapper     0 [001] 137660.900833:     power:cpu_idle: state=4 cpu_id=1
         swapper     0 [001] 137660.900879:     power:cpu_idle: state=4294967295 cpu_id=1 time: 46
         swapper     0 [001] 137660.900882: sched:sched_switch: swapper/1:0 [120] R ==> kworker/u16:1:5273 [120]
   kworker/u16:1  5273 [001] 137660.900891: sched:sched_switch: kworker/u16:1:5273 [120] t ==> swapper/1:0 [120]
         swapper     0 [001] 137660.900894:     power:cpu_idle: state=1 cpu_id=1
         swapper     0 [001] 137661.679273:     power:cpu_idle: state=4294967295 cpu_id=1 time: 778378
         swapper     0 [001] 137661.679280:     power:cpu_idle: state=1 cpu_id=1
         swapper     0 [001] 137662.191265:     power:cpu_idle: state=4294967295 cpu_id=1 time: 511984
         swapper     0 [001] 137662.191274:     power:cpu_idle: state=0 cpu_id=1  <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
         swapper     0 [001] 137662.843297:     power:cpu_idle: state=4294967295 cpu_id=1 time: 652023 uS
         swapper     0 [001] 137662.843301: sched:sched_switch: swapper/1:0 [120] R ==> watchdog/1:14 [0]
      watchdog/1    14 [001] 137662.843307: sched:sched_switch: watchdog/1:14 [0] S ==> swapper/1:0 [120]

CPU 3 sample 1:
doug@s15:~/idle/perf$ cat t02_3_1.txt
         swapper     0 [003] 136958.987314:     power:cpu_idle: state=4 cpu_id=3
         swapper     0 [003] 136959.167299:     power:cpu_idle: state=4294967295 cpu_id=3
         swapper     0 [003] 136959.167316:     power:cpu_idle: state=4 cpu_id=3
         swapper     0 [003] 136960.479303:     power:cpu_idle: state=4294967295 cpu_id=3 time: 1.312 Sec
         swapper     0 [003] 136960.479311:     power:cpu_idle: state=1 cpu_id=3
         swapper     0 [003] 136960.479328:     power:cpu_idle: state=4294967295 cpu_id=3 time: 17 uSec
         swapper     0 [003] 136960.479340:     power:cpu_idle: state=4 cpu_id=3
         swapper     0 [003] 136962.495332:     power:cpu_idle: state=4294967295 cpu_id=3 time: 2.016 Sec
         swapper     0 [003] 136962.495340:     power:cpu_idle: state=2 cpu_id=3
         swapper     0 [003] 136962.495372:     power:cpu_idle: state=4294967295 cpu_id=3 time: 32 uSec
         swapper     0 [003] 136962.495385:     power:cpu_idle: state=4 cpu_id=3
         swapper     0 [003] 136962.655390:     power:cpu_idle: state=4294967295 cpu_id=3 time: 0.16 Sec
         swapper     0 [003] 136962.655399:     power:cpu_idle: state=4 cpu_id=3
         swapper     0 [003] 136962.987380:     power:cpu_idle: state=4294967295 cpu_id=3 time: 0.332 Sec
         swapper     0 [003] 136962.987393: sched:sched_switch: swapper/3:0 [120] R ==> watchdog/3:26 [0]
      watchdog/3    26 [003] 136962.987397: sched:sched_switch: watchdog/3:26 [0] S ==> swapper/3:0 [120]
         swapper     0 [003] 136962.987402:     power:cpu_idle: state=4 cpu_id=3
         swapper     0 [003] 136963.167387:     power:cpu_idle: state=4294967295 cpu_id=3 time: 0.18 Seconds
         swapper     0 [003] 136963.167404:     power:cpu_idle: state=4 cpu_id=3
         swapper     0 [003] 136963.679391:     power:cpu_idle: state=4294967295 cpu_id=3 time: 0.512 Seconds
         swapper     0 [003] 136963.679399:     power:cpu_idle: state=0 cpu_id=3  <<<<<<<<<<<<<<<<<<<<<<<
         swapper     0 [003] 136966.655478:     power:cpu_idle: state=4294967295 cpu_id=3 time: 2.875 Seconds

CPU 3 sample 2:

doug@s15:~/idle/perf$ cat t02_3_2.txt
         swapper     0 [003] 137563.001020:     power:cpu_idle: state=4 cpu_id=3
         swapper     0 [003] 137563.181005:     power:cpu_idle: state=4294967295 cpu_id=3 time: 0.18 Sec
         swapper     0 [003] 137563.181023:     power:cpu_idle: state=4 cpu_id=3
         swapper     0 [003] 137564.493009:     power:cpu_idle: state=4294967295 cpu_id=3 time: 9 uSec
         swapper     0 [003] 137564.493018:     power:cpu_idle: state=1 cpu_id=3
         swapper     0 [003] 137564.493036:     power:cpu_idle: state=4294967295 cpu_id=3 time: 18 uSec
         swapper     0 [003] 137564.493048:     power:cpu_idle: state=4 cpu_id=3
         swapper     0 [003] 137566.509039:     power:cpu_idle: state=4294967295 cpu_id=3 time: 2.016 Sec
         swapper     0 [003] 137566.509048:     power:cpu_idle: state=2 cpu_id=3
         swapper     0 [003] 137566.509082:     power:cpu_idle: state=4294967295 cpu_id=3 time: 34 uSec
         swapper     0 [003] 137566.509094:     power:cpu_idle: state=4 cpu_id=3
         swapper     0 [003] 137566.669096:     power:cpu_idle: state=4294967295 cpu_id=3 time: 7 uSec
         swapper     0 [003] 137566.669103:     power:cpu_idle: state=4 cpu_id=3
         swapper     0 [003] 137567.001089:     power:cpu_idle: state=4294967295 cpu_id=3 time: 0.332 Sec
         swapper     0 [003] 137567.001103: sched:sched_switch: swapper/3:0 [120] R ==> watchdog/3:26 [0]
      watchdog/3    26 [003] 137567.001107: sched:sched_switch: watchdog/3:26 [0] S ==> swapper/3:0 [120]
         swapper     0 [003] 137567.001112:     power:cpu_idle: state=4 cpu_id=3
         swapper     0 [003] 137567.181094:     power:cpu_idle: state=4294967295 cpu_id=3 time: 0.18 Sec
         swapper     0 [003] 137567.181098:     power:cpu_idle: state=0 cpu_id=3 <<<<<<<<<<<<<<<<<<
         swapper     0 [003] 137570.669187:     power:cpu_idle: state=4294967295 cpu_id=3 time: 3.488 Sec

Idle state times (seconds) over 2 minutes, CPU 7 = 100% load, while "perf record" running:
CPU 0:
   State 0: 1.436071
   State 1: 0.000317
   State 2: 0.000000
   State 3: 0.000000
   State 4: 118.554395
CPU 1:
   State 0: 0.000053
   State 1: 0.000007
   State 2: 0.000000
   State 3: 0.000000
   State 4: 120.000261
CPU 2:
   State 0: 0.000000
   State 1: 0.000000
   State 2: 0.032397
   State 3: 0.030218
   State 4: 119.902810
CPU 3:
   State 0: 2.176088
   State 1: 0.000114
   State 2: 0.000887
   State 3: 0.000000
   State 4: 117.820492
CPU 4:
   State 0: 0.000000
   State 1: 0.000086
   State 2: 0.001627
   State 3: 0.000000
   State 4: 121.360904
CPU 5:
   State 0: 0.000000
   State 1: 0.000014
   State 2: 0.001412
   State 3: 0.000000
   State 4: 120.000058
CPU 6:
   State 0: 0.000000
   State 1: 0.000015
   State 2: 0.000427
   State 3: 0.001088
   State 4: 119.984763
CPU 7:
   State 0: 0.000000
   State 1: 0.000000
   State 2: 0.000000
   State 3: 0.000000
   State 4: 0.000000

Idle state times (seconds) over 2 minutes, CPU 7 = 100% load, no "perf record":
CPU 0:
   State 0: 0.000223
   State 1: 0.000679
   State 2: 0.000000
   State 3: 0.000000
   State 4: 119.991636
CPU 1:
   State 0: 3.912153
   State 1: 0.000130
   State 2: 0.000000
   State 3: 0.000000
   State 4: 116.088366
CPU 2:
   State 0: 0.000000
   State 1: 0.000000
   State 2: 0.026982
   State 3: 0.026250
   State 4: 119.920878
CPU 3:
   State 0: 0.000039
   State 1: 0.000325
   State 2: 0.000981
   State 3: 0.000422
   State 4: 119.996761
CPU 4:
   State 0: 23.944607
   State 1: 0.000092
   State 2: 0.001095
   State 3: 0.000000
   State 4: 99.258784
CPU 5:
   State 0: 29.880724
   State 1: 0.000065
   State 2: 0.000934
   State 3: 0.000000
   State 4: 90.120384
CPU 6:
   State 0: 3.740422
   State 1: 0.000017
   State 2: 0.000451
   State 3: 0.000000
   State 4: 116.253516
CPU 7:
   State 0: 0.000000
   State 1: 0.000000
   State 2: 0.000000
   State 3: 0.000000
   State 4: 0.000000

... Doug

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] cpuidle: Allow menu governor to enter deeper sleep states after some time
  2017-10-21 14:28   ` Doug Smythies
@ 2017-11-07 23:04     ` Thomas Ilsche
  2017-11-08  4:53       ` Len Brown
  2017-11-08 16:26     ` Doug Smythies
                       ` (2 subsequent siblings)
  3 siblings, 1 reply; 25+ messages in thread
From: Thomas Ilsche @ 2017-11-07 23:04 UTC (permalink / raw)
  To: Doug Smythies
  Cc: 'Marcus Hähnel', 'Daniel Hackenberg',
	'Robert Schöne', mario.bielert,
	'Rafael J. Wysocki', 'Alex Shi',
	'Ingo Molnar', 'Rik van Riel',
	'Daniel Lezcano', 'Nicholas Piggin', linux-pm,
	'Len Brown'

Hi Doug,

thanks to your detailed description I was able to reproduce and track down the
issue on one of our systems with a similar processor. The effect is similar, the
core stays in a shallow sleep state for too long. This is also amplified on a
system with little background noise where a core can stay in C0 for seconds.
But the cause / trigger is different. By my observation with many perf probes,
the next timer is preventing a deep sleep, also overriding the anti-poll
mechanism.

This immediate (usually 1-3 us) timer can be both tick_sched_timer and
watchdog_timer_fn. The timers actually do happen and run, however poll_idle
directly resumes after the interrupt - there is no need_resched(). The menu
governor assumes that a timer will trigger another menu_select, but it does not.
Neither does our fallback timer - so the mitigation.

I made a hack[1] to stop poll_idle after timer_expires which prevents the issue
in my tests. The idea is to make sure that poll_idle is really exited whenever a
timer happens. I was trying to use existing information, but I'm not entirely
sure if tick_sched.timer_expires actually has the right semantic. At the very
least it would have to be exposed properly. Another way could be to set some
flag within the timer handlers that is checked in poll_idle, but that is
inviting race conditions. There is also the danger that resulting shorter, more
frequent, times spent in C0 further confuses the menu heuristic.

This issue is important to consider for the mitigation idea using a continuing
tick timer when entering a shallow sleep state. We must ensure that the tick
timer actually ends all sleep states including poll_idle.

I did some experiments on an Core i7-2600 at nominal userspace frequency with
enabled HyperThreading with one thread doing while(1){} over 20 min each.
The default configuration consumes on average 53.5 W AC power. The
configurations with C0 disabled, all but C6 disabled, and my timer_expires hack
consume between 48.6 and 49.0 W on average. In the plots [2] of the 50 ms
external power samples you nicely see the clusters where 1/2/3 cores are in C0
adding about 9 W each.

I was able to reproduce it on a number of additional CPUs, including larger
multi-socket server systems. There are some systems where C0 is never used in
the scenario, but that could be due to other factors. I think this is mostly
independent of the actual CPU architecture.

[1]: https://github.com/tud-zih-energy/linux/commit/7529b167dc7c2afaacd4551fe01ec576df5097e3
[2]: https://wwwpub.zih.tu-dresden.de/~tilsche/powernightmares_poll.png


>> Can you please share the patch, I'd like to use it for some experiments.
> 
> Can you not achieve the same, or at least similar and good enough for
> testing, conditions by disabling all the other idle states?
> 
> On my system, the max idle state is 4 (C6), and I just disable states
> 0-3 to force always state 4.

Absolutely! Previously I was using a version prior to 3ed09c9 where disabling state 0 was allowed.

Best,
Thomas

On 2017-10-21 16:28, Doug Smythies wrote:
> Hi Thomas,
> 
> Thanks for your quick reply.
> 
> On 2017.10.20 Thomas Ilsche wrote:
>    
>> Unfortunately, I don't have a complete picture of your issue.
>>
>> 1) What processor do you use and what are the following
>> /sys/devices/system/cpu/cpu*/cpuidle/state*/{name,latency,residency}
> 
> Processor: Intel(R) Core(TM) i7-2600K CPU @ 3.40GHz
> 4 cores, 8 CPUs. I do not disable hyperthreading, which
> your paper mentions you do.
> 
> All CPU have the same numbers, so CPU 0 used:
> 
> $ cat /sys/devices/system/cpu/cpu0/cpuidle/state*/name
> POLL
> C1
> C1E
> C3
> C6
> 
> $ cat /sys/devices/system/cpu/cpu0/cpuidle/state*/latency
> 0
> 2
> 10
> 80
> 104
> 
> $ cat /sys/devices/system/cpu/cpu0/cpuidle/state*/residency
> 0
> 2
> 20
> 211
> 345
> 
>> 2) To get a better picture you can use perf via the following
>> events in addition to power and residency:
>>
>> sched:sched_switch     -> which tasks are scheduled when.
>> power:cpu_idle (state) -> which state was used when going to sleep
>>
>> Record a timeline with these events by using perf record -a -e
>> You could also use -C to select only the core that should be
>> idle but uses C0
>> You can analyse it via perf script (or perf timeline if you have a
>> scalable and fast SVG viewer, which I'm not sure exists)
>>
>> If you see state0 selected before a long idle phase, it's what we
>> call a Powernightmare. The intervals of 8 previous idles can tell
>> what the heuristic tries.
> 
> Very interesting. Running "perf record" with your events
> has a drastic effect on the rate of occurrence of the issue.
> Actually, it pretty much eliminates it. However, I was still
> able to capture a few. (data further below.)
> 
>> Some things to consider to avoid other influences on power
>> - uniformity of the busy core's workload
>> - fixed userspace p-state
>> - pinned threads
> 
> Agreed.
> 
>> If you can share a reproducible use-case I can also try with
>> our tool chain (recording toolchain is FOSS, visualizer is not)
> 
> My test computer is a server, with no GUI. To make "idle" even
> more "idle", I do this:
> 
> $ cat set_cpu_turn_off_services
> #! /bin/bash
> # Turn off some services to try to get "idle" to be more "idle"
> sudo systemctl stop mysql.service
> sudo systemctl stop apache2.service
> sudo systemctl stop nmbd.service
> sudo systemctl stop smbd.service
> sudo systemctl stop cron.service
> sudo systemctl stop winbind.service
> sudo systemctl stop apt-daily.timer
> sudo systemctl stop libvirtd.service
> 
> My original "Reported-and-tested-by" work for 0c313cb
> only looked at my "idle" test system, and still is good
> enough.
> 
> However, now I observe significant excessive power sometimes
> under load, the simplest being a single threaded CPU intensive
> job.
>   
> I merely load one CPU 100% (i.e. loadave = 1), and watch the others,
> using turbostat (at >= 2 minutes per sample) and/or monitoring
> idle state times (at >= 2 minutes, but typically more,
> between samples) and/or the intel_pstate_tracer.py tool.
> 
> Some example trace data (edited). Look for "<<<<<<<<".
> The previous 8 idle entry/exits are included in each sample:
> 
> CPU 1 sample 1:
> 
> doug@s15:~/idle/perf$ cat t02_1_1.txt
>     kworker/u16:1  5273 [001] 137660.900599: sched:sched_switch: kworker/u16:1:5273 [120] t ==> swapper/1:0 [120]
>           swapper     0 [001] 137660.900601:     power:cpu_idle: state=4 cpu_id=1
>           swapper     0 [001] 137660.900645:     power:cpu_idle: state=4294967295 cpu_id=1 time: 44
>           swapper     0 [001] 137660.900649: sched:sched_switch: swapper/1:0 [120] R ==> kworker/u16:1:5273 [120]
>     kworker/u16:1  5273 [001] 137660.900655: sched:sched_switch: kworker/u16:1:5273 [120] t ==> swapper/1:0 [120]
>           swapper     0 [001] 137660.900657: sched:sched_switch: swapper/1:0 [120] R ==> kworker/u16:1:5273 [120]
>     kworker/u16:1  5273 [001] 137660.900659: sched:sched_switch: kworker/u16:1:5273 [120] t ==> swapper/1:0 [120]
>           swapper     0 [001] 137660.900661:     power:cpu_idle: state=4 cpu_id=1
>           swapper     0 [001] 137660.900704:     power:cpu_idle: state=4294967295 cpu_id=1 time: 43
>           swapper     0 [001] 137660.900708: sched:sched_switch: swapper/1:0 [120] R ==> kworker/u16:1:5273 [120]
>     kworker/u16:1  5273 [001] 137660.900713: sched:sched_switch: kworker/u16:1:5273 [120] t ==> swapper/1:0 [120]
>           swapper     0 [001] 137660.900717: sched:sched_switch: swapper/1:0 [120] R ==> kworker/u16:1:5273 [120]
>     kworker/u16:1  5273 [001] 137660.900718: sched:sched_switch: kworker/u16:1:5273 [120] t ==> swapper/1:0 [120]
>           swapper     0 [001] 137660.900720:     power:cpu_idle: state=4 cpu_id=1
>           swapper     0 [001] 137660.900763:     power:cpu_idle: state=4294967295 cpu_id=1 time: 43
>           swapper     0 [001] 137660.900766: sched:sched_switch: swapper/1:0 [120] R ==> kworker/u16:1:5273 [120]
>     kworker/u16:1  5273 [001] 137660.900772: sched:sched_switch: kworker/u16:1:5273 [120] t ==> swapper/1:0 [120]
>           swapper     0 [001] 137660.900775:     power:cpu_idle: state=4 cpu_id=1
>           swapper     0 [001] 137660.900778:     power:cpu_idle: state=4294967295 cpu_id=1 time: 3
>           swapper     0 [001] 137660.900779: sched:sched_switch: swapper/1:0 [120] R ==> kworker/u16:1:5273 [120]
>     kworker/u16:1  5273 [001] 137660.900781: sched:sched_switch: kworker/u16:1:5273 [120] t ==> swapper/1:0 [120]
>           swapper     0 [001] 137660.900783:     power:cpu_idle: state=4 cpu_id=1
>           swapper     0 [001] 137660.900818:     power:cpu_idle: state=4294967295 cpu_id=1 time: 35
>           swapper     0 [001] 137660.900821: sched:sched_switch: swapper/1:0 [120] R ==> kworker/u16:1:5273 [120]
>     kworker/u16:1  5273 [001] 137660.900827: sched:sched_switch: kworker/u16:1:5273 [120] t ==> swapper/1:0 [120]
>           swapper     0 [001] 137660.900830: sched:sched_switch: swapper/1:0 [120] R ==> kworker/u16:1:5273 [120]
>     kworker/u16:1  5273 [001] 137660.900831: sched:sched_switch: kworker/u16:1:5273 [120] t ==> swapper/1:0 [120]
>           swapper     0 [001] 137660.900833:     power:cpu_idle: state=4 cpu_id=1
>           swapper     0 [001] 137660.900879:     power:cpu_idle: state=4294967295 cpu_id=1 time: 46
>           swapper     0 [001] 137660.900882: sched:sched_switch: swapper/1:0 [120] R ==> kworker/u16:1:5273 [120]
>     kworker/u16:1  5273 [001] 137660.900891: sched:sched_switch: kworker/u16:1:5273 [120] t ==> swapper/1:0 [120]
>           swapper     0 [001] 137660.900894:     power:cpu_idle: state=1 cpu_id=1
>           swapper     0 [001] 137661.679273:     power:cpu_idle: state=4294967295 cpu_id=1 time: 778378
>           swapper     0 [001] 137661.679280:     power:cpu_idle: state=1 cpu_id=1
>           swapper     0 [001] 137662.191265:     power:cpu_idle: state=4294967295 cpu_id=1 time: 511984
>           swapper     0 [001] 137662.191274:     power:cpu_idle: state=0 cpu_id=1  <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
>           swapper     0 [001] 137662.843297:     power:cpu_idle: state=4294967295 cpu_id=1 time: 652023 uS
>           swapper     0 [001] 137662.843301: sched:sched_switch: swapper/1:0 [120] R ==> watchdog/1:14 [0]
>        watchdog/1    14 [001] 137662.843307: sched:sched_switch: watchdog/1:14 [0] S ==> swapper/1:0 [120]
> 
> CPU 3 sample 1:
> doug@s15:~/idle/perf$ cat t02_3_1.txt
>           swapper     0 [003] 136958.987314:     power:cpu_idle: state=4 cpu_id=3
>           swapper     0 [003] 136959.167299:     power:cpu_idle: state=4294967295 cpu_id=3
>           swapper     0 [003] 136959.167316:     power:cpu_idle: state=4 cpu_id=3
>           swapper     0 [003] 136960.479303:     power:cpu_idle: state=4294967295 cpu_id=3 time: 1.312 Sec
>           swapper     0 [003] 136960.479311:     power:cpu_idle: state=1 cpu_id=3
>           swapper     0 [003] 136960.479328:     power:cpu_idle: state=4294967295 cpu_id=3 time: 17 uSec
>           swapper     0 [003] 136960.479340:     power:cpu_idle: state=4 cpu_id=3
>           swapper     0 [003] 136962.495332:     power:cpu_idle: state=4294967295 cpu_id=3 time: 2.016 Sec
>           swapper     0 [003] 136962.495340:     power:cpu_idle: state=2 cpu_id=3
>           swapper     0 [003] 136962.495372:     power:cpu_idle: state=4294967295 cpu_id=3 time: 32 uSec
>           swapper     0 [003] 136962.495385:     power:cpu_idle: state=4 cpu_id=3
>           swapper     0 [003] 136962.655390:     power:cpu_idle: state=4294967295 cpu_id=3 time: 0.16 Sec
>           swapper     0 [003] 136962.655399:     power:cpu_idle: state=4 cpu_id=3
>           swapper     0 [003] 136962.987380:     power:cpu_idle: state=4294967295 cpu_id=3 time: 0.332 Sec
>           swapper     0 [003] 136962.987393: sched:sched_switch: swapper/3:0 [120] R ==> watchdog/3:26 [0]
>        watchdog/3    26 [003] 136962.987397: sched:sched_switch: watchdog/3:26 [0] S ==> swapper/3:0 [120]
>           swapper     0 [003] 136962.987402:     power:cpu_idle: state=4 cpu_id=3
>           swapper     0 [003] 136963.167387:     power:cpu_idle: state=4294967295 cpu_id=3 time: 0.18 Seconds
>           swapper     0 [003] 136963.167404:     power:cpu_idle: state=4 cpu_id=3
>           swapper     0 [003] 136963.679391:     power:cpu_idle: state=4294967295 cpu_id=3 time: 0.512 Seconds
>           swapper     0 [003] 136963.679399:     power:cpu_idle: state=0 cpu_id=3  <<<<<<<<<<<<<<<<<<<<<<<
>           swapper     0 [003] 136966.655478:     power:cpu_idle: state=4294967295 cpu_id=3 time: 2.875 Seconds
> 
> CPU 3 sample 2:
> 
> doug@s15:~/idle/perf$ cat t02_3_2.txt
>           swapper     0 [003] 137563.001020:     power:cpu_idle: state=4 cpu_id=3
>           swapper     0 [003] 137563.181005:     power:cpu_idle: state=4294967295 cpu_id=3 time: 0.18 Sec
>           swapper     0 [003] 137563.181023:     power:cpu_idle: state=4 cpu_id=3
>           swapper     0 [003] 137564.493009:     power:cpu_idle: state=4294967295 cpu_id=3 time: 9 uSec
>           swapper     0 [003] 137564.493018:     power:cpu_idle: state=1 cpu_id=3
>           swapper     0 [003] 137564.493036:     power:cpu_idle: state=4294967295 cpu_id=3 time: 18 uSec
>           swapper     0 [003] 137564.493048:     power:cpu_idle: state=4 cpu_id=3
>           swapper     0 [003] 137566.509039:     power:cpu_idle: state=4294967295 cpu_id=3 time: 2.016 Sec
>           swapper     0 [003] 137566.509048:     power:cpu_idle: state=2 cpu_id=3
>           swapper     0 [003] 137566.509082:     power:cpu_idle: state=4294967295 cpu_id=3 time: 34 uSec
>           swapper     0 [003] 137566.509094:     power:cpu_idle: state=4 cpu_id=3
>           swapper     0 [003] 137566.669096:     power:cpu_idle: state=4294967295 cpu_id=3 time: 7 uSec
>           swapper     0 [003] 137566.669103:     power:cpu_idle: state=4 cpu_id=3
>           swapper     0 [003] 137567.001089:     power:cpu_idle: state=4294967295 cpu_id=3 time: 0.332 Sec
>           swapper     0 [003] 137567.001103: sched:sched_switch: swapper/3:0 [120] R ==> watchdog/3:26 [0]
>        watchdog/3    26 [003] 137567.001107: sched:sched_switch: watchdog/3:26 [0] S ==> swapper/3:0 [120]
>           swapper     0 [003] 137567.001112:     power:cpu_idle: state=4 cpu_id=3
>           swapper     0 [003] 137567.181094:     power:cpu_idle: state=4294967295 cpu_id=3 time: 0.18 Sec
>           swapper     0 [003] 137567.181098:     power:cpu_idle: state=0 cpu_id=3 <<<<<<<<<<<<<<<<<<
>           swapper     0 [003] 137570.669187:     power:cpu_idle: state=4294967295 cpu_id=3 time: 3.488 Sec
> 
> Idle state times (seconds) over 2 minutes, CPU 7 = 100% load, while "perf record" running:
> CPU 0:
>     State 0: 1.436071
>     State 1: 0.000317
>     State 2: 0.000000
>     State 3: 0.000000
>     State 4: 118.554395
> CPU 1:
>     State 0: 0.000053
>     State 1: 0.000007
>     State 2: 0.000000
>     State 3: 0.000000
>     State 4: 120.000261
> CPU 2:
>     State 0: 0.000000
>     State 1: 0.000000
>     State 2: 0.032397
>     State 3: 0.030218
>     State 4: 119.902810
> CPU 3:
>     State 0: 2.176088
>     State 1: 0.000114
>     State 2: 0.000887
>     State 3: 0.000000
>     State 4: 117.820492
> CPU 4:
>     State 0: 0.000000
>     State 1: 0.000086
>     State 2: 0.001627
>     State 3: 0.000000
>     State 4: 121.360904
> CPU 5:
>     State 0: 0.000000
>     State 1: 0.000014
>     State 2: 0.001412
>     State 3: 0.000000
>     State 4: 120.000058
> CPU 6:
>     State 0: 0.000000
>     State 1: 0.000015
>     State 2: 0.000427
>     State 3: 0.001088
>     State 4: 119.984763
> CPU 7:
>     State 0: 0.000000
>     State 1: 0.000000
>     State 2: 0.000000
>     State 3: 0.000000
>     State 4: 0.000000
> 
> Idle state times (seconds) over 2 minutes, CPU 7 = 100% load, no "perf record":
> CPU 0:
>     State 0: 0.000223
>     State 1: 0.000679
>     State 2: 0.000000
>     State 3: 0.000000
>     State 4: 119.991636
> CPU 1:
>     State 0: 3.912153
>     State 1: 0.000130
>     State 2: 0.000000
>     State 3: 0.000000
>     State 4: 116.088366
> CPU 2:
>     State 0: 0.000000
>     State 1: 0.000000
>     State 2: 0.026982
>     State 3: 0.026250
>     State 4: 119.920878
> CPU 3:
>     State 0: 0.000039
>     State 1: 0.000325
>     State 2: 0.000981
>     State 3: 0.000422
>     State 4: 119.996761
> CPU 4:
>     State 0: 23.944607
>     State 1: 0.000092
>     State 2: 0.001095
>     State 3: 0.000000
>     State 4: 99.258784
> CPU 5:
>     State 0: 29.880724
>     State 1: 0.000065
>     State 2: 0.000934
>     State 3: 0.000000
>     State 4: 90.120384
> CPU 6:
>     State 0: 3.740422
>     State 1: 0.000017
>     State 2: 0.000451
>     State 3: 0.000000
>     State 4: 116.253516
> CPU 7:
>     State 0: 0.000000
>     State 1: 0.000000
>     State 2: 0.000000
>     State 3: 0.000000
>     State 4: 0.000000
> 
> ... Doug
> 
> 

-- 
Dipl. Inf. Thomas Ilsche
Computer Scientist
Highly Adaptive Energy-Efficient Computing
CRC 912 HAEC: http://tu-dresden.de/sfb912
Technische Universität Dresden
Center for Information Services and High Performance Computing (ZIH)
01062 Dresden, Germany

Phone: +49 351 463-42168
Fax: +49 351 463-37773
E-Mail: thomas.ilsche@tu-dresden.de

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] cpuidle: Allow menu governor to enter deeper sleep states after some time
  2017-11-07 23:04     ` Thomas Ilsche
@ 2017-11-08  4:53       ` Len Brown
  2017-11-08  6:01         ` Yu Chen
  2017-11-08 16:26         ` Doug Smythies
  0 siblings, 2 replies; 25+ messages in thread
From: Len Brown @ 2017-11-08  4:53 UTC (permalink / raw)
  To: Thomas Ilsche, Chen Yu
  Cc: Doug Smythies, Marcus Hähnel, Daniel Hackenberg,
	Robert Schöne, mario.bielert, Rafael J. Wysocki, Alex Shi,
	Ingo Molnar, Rik van Riel, Daniel Lezcano, Nicholas Piggin,
	Linux PM list

Hi Thomas,

> The timers actually do happen and run, however poll_idle
> directly resumes after the interrupt - there is no need_resched(). The menu
> governor assumes that a timer will trigger another menu_select, but it does not.

FYI,

Chen-Yu recently ran into this issue also.  In his case, the workload
is a FUTEX test case
in LKP, where half of the logical CPUs are busy, and the other half
have nothing to run,
but seem to be spending time in C0 rather than in idle, because they are getting
fooled into poll_idle, and staying there.

cheers,
-Len




On Tue, Nov 7, 2017 at 6:04 PM, Thomas Ilsche
<thomas.ilsche@tu-dresden.de> wrote:
> Hi Doug,
>
> thanks to your detailed description I was able to reproduce and track down
> the
> issue on one of our systems with a similar processor. The effect is similar,
> the
> core stays in a shallow sleep state for too long. This is also amplified on
> a
> system with little background noise where a core can stay in C0 for seconds.
> But the cause / trigger is different. By my observation with many perf
> probes,
> the next timer is preventing a deep sleep, also overriding the anti-poll
> mechanism.
>
> This immediate (usually 1-3 us) timer can be both tick_sched_timer and
> watchdog_timer_fn. The timers actually do happen and run, however poll_idle
> directly resumes after the interrupt - there is no need_resched(). The menu
> governor assumes that a timer will trigger another menu_select, but it does
> not.
> Neither does our fallback timer - so the mitigation.
>
> I made a hack[1] to stop poll_idle after timer_expires which prevents the
> issue
> in my tests. The idea is to make sure that poll_idle is really exited
> whenever a
> timer happens. I was trying to use existing information, but I'm not
> entirely
> sure if tick_sched.timer_expires actually has the right semantic. At the
> very
> least it would have to be exposed properly. Another way could be to set some
> flag within the timer handlers that is checked in poll_idle, but that is
> inviting race conditions. There is also the danger that resulting shorter,
> more
> frequent, times spent in C0 further confuses the menu heuristic.
>
> This issue is important to consider for the mitigation idea using a
> continuing
> tick timer when entering a shallow sleep state. We must ensure that the tick
> timer actually ends all sleep states including poll_idle.
>
> I did some experiments on an Core i7-2600 at nominal userspace frequency
> with
> enabled HyperThreading with one thread doing while(1){} over 20 min each.
> The default configuration consumes on average 53.5 W AC power. The
> configurations with C0 disabled, all but C6 disabled, and my timer_expires
> hack
> consume between 48.6 and 49.0 W on average. In the plots [2] of the 50 ms
> external power samples you nicely see the clusters where 1/2/3 cores are in
> C0
> adding about 9 W each.
>
> I was able to reproduce it on a number of additional CPUs, including larger
> multi-socket server systems. There are some systems where C0 is never used
> in
> the scenario, but that could be due to other factors. I think this is mostly
> independent of the actual CPU architecture.
>
> [1]:
> https://github.com/tud-zih-energy/linux/commit/7529b167dc7c2afaacd4551fe01ec576df5097e3
> [2]: https://wwwpub.zih.tu-dresden.de/~tilsche/powernightmares_poll.png
>
>
>>> Can you please share the patch, I'd like to use it for some experiments.
>>
>>
>> Can you not achieve the same, or at least similar and good enough for
>> testing, conditions by disabling all the other idle states?
>>
>> On my system, the max idle state is 4 (C6), and I just disable states
>> 0-3 to force always state 4.
>
>
> Absolutely! Previously I was using a version prior to 3ed09c9 where
> disabling state 0 was allowed.
>
> Best,
> Thomas
>
>
> On 2017-10-21 16:28, Doug Smythies wrote:
>>
>> Hi Thomas,
>>
>> Thanks for your quick reply.
>>
>> On 2017.10.20 Thomas Ilsche wrote:
>>
>>>
>>> Unfortunately, I don't have a complete picture of your issue.
>>>
>>> 1) What processor do you use and what are the following
>>> /sys/devices/system/cpu/cpu*/cpuidle/state*/{name,latency,residency}
>>
>>
>> Processor: Intel(R) Core(TM) i7-2600K CPU @ 3.40GHz
>> 4 cores, 8 CPUs. I do not disable hyperthreading, which
>> your paper mentions you do.
>>
>> All CPU have the same numbers, so CPU 0 used:
>>
>> $ cat /sys/devices/system/cpu/cpu0/cpuidle/state*/name
>> POLL
>> C1
>> C1E
>> C3
>> C6
>>
>> $ cat /sys/devices/system/cpu/cpu0/cpuidle/state*/latency
>> 0
>> 2
>> 10
>> 80
>> 104
>>
>> $ cat /sys/devices/system/cpu/cpu0/cpuidle/state*/residency
>> 0
>> 2
>> 20
>> 211
>> 345
>>
>>> 2) To get a better picture you can use perf via the following
>>> events in addition to power and residency:
>>>
>>> sched:sched_switch     -> which tasks are scheduled when.
>>> power:cpu_idle (state) -> which state was used when going to sleep
>>>
>>> Record a timeline with these events by using perf record -a -e
>>> You could also use -C to select only the core that should be
>>> idle but uses C0
>>> You can analyse it via perf script (or perf timeline if you have a
>>> scalable and fast SVG viewer, which I'm not sure exists)
>>>
>>> If you see state0 selected before a long idle phase, it's what we
>>> call a Powernightmare. The intervals of 8 previous idles can tell
>>> what the heuristic tries.
>>
>>
>> Very interesting. Running "perf record" with your events
>> has a drastic effect on the rate of occurrence of the issue.
>> Actually, it pretty much eliminates it. However, I was still
>> able to capture a few. (data further below.)
>>
>>> Some things to consider to avoid other influences on power
>>> - uniformity of the busy core's workload
>>> - fixed userspace p-state
>>> - pinned threads
>>
>>
>> Agreed.
>>
>>> If you can share a reproducible use-case I can also try with
>>> our tool chain (recording toolchain is FOSS, visualizer is not)
>>
>>
>> My test computer is a server, with no GUI. To make "idle" even
>> more "idle", I do this:
>>
>> $ cat set_cpu_turn_off_services
>> #! /bin/bash
>> # Turn off some services to try to get "idle" to be more "idle"
>> sudo systemctl stop mysql.service
>> sudo systemctl stop apache2.service
>> sudo systemctl stop nmbd.service
>> sudo systemctl stop smbd.service
>> sudo systemctl stop cron.service
>> sudo systemctl stop winbind.service
>> sudo systemctl stop apt-daily.timer
>> sudo systemctl stop libvirtd.service
>>
>> My original "Reported-and-tested-by" work for 0c313cb
>> only looked at my "idle" test system, and still is good
>> enough.
>>
>> However, now I observe significant excessive power sometimes
>> under load, the simplest being a single threaded CPU intensive
>> job.
>>   I merely load one CPU 100% (i.e. loadave = 1), and watch the others,
>> using turbostat (at >= 2 minutes per sample) and/or monitoring
>> idle state times (at >= 2 minutes, but typically more,
>> between samples) and/or the intel_pstate_tracer.py tool.
>>
>> Some example trace data (edited). Look for "<<<<<<<<".
>> The previous 8 idle entry/exits are included in each sample:
>>
>> CPU 1 sample 1:
>>
>> doug@s15:~/idle/perf$ cat t02_1_1.txt
>>     kworker/u16:1  5273 [001] 137660.900599: sched:sched_switch:
>> kworker/u16:1:5273 [120] t ==> swapper/1:0 [120]
>>           swapper     0 [001] 137660.900601:     power:cpu_idle: state=4
>> cpu_id=1
>>           swapper     0 [001] 137660.900645:     power:cpu_idle:
>> state=4294967295 cpu_id=1 time: 44
>>           swapper     0 [001] 137660.900649: sched:sched_switch:
>> swapper/1:0 [120] R ==> kworker/u16:1:5273 [120]
>>     kworker/u16:1  5273 [001] 137660.900655: sched:sched_switch:
>> kworker/u16:1:5273 [120] t ==> swapper/1:0 [120]
>>           swapper     0 [001] 137660.900657: sched:sched_switch:
>> swapper/1:0 [120] R ==> kworker/u16:1:5273 [120]
>>     kworker/u16:1  5273 [001] 137660.900659: sched:sched_switch:
>> kworker/u16:1:5273 [120] t ==> swapper/1:0 [120]
>>           swapper     0 [001] 137660.900661:     power:cpu_idle: state=4
>> cpu_id=1
>>           swapper     0 [001] 137660.900704:     power:cpu_idle:
>> state=4294967295 cpu_id=1 time: 43
>>           swapper     0 [001] 137660.900708: sched:sched_switch:
>> swapper/1:0 [120] R ==> kworker/u16:1:5273 [120]
>>     kworker/u16:1  5273 [001] 137660.900713: sched:sched_switch:
>> kworker/u16:1:5273 [120] t ==> swapper/1:0 [120]
>>           swapper     0 [001] 137660.900717: sched:sched_switch:
>> swapper/1:0 [120] R ==> kworker/u16:1:5273 [120]
>>     kworker/u16:1  5273 [001] 137660.900718: sched:sched_switch:
>> kworker/u16:1:5273 [120] t ==> swapper/1:0 [120]
>>           swapper     0 [001] 137660.900720:     power:cpu_idle: state=4
>> cpu_id=1
>>           swapper     0 [001] 137660.900763:     power:cpu_idle:
>> state=4294967295 cpu_id=1 time: 43
>>           swapper     0 [001] 137660.900766: sched:sched_switch:
>> swapper/1:0 [120] R ==> kworker/u16:1:5273 [120]
>>     kworker/u16:1  5273 [001] 137660.900772: sched:sched_switch:
>> kworker/u16:1:5273 [120] t ==> swapper/1:0 [120]
>>           swapper     0 [001] 137660.900775:     power:cpu_idle: state=4
>> cpu_id=1
>>           swapper     0 [001] 137660.900778:     power:cpu_idle:
>> state=4294967295 cpu_id=1 time: 3
>>           swapper     0 [001] 137660.900779: sched:sched_switch:
>> swapper/1:0 [120] R ==> kworker/u16:1:5273 [120]
>>     kworker/u16:1  5273 [001] 137660.900781: sched:sched_switch:
>> kworker/u16:1:5273 [120] t ==> swapper/1:0 [120]
>>           swapper     0 [001] 137660.900783:     power:cpu_idle: state=4
>> cpu_id=1
>>           swapper     0 [001] 137660.900818:     power:cpu_idle:
>> state=4294967295 cpu_id=1 time: 35
>>           swapper     0 [001] 137660.900821: sched:sched_switch:
>> swapper/1:0 [120] R ==> kworker/u16:1:5273 [120]
>>     kworker/u16:1  5273 [001] 137660.900827: sched:sched_switch:
>> kworker/u16:1:5273 [120] t ==> swapper/1:0 [120]
>>           swapper     0 [001] 137660.900830: sched:sched_switch:
>> swapper/1:0 [120] R ==> kworker/u16:1:5273 [120]
>>     kworker/u16:1  5273 [001] 137660.900831: sched:sched_switch:
>> kworker/u16:1:5273 [120] t ==> swapper/1:0 [120]
>>           swapper     0 [001] 137660.900833:     power:cpu_idle: state=4
>> cpu_id=1
>>           swapper     0 [001] 137660.900879:     power:cpu_idle:
>> state=4294967295 cpu_id=1 time: 46
>>           swapper     0 [001] 137660.900882: sched:sched_switch:
>> swapper/1:0 [120] R ==> kworker/u16:1:5273 [120]
>>     kworker/u16:1  5273 [001] 137660.900891: sched:sched_switch:
>> kworker/u16:1:5273 [120] t ==> swapper/1:0 [120]
>>           swapper     0 [001] 137660.900894:     power:cpu_idle: state=1
>> cpu_id=1
>>           swapper     0 [001] 137661.679273:     power:cpu_idle:
>> state=4294967295 cpu_id=1 time: 778378
>>           swapper     0 [001] 137661.679280:     power:cpu_idle: state=1
>> cpu_id=1
>>           swapper     0 [001] 137662.191265:     power:cpu_idle:
>> state=4294967295 cpu_id=1 time: 511984
>>           swapper     0 [001] 137662.191274:     power:cpu_idle: state=0
>> cpu_id=1  <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
>>           swapper     0 [001] 137662.843297:     power:cpu_idle:
>> state=4294967295 cpu_id=1 time: 652023 uS
>>           swapper     0 [001] 137662.843301: sched:sched_switch:
>> swapper/1:0 [120] R ==> watchdog/1:14 [0]
>>        watchdog/1    14 [001] 137662.843307: sched:sched_switch:
>> watchdog/1:14 [0] S ==> swapper/1:0 [120]
>>
>> CPU 3 sample 1:
>> doug@s15:~/idle/perf$ cat t02_3_1.txt
>>           swapper     0 [003] 136958.987314:     power:cpu_idle: state=4
>> cpu_id=3
>>           swapper     0 [003] 136959.167299:     power:cpu_idle:
>> state=4294967295 cpu_id=3
>>           swapper     0 [003] 136959.167316:     power:cpu_idle: state=4
>> cpu_id=3
>>           swapper     0 [003] 136960.479303:     power:cpu_idle:
>> state=4294967295 cpu_id=3 time: 1.312 Sec
>>           swapper     0 [003] 136960.479311:     power:cpu_idle: state=1
>> cpu_id=3
>>           swapper     0 [003] 136960.479328:     power:cpu_idle:
>> state=4294967295 cpu_id=3 time: 17 uSec
>>           swapper     0 [003] 136960.479340:     power:cpu_idle: state=4
>> cpu_id=3
>>           swapper     0 [003] 136962.495332:     power:cpu_idle:
>> state=4294967295 cpu_id=3 time: 2.016 Sec
>>           swapper     0 [003] 136962.495340:     power:cpu_idle: state=2
>> cpu_id=3
>>           swapper     0 [003] 136962.495372:     power:cpu_idle:
>> state=4294967295 cpu_id=3 time: 32 uSec
>>           swapper     0 [003] 136962.495385:     power:cpu_idle: state=4
>> cpu_id=3
>>           swapper     0 [003] 136962.655390:     power:cpu_idle:
>> state=4294967295 cpu_id=3 time: 0.16 Sec
>>           swapper     0 [003] 136962.655399:     power:cpu_idle: state=4
>> cpu_id=3
>>           swapper     0 [003] 136962.987380:     power:cpu_idle:
>> state=4294967295 cpu_id=3 time: 0.332 Sec
>>           swapper     0 [003] 136962.987393: sched:sched_switch:
>> swapper/3:0 [120] R ==> watchdog/3:26 [0]
>>        watchdog/3    26 [003] 136962.987397: sched:sched_switch:
>> watchdog/3:26 [0] S ==> swapper/3:0 [120]
>>           swapper     0 [003] 136962.987402:     power:cpu_idle: state=4
>> cpu_id=3
>>           swapper     0 [003] 136963.167387:     power:cpu_idle:
>> state=4294967295 cpu_id=3 time: 0.18 Seconds
>>           swapper     0 [003] 136963.167404:     power:cpu_idle: state=4
>> cpu_id=3
>>           swapper     0 [003] 136963.679391:     power:cpu_idle:
>> state=4294967295 cpu_id=3 time: 0.512 Seconds
>>           swapper     0 [003] 136963.679399:     power:cpu_idle: state=0
>> cpu_id=3  <<<<<<<<<<<<<<<<<<<<<<<
>>           swapper     0 [003] 136966.655478:     power:cpu_idle:
>> state=4294967295 cpu_id=3 time: 2.875 Seconds
>>
>> CPU 3 sample 2:
>>
>> doug@s15:~/idle/perf$ cat t02_3_2.txt
>>           swapper     0 [003] 137563.001020:     power:cpu_idle: state=4
>> cpu_id=3
>>           swapper     0 [003] 137563.181005:     power:cpu_idle:
>> state=4294967295 cpu_id=3 time: 0.18 Sec
>>           swapper     0 [003] 137563.181023:     power:cpu_idle: state=4
>> cpu_id=3
>>           swapper     0 [003] 137564.493009:     power:cpu_idle:
>> state=4294967295 cpu_id=3 time: 9 uSec
>>           swapper     0 [003] 137564.493018:     power:cpu_idle: state=1
>> cpu_id=3
>>           swapper     0 [003] 137564.493036:     power:cpu_idle:
>> state=4294967295 cpu_id=3 time: 18 uSec
>>           swapper     0 [003] 137564.493048:     power:cpu_idle: state=4
>> cpu_id=3
>>           swapper     0 [003] 137566.509039:     power:cpu_idle:
>> state=4294967295 cpu_id=3 time: 2.016 Sec
>>           swapper     0 [003] 137566.509048:     power:cpu_idle: state=2
>> cpu_id=3
>>           swapper     0 [003] 137566.509082:     power:cpu_idle:
>> state=4294967295 cpu_id=3 time: 34 uSec
>>           swapper     0 [003] 137566.509094:     power:cpu_idle: state=4
>> cpu_id=3
>>           swapper     0 [003] 137566.669096:     power:cpu_idle:
>> state=4294967295 cpu_id=3 time: 7 uSec
>>           swapper     0 [003] 137566.669103:     power:cpu_idle: state=4
>> cpu_id=3
>>           swapper     0 [003] 137567.001089:     power:cpu_idle:
>> state=4294967295 cpu_id=3 time: 0.332 Sec
>>           swapper     0 [003] 137567.001103: sched:sched_switch:
>> swapper/3:0 [120] R ==> watchdog/3:26 [0]
>>        watchdog/3    26 [003] 137567.001107: sched:sched_switch:
>> watchdog/3:26 [0] S ==> swapper/3:0 [120]
>>           swapper     0 [003] 137567.001112:     power:cpu_idle: state=4
>> cpu_id=3
>>           swapper     0 [003] 137567.181094:     power:cpu_idle:
>> state=4294967295 cpu_id=3 time: 0.18 Sec
>>           swapper     0 [003] 137567.181098:     power:cpu_idle: state=0
>> cpu_id=3 <<<<<<<<<<<<<<<<<<
>>           swapper     0 [003] 137570.669187:     power:cpu_idle:
>> state=4294967295 cpu_id=3 time: 3.488 Sec
>>
>> Idle state times (seconds) over 2 minutes, CPU 7 = 100% load, while "perf
>> record" running:
>> CPU 0:
>>     State 0: 1.436071
>>     State 1: 0.000317
>>     State 2: 0.000000
>>     State 3: 0.000000
>>     State 4: 118.554395
>> CPU 1:
>>     State 0: 0.000053
>>     State 1: 0.000007
>>     State 2: 0.000000
>>     State 3: 0.000000
>>     State 4: 120.000261
>> CPU 2:
>>     State 0: 0.000000
>>     State 1: 0.000000
>>     State 2: 0.032397
>>     State 3: 0.030218
>>     State 4: 119.902810
>> CPU 3:
>>     State 0: 2.176088
>>     State 1: 0.000114
>>     State 2: 0.000887
>>     State 3: 0.000000
>>     State 4: 117.820492
>> CPU 4:
>>     State 0: 0.000000
>>     State 1: 0.000086
>>     State 2: 0.001627
>>     State 3: 0.000000
>>     State 4: 121.360904
>> CPU 5:
>>     State 0: 0.000000
>>     State 1: 0.000014
>>     State 2: 0.001412
>>     State 3: 0.000000
>>     State 4: 120.000058
>> CPU 6:
>>     State 0: 0.000000
>>     State 1: 0.000015
>>     State 2: 0.000427
>>     State 3: 0.001088
>>     State 4: 119.984763
>> CPU 7:
>>     State 0: 0.000000
>>     State 1: 0.000000
>>     State 2: 0.000000
>>     State 3: 0.000000
>>     State 4: 0.000000
>>
>> Idle state times (seconds) over 2 minutes, CPU 7 = 100% load, no "perf
>> record":
>> CPU 0:
>>     State 0: 0.000223
>>     State 1: 0.000679
>>     State 2: 0.000000
>>     State 3: 0.000000
>>     State 4: 119.991636
>> CPU 1:
>>     State 0: 3.912153
>>     State 1: 0.000130
>>     State 2: 0.000000
>>     State 3: 0.000000
>>     State 4: 116.088366
>> CPU 2:
>>     State 0: 0.000000
>>     State 1: 0.000000
>>     State 2: 0.026982
>>     State 3: 0.026250
>>     State 4: 119.920878
>> CPU 3:
>>     State 0: 0.000039
>>     State 1: 0.000325
>>     State 2: 0.000981
>>     State 3: 0.000422
>>     State 4: 119.996761
>> CPU 4:
>>     State 0: 23.944607
>>     State 1: 0.000092
>>     State 2: 0.001095
>>     State 3: 0.000000
>>     State 4: 99.258784
>> CPU 5:
>>     State 0: 29.880724
>>     State 1: 0.000065
>>     State 2: 0.000934
>>     State 3: 0.000000
>>     State 4: 90.120384
>> CPU 6:
>>     State 0: 3.740422
>>     State 1: 0.000017
>>     State 2: 0.000451
>>     State 3: 0.000000
>>     State 4: 116.253516
>> CPU 7:
>>     State 0: 0.000000
>>     State 1: 0.000000
>>     State 2: 0.000000
>>     State 3: 0.000000
>>     State 4: 0.000000
>>
>> ... Doug
>>
>>
>
> --
> Dipl. Inf. Thomas Ilsche
> Computer Scientist
> Highly Adaptive Energy-Efficient Computing
> CRC 912 HAEC: http://tu-dresden.de/sfb912
> Technische Universität Dresden
> Center for Information Services and High Performance Computing (ZIH)
> 01062 Dresden, Germany
>
> Phone: +49 351 463-42168
> Fax: +49 351 463-37773
> E-Mail: thomas.ilsche@tu-dresden.de
>
>



-- 
Len Brown, Intel Open Source Technology Center

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] cpuidle: Allow menu governor to enter deeper sleep states after some time
  2017-11-08  4:53       ` Len Brown
@ 2017-11-08  6:01         ` Yu Chen
  2017-11-08 16:26         ` Doug Smythies
  1 sibling, 0 replies; 25+ messages in thread
From: Yu Chen @ 2017-11-08  6:01 UTC (permalink / raw)
  To: Len Brown
  Cc: Thomas Ilsche, Doug Smythies, Marcus Hähnel,
	Daniel Hackenberg, Robert Schöne, mario.bielert,
	Rafael J. Wysocki, Alex Shi, Ingo Molnar, Rik van Riel,
	Daniel Lezcano, Nicholas Piggin, Ying Huang, Linux PM list

On Tue, Nov 07, 2017 at 11:53:59PM -0500, Len Brown wrote:
> Hi Thomas,
> 
> > The timers actually do happen and run, however poll_idle
> > directly resumes after the interrupt - there is no need_resched(). The menu
> > governor assumes that a timer will trigger another menu_select, but it does not.
> 
> FYI,
> 
> Chen-Yu recently ran into this issue also.  In his case, the workload
> is a FUTEX test case
> in LKP, where half of the logical CPUs are busy, and the other half
> have nothing to run,
> but seem to be spending time in C0 rather than in idle, because they are getting
> fooled into poll_idle, and staying there.
>
Yes, the long poll_idle residence issue is also found on two similar
104 cpus servers. Actually it can be reproduced w/o futex running on
them, only with a perf profiling:
perf record -q --freq=800 -e  cycles:pp -- sleep 0.01

According to the ftrace, there are mainly two problems here:

1. A local apic timer seems to always arrive ealier than it should be,
   which causes the breakage of deep cstate, as a result the menu governor
   would get a short 'next timer' event, which brings the cpu into poll_idle:

  kworker/u657:9-1082  [001] d..3   530.907924: sched_switch: prev_comm=kworker/u657:9 prev_pid=1082 prev_prio=120 prev_state=t ==> next_comm=swapper/1 next_pid=0 next_prio=120
          <idle>-0     [001] d..2   530.907924: cpu_idle: state=2 cpu_id=1
          <idle>-0     [001] d..2   530.908349: cpu_idle: state=4294967295 cpu_id=1
	  /* Deep cstate is interrupt by the local_timer_entry */
          <idle>-0     [001] d.h2   530.908350: local_timer_entry: vector=239
	  /*  The menu governor would check the next expire timer, and
	   *  since it is very near, the poll_idle will be chosen.
	   *  We can also see that, there is no timer callback invoked
	   *  in this local timer interrupt, which means, this interrupt
	   *  might be triggered ahead of time. BTW, the ipi broadcast
	   *  interrupt is only triggered in case this is a broadcast
	   *  interrupt, however ipi is not recorded during the test.
	   */
          <idle>-0     [001] d..2   530.908351: cpu_idle: state=0 cpu_id=1
	  /*  Here comes the 'real' interrupt, the perf handler and
	   *  the tick sched timer handler get running.
	   */
          <idle>-0     [001] d.h2   530.908360: local_timer_entry: vector=239
          <idle>-0     [001] d.h2   530.908363: hrtimer_expire_entry: hrtimer=ffff88085d8610f8 function=perf_mux_hrtimer_handler now=528438011190
          <idle>-0     [001] d.h2   530.908367: hrtimer_expire_entry: hrtimer=ffff88085d854c20 function=tick_sched_timer now=528438011190
          <idle>-0     [001] d.h2   530.909349: local_timer_entry: vector=239
          <idle>-0     [001] d.h2   530.909360: local_timer_entry: vector=239
          <idle>-0     [001] d.h2   530.909360: hrtimer_expire_entry: hrtimer=ffff88085d8610f8 function=perf_mux_hrtimer_handler now=528439008247
          <idle>-0     [001] d.h2   530.909364: hrtimer_expire_entry: hrtimer=ffff88085d854c20 function=tick_sched_timer now=52843900824
	  ------------------cut----------------------------------
	  /* A ready task is switched in, the poll_idle quit. */
          <idle>-0     [001] d..3   530.963985: sched_switch: prev_comm=swapper/1 prev_pid=0 prev_prio=120 prev_state=R ==> next_comm=kworker/u657:9 next_pid=1082 next_prio=120

2.    Once the cpu enters poll_idle, it does not have a chance to choose
      deep cstate until next context switch happen, even if the cpu
      stays in idle for quite long time.
      
      
      
 Question 1 is the reason why the cpu choose poll_idle, and question 2
 is the reason why the cpu stays in poll_idle so long, and I think
 question 1 needs to be further investigated whether we program the
 clockevent too soon(than it should be), and question 2 is what this
 patch trying to address.

Thanks,
	Yu
> cheers,
> -Len
> 
> 
> 
> 
> On Tue, Nov 7, 2017 at 6:04 PM, Thomas Ilsche
> <thomas.ilsche@tu-dresden.de> wrote:
> > Hi Doug,
> >
> > thanks to your detailed description I was able to reproduce and track down
> > the
> > issue on one of our systems with a similar processor. The effect is similar,
> > the
> > core stays in a shallow sleep state for too long. This is also amplified on
> > a
> > system with little background noise where a core can stay in C0 for seconds.
> > But the cause / trigger is different. By my observation with many perf
> > probes,
> > the next timer is preventing a deep sleep, also overriding the anti-poll
> > mechanism.
> >
> > This immediate (usually 1-3 us) timer can be both tick_sched_timer and
> > watchdog_timer_fn. The timers actually do happen and run, however poll_idle
> > directly resumes after the interrupt - there is no need_resched(). The menu
> > governor assumes that a timer will trigger another menu_select, but it does
> > not.
> > Neither does our fallback timer - so the mitigation.
> >
> > I made a hack[1] to stop poll_idle after timer_expires which prevents the
> > issue
> > in my tests. The idea is to make sure that poll_idle is really exited
> > whenever a
> > timer happens. I was trying to use existing information, but I'm not
> > entirely
> > sure if tick_sched.timer_expires actually has the right semantic. At the
> > very
> > least it would have to be exposed properly. Another way could be to set some
> > flag within the timer handlers that is checked in poll_idle, but that is
> > inviting race conditions. There is also the danger that resulting shorter,
> > more
> > frequent, times spent in C0 further confuses the menu heuristic.
> >
> > This issue is important to consider for the mitigation idea using a
> > continuing
> > tick timer when entering a shallow sleep state. We must ensure that the tick
> > timer actually ends all sleep states including poll_idle.
> >
> > I did some experiments on an Core i7-2600 at nominal userspace frequency
> > with
> > enabled HyperThreading with one thread doing while(1){} over 20 min each.
> > The default configuration consumes on average 53.5 W AC power. The
> > configurations with C0 disabled, all but C6 disabled, and my timer_expires
> > hack
> > consume between 48.6 and 49.0 W on average. In the plots [2] of the 50 ms
> > external power samples you nicely see the clusters where 1/2/3 cores are in
> > C0
> > adding about 9 W each.
> >
> > I was able to reproduce it on a number of additional CPUs, including larger
> > multi-socket server systems. There are some systems where C0 is never used
> > in
> > the scenario, but that could be due to other factors. I think this is mostly
> > independent of the actual CPU architecture.
> >
> > [1]:
> > https://github.com/tud-zih-energy/linux/commit/7529b167dc7c2afaacd4551fe01ec576df5097e3
> > [2]: https://wwwpub.zih.tu-dresden.de/~tilsche/powernightmares_poll.png
> >
> >
> >>> Can you please share the patch, I'd like to use it for some experiments.
> >>
> >>
> >> Can you not achieve the same, or at least similar and good enough for
> >> testing, conditions by disabling all the other idle states?
> >>
> >> On my system, the max idle state is 4 (C6), and I just disable states
> >> 0-3 to force always state 4.
> >
> >
> > Absolutely! Previously I was using a version prior to 3ed09c9 where
> > disabling state 0 was allowed.
> >
> > Best,
> > Thomas
> >
> >
> > On 2017-10-21 16:28, Doug Smythies wrote:
> >>
> >> Hi Thomas,
> >>
> >> Thanks for your quick reply.
> >>
> >> On 2017.10.20 Thomas Ilsche wrote:
> >>
> >>>
> >>> Unfortunately, I don't have a complete picture of your issue.
> >>>
> >>> 1) What processor do you use and what are the following
> >>> /sys/devices/system/cpu/cpu*/cpuidle/state*/{name,latency,residency}
> >>
> >>
> >> Processor: Intel(R) Core(TM) i7-2600K CPU @ 3.40GHz
> >> 4 cores, 8 CPUs. I do not disable hyperthreading, which
> >> your paper mentions you do.
> >>
> >> All CPU have the same numbers, so CPU 0 used:
> >>
> >> $ cat /sys/devices/system/cpu/cpu0/cpuidle/state*/name
> >> POLL
> >> C1
> >> C1E
> >> C3
> >> C6
> >>
> >> $ cat /sys/devices/system/cpu/cpu0/cpuidle/state*/latency
> >> 0
> >> 2
> >> 10
> >> 80
> >> 104
> >>
> >> $ cat /sys/devices/system/cpu/cpu0/cpuidle/state*/residency
> >> 0
> >> 2
> >> 20
> >> 211
> >> 345
> >>
> >>> 2) To get a better picture you can use perf via the following
> >>> events in addition to power and residency:
> >>>
> >>> sched:sched_switch     -> which tasks are scheduled when.
> >>> power:cpu_idle (state) -> which state was used when going to sleep
> >>>
> >>> Record a timeline with these events by using perf record -a -e
> >>> You could also use -C to select only the core that should be
> >>> idle but uses C0
> >>> You can analyse it via perf script (or perf timeline if you have a
> >>> scalable and fast SVG viewer, which I'm not sure exists)
> >>>
> >>> If you see state0 selected before a long idle phase, it's what we
> >>> call a Powernightmare. The intervals of 8 previous idles can tell
> >>> what the heuristic tries.
> >>
> >>
> >> Very interesting. Running "perf record" with your events
> >> has a drastic effect on the rate of occurrence of the issue.
> >> Actually, it pretty much eliminates it. However, I was still
> >> able to capture a few. (data further below.)
> >>
> >>> Some things to consider to avoid other influences on power
> >>> - uniformity of the busy core's workload
> >>> - fixed userspace p-state
> >>> - pinned threads
> >>
> >>
> >> Agreed.
> >>
> >>> If you can share a reproducible use-case I can also try with
> >>> our tool chain (recording toolchain is FOSS, visualizer is not)
> >>
> >>
> >> My test computer is a server, with no GUI. To make "idle" even
> >> more "idle", I do this:
> >>
> >> $ cat set_cpu_turn_off_services
> >> #! /bin/bash
> >> # Turn off some services to try to get "idle" to be more "idle"
> >> sudo systemctl stop mysql.service
> >> sudo systemctl stop apache2.service
> >> sudo systemctl stop nmbd.service
> >> sudo systemctl stop smbd.service
> >> sudo systemctl stop cron.service
> >> sudo systemctl stop winbind.service
> >> sudo systemctl stop apt-daily.timer
> >> sudo systemctl stop libvirtd.service
> >>
> >> My original "Reported-and-tested-by" work for 0c313cb
> >> only looked at my "idle" test system, and still is good
> >> enough.
> >>
> >> However, now I observe significant excessive power sometimes
> >> under load, the simplest being a single threaded CPU intensive
> >> job.
> >>   I merely load one CPU 100% (i.e. loadave = 1), and watch the others,
> >> using turbostat (at >= 2 minutes per sample) and/or monitoring
> >> idle state times (at >= 2 minutes, but typically more,
> >> between samples) and/or the intel_pstate_tracer.py tool.
> >>
> >> Some example trace data (edited). Look for "<<<<<<<<".
> >> The previous 8 idle entry/exits are included in each sample:
> >>
> >> CPU 1 sample 1:
> >>
> >> doug@s15:~/idle/perf$ cat t02_1_1.txt
> >>     kworker/u16:1  5273 [001] 137660.900599: sched:sched_switch:
> >> kworker/u16:1:5273 [120] t ==> swapper/1:0 [120]
> >>           swapper     0 [001] 137660.900601:     power:cpu_idle: state=4
> >> cpu_id=1
> >>           swapper     0 [001] 137660.900645:     power:cpu_idle:
> >> state=4294967295 cpu_id=1 time: 44
> >>           swapper     0 [001] 137660.900649: sched:sched_switch:
> >> swapper/1:0 [120] R ==> kworker/u16:1:5273 [120]
> >>     kworker/u16:1  5273 [001] 137660.900655: sched:sched_switch:
> >> kworker/u16:1:5273 [120] t ==> swapper/1:0 [120]
> >>           swapper     0 [001] 137660.900657: sched:sched_switch:
> >> swapper/1:0 [120] R ==> kworker/u16:1:5273 [120]
> >>     kworker/u16:1  5273 [001] 137660.900659: sched:sched_switch:
> >> kworker/u16:1:5273 [120] t ==> swapper/1:0 [120]
> >>           swapper     0 [001] 137660.900661:     power:cpu_idle: state=4
> >> cpu_id=1
> >>           swapper     0 [001] 137660.900704:     power:cpu_idle:
> >> state=4294967295 cpu_id=1 time: 43
> >>           swapper     0 [001] 137660.900708: sched:sched_switch:
> >> swapper/1:0 [120] R ==> kworker/u16:1:5273 [120]
> >>     kworker/u16:1  5273 [001] 137660.900713: sched:sched_switch:
> >> kworker/u16:1:5273 [120] t ==> swapper/1:0 [120]
> >>           swapper     0 [001] 137660.900717: sched:sched_switch:
> >> swapper/1:0 [120] R ==> kworker/u16:1:5273 [120]
> >>     kworker/u16:1  5273 [001] 137660.900718: sched:sched_switch:
> >> kworker/u16:1:5273 [120] t ==> swapper/1:0 [120]
> >>           swapper     0 [001] 137660.900720:     power:cpu_idle: state=4
> >> cpu_id=1
> >>           swapper     0 [001] 137660.900763:     power:cpu_idle:
> >> state=4294967295 cpu_id=1 time: 43
> >>           swapper     0 [001] 137660.900766: sched:sched_switch:
> >> swapper/1:0 [120] R ==> kworker/u16:1:5273 [120]
> >>     kworker/u16:1  5273 [001] 137660.900772: sched:sched_switch:
> >> kworker/u16:1:5273 [120] t ==> swapper/1:0 [120]
> >>           swapper     0 [001] 137660.900775:     power:cpu_idle: state=4
> >> cpu_id=1
> >>           swapper     0 [001] 137660.900778:     power:cpu_idle:
> >> state=4294967295 cpu_id=1 time: 3
> >>           swapper     0 [001] 137660.900779: sched:sched_switch:
> >> swapper/1:0 [120] R ==> kworker/u16:1:5273 [120]
> >>     kworker/u16:1  5273 [001] 137660.900781: sched:sched_switch:
> >> kworker/u16:1:5273 [120] t ==> swapper/1:0 [120]
> >>           swapper     0 [001] 137660.900783:     power:cpu_idle: state=4
> >> cpu_id=1
> >>           swapper     0 [001] 137660.900818:     power:cpu_idle:
> >> state=4294967295 cpu_id=1 time: 35
> >>           swapper     0 [001] 137660.900821: sched:sched_switch:
> >> swapper/1:0 [120] R ==> kworker/u16:1:5273 [120]
> >>     kworker/u16:1  5273 [001] 137660.900827: sched:sched_switch:
> >> kworker/u16:1:5273 [120] t ==> swapper/1:0 [120]
> >>           swapper     0 [001] 137660.900830: sched:sched_switch:
> >> swapper/1:0 [120] R ==> kworker/u16:1:5273 [120]
> >>     kworker/u16:1  5273 [001] 137660.900831: sched:sched_switch:
> >> kworker/u16:1:5273 [120] t ==> swapper/1:0 [120]
> >>           swapper     0 [001] 137660.900833:     power:cpu_idle: state=4
> >> cpu_id=1
> >>           swapper     0 [001] 137660.900879:     power:cpu_idle:
> >> state=4294967295 cpu_id=1 time: 46
> >>           swapper     0 [001] 137660.900882: sched:sched_switch:
> >> swapper/1:0 [120] R ==> kworker/u16:1:5273 [120]
> >>     kworker/u16:1  5273 [001] 137660.900891: sched:sched_switch:
> >> kworker/u16:1:5273 [120] t ==> swapper/1:0 [120]
> >>           swapper     0 [001] 137660.900894:     power:cpu_idle: state=1
> >> cpu_id=1
> >>           swapper     0 [001] 137661.679273:     power:cpu_idle:
> >> state=4294967295 cpu_id=1 time: 778378
> >>           swapper     0 [001] 137661.679280:     power:cpu_idle: state=1
> >> cpu_id=1
> >>           swapper     0 [001] 137662.191265:     power:cpu_idle:
> >> state=4294967295 cpu_id=1 time: 511984
> >>           swapper     0 [001] 137662.191274:     power:cpu_idle: state=0
> >> cpu_id=1  <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
> >>           swapper     0 [001] 137662.843297:     power:cpu_idle:
> >> state=4294967295 cpu_id=1 time: 652023 uS
> >>           swapper     0 [001] 137662.843301: sched:sched_switch:
> >> swapper/1:0 [120] R ==> watchdog/1:14 [0]
> >>        watchdog/1    14 [001] 137662.843307: sched:sched_switch:
> >> watchdog/1:14 [0] S ==> swapper/1:0 [120]
> >>
> >> CPU 3 sample 1:
> >> doug@s15:~/idle/perf$ cat t02_3_1.txt
> >>           swapper     0 [003] 136958.987314:     power:cpu_idle: state=4
> >> cpu_id=3
> >>           swapper     0 [003] 136959.167299:     power:cpu_idle:
> >> state=4294967295 cpu_id=3
> >>           swapper     0 [003] 136959.167316:     power:cpu_idle: state=4
> >> cpu_id=3
> >>           swapper     0 [003] 136960.479303:     power:cpu_idle:
> >> state=4294967295 cpu_id=3 time: 1.312 Sec
> >>           swapper     0 [003] 136960.479311:     power:cpu_idle: state=1
> >> cpu_id=3
> >>           swapper     0 [003] 136960.479328:     power:cpu_idle:
> >> state=4294967295 cpu_id=3 time: 17 uSec
> >>           swapper     0 [003] 136960.479340:     power:cpu_idle: state=4
> >> cpu_id=3
> >>           swapper     0 [003] 136962.495332:     power:cpu_idle:
> >> state=4294967295 cpu_id=3 time: 2.016 Sec
> >>           swapper     0 [003] 136962.495340:     power:cpu_idle: state=2
> >> cpu_id=3
> >>           swapper     0 [003] 136962.495372:     power:cpu_idle:
> >> state=4294967295 cpu_id=3 time: 32 uSec
> >>           swapper     0 [003] 136962.495385:     power:cpu_idle: state=4
> >> cpu_id=3
> >>           swapper     0 [003] 136962.655390:     power:cpu_idle:
> >> state=4294967295 cpu_id=3 time: 0.16 Sec
> >>           swapper     0 [003] 136962.655399:     power:cpu_idle: state=4
> >> cpu_id=3
> >>           swapper     0 [003] 136962.987380:     power:cpu_idle:
> >> state=4294967295 cpu_id=3 time: 0.332 Sec
> >>           swapper     0 [003] 136962.987393: sched:sched_switch:
> >> swapper/3:0 [120] R ==> watchdog/3:26 [0]
> >>        watchdog/3    26 [003] 136962.987397: sched:sched_switch:
> >> watchdog/3:26 [0] S ==> swapper/3:0 [120]
> >>           swapper     0 [003] 136962.987402:     power:cpu_idle: state=4
> >> cpu_id=3
> >>           swapper     0 [003] 136963.167387:     power:cpu_idle:
> >> state=4294967295 cpu_id=3 time: 0.18 Seconds
> >>           swapper     0 [003] 136963.167404:     power:cpu_idle: state=4
> >> cpu_id=3
> >>           swapper     0 [003] 136963.679391:     power:cpu_idle:
> >> state=4294967295 cpu_id=3 time: 0.512 Seconds
> >>           swapper     0 [003] 136963.679399:     power:cpu_idle: state=0
> >> cpu_id=3  <<<<<<<<<<<<<<<<<<<<<<<
> >>           swapper     0 [003] 136966.655478:     power:cpu_idle:
> >> state=4294967295 cpu_id=3 time: 2.875 Seconds
> >>
> >> CPU 3 sample 2:
> >>
> >> doug@s15:~/idle/perf$ cat t02_3_2.txt
> >>           swapper     0 [003] 137563.001020:     power:cpu_idle: state=4
> >> cpu_id=3
> >>           swapper     0 [003] 137563.181005:     power:cpu_idle:
> >> state=4294967295 cpu_id=3 time: 0.18 Sec
> >>           swapper     0 [003] 137563.181023:     power:cpu_idle: state=4
> >> cpu_id=3
> >>           swapper     0 [003] 137564.493009:     power:cpu_idle:
> >> state=4294967295 cpu_id=3 time: 9 uSec
> >>           swapper     0 [003] 137564.493018:     power:cpu_idle: state=1
> >> cpu_id=3
> >>           swapper     0 [003] 137564.493036:     power:cpu_idle:
> >> state=4294967295 cpu_id=3 time: 18 uSec
> >>           swapper     0 [003] 137564.493048:     power:cpu_idle: state=4
> >> cpu_id=3
> >>           swapper     0 [003] 137566.509039:     power:cpu_idle:
> >> state=4294967295 cpu_id=3 time: 2.016 Sec
> >>           swapper     0 [003] 137566.509048:     power:cpu_idle: state=2
> >> cpu_id=3
> >>           swapper     0 [003] 137566.509082:     power:cpu_idle:
> >> state=4294967295 cpu_id=3 time: 34 uSec
> >>           swapper     0 [003] 137566.509094:     power:cpu_idle: state=4
> >> cpu_id=3
> >>           swapper     0 [003] 137566.669096:     power:cpu_idle:
> >> state=4294967295 cpu_id=3 time: 7 uSec
> >>           swapper     0 [003] 137566.669103:     power:cpu_idle: state=4
> >> cpu_id=3
> >>           swapper     0 [003] 137567.001089:     power:cpu_idle:
> >> state=4294967295 cpu_id=3 time: 0.332 Sec
> >>           swapper     0 [003] 137567.001103: sched:sched_switch:
> >> swapper/3:0 [120] R ==> watchdog/3:26 [0]
> >>        watchdog/3    26 [003] 137567.001107: sched:sched_switch:
> >> watchdog/3:26 [0] S ==> swapper/3:0 [120]
> >>           swapper     0 [003] 137567.001112:     power:cpu_idle: state=4
> >> cpu_id=3
> >>           swapper     0 [003] 137567.181094:     power:cpu_idle:
> >> state=4294967295 cpu_id=3 time: 0.18 Sec
> >>           swapper     0 [003] 137567.181098:     power:cpu_idle: state=0
> >> cpu_id=3 <<<<<<<<<<<<<<<<<<
> >>           swapper     0 [003] 137570.669187:     power:cpu_idle:
> >> state=4294967295 cpu_id=3 time: 3.488 Sec
> >>
> >> Idle state times (seconds) over 2 minutes, CPU 7 = 100% load, while "perf
> >> record" running:
> >> CPU 0:
> >>     State 0: 1.436071
> >>     State 1: 0.000317
> >>     State 2: 0.000000
> >>     State 3: 0.000000
> >>     State 4: 118.554395
> >> CPU 1:
> >>     State 0: 0.000053
> >>     State 1: 0.000007
> >>     State 2: 0.000000
> >>     State 3: 0.000000
> >>     State 4: 120.000261
> >> CPU 2:
> >>     State 0: 0.000000
> >>     State 1: 0.000000
> >>     State 2: 0.032397
> >>     State 3: 0.030218
> >>     State 4: 119.902810
> >> CPU 3:
> >>     State 0: 2.176088
> >>     State 1: 0.000114
> >>     State 2: 0.000887
> >>     State 3: 0.000000
> >>     State 4: 117.820492
> >> CPU 4:
> >>     State 0: 0.000000
> >>     State 1: 0.000086
> >>     State 2: 0.001627
> >>     State 3: 0.000000
> >>     State 4: 121.360904
> >> CPU 5:
> >>     State 0: 0.000000
> >>     State 1: 0.000014
> >>     State 2: 0.001412
> >>     State 3: 0.000000
> >>     State 4: 120.000058
> >> CPU 6:
> >>     State 0: 0.000000
> >>     State 1: 0.000015
> >>     State 2: 0.000427
> >>     State 3: 0.001088
> >>     State 4: 119.984763
> >> CPU 7:
> >>     State 0: 0.000000
> >>     State 1: 0.000000
> >>     State 2: 0.000000
> >>     State 3: 0.000000
> >>     State 4: 0.000000
> >>
> >> Idle state times (seconds) over 2 minutes, CPU 7 = 100% load, no "perf
> >> record":
> >> CPU 0:
> >>     State 0: 0.000223
> >>     State 1: 0.000679
> >>     State 2: 0.000000
> >>     State 3: 0.000000
> >>     State 4: 119.991636
> >> CPU 1:
> >>     State 0: 3.912153
> >>     State 1: 0.000130
> >>     State 2: 0.000000
> >>     State 3: 0.000000
> >>     State 4: 116.088366
> >> CPU 2:
> >>     State 0: 0.000000
> >>     State 1: 0.000000
> >>     State 2: 0.026982
> >>     State 3: 0.026250
> >>     State 4: 119.920878
> >> CPU 3:
> >>     State 0: 0.000039
> >>     State 1: 0.000325
> >>     State 2: 0.000981
> >>     State 3: 0.000422
> >>     State 4: 119.996761
> >> CPU 4:
> >>     State 0: 23.944607
> >>     State 1: 0.000092
> >>     State 2: 0.001095
> >>     State 3: 0.000000
> >>     State 4: 99.258784
> >> CPU 5:
> >>     State 0: 29.880724
> >>     State 1: 0.000065
> >>     State 2: 0.000934
> >>     State 3: 0.000000
> >>     State 4: 90.120384
> >> CPU 6:
> >>     State 0: 3.740422
> >>     State 1: 0.000017
> >>     State 2: 0.000451
> >>     State 3: 0.000000
> >>     State 4: 116.253516
> >> CPU 7:
> >>     State 0: 0.000000
> >>     State 1: 0.000000
> >>     State 2: 0.000000
> >>     State 3: 0.000000
> >>     State 4: 0.000000
> >>
> >> ... Doug
> >>
> >>
> >
> > --
> > Dipl. Inf. Thomas Ilsche
> > Computer Scientist
> > Highly Adaptive Energy-Efficient Computing
> > CRC 912 HAEC: http://tu-dresden.de/sfb912
> > Technische Universität Dresden
> > Center for Information Services and High Performance Computing (ZIH)
> > 01062 Dresden, Germany
> >
> > Phone: +49 351 463-42168
> > Fax: +49 351 463-37773
> > E-Mail: thomas.ilsche@tu-dresden.de
> >
> >
> 
> 
> 
> -- 
> Len Brown, Intel Open Source Technology Center

^ permalink raw reply	[flat|nested] 25+ messages in thread

* RE: [PATCH] cpuidle: Allow menu governor to enter deeper sleep states after some time
  2017-10-21 14:28   ` Doug Smythies
  2017-11-07 23:04     ` Thomas Ilsche
@ 2017-11-08 16:26     ` Doug Smythies
  2017-11-10 17:42     ` Doug Smythies
  2017-11-14  6:12     ` Doug Smythies
  3 siblings, 0 replies; 25+ messages in thread
From: Doug Smythies @ 2017-11-08 16:26 UTC (permalink / raw)
  To: 'Thomas Ilsche'
  Cc: 'Marcus Hähnel', 'Daniel Hackenberg',
	'Robert Schöne', mario.bielert,
	'Rafael J. Wysocki', 'Alex Shi',
	'Ingo Molnar', 'Rik van Riel',
	'Daniel Lezcano', 'Nicholas Piggin', linux-pm,
	'Len Brown', 'Yu Chen', Doug Smythies

Hi Thomas,

Thanks for your follow up and detailed information.
I have continued to work on this during these 2+ weeks,
but was unable to get to the root cause.

On 2017.11.07 15:04 Thomas Ilsche wrote:

> thanks to your detailed description I was able to reproduce and track down the
> issue on one of our systems with a similar processor. The effect is similar, the
> core stays in a shallow sleep state for too long.
> This is also amplified on a system with little background noise where a core
> can stay in C0 for seconds.

Agreed.

> But the cause / trigger is different.

Agreed.

> By my observation with many perf probes,
> the next timer is preventing a deep sleep, also overriding the anti-poll
> mechanism.

Agreed, but haven't been able to figure out why. Yu's e-mail on this point
is very interesting.

> This immediate (usually 1-3 us) timer can be both tick_sched_timer and
> watchdog_timer_fn. The timers actually do happen and run, however poll_idle
> directly resumes after the interrupt - there is no need_resched(). The menu
> governor assumes that a timer will trigger another menu_select, but it does not.
> Neither does our fallback timer - so the mitigation.

Agreed.

> I made a hack[1] to stop poll_idle after timer_expires which prevents the issue
> in my tests.

I did a similar test with the same results. In my case I just used an old
proposed patch from Rik van Riel [2] (well, rebased to current).

However, I still wanted to understand why this issue occurs in the first
place, because and as far as could tell, it shouldn't.
This is where I got stuck.

...[snip]...

... Doug

[2] https://marc.info/?l=linux-pm&m=145834176720176&w=2

^ permalink raw reply	[flat|nested] 25+ messages in thread

* RE: [PATCH] cpuidle: Allow menu governor to enter deeper sleep states after some time
  2017-11-08  4:53       ` Len Brown
  2017-11-08  6:01         ` Yu Chen
@ 2017-11-08 16:26         ` Doug Smythies
  1 sibling, 0 replies; 25+ messages in thread
From: Doug Smythies @ 2017-11-08 16:26 UTC (permalink / raw)
  To: 'Yu Chen', 'Len Brown'
  Cc: 'Thomas Ilsche', 'Marcus Hähnel',
	'Daniel Hackenberg', 'Robert Schöne',
	mario.bielert, 'Rafael J. Wysocki', 'Alex Shi',
	'Ingo Molnar', 'Rik van Riel',
	'Daniel Lezcano', 'Nicholas Piggin',
	'Ying Huang', 'Linux PM list', Doug Smythies

On 2017.11.07.10:01 22:01 Yu Chen wrote:

... [snip]...

> According to the ftrace, there are mainly two problems here:
>
> 1. A local apic timer seems to always arrive earlier than it should be,
>   which causes the breakage of deep cstate, as a result the menu governor
>   would get a short 'next timer' event, which brings the cpu into poll_idle:

... [snip]...
      
> Question 1 is the reason why the cpu choose poll_idle, ...
> , and I think question 1 needs to be further investigated
> whether we program the clockevent too soon(than it should be),

Agreed.
Thank you for the information.

...[snip]...

^ permalink raw reply	[flat|nested] 25+ messages in thread

* RE: [PATCH] cpuidle: Allow menu governor to enter deeper sleep states after some time
  2017-10-21 14:28   ` Doug Smythies
  2017-11-07 23:04     ` Thomas Ilsche
  2017-11-08 16:26     ` Doug Smythies
@ 2017-11-10 17:42     ` Doug Smythies
  2017-11-14  6:12     ` Doug Smythies
  3 siblings, 0 replies; 25+ messages in thread
From: Doug Smythies @ 2017-11-10 17:42 UTC (permalink / raw)
  To: 'Thomas Ilsche'
  Cc: 'Marcus Hähnel', 'Daniel Hackenberg',
	'Robert Schöne', mario.bielert,
	'Rafael J. Wysocki', 'Alex Shi',
	'Ingo Molnar', 'Rik van Riel',
	'Daniel Lezcano', 'Nicholas Piggin', linux-pm,
	'Len Brown', Doug Smythies

Hi Thomas,

In addition to my previous reply:

On 2017.11.7 15:04 Thomas Ilsche wrote:

> I made a hack[1] to stop poll_idle after timer_expires which prevents the issue
> in my tests. The idea is to make sure that poll_idle is really exited whenever a
> timer happens. I was trying to use existing information, but I'm not entirely
> sure if tick_sched.timer_expires actually has the right semantic.

Your patch does not prevent the issue in my case.
And actually I wouldn't expect it to. Why not? My (unproven) theory has always been
that by the time it ends up in the loop in drivers/cpuidle/poll_state.c the timer has
already expired, so you are looking at the next timer time, which is long.

As previously mentioned, another old, but rebased, patch does work for my case,
I'll paste it below.

I'll just add a note that I find testing results hard to repeat here. Sometimes my tests
will go for hours without an issue and other times they go for hours with constant issues.
I will say that there does appear to be some repeatable pattern, where forcing minimum
pstate makes the problem worse than when forcing the maximum pstate. (i.e. the time between
reading the original call to tick_nohz_get_sleep_length and the loop inside
drivers/cpuidle/poll_state.c is longer).

[1]: https://github.com/tud-zih-energy/linux/commit/7529b167dc7c2afaacd4551fe01ec576df5097e3

... Doug

>From 788fab5531625d7bc7f6db9622984c8e87fe7672 Mon Sep 17 00:00:00 2001
From: Doug Smythies <dsmythies@telus.net>
Date: Sat, 28 Oct 2017 07:15:41 -0700
Subject: [PATCH] cpuidle: break out of idle polling loop after HLT threshold

This was from Rik van Riel on 2016.03.18

Staying in the poll loop can be beneficial for workloads that
wake back up very, very quickly, but staying in the poll loop
for too long can waste a lot of power.

Break out of the idle polling loop if the system has spent more
time in the loop than the threshold for entering the next power
saving mode.

The poll loop uses full CPU power already, so this will not
increase the power use of the poll loop.

Signed-off-by: Doug Smythies <dsmythies@telus.net>
---
 drivers/cpuidle/poll_state.c | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/drivers/cpuidle/poll_state.c b/drivers/cpuidle/poll_state.c
index 7416b16..3023d44 100644
--- a/drivers/cpuidle/poll_state.c
+++ b/drivers/cpuidle/poll_state.c
@@ -11,10 +11,29 @@
 static int __cpuidle poll_idle(struct cpuidle_device *dev,
                               struct cpuidle_driver *drv, int index)
 {
+       /*
+        * Polling is state 0; break out of the polling loop after the
+        * threshold for the next power state has passed. Doing this several
+        * times in a row should cause the menu governor to immediately
+        * select a deeper power state.
+        */
+       u64 limit = drv->states[1].target_residency * NSEC_PER_USEC;
+       ktime_t start = ktime_get();
+       int i = 0;
+
        local_irq_enable();
        if (!current_set_polling_and_test()) {
-               while (!need_resched())
+               while (!need_resched()) {
+                       ktime_t now;
                        cpu_relax();
+
+                       /* Occasionally check for a timeout. */
+                       if (!(i++ % 1000)) {
+                               now = ktime_get();
+                               if (ktime_to_ns(ktime_sub(now, start)) > limit)
+                                       break;
+                       }
+               }
        }
        current_clr_polling();

--
2.7.4

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* RE: [PATCH] cpuidle: Allow menu governor to enter deeper sleep states after some time
  2017-10-21 14:28   ` Doug Smythies
                       ` (2 preceding siblings ...)
  2017-11-10 17:42     ` Doug Smythies
@ 2017-11-14  6:12     ` Doug Smythies
  2017-11-16 16:11       ` Thomas Ilsche
                         ` (3 more replies)
  3 siblings, 4 replies; 25+ messages in thread
From: Doug Smythies @ 2017-11-14  6:12 UTC (permalink / raw)
  To: 'Thomas Ilsche'
  Cc: 'Marcus Hähnel', 'Daniel Hackenberg',
	'Robert Schöne', mario.bielert,
	'Rafael J. Wysocki', 'Alex Shi',
	'Ingo Molnar', 'Rik van Riel',
	'Daniel Lezcano', 'Nicholas Piggin', linux-pm,
	'Len Brown', 'Yu Chen', 'Doug Smythies'

Hi Thomas,

I am revising and expanding one of my previous replies:

On 2017.11.08 08:26 Doug Smythies wrote:
> On 2017.11.07 15:04 Thomas Ilsche wrote:

>> This immediate (usually 1-3 us) timer can be both tick_sched_timer and
>> watchdog_timer_fn. The timers actually do happen and run, however poll_idle
>> directly resumes after the interrupt - there is no need_resched(). The menu
>> governor assumes that a timer will trigger another menu_select, but it does not.
>> Neither does our fallback timer - so the mitigation.

> Agreed.

Revised: Mostly agree.

Actually, the watchdog_timer_fn does set the "need_resched" condition, and will
cause the state 0 idle to exit normally.

But yes, tick_sched_timer and a few others (for example: sched_rt_period_timer,
clocksource_watchdog) do not set the "need_resched" condition, and, as you
mentioned, will not cause the state 0 idle to exit as it should.

Conclusion: Currently the exit condition in drivers/cpuidle/poll_state.c
is insufficient to guarantee proper operation.

This:

while (!need_resched())

is not enough.

I was thinking that maybe the number of interrupts could also
be a condition:

while(!need_resched() | interrupt count has not changed)

However, I haven't yet figured out how to code it, or even if there
is a interrupt count per CPU that can be used.

Supporting trace data, reprocessed so that delta microseconds is on
the left (the 1 uSec uncertainty is because I am only using doubles
instead of long doubles for the math):
 
       4 [006] d... 93455.518023: cpu_idle: state=0 cpu_id=6
       1 [006] d.h. 93455.518024: local_timer_entry: vector=239
       0 [006] d.h. 93455.518025: hrtimer_expire_entry: hrtimer=ffff97329f394880 function=tick_sched_timer now=93453436001688
       3 [006] d.h. 93455.518028: hrtimer_expire_exit: hrtimer=ffff97329f394880
       0 [006] d.h. 93455.518028: local_timer_exit: vector=239
       0 [006] ..s. 93455.518029: timer_expire_entry: timer=ffffffffa378f260 function=clocksource_watchdog now=4318255656
       3 [006] ..s. 93455.518032: timer_expire_exit: timer=ffffffffa378f260
 1375991 [006] d.h. 93456.894024: local_timer_entry: vector=239
       1 [006] d.h. 93456.894025: local_timer_exit: vector=239
      29 [006] d.h. 93456.894055: local_timer_entry: vector=239
       0 [006] d.h. 93456.894055: hrtimer_expire_entry: hrtimer=ffff97329f394880 function=tick_sched_timer now=93454812001797
       2 [006] d.h. 93456.894057: hrtimer_expire_exit: hrtimer=ffff97329f394880
       0 [006] d.h. 93456.894057: local_timer_exit: vector=239
       1 [006] d.s. 93456.894058: timer_expire_entry: timer=ffff97328c8dd200 function=delayed_work_timer_fn now=4318256000
       2 [006] dNs. 93456.894061: timer_expire_exit: timer=ffff97328c8dd200
       4 [006] .N.. 93456.894065: cpu_idle: state=4294967295 cpu_id=6

       4 [006] d... 93470.150347: cpu_idle: state=0 cpu_id=6
       0 [006] d.h. 93470.150348: local_timer_entry: vector=239
       1 [006] d.h. 93470.150349: hrtimer_expire_entry: hrtimer=ffff97329f394a00 function=watchdog_timer_fn now=93468068002439
       2 [006] dNh. 93470.150352: hrtimer_expire_exit: hrtimer=ffff97329f394a00
       1 [006] dNh. 93470.150353: local_timer_exit: vector=239
       0 [006] .N.. 93470.150354: cpu_idle: state=4294967295 cpu_id=6  <<<< This one is O.K.

       2 [001] d... 93447.137840: cpu_idle: state=0 cpu_id=1
       2 [001] d.h. 93447.137842: local_timer_entry: vector=239
       1 [001] d.h. 93447.137843: hrtimer_expire_entry: hrtimer=ffffffffa3739998 function=sched_rt_period_timer now=93445056004391
       2 [001] d.h. 93447.137845: hrtimer_expire_exit: hrtimer=ffffffffa3739998
       0 [001] d.h. 93447.137845: local_timer_exit: vector=239
 1372029 [001] .N.. 93448.509875: cpu_idle: state=4294967295 cpu_id=1  <<<<<< I don't know the exit cause for this one.

       4 [001] d... 93489.022764: cpu_idle: state=0 cpu_id=1
       0 [001] d.h. 93489.022764: local_timer_entry: vector=239
       1 [001] d.h. 93489.022765: hrtimer_expire_entry: hrtimer=ffff97329f254880 function=tick_sched_timer now=93486940002114
       2 [001] d.h. 93489.022768: hrtimer_expire_exit: hrtimer=ffff97329f254880
       1 [001] d.h. 93489.022769: local_timer_exit: vector=239
       0 [001] ..s. 93489.022770: timer_expire_entry: timer=ffffffffa378f260 function=clocksource_watchdog now=4318264032
       3 [001] ..s. 93489.022773: timer_expire_exit: timer=ffffffffa378f260
 1115989 [001] d.h. 93490.138763: local_timer_entry: vector=239
       1 [001] d.h. 93490.138764: local_timer_exit: vector=239
      25 [001] d.h. 93490.138789: local_timer_entry: vector=239
       0 [001] d.h. 93490.138790: hrtimer_expire_entry: hrtimer=ffff97329f254a00 function=watchdog_timer_fn now=93488056002228
       3 [001] dNh. 93490.138793: hrtimer_expire_exit: hrtimer=ffff97329f254a00
       0 [001] dNh. 93490.138794: local_timer_exit: vector=239
       1 [001] .N.. 93490.138795: cpu_idle: state=4294967295 cpu_id=1

Legend (copied):

# tracer: nop
#
#                              _-----=> irqs-off
#                             / _----=> need-resched  <<<<<<< Important flag.
#                            | / _---=> hardirq/softirq
#                            || / _--=> preempt-depth
#                            ||| /     delay
#           TASK-PID   CPU#  ||||    TIMESTAMP  FUNCTION
#              | |       |   ||||       |         |
          <idle>-0     [005] d... 93446.826313: cpu_idle: state=4294967295 cpu_id=5

... Doug

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] cpuidle: Allow menu governor to enter deeper sleep states after some time
  2017-11-14  6:12     ` Doug Smythies
@ 2017-11-16 16:11       ` Thomas Ilsche
  2017-11-16 22:47       ` Doug Smythies
                         ` (2 subsequent siblings)
  3 siblings, 0 replies; 25+ messages in thread
From: Thomas Ilsche @ 2017-11-16 16:11 UTC (permalink / raw)
  To: Doug Smythies
  Cc: 'Marcus Hähnel', 'Daniel Hackenberg',
	'Robert Schöne', mario.bielert,
	'Rafael J. Wysocki', 'Alex Shi',
	'Ingo Molnar', 'Rik van Riel',
	'Daniel Lezcano', 'Nicholas Piggin', linux-pm,
	'Len Brown', 'Yu Chen'

[-- Attachment #1: Type: text/plain, Size: 1503 bytes --]

Hi Doug,

thanks for taking another closer look at this and the specific timers.

> Actually, the watchdog_timer_fn does set the "need_resched" condition, and will
> cause the state 0 idle to exit normally.
> 
> But yes, tick_sched_timer and a few others (for example: sched_rt_period_timer,
> clocksource_watchdog) do not set the "need_resched" condition, and, as you
> mentioned, will not cause the state 0 idle to exit as it should.
> 
> Conclusion: Currently the exit condition in drivers/cpuidle/poll_state.c
> is insufficient to guarantee proper operation.
> 
> This:
> 
> while (!need_resched())
> 
> is not enough.

I may very well have mistakenly included watchdog_timer_fn in the list,
but as you describe it is inconsequential. If there are timers that do
not set need_resched, and that itself is not considered a bug, then
there should be another break condition. I suppose it is a good idea
to differentiate between the need for rescheduling and the need to
be able to go in another sleep state.

> 
> I was thinking that maybe the number of interrupts could also
> be a condition:
> 
> while(!need_resched() | interrupt count has not changed)
> 
> However, I haven't yet figured out how to code it, or even if there
> is a interrupt count per CPU that can be used.

What do you think about the idea to use idle_expires?
Although on second thought that may have issues regarding accuracy /
race conditions with the interrupt timer.

Best,
Thomas


[-- Attachment #2: S/MIME Cryptographic Signature --]
[-- Type: application/pkcs7-signature, Size: 5214 bytes --]

^ permalink raw reply	[flat|nested] 25+ messages in thread

* RE: [PATCH] cpuidle: Allow menu governor to enter deeper sleep states after some time
  2017-11-14  6:12     ` Doug Smythies
  2017-11-16 16:11       ` Thomas Ilsche
@ 2017-11-16 22:47       ` Doug Smythies
  2017-11-24 17:36       ` Doug Smythies
  2017-11-25 16:30       ` Doug Smythies
  3 siblings, 0 replies; 25+ messages in thread
From: Doug Smythies @ 2017-11-16 22:47 UTC (permalink / raw)
  To: 'Thomas Ilsche'
  Cc: 'Marcus Hähnel', 'Daniel Hackenberg',
	'Robert Schöne', mario.bielert,
	'Rafael J. Wysocki', 'Alex Shi',
	'Ingo Molnar', 'Rik van Riel',
	'Daniel Lezcano', 'Nicholas Piggin', linux-pm,
	'Len Brown', 'Yu Chen', Doug Smythies


On 2017.11.16 08:11 Thomas Ilsche wrote:

>> Actually, the watchdog_timer_fn does set the "need_resched" condition, and will
>> cause the state 0 idle to exit normally.
>> 
>> But yes, tick_sched_timer and a few others (for example: sched_rt_period_timer,
>> clocksource_watchdog) do not set the "need_resched" condition, and, as you
>> mentioned, will not cause the state 0 idle to exit as it should.
>> 
>> Conclusion: Currently the exit condition in drivers/cpuidle/poll_state.c
>> is insufficient to guarantee proper operation.

Or: Any interrupt out of the idle loop must return with "need_resched" 

>> 
>> This:
>> 
>> while (!need_resched())
>> 
>> is not enough.
>
> I may very well have mistakenly included watchdog_timer_fn in the list,
> but as you describe it is inconsequential. If there are timers that do
> not set need_resched, and that itself is not considered a bug, then
> there should be another break condition.
> I suppose it is a good idea
> to differentiate between the need for rescheduling and the need to
> be able to go in another sleep state.

See patch below. I think both conditions are satisfied.

> What do you think about the idea to use idle_expires?
> Although on second thought that may have issues regarding accuracy /
> race conditions with the interrupt timer.

For a couples of days now, and with excellent results, I have
been testing variations on the following theme:

diff --git a/drivers/cpuidle/poll_state.c b/drivers/cpuidle/poll_state.c
index 7416b16..4d17d3d 100644
--- a/drivers/cpuidle/poll_state.c
+++ b/drivers/cpuidle/poll_state.c
@@ -5,16 +5,31 @@
  */

 #include <linux/cpuidle.h>
+#include <linux/tick.h>
 #include <linux/sched.h>
 #include <linux/sched/idle.h>

 static int __cpuidle poll_idle(struct cpuidle_device *dev,
                               struct cpuidle_driver *drv, int index)
 {
+       unsigned int next_timer_us, i;
+
        local_irq_enable();
        if (!current_set_polling_and_test()) {
-               while (!need_resched())
+               while (!need_resched()){
                        cpu_relax();
+
+                       /* Occasionally check for a new and long expected residency time. */
+                       if (!(i++ % 1024)) {
+                               local_irq_disable();
+                               next_timer_us = ktime_to_us(tick_nohz_get_sleep_length());
+                               local_irq_enable();
+                               /* need a better way to get threshold, including large margin */
+                               /* We are only trying to catch really bad cases here.         */
+                               if (next_timer_us > 100)
+                               break;
+                       }
+               }
        }
        current_clr_polling();


Trace example 1:

       9 [005] d...  1749.232242: cpu_idle: state=4 cpu_id=5
 1055985 [005] d...  1750.288228: cpu_idle: state=4294967295 cpu_id=5
       3 [005] d.h.  1750.288231: local_timer_entry: vector=239
       1 [005] d.h.  1750.288233: local_timer_exit: vector=239
       5 [005] d...  1750.288238: cpu_idle: state=0 cpu_id=5
       0 [005] d.h.  1750.288238: local_timer_entry: vector=239
       0 [005] d.h.  1750.288239: hrtimer_expire_entry: hrtimer=ffff91ca5f354880 function=tick_sched_timer now=1749980002791
       3 [005] d.h.  1750.288242: hrtimer_expire_exit: hrtimer=ffff91ca5f354880
       0 [005] d.h.  1750.288243: local_timer_exit: vector=239
       1 [005] ..s.  1750.288244: timer_expire_entry: timer=ffffffffb4770ee0 function=__prandom_timer now=4295329792
       4 [005] ..s.  1750.288249: timer_expire_exit: timer=ffffffffb4770ee0
       5 [005] ....  1750.288254: cpu_idle: state=4294967295 cpu_id=5

"need_resched" is not set, but the next timer is far off, so poll_state.c with the above patch now exits.
And properly now decides to go into idle state 4, because nothing is going to happen for an eternity.

       1 [005] d...  1750.288256: cpu_idle: state=4 cpu_id=5
 2087982 [005] d...  1752.376239: cpu_idle: state=4294967295 cpu_id=5
       3 [005] d.h.  1752.376242: local_timer_entry: vector=239
       0 [005] d.h.  1752.376243: local_timer_exit: vector=239
       5 [005] d...  1752.376248: cpu_idle: state=1 cpu_id=5
      15 [005] d...  1752.376263: cpu_idle: state=4294967295 cpu_id=5
       0 [005] d.h.  1752.376263: local_timer_entry: vector=239
       0 [005] d.h.  1752.376264: hrtimer_expire_entry: hrtimer=ffff91ca5f354a00 function=watchdog_timer_fn now=1752068001621
       3 [005] dNh.  1752.376268: hrtimer_expire_exit: hrtimer=ffff91ca5f354a00


Trace example 2:

       4 [000] d...  1792.272757: cpu_idle: state=0 cpu_id=0
       1 [000] d.h.  1792.272758: local_timer_entry: vector=239
       0 [000] d.h.  1792.272759: hrtimer_expire_entry: hrtimer=ffff91ca5f214880 function=tick_sched_timer now=1791964002768
       3 [000] d.h.  1792.272762: hrtimer_expire_exit: hrtimer=ffff91ca5f214880
       0 [000] d.h.  1792.272762: local_timer_exit: vector=239

The next timer is very short, so the poll_state.c loop does not exit.
(even if it was going to exit, it might not have had time to. I didn't find a better example.)

       0 [000] ..s.  1792.272763: timer_expire_entry: timer=ffff91ca4cde8478 function=dev_watchdog now=4295340288
       3 [000] ..s.  1792.272766: timer_expire_exit: timer=ffff91ca4cde8478

The next timer is very short, so the poll_state.c loop does not exit.

       0 [000] d.s.  1792.272767: timer_expire_entry: timer=ffffffffc0997440 function=delayed_work_timer_fn now=4295340288
       5 [000] dNs.  1792.272772: timer_expire_exit: timer=ffffffffc0997440

This time "need_resched" is set. I assume it didn't have time to exit idle state 0 yet.

       0 [000] dNs.  1792.272772: timer_expire_entry: timer=ffffffffb46faa40 function=delayed_work_timer_fn now=4295340288
       0 [000] dNs.  1792.272773: timer_expire_exit: timer=ffffffffb46faa40

Now it exits idle state 0.

       7 [000] .N..  1792.272780: cpu_idle: state=4294967295 cpu_id=0
      29 [000] d...  1792.272810: cpu_idle: state=4 cpu_id=0

And properly now decides to go into idle state 4, because nothing is going to happen for awhile.

   91949 [000] d...  1792.364760: cpu_idle: state=4294967295 cpu_id=0
       3 [000] d.h.  1792.364763: local_timer_entry: vector=239
       0 [000] d.h.  1792.364764: hrtimer_expire_entry: hrtimer=ffff91ca5f214a00 function=watchdog_timer_fn now=1792056006926

... Doug

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* RE: [PATCH] cpuidle: Allow menu governor to enter deeper sleep states after some time
  2017-11-14  6:12     ` Doug Smythies
  2017-11-16 16:11       ` Thomas Ilsche
  2017-11-16 22:47       ` Doug Smythies
@ 2017-11-24 17:36       ` Doug Smythies
  2017-12-02 12:56         ` Thomas Ilsche
  2017-11-25 16:30       ` Doug Smythies
  3 siblings, 1 reply; 25+ messages in thread
From: Doug Smythies @ 2017-11-24 17:36 UTC (permalink / raw)
  To: 'Thomas Ilsche', 'Yu Chen'
  Cc: 'Marcus Hähnel', 'Daniel Hackenberg',
	'Robert Schöne', mario.bielert,
	'Rafael J. Wysocki', 'Alex Shi',
	'Ingo Molnar', 'Rik van Riel',
	'Daniel Lezcano', 'Nicholas Piggin', linux-pm,
	'Len Brown', Doug Smythies

@Yu: among other things, off-list you asked for some benchmark data. See below.

@Thomas: I did several phoronix tests with a kernel with both your patch,
for idle states deeper than 0, and my patch, specific to idle state 0.
Every test was done with a stock 4.14 kernel and a patched kernel with:
The Thomas part disabled; The default Thomas setting of 10000 uSec
timeout; A Thomas setting of 1000 uSec timeout; Idle states 0,1,2,3
disabled (my system max is state 4). I have yet to find a good phoronix
test to demonstrate the idle states deeper than 0 improvements. There was
never a degradation (other than already listed below) due to your patch.

@All: I am just trying to get some baseline data here, I am not suggesting
either patch is in a final form. For my idle state 0 patch (further below)
I seek help to move the concept to a real robust patch.
All power/energy measurements were processor package power as
measured with turbostat.

Conclusion: The most significant phoronix test improvements are for single
threaded tests. This is not a surprise.

All test results listed are for the stock verses idle state 0 fix only:

Test 1: A contstant 100% load on one CPU: 23% less energy.

Phoronix compress-lzma: 15.2% less energy; 3.6% performance improvement.
Phoronix encode-mp3: 3.5% less energy; 1% performance improvement.
Phoronix himeno: 13.3% less energy; 5% performance improvement. (Note lots of test to test variability.)
Phoronix crafty: 4.6% less energy; 0.5% performance improvement.
Phoronix apache: 3% less energy; 3% performance improvement.
Phoronix sudokut: undetectable energy or performance change.
Phoronix iozone (1,1,1)(4Kb, 512MB, write): 3% less energy; undetectable performance change.
Phoronix mafft: 1.8% MORE energy; ~3.5% performance DEGRADATION. (investigation pending.)
Phoronix ffmpeg: undetectable energy or performance change.

Anticipated question: The energy improvements make sense, but
why the performance improvements?
Answer: Performance is actually slightly improved because when
idle state 0 powernightmares were running on other cores,
the maximum clock rate is reduced on my processor.
Excerpt from turbostat output:
35 * 100.0 = 3500.0 MHz max turbo 4 active cores
36 * 100.0 = 3600.0 MHz max turbo 3 active cores
37 * 100.0 = 3700.0 MHz max turbo 2 active cores
38 * 100.0 = 3800.0 MHz max turbo 1 active cores

On 2017.11.16 14:48 Doug Smythies wrote:
> On 2017.11.16 08:11 Thomas Ilsche wrote:
>
>>> Actually, the watchdog_timer_fn does set the "need_resched" condition, and will
>>> cause the state 0 idle to exit normally.
>>> 
>>> But yes, tick_sched_timer and a few others (for example: sched_rt_period_timer,
>>> clocksource_watchdog) do not set the "need_resched" condition, and, as you
>>> mentioned, will not cause the state 0 idle to exit as it should.
>>> 
>>> Conclusion: Currently the exit condition in drivers/cpuidle/poll_state.c
>>> is insufficient to guarantee proper operation.
>
> Or: Any interrupt out of the idle loop must return with "need_resched" 
>
>>> 
>>> This:
>>> 
>>> while (!need_resched())
>>> 
>>> is not enough.
>>
>> I may very well have mistakenly included watchdog_timer_fn in the list,
>> but as you describe it is inconsequential. If there are timers that do
>> not set need_resched, and that itself is not considered a bug, then
>> there should be another break condition.
>> I suppose it is a good idea
>> to differentiate between the need for rescheduling and the need to
>> be able to go in another sleep state.
>
> See patch below. I think both conditions are satisfied.
>
>> What do you think about the idea to use idle_expires?
>> Although on second thought that may have issues regarding accuracy /
>> race conditions with the interrupt timer.
>
> For a couples of days now, and with excellent results, I have
> been testing variations on the following theme:
>
> diff --git a/drivers/cpuidle/poll_state.c b/drivers/cpuidle/poll_state.c
> index 7416b16..4d17d3d 100644
> --- a/drivers/cpuidle/poll_state.c
> +++ b/drivers/cpuidle/poll_state.c
> @@ -5,16 +5,31 @@
>  */
>
> #include <linux/cpuidle.h>
> +#include <linux/tick.h>
> #include <linux/sched.h>
> #include <linux/sched/idle.h>
>
> static int __cpuidle poll_idle(struct cpuidle_device *dev,
>                               struct cpuidle_driver *drv, int index)
> {
> +       unsigned int next_timer_us, i;
> +
>        local_irq_enable();
>        if (!current_set_polling_and_test()) {
> -               while (!need_resched())
> +               while (!need_resched()){
>                        cpu_relax();
> +
> +                       /* Occasionally check for a new and long expected residency time. */
> +                       if (!(i++ % 1024)) {
> +                               local_irq_disable();
> +                               next_timer_us = ktime_to_us(tick_nohz_get_sleep_length());
> +                               local_irq_enable();
> +                               /* need a better way to get threshold, including large margin */
> +                               /* We are only trying to catch really bad cases here.         */
> +                               if (next_timer_us > 100)
> +                               break;
> +                       }
> +               }
>         }
>         current_clr_polling();
>
>
> Trace example 1:
>
>       9 [005] d...  1749.232242: cpu_idle: state=4 cpu_id=5
> 1055985 [005] d...  1750.288228: cpu_idle: state=4294967295 cpu_id=5
>       3 [005] d.h.  1750.288231: local_timer_entry: vector=239
>       1 [005] d.h.  1750.288233: local_timer_exit: vector=239
>       5 [005] d...  1750.288238: cpu_idle: state=0 cpu_id=5
>       0 [005] d.h.  1750.288238: local_timer_entry: vector=239
>       0 [005] d.h.  1750.288239: hrtimer_expire_entry: hrtimer=ffff91ca5f354880 function=tick_sched_timer now=1749980002791
>       3 [005] d.h.  1750.288242: hrtimer_expire_exit: hrtimer=ffff91ca5f354880
>       0 [005] d.h.  1750.288243: local_timer_exit: vector=239
>       1 [005] ..s.  1750.288244: timer_expire_entry: timer=ffffffffb4770ee0 function=__prandom_timer now=4295329792
>       4 [005] ..s.  1750.288249: timer_expire_exit: timer=ffffffffb4770ee0
>       5 [005] ....  1750.288254: cpu_idle: state=4294967295 cpu_id=5
>
> "need_resched" is not set, but the next timer is far off, so poll_state.c with the above patch now exits.
> And properly now decides to go into idle state 4, because nothing is going to happen for an eternity.
>
>       1 [005] d...  1750.288256: cpu_idle: state=4 cpu_id=5
> 2087982 [005] d...  1752.376239: cpu_idle: state=4294967295 cpu_id=5
>       3 [005] d.h.  1752.376242: local_timer_entry: vector=239
>       0 [005] d.h.  1752.376243: local_timer_exit: vector=239
>       5 [005] d...  1752.376248: cpu_idle: state=1 cpu_id=5
>      15 [005] d...  1752.376263: cpu_idle: state=4294967295 cpu_id=5
>       0 [005] d.h.  1752.376263: local_timer_entry: vector=239
>       0 [005] d.h.  1752.376264: hrtimer_expire_entry: hrtimer=ffff91ca5f354a00 function=watchdog_timer_fn now=1752068001621
>       3 [005] dNh.  1752.376268: hrtimer_expire_exit: hrtimer=ffff91ca5f354a00
>
>
> Trace example 2:
>
>       4 [000] d...  1792.272757: cpu_idle: state=0 cpu_id=0
>       1 [000] d.h.  1792.272758: local_timer_entry: vector=239
>       0 [000] d.h.  1792.272759: hrtimer_expire_entry: hrtimer=ffff91ca5f214880 function=tick_sched_timer now=1791964002768
>       3 [000] d.h.  1792.272762: hrtimer_expire_exit: hrtimer=ffff91ca5f214880
>       0 [000] d.h.  1792.272762: local_timer_exit: vector=239
>
> The next timer is very short, so the poll_state.c loop does not exit.
> (even if it was going to exit, it might not have had time to. I didn't find a better example.)
>
>       0 [000] ..s.  1792.272763: timer_expire_entry: timer=ffff91ca4cde8478 function=dev_watchdog now=4295340288
>       3 [000] ..s.  1792.272766: timer_expire_exit: timer=ffff91ca4cde8478
>
> The next timer is very short, so the poll_state.c loop does not exit.
>
>       0 [000] d.s.  1792.272767: timer_expire_entry: timer=ffffffffc0997440 function=delayed_work_timer_fn now=4295340288
>       5 [000] dNs.  1792.272772: timer_expire_exit: timer=ffffffffc0997440
>
> This time "need_resched" is set. I assume it didn't have time to exit idle state 0 yet.
>
>       0 [000] dNs.  1792.272772: timer_expire_entry: timer=ffffffffb46faa40 function=delayed_work_timer_fn now=4295340288
>       0 [000] dNs.  1792.272773: timer_expire_exit: timer=ffffffffb46faa40
>
> Now it exits idle state 0.
>
>       7 [000] .N..  1792.272780: cpu_idle: state=4294967295 cpu_id=0
>      29 [000] d...  1792.272810: cpu_idle: state=4 cpu_id=0
>
> And properly now decides to go into idle state 4, because nothing is going to happen for awhile.
>
>   91949 [000] d...  1792.364760: cpu_idle: state=4294967295 cpu_id=0
>       3 [000] d.h.  1792.364763: local_timer_entry: vector=239
>       0 [000] d.h.  1792.364764: hrtimer_expire_entry: hrtimer=ffff91ca5f214a00 function=watchdog_timer_fn now=1792056006926

^ permalink raw reply	[flat|nested] 25+ messages in thread

* RE: [PATCH] cpuidle: Allow menu governor to enter deeper sleep states after some time
  2017-11-14  6:12     ` Doug Smythies
                         ` (2 preceding siblings ...)
  2017-11-24 17:36       ` Doug Smythies
@ 2017-11-25 16:30       ` Doug Smythies
  3 siblings, 0 replies; 25+ messages in thread
From: Doug Smythies @ 2017-11-25 16:30 UTC (permalink / raw)
  To: 'Thomas Ilsche', 'Yu Chen'
  Cc: 'Marcus Hähnel', 'Daniel Hackenberg',
	'Robert Schöne', mario.bielert,
	'Rafael J. Wysocki', 'Alex Shi',
	'Ingo Molnar', 'Rik van Riel',
	'Daniel Lezcano', 'Nicholas Piggin', linux-pm,
	'Len Brown', Doug Smythies

On 2017.11.24 09:36 Doug Smythies wrote:

> Phoronix mafft: 1.8% MORE energy; ~3.5% performance DEGRADATION. (investigation pending.)

Investigation complete. ~400 more Phoronix mafft tests were done. The revised results are:

Phoronix mafft: no consistent energy or performance change.

... Doug

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] cpuidle: Allow menu governor to enter deeper sleep states after some time
  2017-11-24 17:36       ` Doug Smythies
@ 2017-12-02 12:56         ` Thomas Ilsche
  2017-12-15 10:44           ` Thomas Ilsche
  0 siblings, 1 reply; 25+ messages in thread
From: Thomas Ilsche @ 2017-12-02 12:56 UTC (permalink / raw)
  To: Doug Smythies, 'Len Brown'
  Cc: 'Yu Chen', 'Marcus Hähnel',
	'Daniel Hackenberg', 'Robert Schöne',
	mario.bielert, 'Rafael J. Wysocki', 'Alex Shi',
	'Ingo Molnar', 'Rik van Riel',
	'Daniel Lezcano', 'Nicholas Piggin', linux-pm

Doug,

thank you so much for the detailed performance investigation! I think the
results are consistent with my expectations.

> @All: I am just trying to get some baseline data here, I am not suggesting
> either patch is in a final form. For my idle state 0 patch (further below)
> I seek help to move the concept to a real robust patch.

I would also like to help towards a joint patch that addresses both aspects.
It seems we need to figure out two key questions on which I would like to
get feedback from the experienced folks, particularly @Len Brown to make
sure the patch can be merged:

1) Regarding C1+ issues
The suggestion to keep the scheduling ticks running instead of using an
additional fallback timer seems very reasonable. But how can we achieve that
across the different software layers, particularly given that the tick-timer
is disabled early in the outer idle-loop in do_idle, whereas the decision is
made later in the governor?

2) Regarding C0 / poll_idle
What is a suitable abort criterion for the idle loop? We discussed
interrupt counts / tick_nohz_get_sleep_length / tick_sched->idle_expires

> @Yu: among other things, off-list you asked for some benchmark data. See below.
> 
> @Thomas: I did several phoronix tests with a kernel with both your patch,
> for idle states deeper than 0, and my patch, specific to idle state 0.
> Every test was done with a stock 4.14 kernel and a patched kernel with:
> The Thomas part disabled; The default Thomas setting of 10000 uSec
> timeout; A Thomas setting of 1000 uSec timeout; Idle states 0,1,2,3
> disabled (my system max is state 4). I have yet to find a good phoronix
> test to demonstrate the idle states deeper than 0 improvements. There was
> never a degradation (other than already listed below) due to your patch.

I assume the benchmarks normally fully utilize at least one CPU. In that
case I would not expect a noticeable energy / performance difference.
The large differences in power consumption we observe in the original
scenario are due to package C-states - which are prevented by the active
core in the phoronix tests. The deep C-State powernightmares we noticed
are inherently difficult to benchmark / show a high variability and lots
of factors can influence the trigger pattern. That's why we used the
synthetic code to generate quantitative numbers, while showing that they
do occur in principle on production systems.

Best,
Thomas

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] cpuidle: Allow menu governor to enter deeper sleep states after some time
  2017-12-02 12:56         ` Thomas Ilsche
@ 2017-12-15 10:44           ` Thomas Ilsche
  2017-12-15 14:23             ` Rafael J. Wysocki
  2017-12-15 16:16             ` Doug Smythies
  0 siblings, 2 replies; 25+ messages in thread
From: Thomas Ilsche @ 2017-12-15 10:44 UTC (permalink / raw)
  To: Doug Smythies, 'Len Brown'
  Cc: 'Yu Chen', 'Marcus Hähnel',
	'Daniel Hackenberg', 'Robert Schöne',
	mario.bielert, 'Rafael J. Wysocki', 'Alex Shi',
	'Ingo Molnar', 'Rik van Riel',
	'Daniel Lezcano', 'Nicholas Piggin', linux-pm

[-- Attachment #1: Type: text/plain, Size: 1777 bytes --]

Len, Doug,

we just saw very bad Powernightmares on a freshly installed dual socket 36 core
Skylake system. There is a particular steep increase in AC system power from
69 W to 120 W just from having a single core not in the deepest state.
Together with high occurrence and persistence, this raises average idle power
from 69 W to 92 W. You can see a typical example power chart at [1] measured
externally right after the installation. This is a default installation of
Ubuntu 16.04, but upgrading to a 4.15.0-rc3 kernel provides no help.

This issue is probably wasting a couple Megawatts globally, and it's getting
worse with newer Intel processor generations. I like to reinforce the offer
to help develop and test a patch, but the points below need to be addressed
by an experienced kernel developer / maintainer.

We acknowledge your suggestion to keep the scheduling ticks running instead
of using an additional fallback timer. Still, we need a solution with
manageable code changes without tightly coupling different software layers.
Unfortunately, the tick-timer is disabled early in the outer idle-loop in
do_idle(), but the C-state decision is taken in the governor.

In your opinion, what is the best solution?
(a) re-enable the tick-timer? (not a very clean solution)
(b) moving the tick-timer disabling? (could be very intrusive in terms of LOC,
we have no overview on which assumptions would break if we do this)
(c) use the fallback timer (not elegant, since there are already ticks)

Regarding the C0/poll_idle problem, what is suitable as abort criterion:
(a) interrupt counts
(b) tick_nohz_get_sleep_length
(c) tick_sched->idle_expires

Best,
Thomas

[1] https://wwwpub.zih.tu-dresden.de/~tilsche/powernightmares_skl.png

[-- Attachment #2: S/MIME Cryptographic Signature --]
[-- Type: application/pkcs7-signature, Size: 5214 bytes --]

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] cpuidle: Allow menu governor to enter deeper sleep states after some time
  2017-12-15 10:44           ` Thomas Ilsche
@ 2017-12-15 14:23             ` Rafael J. Wysocki
  2017-12-21  9:43               ` Thomas Ilsche
  2017-12-15 16:16             ` Doug Smythies
  1 sibling, 1 reply; 25+ messages in thread
From: Rafael J. Wysocki @ 2017-12-15 14:23 UTC (permalink / raw)
  To: Thomas Ilsche
  Cc: Doug Smythies, Len Brown, Yu Chen, Marcus Hähnel,
	Daniel Hackenberg, Robert Schöne, mario.bielert,
	Rafael J. Wysocki, Alex Shi, Ingo Molnar, Rik van Riel,
	Daniel Lezcano, Nicholas Piggin, Linux PM, Peter Zijlstra,
	Thomas Gleixner

On Fri, Dec 15, 2017 at 11:44 AM, Thomas Ilsche
<thomas.ilsche@tu-dresden.de> wrote:
> Len, Doug,
>
> we just saw very bad Powernightmares on a freshly installed dual socket 36
> core

Can you please refrain from using marketing language for describing
technical issues?

> Skylake system. There is a particular steep increase in AC system power from
> 69 W to 120 W just from having a single core not in the deepest state.
> Together with high occurrence and persistence, this raises average idle
> power
> from 69 W to 92 W. You can see a typical example power chart at [1] measured
> externally right after the installation. This is a default installation of
> Ubuntu 16.04, but upgrading to a 4.15.0-rc3 kernel provides no help.
>
> This issue is probably wasting a couple Megawatts globally, and it's getting
> worse with newer Intel processor generations. I like to reinforce the offer
> to help develop and test a patch, but the points below need to be addressed
> by an experienced kernel developer / maintainer.
>
> We acknowledge your suggestion to keep the scheduling ticks running instead
> of using an additional fallback timer. Still, we need a solution with
> manageable code changes without tightly coupling different software layers.
> Unfortunately, the tick-timer is disabled early in the outer idle-loop in
> do_idle(), but the C-state decision is taken in the governor.
>
> In your opinion, what is the best solution?
> (a) re-enable the tick-timer? (not a very clean solution)
> (b) moving the tick-timer disabling?

Yes.

> (could be very intrusive in terms of LOC,
> we have no overview on which assumptions would break if we do this)

Regardless, this is the long-term way to go IMO not just to address
this issue, but also to avoid unneeded overhead of stopping the tick.

The tick should only be stopped when we know for a fact that it is
worth the overhead which is only when we know that the target
residency of the idle state we want to enter is longer than the time
to the next tick.  Note that this doesn't depend on the idle governor
in use at all.

And since this is the long-term way to go anyway, why don't we just do
it right away?

> (c) use the fallback timer (not elegant, since there are already ticks)
>
> Regarding the C0/poll_idle problem, what is suitable as abort criterion:
> (a) interrupt counts
> (b) tick_nohz_get_sleep_length
> (c) tick_sched->idle_expires

Can you please remind me what that problem is exactly?

Thanks,
Rafael

^ permalink raw reply	[flat|nested] 25+ messages in thread

* RE: [PATCH] cpuidle: Allow menu governor to enter deeper sleep states after some time
  2017-12-15 10:44           ` Thomas Ilsche
  2017-12-15 14:23             ` Rafael J. Wysocki
@ 2017-12-15 16:16             ` Doug Smythies
  2017-12-16  2:34               ` Rafael J. Wysocki
  1 sibling, 1 reply; 25+ messages in thread
From: Doug Smythies @ 2017-12-15 16:16 UTC (permalink / raw)
  To: 'Rafael J. Wysocki'
  Cc: 'Len Brown', 'Yu Chen',
	'Marcus Hähnel', 'Daniel Hackenberg',
	'Robert Schöne', mario.bielert,
	'Rafael J. Wysocki', 'Alex Shi',
	'Ingo Molnar', 'Rik van Riel',
	'Daniel Lezcano', 'Nicholas Piggin',
	'Linux PM', 'Peter Zijlstra',
	'Thomas Gleixner', 'Thomas Ilsche', Doug Smythies

On 2017.12.15 06:23 Rafael J. Wysocki wrote:
> On Fri, Dec 15, 2017 at 11:44 AM, Thomas Ilsche wrote:
>
>> we just saw very bad Powernightmares on a freshly installed dual socket 36
>> core

During the gap in this thread, I have seen some pretty bad ones also, but
not to the magnitude that you do.

> Can you please refrain from using marketing language for describing
> technical issues?

Ugh oh. I have just changed my web notes on this to use that term.

>> This issue is probably wasting a couple Megawatts globally,

I agree that there appears to be significant energy that can be saved.

>> I like to reinforce the offer
>> to help develop and test a patch,

Me also, however and as previously mentioned, I am stuck.

>> We acknowledge your suggestion to keep the scheduling ticks running instead
>> of using an additional fallback timer. Still, we need a solution with
>> manageable code changes without tightly coupling different software layers.
>> Unfortunately, the tick-timer is disabled early in the outer idle-loop in
>> do_idle(), but the C-state decision is taken in the governor.
>>
>> In your opinion, what is the best solution?
>> (a) re-enable the tick-timer? (not a very clean solution)
>> (b) moving the tick-timer disabling?
>
> Yes.
>
>> (could be very intrusive in terms of LOC,
>> we have no overview on which assumptions would break if we do this)
>
> Regardless, this is the long-term way to go IMO not just to address
> this issue, but also to avoid unneeded overhead of stopping the tick.
>
> The tick should only be stopped when we know for a fact that it is
> worth the overhead which is only when we know that the target
> residency of the idle state we want to enter is longer than the time
> to the next tick.  Note that this doesn't depend on the idle governor
> in use at all.
>
> And since this is the long-term way to go anyway, why don't we just do
> it right away?

Maybe that would also solve the C0 problem, I don't know.

>>
>> Regarding the C0/poll_idle problem, what is suitable as abort criterion:
>> (a) interrupt counts
>> (b) tick_nohz_get_sleep_length
>> (c) tick_sched->idle_expires
>
> Can you please remind me what that problem is exactly?

The C0 version of the problem is what I have been investigating for a long time
now. 

In an off-list e-mail, Yu claims to have found the root issue. I disagree that
it explains all the long poll_idle scenarios I have found. Obviously, I would
test any patch when available.

The problem is within the main loop of drivers/cpuidle/poll_state.c:

                while (!need_resched())
                        cpu_relax();

because not all isr's set the need_resched flag, and therefore it can end up
sitting in that loop for multiple seconds, instead of the expected few uSecs.

You may or may not recall that my main test scenario for this is a 100% load
on one CPU. Typically I observe something like for every unit of time
properly spent in idle state 0, the most energy costly idle state,
roughly 60,000 units of time are spent incorrectly in idle state 0, wasting
energy. Testing needs to done over hours, because the issue tends to come
in clusters being aggravated by some sort of beat frequencies
between events.

My proposed solution was this:

diff --git a/drivers/cpuidle/poll_state.c b/drivers/cpuidle/poll_state.c
index 7416b16..4d17d3d 100644
--- a/drivers/cpuidle/poll_state.c
+++ b/drivers/cpuidle/poll_state.c
@@ -5,16 +5,31 @@
  */

 #include <linux/cpuidle.h>
+#include <linux/tick.h>
 #include <linux/sched.h>
 #include <linux/sched/idle.h>

 static int __cpuidle poll_idle(struct cpuidle_device *dev,
                               struct cpuidle_driver *drv, int index)
 {
+       unsigned int next_timer_us, i;
+
        local_irq_enable();
        if (!current_set_polling_and_test()) {
-               while (!need_resched())
+               while (!need_resched()){
                        cpu_relax();
+
+                       /* Occasionally check for a new and long expected residency time. */
+                       if (!(i++ % 1024)) {
+                               local_irq_disable();
+                               next_timer_us = ktime_to_us(tick_nohz_get_sleep_length());
+                               local_irq_enable();
+                               /* need a better way to get threshold, including large margin */
+                               /* We are only trying to catch really bad cases here.         */
+                               if (next_timer_us > 100)
+                               break;
+                       }
+               }
        }
        current_clr_polling();

However,
I do not like that I am disabling and re-enabling local interrupts in some loop,
albeit only once per 1024 main loops. I do not know if there is another way to get
the same information without disabling interrupts.

I have not been able to figure out how to replace my "100" microseconds arbitrary
exit condition with some useful, machine specific exit criteria.

The root issue for me, is that I am not very familiar with the idle code.

... Doug

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* Re: [PATCH] cpuidle: Allow menu governor to enter deeper sleep states after some time
  2017-12-15 16:16             ` Doug Smythies
@ 2017-12-16  2:34               ` Rafael J. Wysocki
  0 siblings, 0 replies; 25+ messages in thread
From: Rafael J. Wysocki @ 2017-12-16  2:34 UTC (permalink / raw)
  To: Doug Smythies
  Cc: 'Rafael J. Wysocki', 'Len Brown',
	'Yu Chen', 'Marcus Hähnel',
	'Daniel Hackenberg', 'Robert Schöne',
	mario.bielert, 'Rafael J. Wysocki', 'Alex Shi',
	'Ingo Molnar', 'Rik van Riel',
	'Daniel Lezcano', 'Nicholas Piggin',
	'Linux PM', 'Peter Zijlstra',
	'Thomas Gleixner', 'Thomas Ilsche'

On Friday, December 15, 2017 5:16:08 PM CET Doug Smythies wrote:
> On 2017.12.15 06:23 Rafael J. Wysocki wrote:
> > On Fri, Dec 15, 2017 at 11:44 AM, Thomas Ilsche wrote:
> >
> >> we just saw very bad Powernightmares on a freshly installed dual socket 36
> >> core
> 
> During the gap in this thread, I have seen some pretty bad ones also, but
> not to the magnitude that you do.
> 
> > Can you please refrain from using marketing language for describing
> > technical issues?
> 
> Ugh oh. I have just changed my web notes on this to use that term.
> 
> >> This issue is probably wasting a couple Megawatts globally,
> 
> I agree that there appears to be significant energy that can be saved.
> 
> >> I like to reinforce the offer
> >> to help develop and test a patch,
> 
> Me also, however and as previously mentioned, I am stuck.
> 
> >> We acknowledge your suggestion to keep the scheduling ticks running instead
> >> of using an additional fallback timer. Still, we need a solution with
> >> manageable code changes without tightly coupling different software layers.
> >> Unfortunately, the tick-timer is disabled early in the outer idle-loop in
> >> do_idle(), but the C-state decision is taken in the governor.
> >>
> >> In your opinion, what is the best solution?
> >> (a) re-enable the tick-timer? (not a very clean solution)
> >> (b) moving the tick-timer disabling?
> >
> > Yes.
> >
> >> (could be very intrusive in terms of LOC,
> >> we have no overview on which assumptions would break if we do this)
> >
> > Regardless, this is the long-term way to go IMO not just to address
> > this issue, but also to avoid unneeded overhead of stopping the tick.
> >
> > The tick should only be stopped when we know for a fact that it is
> > worth the overhead which is only when we know that the target
> > residency of the idle state we want to enter is longer than the time
> > to the next tick.  Note that this doesn't depend on the idle governor
> > in use at all.
> >
> > And since this is the long-term way to go anyway, why don't we just do
> > it right away?
> 
> Maybe that would also solve the C0 problem, I don't know.

It should mitigate it somehwat at least, because it would add an upper bound
on the idle time for states with target residencies below the length of the
tick period.

We may need something on top of it, though.

> >>
> >> Regarding the C0/poll_idle problem, what is suitable as abort criterion:
> >> (a) interrupt counts
> >> (b) tick_nohz_get_sleep_length
> >> (c) tick_sched->idle_expires
> >
> > Can you please remind me what that problem is exactly?
> 
> The C0 version of the problem is what I have been investigating for a long time
> now. 
> 
> In an off-list e-mail, Yu claims to have found the root issue. I disagree that
> it explains all the long poll_idle scenarios I have found. Obviously, I would
> test any patch when available.
> 
> The problem is within the main loop of drivers/cpuidle/poll_state.c:
> 
>                 while (!need_resched())
>                         cpu_relax();
> 
> because not all isr's set the need_resched flag, and therefore it can end up
> sitting in that loop for multiple seconds, instead of the expected few uSecs.

I see, thanks!

> You may or may not recall that my main test scenario for this is a 100% load
> on one CPU. Typically I observe something like for every unit of time
> properly spent in idle state 0, the most energy costly idle state,
> roughly 60,000 units of time are spent incorrectly in idle state 0, wasting
> energy. Testing needs to done over hours, because the issue tends to come
> in clusters being aggravated by some sort of beat frequencies
> between events.
> 
> My proposed solution was this:
> 
> diff --git a/drivers/cpuidle/poll_state.c b/drivers/cpuidle/poll_state.c
> index 7416b16..4d17d3d 100644
> --- a/drivers/cpuidle/poll_state.c
> +++ b/drivers/cpuidle/poll_state.c
> @@ -5,16 +5,31 @@
>   */
> 
>  #include <linux/cpuidle.h>
> +#include <linux/tick.h>
>  #include <linux/sched.h>
>  #include <linux/sched/idle.h>
> 
>  static int __cpuidle poll_idle(struct cpuidle_device *dev,
>                                struct cpuidle_driver *drv, int index)
>  {
> +       unsigned int next_timer_us, i;
> +
>         local_irq_enable();
>         if (!current_set_polling_and_test()) {
> -               while (!need_resched())
> +               while (!need_resched()){
>                         cpu_relax();
> +
> +                       /* Occasionally check for a new and long expected residency time. */
> +                       if (!(i++ % 1024)) {
> +                               local_irq_disable();
> +                               next_timer_us = ktime_to_us(tick_nohz_get_sleep_length());
> +                               local_irq_enable();
> +                               /* need a better way to get threshold, including large margin */
> +                               /* We are only trying to catch really bad cases here.         */
> +                               if (next_timer_us > 100)
> +                               break;
> +                       }
> +               }
>         }
>         current_clr_polling();
> 
> However,
> I do not like that I am disabling and re-enabling local interrupts in some loop,
> albeit only once per 1024 main loops. I do not know if there is another way to get
> the same information without disabling interrupts.

It need not be the same information.

I think you could just set a static time limit for staying in that loop.

> I have not been able to figure out how to replace my "100" microseconds arbitrary
> exit condition with some useful, machine specific exit criteria.

I would make it a fraction of the tick period or similar.

Anyway, as I said above, I would defer playing with this till after the
tick-related changes are made.

Thanks,
Rafael

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] cpuidle: Allow menu governor to enter deeper sleep states after some time
  2017-12-15 14:23             ` Rafael J. Wysocki
@ 2017-12-21  9:43               ` Thomas Ilsche
  2017-12-22 19:37                 ` Rafael J. Wysocki
  0 siblings, 1 reply; 25+ messages in thread
From: Thomas Ilsche @ 2017-12-21  9:43 UTC (permalink / raw)
  To: Rafael J. Wysocki
  Cc: Doug Smythies, Len Brown, Yu Chen, Marcus Hähnel,
	Daniel Hackenberg, Robert Schöne, mario.bielert,
	Rafael J. Wysocki, Alex Shi, Ingo Molnar, Rik van Riel,
	Daniel Lezcano, Nicholas Piggin, Linux PM, Peter Zijlstra,
	Thomas Gleixner

[-- Attachment #1: Type: text/plain, Size: 4711 bytes --]

Hi Rafael,

you mentioned upcoming tick-related changes. Could you please give me a quick
pointer or summary to the discussion?

> Regardless, this is the long-term way to go IMO not just to address
> this issue, but also to avoid unneeded overhead of stopping the tick.
> 
> The tick should only be stopped when we know for a fact that it is
> worth the overhead which is only when we know that the target
> residency of the idle state we want to enter is longer than the time
> to the next tick.  Note that this doesn't depend on the idle governor
> in use at all.

The residency is a bad condition. For example the deepest sleep state on SKL-SP
has a target residency of 600 us. That is typically less than a scheduling tick.
Your suggestion you would keep the tick timer enabled most of the time,
effectively disabling dyntick-idle.

One could consider keeping the enabled sched-tick for sleep states that are not
the deepest as per QOS requirements. Note that currently the QOS-check happens
in the governor. This would be sub-optimal if the deepest sleep state can hardly
be reached, e.g. on systems with very high residencies. Overall residency is not
a good criteria, as it's just the minimum time for the C-state to be effective -
sleeping for longer often improves energy efficiency a lot.

We looked at the possible combination of upcoming events (selected C-state
residency, next scheduling timer, next non-scheduling timer, heuristically
predicted wakeup time). It is more complicated than I originally thought, but we
think that the following decision would be desired:

if next_timer_is_sched {
     if heuristically_predicted_time < next_timer * alpha [0.9] {
          ** Simple case **
          // This is a potential case where the selected sleep state is
          // suboptimal due to poor heuristic - we use the sched tick as fallback
          // to wakeup
          // If the prediction is correct, there is no need to disable the
          // scheduling tick, but it if is incorrect, we can use the scheduling
          // tick as a fallback wakeup
          // Alpha is used such that if the heuristic predicts the sched time
          // we can still disable the timer.
          keep_sched_timer_enabled();
     } else {
          // ** Complicated case **
          // Usually we must cancel the sched tick here, because it would shorten
          // our sleep time.
          disable_sched_timer();
          // Check if this is a potential case of a mispredicting heuristic
          // causing a long residency in a suboptimal sleep state
          // (a.k.a. a Powernightmare)
          // Unfortunately this is not easy to check and requires a trade-off
          // between false positives (unnecessary timer overhead)
          // and false negatives (certain amount of time spent in bad idle state)
          if (heuristically_predicted_time < next_non_sched_timer * beta [0.1])
              && (next_non_sched_timer > threshold)
              && (selected idle state < deepest_state_qos)
              enable_fallback_timer();
          }
     }
} else {
     // We don't need to bother stopping the sched tick, another timer will wake
     // the CPU before anyway
     keep_sched_timer_enabled();
}

The residency of the selected sleep state doesn't play a role in the decision. I
would consider to set "threshold" (the maximum duration of a tolerated stay in a
bad sleep state) to the residency of the next higher sleep state with a factor.

It also seems that the next non-sched timer seems to be difficult to determine.
Reading it via tick_nohz_get_sleep_length would force the code to disable the
sched timer before being able to make the actual decision which state to go to.

Unfortunately in the complicated case, the sched timer is not optimal as a
fallback as it will be before the heuristically predicted wakeup and possibly
even before the residency time of the selected sleep state. On the one hand it
may be required set a timer to prevent a long suboptimal sleep state, on the
other hand the scheduling tick will decrease sleep time if the heuristic is
correct, and can even cause the selection of worse sleep states. Therefore in
this case a custom callback using a time between the heuristic and the actual
next timer set further in the future would prevent issues.

For a system where the deepest sleep state has a residency << 1/HZ, the simple
case is more likely than the complicated one. However, this is not true on all
systems.

Unfortunately this doesn't yet address the question how to implement the
decision among idle loop / governor.

Best,
Thomas

[-- Attachment #2: S/MIME Cryptographic Signature --]
[-- Type: application/pkcs7-signature, Size: 5214 bytes --]

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] cpuidle: Allow menu governor to enter deeper sleep states after some time
  2017-12-21  9:43               ` Thomas Ilsche
@ 2017-12-22 19:37                 ` Rafael J. Wysocki
  0 siblings, 0 replies; 25+ messages in thread
From: Rafael J. Wysocki @ 2017-12-22 19:37 UTC (permalink / raw)
  To: Thomas Ilsche
  Cc: Rafael J. Wysocki, Doug Smythies, Len Brown, Yu Chen,
	Marcus Hähnel, Daniel Hackenberg, Robert Schöne,
	mario.bielert, Rafael J. Wysocki, Alex Shi, Ingo Molnar,
	Rik van Riel, Daniel Lezcano, Nicholas Piggin, Linux PM,
	Peter Zijlstra, Thomas Gleixner, Li, Aubrey

On Thursday, December 21, 2017 10:43:20 AM CET Thomas Ilsche wrote:
>  This is a cryptographically signed message in MIME format.
> 
> --------------ms090505030709050404040303
> Content-Type: text/plain; charset=utf-8; format=flowed
> Content-Language: en-US
> Content-Transfer-Encoding: quoted-printable
> 
> Hi Rafael,
> 
> you mentioned upcoming tick-related changes. Could you please give me a q=
> uick pointer or summary to the discussion?

There is another problem with stopping the tick during idle entry before
invoking the governor, which happens if the governor predicts short idle
duration and it is right (note that the problem you are considering happens
when the governor predicts short idle duration and it is wrong).  In that
case, stopping the tick and possibly reprogramming the timer adds overhead
that is completely unnecessary (and people have been complaining about that
for a long time AFAICS).

Aubrey has been working on this for quite some time, but his last proposal
still wasn't what we wanted.  Basically, however, it is about invoking the
governor before the tick is stopped to avoid issues.

> > Regardless, this is the long-term way to go IMO not just to address
> > this issue, but also to avoid unneeded overhead of stopping the tick.
> >=20
> > The tick should only be stopped when we know for a fact that it is
> > worth the overhead which is only when we know that the target
> > residency of the idle state we want to enter is longer than the time
> > to the next tick.  Note that this doesn't depend on the idle governor
> > in use at all.
> 
> The residency is a bad condition. For example the deepest sleep state on =
> SKL-SP
> has a target residency of 600 us. That is typically less than a schedulin=
> g tick.
> Your suggestion you would keep the tick timer enabled most of the time,
> effectively disabling dyntick-idle.
> 
> One could consider keeping the enabled sched-tick for sleep states that a=
> re not
> the deepest as per QOS requirements.

Quite obviously, the tick should still be stopped if the selected state is the
deepest one that can be used (either overall or because of QoS restrictions)
unless it is a "polling" state.

> Note that currently the QOS-check ha=
> ppens
> in the governor.

But since both the existing governors check it in the same way, this isn't
a limitation.

> This would be sub-optimal if the deepest sleep state can=
>  hardly
> be reached, e.g. on systems with very high residencies. Overall residency=
>  is not
> a good criteria, as it's just the minimum time for the C-state to be effe=
> ctive -
> sleeping for longer often improves energy efficiency a lot.

Unless there is a deeper state you can use, that is.

The logic there can be tweaked, but anyway adding an extra timer *after*
stopping the tick is not an option, because it adds more overhead to the
other case mentioned above in which there is too much overhead already.

> We looked at the possible combination of upcoming events (selected C-stat=
> e
> residency, next scheduling timer, next non-scheduling timer, heuristicall=
> y
> predicted wakeup time). It is more complicated than I originally thought,=
>  but we
> think that the following decision would be desired:
> 
> if next_timer_is_sched {
>      if heuristically_predicted_time < next_timer * alpha [0.9] {
>           ** Simple case **
>           // This is a potential case where the selected sleep state is
>           // suboptimal due to poor heuristic - we use the sched tick as =
> fallback
>           // to wakeup
>           // If the prediction is correct, there is no need to disable th=
> e
>           // scheduling tick, but it if is incorrect, we can use the sche=
> duling
>           // tick as a fallback wakeup
>           // Alpha is used such that if the heuristic predicts the sched =
> time
>           // we can still disable the timer.
>           keep_sched_timer_enabled();
>      } else {
>           // ** Complicated case **
>           // Usually we must cancel the sched tick here, because it would=
>  shorten
>           // our sleep time.
>           disable_sched_timer();
>           // Check if this is a potential case of a mispredicting heurist=
> ic
>           // causing a long residency in a suboptimal sleep state
>           // (a.k.a. a Powernightmare)
>           // Unfortunately this is not easy to check and requires a trade=
> -off
>           // between false positives (unnecessary timer overhead)
>           // and false negatives (certain amount of time spent in bad idl=
> e state)
>           if (heuristically_predicted_time < next_non_sched_timer * beta =
> [0.1])
>               && (next_non_sched_timer > threshold)
>               && (selected idle state < deepest_state_qos)
>               enable_fallback_timer();
>           }
>      }
> } else {
>      // We don't need to bother stopping the sched tick, another timer wi=
> ll wake
>      // the CPU before anyway
>      keep_sched_timer_enabled();
> }
> 
> The residency of the selected sleep state doesn't play a role in the deci=
> sion.

At least in principle it should, because idle duration may still be mispredicted
at least occasionally (it is fundamentally nondeterministic after all) and there
should be some kind of safety net for that (which is missing today).

> I would consider to set "threshold" (the maximum duration of a tolerated st=
> ay in a
> bad sleep state) to the residency of the next higher sleep state with a f=
> actor.
> 
> It also seems that the next non-sched timer seems to be difficult to dete=
> rmine.
> Reading it via tick_nohz_get_sleep_length would force the code to disable=
>  the
> sched timer before being able to make the actual decision which state to =
> go to.
> 
> Unfortunately in the complicated case, the sched timer is not optimal as =
> a
> fallback as it will be before the heuristically predicted wakeup and poss=
> ibly
> even before the residency time of the selected sleep state. On the one ha=
> nd it
> may be required set a timer to prevent a long suboptimal sleep state, on =
> the
> other hand the scheduling tick will decrease sleep time if the heuristic =
> is
> correct, and can even cause the selection of worse sleep states. Therefor=
> e in
> this case a custom callback using a time between the heuristic and the ac=
> tual
> next timer set further in the future would prevent issues.
> 
> For a system where the deepest sleep state has a residency << 1/HZ, the s=
> imple
> case is more likely than the complicated one. However, this is not true o=
> n all
> systems.
> 
> Unfortunately this doesn't yet address the question how to implement the
> decision among idle loop / governor.

I've been looking at this for some time and I have some ideas, but I still need
to do some verification, so I'm not yet ready to discuss them in detail ATM.

Thanks,
Rafael

^ permalink raw reply	[flat|nested] 25+ messages in thread

end of thread, other threads:[~2017-12-22 19:38 UTC | newest]

Thread overview: 25+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <a181bf42-9462-476c-6dcd-39fc7151957f@tu-dresden.de>
2017-07-27 12:50 ` [PATCH] cpuidle: Allow menu governor to enter deeper sleep states after some time Thomas Ilsche
2017-10-19  7:46   ` Len Brown
2017-10-20 16:31     ` Thomas Ilsche
2017-10-21 14:27     ` Doug Smythies
2017-10-20  0:17 ` Doug Smythies
2017-10-20 17:13   ` Thomas Ilsche
2017-10-21 14:28   ` Doug Smythies
2017-11-07 23:04     ` Thomas Ilsche
2017-11-08  4:53       ` Len Brown
2017-11-08  6:01         ` Yu Chen
2017-11-08 16:26         ` Doug Smythies
2017-11-08 16:26     ` Doug Smythies
2017-11-10 17:42     ` Doug Smythies
2017-11-14  6:12     ` Doug Smythies
2017-11-16 16:11       ` Thomas Ilsche
2017-11-16 22:47       ` Doug Smythies
2017-11-24 17:36       ` Doug Smythies
2017-12-02 12:56         ` Thomas Ilsche
2017-12-15 10:44           ` Thomas Ilsche
2017-12-15 14:23             ` Rafael J. Wysocki
2017-12-21  9:43               ` Thomas Ilsche
2017-12-22 19:37                 ` Rafael J. Wysocki
2017-12-15 16:16             ` Doug Smythies
2017-12-16  2:34               ` Rafael J. Wysocki
2017-11-25 16:30       ` Doug Smythies

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).