[patch]: fixing of pmtimer overflow that make Cx states time incorrect

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [patch]: fixing of pmtimer overflow that make Cx states time incorrect
@ 2009-01-19  5:13 alex.shi
  2009-01-27  6:52 ` Andrew Morton
  0 siblings, 1 reply; 9+ messages in thread
From: alex.shi @ 2009-01-19  5:13 UTC (permalink / raw)
  To: linux-kernel; +Cc: Shi, Alex, ";yakui.zhao"

[-- Attachment #1: Type: text/plain, Size: 8890 bytes --]

We found  Cx states time abnormal in our some of machines which have 16 LCPUs, the C0 take too many time while system is really idle when kernel enabled tickless and highres.  powertop output is below: 
     PowerTOP version 1.9       (C) 2007 Intel Corporation

Cn                Avg residency       P-states (frequencies)
C0 (cpu running)        (40.5%)         2.53 Ghz     0.0%
C1                0.0ms ( 0.0%)         2.53 Ghz     0.0%
C2              128.8ms (59.5%)         2.40 Ghz     0.0%
                                        1.60 Ghz   100.0%


Wakeups-from-idle per second :  4.7     interval: 20.0s
no ACPI power usage estimate available

Top causes for wakeups:
  41.4% ( 24.9)       <interrupt> : extra timer interrupt 
  20.2% ( 12.2)     <kernel core> : usb_hcd_poll_rh_status (rh_timer_func)


After tacking detailed for this issue, Yakui and I find it is due to 24 bit PM timer overflows when some of cpu sleep more than 4 seconds. With tickless kernel, the CPU want to sleep as much as possible when system idle. But the Cx sleep time are recorded by pmtimer which length is determined by BIOS. The current Cx time was gotten in the following function from driver/acpi/processor_idle.c:

static inline u32 ticks_elapsed(u32 t1, u32 t2)
{
       if (t2 >= t1)
               return (t2 - t1);
       else if (!(acpi_gbl_FADT.flags & ACPI_FADT_32BIT_TIMER))
               return (((0x00FFFFFF - t1) + t2) & 0x00FFFFFF);
       else
               return ((0xFFFFFFFF - t1) + t2);
}

If pmtimer is 24 bits and it take 5 seconds from t1 to t2, in above function, just about 1 seconds ticks was recorded. So the Cx time will be reduced about 4 seconds. and this is why we see above powertop output. 


To resolve this problem,  Yakui and I use ktime_get() to record the Cx states time instead of PM timer as the following patch. the patch was tested with i386/x86_64 modes on several platforms. 
and the Cx states time become good. this patch do not fully remove PM timer for Cx idle time recording. Maybe it can be recovered if the PM timer get improved in hardware.

Best Regards
Alex
Signed-off-by:  Alex Shi <alex.shi@intel.com>
Signed-off-by:  Yakui.zhao <yakui.zhao@intel.com>

---

Index: linux-2.6/drivers/acpi/processor_idle.c
===================================================================
--- linux-2.6.orig/drivers/acpi/processor_idle.c
+++ linux-2.6/drivers/acpi/processor_idle.c
@@ -185,25 +185,6 @@ static struct dmi_system_id __cpuinitdat
 	{},
 };
 
-static inline u32 ticks_elapsed(u32 t1, u32 t2)
-{
-	if (t2 >= t1)
-		return (t2 - t1);
-	else if (!(acpi_gbl_FADT.flags & ACPI_FADT_32BIT_TIMER))
-		return (((0x00FFFFFF - t1) + t2) & 0x00FFFFFF);
-	else
-		return ((0xFFFFFFFF - t1) + t2);
-}
-
-static inline u32 ticks_elapsed_in_us(u32 t1, u32 t2)
-{
-	if (t2 >= t1)
-		return PM_TIMER_TICKS_TO_US(t2 - t1);
-	else if (!(acpi_gbl_FADT.flags & ACPI_FADT_32BIT_TIMER))
-		return PM_TIMER_TICKS_TO_US(((0x00FFFFFF - t1) + t2) & 0x00FFFFFF);
-	else
-		return PM_TIMER_TICKS_TO_US((0xFFFFFFFF - t1) + t2);
-}
 
 /*
  * Callers should disable interrupts before the call and enable
@@ -397,7 +378,8 @@ static void acpi_processor_idle(void)
 	struct acpi_processor_cx *cx = NULL;
 	struct acpi_processor_cx *next_state = NULL;
 	int sleep_ticks = 0;
-	u32 t1, t2 = 0;
+	ktime_t  kt1, kt2;
+	u32 idle_time;
 
 	/*
 	 * Interrupts must be disabled during bus mastering calculations and
@@ -543,15 +525,16 @@ static void acpi_processor_idle(void)
 		break;
 
 	case ACPI_STATE_C2:
-		/* Get start time (ticks) */
-		t1 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+		/* Get start time  */
+		kt1 = ktime_get();
 		/* Tell the scheduler that we are going deep-idle: */
 		sched_clock_idle_sleep_event();
 		/* Invoke C2 */
 		acpi_state_timer_broadcast(pr, cx, 1);
 		acpi_cstate_enter(cx);
-		/* Get end time (ticks) */
-		t2 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+		/* Get end time */
+		kt2 = ktime_get();
+		idle_time =  ktime_to_us(ktime_sub(kt2, kt1));
 
 #if defined (CONFIG_GENERIC_TIME) && defined (CONFIG_X86)
 		/* TSC halts in C2, so notify users */
@@ -559,7 +542,7 @@ static void acpi_processor_idle(void)
 			mark_tsc_unstable("possible TSC halt in C2");
 #endif
 		/* Compute time (ticks) that we were actually asleep */
-		sleep_ticks = ticks_elapsed(t1, t2);
+		sleep_ticks = US_TO_PM_TIMER_TICKS( idle_time);
 
 		/* Tell the scheduler how much we idled: */
 		sched_clock_idle_wakeup_event(sleep_ticks*PM_TIMER_TICK_NS);
@@ -604,14 +587,15 @@ static void acpi_processor_idle(void)
 			ACPI_FLUSH_CPU_CACHE();
 		}
 
-		/* Get start time (ticks) */
-		t1 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+		/* Get start time  */
+		kt1 = ktime_get();
 		/* Invoke C3 */
 		/* Tell the scheduler that we are going deep-idle: */
 		sched_clock_idle_sleep_event();
 		acpi_cstate_enter(cx);
-		/* Get end time (ticks) */
-		t2 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+		/* Get end time  */
+		kt2 = ktime_get();
+		idle_time =  ktime_to_us(ktime_sub(kt2, kt1));
 		if (pr->flags.bm_check && pr->flags.bm_control) {
 			/* Enable bus master arbitration */
 			atomic_dec(&c3_cpu_count);
@@ -624,7 +608,7 @@ static void acpi_processor_idle(void)
 			mark_tsc_unstable("TSC halts in C3");
 #endif
 		/* Compute time (ticks) that we were actually asleep */
-		sleep_ticks = ticks_elapsed(t1, t2);
+		sleep_ticks = US_TO_PM_TIMER_TICKS( idle_time);
 		/* Tell the scheduler how much we idled: */
 		sched_clock_idle_wakeup_event(sleep_ticks*PM_TIMER_TICK_NS);
 
@@ -1455,7 +1439,8 @@ static inline void acpi_idle_do_entry(st
 static int acpi_idle_enter_c1(struct cpuidle_device *dev,
 			      struct cpuidle_state *state)
 {
-	u32 t1, t2;
+	ktime_t  kt1, kt2;
+	u32 idle_time;
 	struct acpi_processor *pr;
 	struct acpi_processor_cx *cx = cpuidle_get_statedata(state);
 
@@ -1476,14 +1461,15 @@ static int acpi_idle_enter_c1(struct cpu
 	if (pr->flags.bm_check)
 		acpi_idle_update_bm_rld(pr, cx);
 
-	t1 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+	kt1 = ktime_get();
 	acpi_idle_do_entry(cx);
-	t2 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+	kt2 = ktime_get();
+	idle_time =  ktime_to_us(ktime_sub(kt2, kt1));
 
 	local_irq_enable();
 	cx->usage++;
 
-	return ticks_elapsed_in_us(t1, t2);
+	return idle_time;
 }
 
 /**
@@ -1496,7 +1482,8 @@ static int acpi_idle_enter_simple(struct
 {
 	struct acpi_processor *pr;
 	struct acpi_processor_cx *cx = cpuidle_get_statedata(state);
-	u32 t1, t2;
+	ktime_t  kt1, kt2;
+	u32 idle_time;
 	int sleep_ticks = 0;
 
 	pr = __get_cpu_var(processors);
@@ -1533,18 +1520,19 @@ static int acpi_idle_enter_simple(struct
 	if (cx->type == ACPI_STATE_C3)
 		ACPI_FLUSH_CPU_CACHE();
 
-	t1 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+	kt1 = ktime_get();
 	/* Tell the scheduler that we are going deep-idle: */
 	sched_clock_idle_sleep_event();
 	acpi_idle_do_entry(cx);
-	t2 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+	kt2 = ktime_get();
+	idle_time =  ktime_to_us(ktime_sub(kt2, kt1));
 
 #if defined (CONFIG_GENERIC_TIME) && defined (CONFIG_X86)
 	/* TSC could halt in idle, so notify users */
 	if (tsc_halts_in_c(cx->type))
 		mark_tsc_unstable("TSC halts in idle");;
 #endif
-	sleep_ticks = ticks_elapsed(t1, t2);
+	sleep_ticks = US_TO_PM_TIMER_TICKS( idle_time);
 
 	/* Tell the scheduler how much we idled: */
 	sched_clock_idle_wakeup_event(sleep_ticks*PM_TIMER_TICK_NS);
@@ -1556,7 +1544,7 @@ static int acpi_idle_enter_simple(struct
 
 	acpi_state_timer_broadcast(pr, cx, 0);
 	cx->time += sleep_ticks;
-	return ticks_elapsed_in_us(t1, t2);
+	return idle_time;
 }
 
 static int c3_cpu_count;
@@ -1574,7 +1562,8 @@ static int acpi_idle_enter_bm(struct cpu
 {
 	struct acpi_processor *pr;
 	struct acpi_processor_cx *cx = cpuidle_get_statedata(state);
-	u32 t1, t2;
+	ktime_t  kt1, kt2;
+	u32 idle_time;
 	int sleep_ticks = 0;
 
 	pr = __get_cpu_var(processors);
@@ -1644,9 +1633,10 @@ static int acpi_idle_enter_bm(struct cpu
 		ACPI_FLUSH_CPU_CACHE();
 	}
 
-	t1 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+	kt1 = ktime_get();
 	acpi_idle_do_entry(cx);
-	t2 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+	kt2 = ktime_get();
+	idle_time =  ktime_to_us(ktime_sub(kt2, kt1));
 
 	/* Re-enable bus master arbitration */
 	if (pr->flags.bm_check && pr->flags.bm_control) {
@@ -1661,7 +1651,7 @@ static int acpi_idle_enter_bm(struct cpu
 	if (tsc_halts_in_c(ACPI_STATE_C3))
 		mark_tsc_unstable("TSC halts in idle");
 #endif
-	sleep_ticks = ticks_elapsed(t1, t2);
+	sleep_ticks = US_TO_PM_TIMER_TICKS(idle_time);
 	/* Tell the scheduler how much we idled: */
 	sched_clock_idle_wakeup_event(sleep_ticks*PM_TIMER_TICK_NS);
 
@@ -1672,7 +1662,7 @@ static int acpi_idle_enter_bm(struct cpu
 
 	acpi_state_timer_broadcast(pr, cx, 0);
 	cx->time += sleep_ticks;
-	return ticks_elapsed_in_us(t1, t2);
+	return idle_time;
 }
 
 struct cpuidle_driver acpi_idle_driver = {












[-- Attachment #2: ctime-ck.patch --]
[-- Type: text/x-patch, Size: 6666 bytes --]

Index: linux-2.6/drivers/acpi/processor_idle.c
===================================================================
--- linux-2.6.orig/drivers/acpi/processor_idle.c
+++ linux-2.6/drivers/acpi/processor_idle.c
@@ -185,25 +185,6 @@ static struct dmi_system_id __cpuinitdat
 	{},
 };
 
-static inline u32 ticks_elapsed(u32 t1, u32 t2)
-{
-	if (t2 >= t1)
-		return (t2 - t1);
-	else if (!(acpi_gbl_FADT.flags & ACPI_FADT_32BIT_TIMER))
-		return (((0x00FFFFFF - t1) + t2) & 0x00FFFFFF);
-	else
-		return ((0xFFFFFFFF - t1) + t2);
-}
-
-static inline u32 ticks_elapsed_in_us(u32 t1, u32 t2)
-{
-	if (t2 >= t1)
-		return PM_TIMER_TICKS_TO_US(t2 - t1);
-	else if (!(acpi_gbl_FADT.flags & ACPI_FADT_32BIT_TIMER))
-		return PM_TIMER_TICKS_TO_US(((0x00FFFFFF - t1) + t2) & 0x00FFFFFF);
-	else
-		return PM_TIMER_TICKS_TO_US((0xFFFFFFFF - t1) + t2);
-}
 
 /*
  * Callers should disable interrupts before the call and enable
@@ -397,7 +378,8 @@ static void acpi_processor_idle(void)
 	struct acpi_processor_cx *cx = NULL;
 	struct acpi_processor_cx *next_state = NULL;
 	int sleep_ticks = 0;
-	u32 t1, t2 = 0;
+	ktime_t  kt1, kt2;
+	u32 idle_time;
 
 	/*
 	 * Interrupts must be disabled during bus mastering calculations and
@@ -543,15 +525,16 @@ static void acpi_processor_idle(void)
 		break;
 
 	case ACPI_STATE_C2:
-		/* Get start time (ticks) */
-		t1 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+		/* Get start time  */
+		kt1 = ktime_get();
 		/* Tell the scheduler that we are going deep-idle: */
 		sched_clock_idle_sleep_event();
 		/* Invoke C2 */
 		acpi_state_timer_broadcast(pr, cx, 1);
 		acpi_cstate_enter(cx);
-		/* Get end time (ticks) */
-		t2 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+		/* Get end time */
+		kt2 = ktime_get();
+		idle_time =  ktime_to_us(ktime_sub(kt2, kt1));
 
 #if defined (CONFIG_GENERIC_TIME) && defined (CONFIG_X86)
 		/* TSC halts in C2, so notify users */
@@ -559,7 +542,7 @@ static void acpi_processor_idle(void)
 			mark_tsc_unstable("possible TSC halt in C2");
 #endif
 		/* Compute time (ticks) that we were actually asleep */
-		sleep_ticks = ticks_elapsed(t1, t2);
+		sleep_ticks = US_TO_PM_TIMER_TICKS( idle_time);
 
 		/* Tell the scheduler how much we idled: */
 		sched_clock_idle_wakeup_event(sleep_ticks*PM_TIMER_TICK_NS);
@@ -604,14 +587,15 @@ static void acpi_processor_idle(void)
 			ACPI_FLUSH_CPU_CACHE();
 		}
 
-		/* Get start time (ticks) */
-		t1 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+		/* Get start time  */
+		kt1 = ktime_get();
 		/* Invoke C3 */
 		/* Tell the scheduler that we are going deep-idle: */
 		sched_clock_idle_sleep_event();
 		acpi_cstate_enter(cx);
-		/* Get end time (ticks) */
-		t2 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+		/* Get end time  */
+		kt2 = ktime_get();
+		idle_time =  ktime_to_us(ktime_sub(kt2, kt1));
 		if (pr->flags.bm_check && pr->flags.bm_control) {
 			/* Enable bus master arbitration */
 			atomic_dec(&c3_cpu_count);
@@ -624,7 +608,7 @@ static void acpi_processor_idle(void)
 			mark_tsc_unstable("TSC halts in C3");
 #endif
 		/* Compute time (ticks) that we were actually asleep */
-		sleep_ticks = ticks_elapsed(t1, t2);
+		sleep_ticks = US_TO_PM_TIMER_TICKS( idle_time);
 		/* Tell the scheduler how much we idled: */
 		sched_clock_idle_wakeup_event(sleep_ticks*PM_TIMER_TICK_NS);
 
@@ -1455,7 +1439,8 @@ static inline void acpi_idle_do_entry(st
 static int acpi_idle_enter_c1(struct cpuidle_device *dev,
 			      struct cpuidle_state *state)
 {
-	u32 t1, t2;
+	ktime_t  kt1, kt2;
+	u32 idle_time;
 	struct acpi_processor *pr;
 	struct acpi_processor_cx *cx = cpuidle_get_statedata(state);
 
@@ -1476,14 +1461,15 @@ static int acpi_idle_enter_c1(struct cpu
 	if (pr->flags.bm_check)
 		acpi_idle_update_bm_rld(pr, cx);
 
-	t1 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+	kt1 = ktime_get();
 	acpi_idle_do_entry(cx);
-	t2 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+	kt2 = ktime_get();
+	idle_time =  ktime_to_us(ktime_sub(kt2, kt1));
 
 	local_irq_enable();
 	cx->usage++;
 
-	return ticks_elapsed_in_us(t1, t2);
+	return idle_time;
 }
 
 /**
@@ -1496,7 +1482,8 @@ static int acpi_idle_enter_simple(struct
 {
 	struct acpi_processor *pr;
 	struct acpi_processor_cx *cx = cpuidle_get_statedata(state);
-	u32 t1, t2;
+	ktime_t  kt1, kt2;
+	u32 idle_time;
 	int sleep_ticks = 0;
 
 	pr = __get_cpu_var(processors);
@@ -1533,18 +1520,19 @@ static int acpi_idle_enter_simple(struct
 	if (cx->type == ACPI_STATE_C3)
 		ACPI_FLUSH_CPU_CACHE();
 
-	t1 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+	kt1 = ktime_get();
 	/* Tell the scheduler that we are going deep-idle: */
 	sched_clock_idle_sleep_event();
 	acpi_idle_do_entry(cx);
-	t2 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+	kt2 = ktime_get();
+	idle_time =  ktime_to_us(ktime_sub(kt2, kt1));
 
 #if defined (CONFIG_GENERIC_TIME) && defined (CONFIG_X86)
 	/* TSC could halt in idle, so notify users */
 	if (tsc_halts_in_c(cx->type))
 		mark_tsc_unstable("TSC halts in idle");;
 #endif
-	sleep_ticks = ticks_elapsed(t1, t2);
+	sleep_ticks = US_TO_PM_TIMER_TICKS( idle_time);
 
 	/* Tell the scheduler how much we idled: */
 	sched_clock_idle_wakeup_event(sleep_ticks*PM_TIMER_TICK_NS);
@@ -1556,7 +1544,7 @@ static int acpi_idle_enter_simple(struct
 
 	acpi_state_timer_broadcast(pr, cx, 0);
 	cx->time += sleep_ticks;
-	return ticks_elapsed_in_us(t1, t2);
+	return idle_time;
 }
 
 static int c3_cpu_count;
@@ -1574,7 +1562,8 @@ static int acpi_idle_enter_bm(struct cpu
 {
 	struct acpi_processor *pr;
 	struct acpi_processor_cx *cx = cpuidle_get_statedata(state);
-	u32 t1, t2;
+	ktime_t  kt1, kt2;
+	u32 idle_time;
 	int sleep_ticks = 0;
 
 	pr = __get_cpu_var(processors);
@@ -1644,9 +1633,10 @@ static int acpi_idle_enter_bm(struct cpu
 		ACPI_FLUSH_CPU_CACHE();
 	}
 
-	t1 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+	kt1 = ktime_get();
 	acpi_idle_do_entry(cx);
-	t2 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+	kt2 = ktime_get();
+	idle_time =  ktime_to_us(ktime_sub(kt2, kt1));
 
 	/* Re-enable bus master arbitration */
 	if (pr->flags.bm_check && pr->flags.bm_control) {
@@ -1661,7 +1651,7 @@ static int acpi_idle_enter_bm(struct cpu
 	if (tsc_halts_in_c(ACPI_STATE_C3))
 		mark_tsc_unstable("TSC halts in idle");
 #endif
-	sleep_ticks = ticks_elapsed(t1, t2);
+	sleep_ticks = US_TO_PM_TIMER_TICKS(idle_time);
 	/* Tell the scheduler how much we idled: */
 	sched_clock_idle_wakeup_event(sleep_ticks*PM_TIMER_TICK_NS);
 
@@ -1672,7 +1662,7 @@ static int acpi_idle_enter_bm(struct cpu
 
 	acpi_state_timer_broadcast(pr, cx, 0);
 	cx->time += sleep_ticks;
-	return ticks_elapsed_in_us(t1, t2);
+	return idle_time;
 }
 
 struct cpuidle_driver acpi_idle_driver = {

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [patch]: fixing of pmtimer overflow that make Cx states time incorrect
  2009-01-19  5:13 [patch]: fixing of pmtimer overflow that make Cx states time incorrect alex.shi
@ 2009-01-27  6:52 ` Andrew Morton
  2009-02-01  2:04   ` yakui_zhao
  0 siblings, 1 reply; 9+ messages in thread
From: Andrew Morton @ 2009-01-27  6:52 UTC (permalink / raw)
  To: alex.shi; +Cc: linux-kernel, yakui.zhao, linux-acpi


The fix looks reasonable to me.

Please always cc linux-acpi@vger.kernel.org on acpi patches.

The patch was quite a mess: inlined code plus an attachment, mangled
email headers, funny characters in the email body, 300-column lines,
etc.

I reproduce a cleaned up copy below.

It's a bit odd that several of these functions (for example,
acpi_idle_enter_bm()) return number-of-microseconds in a plain `int'. 
It should be an unsigned scalar - `unsigned int', `u32', `u64', etc.

Also, all those functions have kerneldoc comments, but none of them
document the function's return value, which is a rather important part
of the interface!

Oh well.



From: Alex Shi <alex.shi@intel.com>

We found Cx states time abnormal in our some of machines which have 16
LCPUs, the C0 take too many time while system is really idle when kernel
enabled tickless and highres.  powertop output is below: 

     PowerTOP version 1.9       (C) 2007 Intel Corporation

Cn                Avg residency       P-states (frequencies)
C0 (cpu running)        (40.5%)         2.53 Ghz     0.0%
C1                0.0ms ( 0.0%)         2.53 Ghz     0.0%
C2              128.8ms (59.5%)         2.40 Ghz     0.0%
                                        1.60 Ghz   100.0%


Wakeups-from-idle per second :  4.7     interval: 20.0s
no ACPI power usage estimate available

Top causes for wakeups:
  41.4% ( 24.9)       <interrupt> : extra timer interrupt 
  20.2% ( 12.2)     <kernel core> : usb_hcd_poll_rh_status (rh_timer_func)


After tacking detailed for this issue, Yakui and I find it is due to 24
bit PM timer overflows when some of cpu sleep more than 4 seconds.  With
tickless kernel, the CPU want to sleep as much as possible when system
idle.  But the Cx sleep time are recorded by pmtimer which length is
determined by BIOS.  The current Cx time was gotten in the following
function from driver/acpi/processor_idle.c:

static inline u32 ticks_elapsed(u32 t1, u32 t2)
{
       if (t2 >= t1)
               return (t2 - t1);
       else if (!(acpi_gbl_FADT.flags & ACPI_FADT_32BIT_TIMER))
               return (((0x00FFFFFF - t1) + t2) & 0x00FFFFFF);
       else
               return ((0xFFFFFFFF - t1) + t2);
}

If pmtimer is 24 bits and it take 5 seconds from t1 to t2, in above
function, just about 1 seconds ticks was recorded.  So the Cx time will be
reduced about 4 seconds.  and this is why we see above powertop output.  


To resolve this problem, Yakui and I use ktime_get() to record the Cx
states time instead of PM timer as the following patch.  the patch was
tested with i386/x86_64 modes on several platforms.  and the Cx states
time become good.  this patch do not fully remove PM timer for Cx idle
time recording.  Maybe it can be recovered if the PM timer get improved in
hardware.

Signed-off-by: Alex Shi <alex.shi@intel.com>
Signed-off-by: Yakui.zhao <yakui.zhao@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---

 drivers/acpi/processor_idle.c |   78 +++++++++++++-------------------
 1 file changed, 34 insertions(+), 44 deletions(-)

diff -puN drivers/acpi/processor_idle.c~fixing-of-pmtimer-overflow-that-make-cx-states-time-incorrect drivers/acpi/processor_idle.c
--- a/drivers/acpi/processor_idle.c~fixing-of-pmtimer-overflow-that-make-cx-states-time-incorrect
+++ a/drivers/acpi/processor_idle.c
@@ -185,25 +185,6 @@ static struct dmi_system_id __cpuinitdat
 	{},
 };
 
-static inline u32 ticks_elapsed(u32 t1, u32 t2)
-{
-	if (t2 >= t1)
-		return (t2 - t1);
-	else if (!(acpi_gbl_FADT.flags & ACPI_FADT_32BIT_TIMER))
-		return (((0x00FFFFFF - t1) + t2) & 0x00FFFFFF);
-	else
-		return ((0xFFFFFFFF - t1) + t2);
-}
-
-static inline u32 ticks_elapsed_in_us(u32 t1, u32 t2)
-{
-	if (t2 >= t1)
-		return PM_TIMER_TICKS_TO_US(t2 - t1);
-	else if (!(acpi_gbl_FADT.flags & ACPI_FADT_32BIT_TIMER))
-		return PM_TIMER_TICKS_TO_US(((0x00FFFFFF - t1) + t2) & 0x00FFFFFF);
-	else
-		return PM_TIMER_TICKS_TO_US((0xFFFFFFFF - t1) + t2);
-}
 
 /*
  * Callers should disable interrupts before the call and enable
@@ -397,7 +378,8 @@ static void acpi_processor_idle(void)
 	struct acpi_processor_cx *cx = NULL;
 	struct acpi_processor_cx *next_state = NULL;
 	int sleep_ticks = 0;
-	u32 t1, t2 = 0;
+	ktime_t  kt1, kt2;
+	u32 idle_time;
 
 	/*
 	 * Interrupts must be disabled during bus mastering calculations and
@@ -543,15 +525,16 @@ static void acpi_processor_idle(void)
 		break;
 
 	case ACPI_STATE_C2:
-		/* Get start time (ticks) */
-		t1 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+		/* Get start time  */
+		kt1 = ktime_get();
 		/* Tell the scheduler that we are going deep-idle: */
 		sched_clock_idle_sleep_event();
 		/* Invoke C2 */
 		acpi_state_timer_broadcast(pr, cx, 1);
 		acpi_cstate_enter(cx);
-		/* Get end time (ticks) */
-		t2 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+		/* Get end time */
+		kt2 = ktime_get();
+		idle_time =  ktime_to_us(ktime_sub(kt2, kt1));
 
 #if defined (CONFIG_GENERIC_TIME) && defined (CONFIG_X86)
 		/* TSC halts in C2, so notify users */
@@ -559,7 +542,7 @@ static void acpi_processor_idle(void)
 			mark_tsc_unstable("possible TSC halt in C2");
 #endif
 		/* Compute time (ticks) that we were actually asleep */
-		sleep_ticks = ticks_elapsed(t1, t2);
+		sleep_ticks = US_TO_PM_TIMER_TICKS( idle_time);
 
 		/* Tell the scheduler how much we idled: */
 		sched_clock_idle_wakeup_event(sleep_ticks*PM_TIMER_TICK_NS);
@@ -604,14 +587,15 @@ static void acpi_processor_idle(void)
 			ACPI_FLUSH_CPU_CACHE();
 		}
 
-		/* Get start time (ticks) */
-		t1 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+		/* Get start time  */
+		kt1 = ktime_get();
 		/* Invoke C3 */
 		/* Tell the scheduler that we are going deep-idle: */
 		sched_clock_idle_sleep_event();
 		acpi_cstate_enter(cx);
-		/* Get end time (ticks) */
-		t2 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+		/* Get end time  */
+		kt2 = ktime_get();
+		idle_time =  ktime_to_us(ktime_sub(kt2, kt1));
 		if (pr->flags.bm_check && pr->flags.bm_control) {
 			/* Enable bus master arbitration */
 			atomic_dec(&c3_cpu_count);
@@ -624,7 +608,7 @@ static void acpi_processor_idle(void)
 			mark_tsc_unstable("TSC halts in C3");
 #endif
 		/* Compute time (ticks) that we were actually asleep */
-		sleep_ticks = ticks_elapsed(t1, t2);
+		sleep_ticks = US_TO_PM_TIMER_TICKS( idle_time);
 		/* Tell the scheduler how much we idled: */
 		sched_clock_idle_wakeup_event(sleep_ticks*PM_TIMER_TICK_NS);
 
@@ -1455,7 +1439,8 @@ static inline void acpi_idle_do_entry(st
 static int acpi_idle_enter_c1(struct cpuidle_device *dev,
 			      struct cpuidle_state *state)
 {
-	u32 t1, t2;
+	ktime_t  kt1, kt2;
+	u32 idle_time;
 	struct acpi_processor *pr;
 	struct acpi_processor_cx *cx = cpuidle_get_statedata(state);
 
@@ -1476,14 +1461,15 @@ static int acpi_idle_enter_c1(struct cpu
 	if (pr->flags.bm_check)
 		acpi_idle_update_bm_rld(pr, cx);
 
-	t1 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+	kt1 = ktime_get();
 	acpi_idle_do_entry(cx);
-	t2 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+	kt2 = ktime_get();
+	idle_time =  ktime_to_us(ktime_sub(kt2, kt1));
 
 	local_irq_enable();
 	cx->usage++;
 
-	return ticks_elapsed_in_us(t1, t2);
+	return idle_time;
 }
 
 /**
@@ -1496,7 +1482,8 @@ static int acpi_idle_enter_simple(struct
 {
 	struct acpi_processor *pr;
 	struct acpi_processor_cx *cx = cpuidle_get_statedata(state);
-	u32 t1, t2;
+	ktime_t  kt1, kt2;
+	u32 idle_time;
 	int sleep_ticks = 0;
 
 	pr = __get_cpu_var(processors);
@@ -1533,18 +1520,19 @@ static int acpi_idle_enter_simple(struct
 	if (cx->type == ACPI_STATE_C3)
 		ACPI_FLUSH_CPU_CACHE();
 
-	t1 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+	kt1 = ktime_get();
 	/* Tell the scheduler that we are going deep-idle: */
 	sched_clock_idle_sleep_event();
 	acpi_idle_do_entry(cx);
-	t2 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+	kt2 = ktime_get();
+	idle_time =  ktime_to_us(ktime_sub(kt2, kt1));
 
 #if defined (CONFIG_GENERIC_TIME) && defined (CONFIG_X86)
 	/* TSC could halt in idle, so notify users */
 	if (tsc_halts_in_c(cx->type))
 		mark_tsc_unstable("TSC halts in idle");;
 #endif
-	sleep_ticks = ticks_elapsed(t1, t2);
+	sleep_ticks = US_TO_PM_TIMER_TICKS( idle_time);
 
 	/* Tell the scheduler how much we idled: */
 	sched_clock_idle_wakeup_event(sleep_ticks*PM_TIMER_TICK_NS);
@@ -1556,7 +1544,7 @@ static int acpi_idle_enter_simple(struct
 
 	acpi_state_timer_broadcast(pr, cx, 0);
 	cx->time += sleep_ticks;
-	return ticks_elapsed_in_us(t1, t2);
+	return idle_time;
 }
 
 static int c3_cpu_count;
@@ -1574,7 +1562,8 @@ static int acpi_idle_enter_bm(struct cpu
 {
 	struct acpi_processor *pr;
 	struct acpi_processor_cx *cx = cpuidle_get_statedata(state);
-	u32 t1, t2;
+	ktime_t  kt1, kt2;
+	u32 idle_time;
 	int sleep_ticks = 0;
 
 	pr = __get_cpu_var(processors);
@@ -1644,9 +1633,10 @@ static int acpi_idle_enter_bm(struct cpu
 		ACPI_FLUSH_CPU_CACHE();
 	}
 
-	t1 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+	kt1 = ktime_get();
 	acpi_idle_do_entry(cx);
-	t2 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+	kt2 = ktime_get();
+	idle_time =  ktime_to_us(ktime_sub(kt2, kt1));
 
 	/* Re-enable bus master arbitration */
 	if (pr->flags.bm_check && pr->flags.bm_control) {
@@ -1661,7 +1651,7 @@ static int acpi_idle_enter_bm(struct cpu
 	if (tsc_halts_in_c(ACPI_STATE_C3))
 		mark_tsc_unstable("TSC halts in idle");
 #endif
-	sleep_ticks = ticks_elapsed(t1, t2);
+	sleep_ticks = US_TO_PM_TIMER_TICKS(idle_time);
 	/* Tell the scheduler how much we idled: */
 	sched_clock_idle_wakeup_event(sleep_ticks*PM_TIMER_TICK_NS);
 
@@ -1672,7 +1662,7 @@ static int acpi_idle_enter_bm(struct cpu
 
 	acpi_state_timer_broadcast(pr, cx, 0);
 	cx->time += sleep_ticks;
-	return ticks_elapsed_in_us(t1, t2);
+	return idle_time;
 }
 
 struct cpuidle_driver acpi_idle_driver = {
_


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [patch]: fixing of pmtimer overflow that make Cx states time incorrect
  2009-01-27  6:52 ` Andrew Morton
@ 2009-02-01  2:04   ` yakui_zhao
  2009-02-01 10:03     ` alex.shi
  2009-02-02  1:46     ` alex.shi
  0 siblings, 2 replies; 9+ messages in thread
From: yakui_zhao @ 2009-02-01  2:04 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Shi, Alex, linux-kernel@vger.kernel.org,
	linux-acpi@vger.kernel.org

On Tue, 2009-01-27 at 14:52 +0800, Andrew Morton wrote:
> The fix looks reasonable to me.
> 
> Please always cc linux-acpi@vger.kernel.org on acpi patches.
> 
> The patch was quite a mess: inlined code plus an attachment, mangled
> email headers, funny characters in the email body, 300-column lines,
> etc.
> 
> I reproduce a cleaned up copy below.
Thanks for the clean up.
But it seems that there exists a potential overflow.
In the clean-up patch the data type of idle_time is defined as u32.
And the idle_time will be converted to sleep_tick by using the macro
definition of US_TO_PM_TIMER. 
>#define US_TO_PM_TIMER_TICKS(t) ((t*(PM_TIMER_FREQUENCY/1000))/1000)
 If the idle_time is more than 4.687s, the overflow will happen. 
 So the interval variable(idle_time) had better be defined as 64-bit
type.  

In fact the following patch is already updated based on the Venki's
suggestion. 
    The monotonic time is not required while getting the C-state sleep
state. In such case the function of ktime_get_real is enough 

thanks.

   
> 
> It's a bit odd that several of these functions (for example,
> acpi_idle_enter_bm()) return number-of-microseconds in a plain `int'. 
> It should be an unsigned scalar - `unsigned int', `u32', `u64', etc.

> 
> Also, all those functions have kerneldoc comments, but none of them
> document the function's return value, which is a rather important part
> of the interface!
> 
> Oh well.
> 
> 
> 
> From: Alex Shi <alex.shi@intel.com>
> 
> We found Cx states time abnormal in our some of machines which have 16
> LCPUs, the C0 take too many time while system is really idle when kernel
> enabled tickless and highres.  powertop output is below: 
> 
>      PowerTOP version 1.9       (C) 2007 Intel Corporation
> 
> Cn                Avg residency       P-states (frequencies)
> C0 (cpu running)        (40.5%)         2.53 Ghz     0.0%
> C1                0.0ms ( 0.0%)         2.53 Ghz     0.0%
> C2              128.8ms (59.5%)         2.40 Ghz     0.0%
>                                         1.60 Ghz   100.0%
> 
> 
> Wakeups-from-idle per second :  4.7     interval: 20.0s
> no ACPI power usage estimate available
> 
> Top causes for wakeups:
>   41.4% ( 24.9)       <interrupt> : extra timer interrupt 
>   20.2% ( 12.2)     <kernel core> : usb_hcd_poll_rh_status (rh_timer_func)
> 
> 
> After tacking detailed for this issue, Yakui and I find it is due to 24
> bit PM timer overflows when some of cpu sleep more than 4 seconds.  With
> tickless kernel, the CPU want to sleep as much as possible when system
> idle.  But the Cx sleep time are recorded by pmtimer which length is
> determined by BIOS.  The current Cx time was gotten in the following
> function from driver/acpi/processor_idle.c:
> 
> static inline u32 ticks_elapsed(u32 t1, u32 t2)
> {
>        if (t2 >= t1)
>                return (t2 - t1);
>        else if (!(acpi_gbl_FADT.flags & ACPI_FADT_32BIT_TIMER))
>                return (((0x00FFFFFF - t1) + t2) & 0x00FFFFFF);
>        else
>                return ((0xFFFFFFFF - t1) + t2);
> }
> 
> If pmtimer is 24 bits and it take 5 seconds from t1 to t2, in above
> function, just about 1 seconds ticks was recorded.  So the Cx time will be
> reduced about 4 seconds.  and this is why we see above powertop output.  
> 
> 
> To resolve this problem, Yakui and I use ktime_get() to record the Cx
> states time instead of PM timer as the following patch.  the patch was
> tested with i386/x86_64 modes on several platforms.  and the Cx states
> time become good.  this patch do not fully remove PM timer for Cx idle
> time recording.  Maybe it can be recovered if the PM timer get improved in
> hardware.
> 
> Signed-off-by: Alex Shi <alex.shi@intel.com>
> Signed-off-by: Yakui.zhao <yakui.zhao@intel.com>
> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
> ---
> 
>  drivers/acpi/processor_idle.c |   78 +++++++++++++-------------------
>  1 file changed, 34 insertions(+), 44 deletions(-)
> 
> diff -puN drivers/acpi/processor_idle.c~fixing-of-pmtimer-overflow-that-make-cx-states-time-incorrect drivers/acpi/processor_idle.c
> --- a/drivers/acpi/processor_idle.c~fixing-of-pmtimer-overflow-that-make-cx-states-time-incorrect
> +++ a/drivers/acpi/processor_idle.c
> @@ -185,25 +185,6 @@ static struct dmi_system_id __cpuinitdat
>  	{},
>  };
>  
> -static inline u32 ticks_elapsed(u32 t1, u32 t2)
> -{
> -	if (t2 >= t1)
> -		return (t2 - t1);
> -	else if (!(acpi_gbl_FADT.flags & ACPI_FADT_32BIT_TIMER))
> -		return (((0x00FFFFFF - t1) + t2) & 0x00FFFFFF);
> -	else
> -		return ((0xFFFFFFFF - t1) + t2);
> -}
> -
> -static inline u32 ticks_elapsed_in_us(u32 t1, u32 t2)
> -{
> -	if (t2 >= t1)
> -		return PM_TIMER_TICKS_TO_US(t2 - t1);
> -	else if (!(acpi_gbl_FADT.flags & ACPI_FADT_32BIT_TIMER))
> -		return PM_TIMER_TICKS_TO_US(((0x00FFFFFF - t1) + t2) & 0x00FFFFFF);
> -	else
> -		return PM_TIMER_TICKS_TO_US((0xFFFFFFFF - t1) + t2);
> -}
>  
>  /*
>   * Callers should disable interrupts before the call and enable
> @@ -397,7 +378,8 @@ static void acpi_processor_idle(void)
>  	struct acpi_processor_cx *cx = NULL;
>  	struct acpi_processor_cx *next_state = NULL;
>  	int sleep_ticks = 0;
> -	u32 t1, t2 = 0;
> +	ktime_t  kt1, kt2;
> +	u32 idle_time;
>  
>  	/*
>  	 * Interrupts must be disabled during bus mastering calculations and
> @@ -543,15 +525,16 @@ static void acpi_processor_idle(void)
>  		break;
>  
>  	case ACPI_STATE_C2:
> -		/* Get start time (ticks) */
> -		t1 = inl(acpi_gbl_FADT.xpm_timer_block.address);
> +		/* Get start time  */
> +		kt1 = ktime_get();
>  		/* Tell the scheduler that we are going deep-idle: */
>  		sched_clock_idle_sleep_event();
>  		/* Invoke C2 */
>  		acpi_state_timer_broadcast(pr, cx, 1);
>  		acpi_cstate_enter(cx);
> -		/* Get end time (ticks) */
> -		t2 = inl(acpi_gbl_FADT.xpm_timer_block.address);
> +		/* Get end time */
> +		kt2 = ktime_get();
> +		idle_time =  ktime_to_us(ktime_sub(kt2, kt1));
>  
>  #if defined (CONFIG_GENERIC_TIME) && defined (CONFIG_X86)
>  		/* TSC halts in C2, so notify users */
> @@ -559,7 +542,7 @@ static void acpi_processor_idle(void)
>  			mark_tsc_unstable("possible TSC halt in C2");
>  #endif
>  		/* Compute time (ticks) that we were actually asleep */
> -		sleep_ticks = ticks_elapsed(t1, t2);
> +		sleep_ticks = US_TO_PM_TIMER_TICKS( idle_time);
>  
>  		/* Tell the scheduler how much we idled: */
>  		sched_clock_idle_wakeup_event(sleep_ticks*PM_TIMER_TICK_NS);
> @@ -604,14 +587,15 @@ static void acpi_processor_idle(void)
>  			ACPI_FLUSH_CPU_CACHE();
>  		}
>  
> -		/* Get start time (ticks) */
> -		t1 = inl(acpi_gbl_FADT.xpm_timer_block.address);
> +		/* Get start time  */
> +		kt1 = ktime_get();
>  		/* Invoke C3 */
>  		/* Tell the scheduler that we are going deep-idle: */
>  		sched_clock_idle_sleep_event();
>  		acpi_cstate_enter(cx);
> -		/* Get end time (ticks) */
> -		t2 = inl(acpi_gbl_FADT.xpm_timer_block.address);
> +		/* Get end time  */
> +		kt2 = ktime_get();
> +		idle_time =  ktime_to_us(ktime_sub(kt2, kt1));
>  		if (pr->flags.bm_check && pr->flags.bm_control) {
>  			/* Enable bus master arbitration */
>  			atomic_dec(&c3_cpu_count);
> @@ -624,7 +608,7 @@ static void acpi_processor_idle(void)
>  			mark_tsc_unstable("TSC halts in C3");
>  #endif
>  		/* Compute time (ticks) that we were actually asleep */
> -		sleep_ticks = ticks_elapsed(t1, t2);
> +		sleep_ticks = US_TO_PM_TIMER_TICKS( idle_time);
>  		/* Tell the scheduler how much we idled: */
>  		sched_clock_idle_wakeup_event(sleep_ticks*PM_TIMER_TICK_NS);
>  
> @@ -1455,7 +1439,8 @@ static inline void acpi_idle_do_entry(st
>  static int acpi_idle_enter_c1(struct cpuidle_device *dev,
>  			      struct cpuidle_state *state)
>  {
> -	u32 t1, t2;
> +	ktime_t  kt1, kt2;
> +	u32 idle_time;
>  	struct acpi_processor *pr;
>  	struct acpi_processor_cx *cx = cpuidle_get_statedata(state);
>  
> @@ -1476,14 +1461,15 @@ static int acpi_idle_enter_c1(struct cpu
>  	if (pr->flags.bm_check)
>  		acpi_idle_update_bm_rld(pr, cx);
>  
> -	t1 = inl(acpi_gbl_FADT.xpm_timer_block.address);
> +	kt1 = ktime_get();
>  	acpi_idle_do_entry(cx);
> -	t2 = inl(acpi_gbl_FADT.xpm_timer_block.address);
> +	kt2 = ktime_get();
> +	idle_time =  ktime_to_us(ktime_sub(kt2, kt1));
>  
>  	local_irq_enable();
>  	cx->usage++;
>  
> -	return ticks_elapsed_in_us(t1, t2);
> +	return idle_time;
>  }
>  
>  /**
> @@ -1496,7 +1482,8 @@ static int acpi_idle_enter_simple(struct
>  {
>  	struct acpi_processor *pr;
>  	struct acpi_processor_cx *cx = cpuidle_get_statedata(state);
> -	u32 t1, t2;
> +	ktime_t  kt1, kt2;
> +	u32 idle_time;
>  	int sleep_ticks = 0;
>  
>  	pr = __get_cpu_var(processors);
> @@ -1533,18 +1520,19 @@ static int acpi_idle_enter_simple(struct
>  	if (cx->type == ACPI_STATE_C3)
>  		ACPI_FLUSH_CPU_CACHE();
>  
> -	t1 = inl(acpi_gbl_FADT.xpm_timer_block.address);
> +	kt1 = ktime_get();
>  	/* Tell the scheduler that we are going deep-idle: */
>  	sched_clock_idle_sleep_event();
>  	acpi_idle_do_entry(cx);
> -	t2 = inl(acpi_gbl_FADT.xpm_timer_block.address);
> +	kt2 = ktime_get();
> +	idle_time =  ktime_to_us(ktime_sub(kt2, kt1));
>  
>  #if defined (CONFIG_GENERIC_TIME) && defined (CONFIG_X86)
>  	/* TSC could halt in idle, so notify users */
>  	if (tsc_halts_in_c(cx->type))
>  		mark_tsc_unstable("TSC halts in idle");;
>  #endif
> -	sleep_ticks = ticks_elapsed(t1, t2);
> +	sleep_ticks = US_TO_PM_TIMER_TICKS( idle_time);
>  
>  	/* Tell the scheduler how much we idled: */
>  	sched_clock_idle_wakeup_event(sleep_ticks*PM_TIMER_TICK_NS);
> @@ -1556,7 +1544,7 @@ static int acpi_idle_enter_simple(struct
>  
>  	acpi_state_timer_broadcast(pr, cx, 0);
>  	cx->time += sleep_ticks;
> -	return ticks_elapsed_in_us(t1, t2);
> +	return idle_time;
>  }
>  
>  static int c3_cpu_count;
> @@ -1574,7 +1562,8 @@ static int acpi_idle_enter_bm(struct cpu
>  {
>  	struct acpi_processor *pr;
>  	struct acpi_processor_cx *cx = cpuidle_get_statedata(state);
> -	u32 t1, t2;
> +	ktime_t  kt1, kt2;
> +	u32 idle_time;
>  	int sleep_ticks = 0;
>  
>  	pr = __get_cpu_var(processors);
> @@ -1644,9 +1633,10 @@ static int acpi_idle_enter_bm(struct cpu
>  		ACPI_FLUSH_CPU_CACHE();
>  	}
>  
> -	t1 = inl(acpi_gbl_FADT.xpm_timer_block.address);
> +	kt1 = ktime_get();
>  	acpi_idle_do_entry(cx);
> -	t2 = inl(acpi_gbl_FADT.xpm_timer_block.address);
> +	kt2 = ktime_get();
> +	idle_time =  ktime_to_us(ktime_sub(kt2, kt1));
>  
>  	/* Re-enable bus master arbitration */
>  	if (pr->flags.bm_check && pr->flags.bm_control) {
> @@ -1661,7 +1651,7 @@ static int acpi_idle_enter_bm(struct cpu
>  	if (tsc_halts_in_c(ACPI_STATE_C3))
>  		mark_tsc_unstable("TSC halts in idle");
>  #endif
> -	sleep_ticks = ticks_elapsed(t1, t2);
> +	sleep_ticks = US_TO_PM_TIMER_TICKS(idle_time);
>  	/* Tell the scheduler how much we idled: */
>  	sched_clock_idle_wakeup_event(sleep_ticks*PM_TIMER_TICK_NS);
>  
> @@ -1672,7 +1662,7 @@ static int acpi_idle_enter_bm(struct cpu
>  
>  	acpi_state_timer_broadcast(pr, cx, 0);
>  	cx->time += sleep_ticks;
> -	return ticks_elapsed_in_us(t1, t2);
> +	return idle_time;
>  }
>  
>  struct cpuidle_driver acpi_idle_driver = {
> _
> 


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [patch]: fixing of pmtimer overflow that make Cx states time incorrect
  2009-02-01  2:04   ` yakui_zhao
@ 2009-02-01 10:03     ` alex.shi
  2009-02-02  1:46     ` alex.shi
  1 sibling, 0 replies; 9+ messages in thread
From: alex.shi @ 2009-02-01 10:03 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-acpi@vger.kernel.org, ; linux-kernel@vger.kernel.org,
	";venkatesh.pallipadi"

On Sun, 2009-02-01 at 10:04 +0800, Zhao, Yakui wrote:
> On Tue, 2009-01-27 at 14:52 +0800, Andrew Morton wrote:
> > The fix looks reasonable to me.
> > 
> > Please always cc linux-acpi@vger.kernel.org on acpi patches.
> > 
> > The patch was quite a mess: inlined code plus an attachment, mangled
> > email headers, funny characters in the email body, 300-column lines,
> > etc.
> > 
> > I reproduce a cleaned up copy below.
> Thanks for the clean up.
> But it seems that there exists a potential overflow.
> In the clean-up patch the data type of idle_time is defined as u32.
> And the idle_time will be converted to sleep_tick by using the macro
> definition of US_TO_PM_TIMER. 
> >#define US_TO_PM_TIMER_TICKS(t) ((t*(PM_TIMER_FREQUENCY/1000))/1000)
>  If the idle_time is more than 4.687s, the overflow will happen. 
>  So the interval variable(idle_time) had better be defined as 64-bit
> type.  
> 
> In fact the following patch is already updated based on the Venki's
> suggestion. 
>     The monotonic time is not required while getting the C-state sleep
> state. In such case the function of ktime_get_real is enough 
> 
> thanks.
> 
yes, this kind of overflow will cause a little Cx sleep time omit. So we
try to use s64 bit idle_time to present the Cx sleep time. For this
purpose we need to use div64_u64 to convert US_TO_PM_TICKS in i386
mode. So we rewrite this patch on 2.6.29-rc3 as following. Please revert
previous version before patch it. 

Alex 
> It's a bit odd that several of these functions (for example,
> acpi_idle_enter_bm()) return number-of-microseconds in a plain `int'. 
> It should be an unsigned scalar - `unsigned int', `u32', `u64', etc.

> 
> Also, all those functions have kerneldoc comments, but none of them
> document the function's return value, which is a rather important part
> of the interface!
> 
> Oh well.
> 
> 
> 
> From: Alex Shi <alex.shi@intel.com>
> 
> We found Cx states time abnormal in our some of machines which have 16
> LCPUs, the C0 take too many time while system is really idle when
kernel
> enabled tickless and highres.  powertop output is below: 
> 
>      PowerTOP version 1.9       (C) 2007 Intel Corporation
> 
> Cn                Avg residency       P-states (frequencies)
> C0 (cpu running)        (40.5%)         2.53 Ghz     0.0%
> C1                0.0ms ( 0.0%)         2.53 Ghz     0.0%
> C2              128.8ms (59.5%)         2.40 Ghz     0.0%
>                                         1.60 Ghz   100.0%
> 
> 
> Wakeups-from-idle per second :  4.7     interval: 20.0s
> no ACPI power usage estimate available
> 
> Top causes for wakeups:
>   41.4% ( 24.9)       <interrupt> : extra timer interrupt 
>   20.2% ( 12.2)     <kernel core> : usb_hcd_poll_rh_status
(rh_timer_func)
> 
> 
> After tacking detailed for this issue, Yakui and I find it is due to
24
> bit PM timer overflows when some of cpu sleep more than 4 seconds.
With
> tickless kernel, the CPU want to sleep as much as possible when system
> idle.  But the Cx sleep time are recorded by pmtimer which length is
> determined by BIOS.  The current Cx time was gotten in the following
> function from driver/acpi/processor_idle.c:
> 
> static inline u32 ticks_elapsed(u32 t1, u32 t2)
> {
>        if (t2 >= t1)
>                return (t2 - t1);
>        else if (!(acpi_gbl_FADT.flags & ACPI_FADT_32BIT_TIMER))
>                return (((0x00FFFFFF - t1) + t2) & 0x00FFFFFF);
>        else
>                return ((0xFFFFFFFF - t1) + t2);
> }
> 
> If pmtimer is 24 bits and it take 5 seconds from t1 to t2, in above
> function, just about 1 seconds ticks was recorded.  So the Cx time
will be
> reduced about 4 seconds.  and this is why we see above powertop
output.  
> 
> 
> To resolve this problem, Yakui and I use ktime_get() to record the Cx
> states time instead of PM timer as the following patch.  the patch was
> tested with i386/x86_64 modes on several platforms.  and the Cx states
> time become good.  this patch do not fully remove PM timer for Cx idle
> time recording.  Maybe it can be recovered if the PM timer get
improved in
> hardware.

Signed-off-by: Alex Shi <alex.shi@intel.com>
Signed-off-by: Yakui.zhao <yakui.zhao@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
Index: linux-2.6.29-rc3/drivers/acpi/processor_idle.c
===================================================================
--- linux-2.6.29-rc3.orig/drivers/acpi/processor_idle.c
+++ linux-2.6.29-rc3/drivers/acpi/processor_idle.c
@@ -64,7 +64,8 @@
 #define _COMPONENT              ACPI_PROCESSOR_COMPONENT
 ACPI_MODULE_NAME("processor_idle");
 #define ACPI_PROCESSOR_FILE_POWER	"power"
-#define US_TO_PM_TIMER_TICKS(t)		((t * (PM_TIMER_FREQUENCY/1000)) / 1000)
+#define US_TO_PM_TIMER_TICKS(t)		div64_u64(\
+	(t * (PM_TIMER_FREQUENCY/1000)), 1000ULL)
 #define PM_TIMER_TICK_NS		(1000000000ULL/PM_TIMER_FREQUENCY)
 #ifndef CONFIG_CPU_IDLE
 #define C2_OVERHEAD			4	/* 1us (3.579 ticks per us) */
@@ -396,8 +397,9 @@ static void acpi_processor_idle(void)
 	struct acpi_processor *pr = NULL;
 	struct acpi_processor_cx *cx = NULL;
 	struct acpi_processor_cx *next_state = NULL;
-	int sleep_ticks = 0;
-	u32 t1, t2 = 0;
+	s64 sleep_ticks = 0;
+	ktime_t  kt1, kt2;
+	s64 idle_time;
 
 	/*
 	 * Interrupts must be disabled during bus mastering calculations and
@@ -544,14 +546,15 @@ static void acpi_processor_idle(void)
 
 	case ACPI_STATE_C2:
 		/* Get start time (ticks) */
-		t1 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+		kt1 = ktime_get_real();
 		/* Tell the scheduler that we are going deep-idle: */
 		sched_clock_idle_sleep_event();
 		/* Invoke C2 */
 		acpi_state_timer_broadcast(pr, cx, 1);
 		acpi_cstate_enter(cx);
 		/* Get end time (ticks) */
-		t2 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+		kt2 = ktime_get_real();
+		idle_time =  ktime_to_us(ktime_sub(kt2, kt1));
 
 #if defined (CONFIG_GENERIC_TIME) && defined (CONFIG_X86)
 		/* TSC halts in C2, so notify users */
@@ -559,7 +562,7 @@ static void acpi_processor_idle(void)
 			mark_tsc_unstable("possible TSC halt in C2");
 #endif
 		/* Compute time (ticks) that we were actually asleep */
-		sleep_ticks = ticks_elapsed(t1, t2);
+		sleep_ticks = US_TO_PM_TIMER_TICKS(idle_time);
 
 		/* Tell the scheduler how much we idled: */
 		sched_clock_idle_wakeup_event(sleep_ticks*PM_TIMER_TICK_NS);
@@ -605,13 +608,14 @@ static void acpi_processor_idle(void)
 		}
 
 		/* Get start time (ticks) */
-		t1 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+		kt1 = ktime_get_real();
 		/* Invoke C3 */
 		/* Tell the scheduler that we are going deep-idle: */
 		sched_clock_idle_sleep_event();
 		acpi_cstate_enter(cx);
 		/* Get end time (ticks) */
-		t2 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+		kt2 = ktime_get_real();
+		idle_time =  ktime_to_us(ktime_sub(kt2, kt1));
 		if (pr->flags.bm_check && pr->flags.bm_control) {
 			/* Enable bus master arbitration */
 			atomic_dec(&c3_cpu_count);
@@ -624,7 +628,7 @@ static void acpi_processor_idle(void)
 			mark_tsc_unstable("TSC halts in C3");
 #endif
 		/* Compute time (ticks) that we were actually asleep */
-		sleep_ticks = ticks_elapsed(t1, t2);
+		sleep_ticks = US_TO_PM_TIMER_TICKS(idle_time);
 		/* Tell the scheduler how much we idled: */
 		sched_clock_idle_wakeup_event(sleep_ticks*PM_TIMER_TICK_NS);
 
@@ -1455,7 +1459,8 @@ static inline void acpi_idle_do_entry(st
 static int acpi_idle_enter_c1(struct cpuidle_device *dev,
 			      struct cpuidle_state *state)
 {
-	u32 t1, t2;
+	ktime_t  kt1, kt2;
+	s64 idle_time;
 	struct acpi_processor *pr;
 	struct acpi_processor_cx *cx = cpuidle_get_statedata(state);
 
@@ -1476,14 +1481,15 @@ static int acpi_idle_enter_c1(struct cpu
 	if (pr->flags.bm_check)
 		acpi_idle_update_bm_rld(pr, cx);
 
-	t1 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+	kt1 = ktime_get_real();
 	acpi_idle_do_entry(cx);
-	t2 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+	kt2 = ktime_get_real();
+	idle_time =  ktime_to_us(ktime_sub(kt2, kt1));
 
 	local_irq_enable();
 	cx->usage++;
 
-	return ticks_elapsed_in_us(t1, t2);
+	return idle_time;
 }
 
 /**
@@ -1496,8 +1502,9 @@ static int acpi_idle_enter_simple(struct
 {
 	struct acpi_processor *pr;
 	struct acpi_processor_cx *cx = cpuidle_get_statedata(state);
-	u32 t1, t2;
-	int sleep_ticks = 0;
+	ktime_t  kt1, kt2;
+	s64 idle_time;
+	s64 sleep_ticks = 0;
 
 	pr = __get_cpu_var(processors);
 
@@ -1533,18 +1540,19 @@ static int acpi_idle_enter_simple(struct
 	if (cx->type == ACPI_STATE_C3)
 		ACPI_FLUSH_CPU_CACHE();
 
-	t1 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+	kt1 = ktime_get_real();
 	/* Tell the scheduler that we are going deep-idle: */
 	sched_clock_idle_sleep_event();
 	acpi_idle_do_entry(cx);
-	t2 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+	kt2 = ktime_get_real();
+	idle_time =  ktime_to_us(ktime_sub(kt2, kt1));
 
 #if defined (CONFIG_GENERIC_TIME) && defined (CONFIG_X86)
 	/* TSC could halt in idle, so notify users */
 	if (tsc_halts_in_c(cx->type))
 		mark_tsc_unstable("TSC halts in idle");;
 #endif
-	sleep_ticks = ticks_elapsed(t1, t2);
+	sleep_ticks = US_TO_PM_TIMER_TICKS(idle_time);
 
 	/* Tell the scheduler how much we idled: */
 	sched_clock_idle_wakeup_event(sleep_ticks*PM_TIMER_TICK_NS);
@@ -1556,7 +1564,7 @@ static int acpi_idle_enter_simple(struct
 
 	acpi_state_timer_broadcast(pr, cx, 0);
 	cx->time += sleep_ticks;
-	return ticks_elapsed_in_us(t1, t2);
+	return idle_time;
 }
 
 static int c3_cpu_count;
@@ -1574,8 +1582,9 @@ static int acpi_idle_enter_bm(struct cpu
 {
 	struct acpi_processor *pr;
 	struct acpi_processor_cx *cx = cpuidle_get_statedata(state);
-	u32 t1, t2;
-	int sleep_ticks = 0;
+	ktime_t  kt1, kt2;
+	s64 idle_time;
+	s64 sleep_ticks = 0;
 
 	pr = __get_cpu_var(processors);
 
@@ -1644,9 +1653,10 @@ static int acpi_idle_enter_bm(struct cpu
 		ACPI_FLUSH_CPU_CACHE();
 	}
 
-	t1 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+	kt1 = ktime_get_real();
 	acpi_idle_do_entry(cx);
-	t2 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+	kt2 = ktime_get_real();
+	idle_time =  ktime_to_us(ktime_sub(kt2, kt1));
 
 	/* Re-enable bus master arbitration */
 	if (pr->flags.bm_check && pr->flags.bm_control) {
@@ -1661,7 +1671,7 @@ static int acpi_idle_enter_bm(struct cpu
 	if (tsc_halts_in_c(ACPI_STATE_C3))
 		mark_tsc_unstable("TSC halts in idle");
 #endif
-	sleep_ticks = ticks_elapsed(t1, t2);
+	sleep_ticks = US_TO_PM_TIMER_TICKS(idle_time);
 	/* Tell the scheduler how much we idled: */
 	sched_clock_idle_wakeup_event(sleep_ticks*PM_TIMER_TICK_NS);
 
@@ -1672,7 +1682,7 @@ static int acpi_idle_enter_bm(struct cpu
 
 	acpi_state_timer_broadcast(pr, cx, 0);
 	cx->time += sleep_ticks;
-	return ticks_elapsed_in_us(t1, t2);
+	return idle_time;
 }
 
 struct cpuidle_driver acpi_idle_driver = {



^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [patch]: fixing of pmtimer overflow that make Cx states time incorrect
  2009-02-01  2:04   ` yakui_zhao
  2009-02-01 10:03     ` alex.shi
@ 2009-02-02  1:46     ` alex.shi
  2009-02-03  6:18       ` yakui_zhao
                         ` (2 more replies)
  1 sibling, 3 replies; 9+ messages in thread
From: alex.shi @ 2009-02-02  1:46 UTC (permalink / raw)
  To: Andrew Morton, "; lenb"
  Cc: linux-acpi@vger.kernel.org, ; linux-kernel@vger.kernel.org,
	"; venkatesh.pallipadi"

Yagui want to give a clear explanation to be used for commitment. So I resend
this again. 

    On most boxes the ACPI PM timer is 24-bit counter that runs on 3.579545MHz
clock. In such case the max C-state sleep time should be less than 4687ms when
it is used to record C2/C3 duration time. 
    But on some boxes the max C-state sleep time is more than 4687ms. In such
case the overflow happens and the C-state duration time can't be counted
accurately.

    Use clocksource to get the C-state time instead of ACPI PM timer. and use
div64_u64 to convert US_TO_PM_TIME_TICKS in i386 mode.

Thanks
Alex 

Asked-by: venkatesh.pallipadi@intel.com
Tested-by: Alex Shi <alex.shi@intel.com>
Signed-off-by: Alex Shi <alex.shi@intel.com>
Signed-off-by: Yakui.zhao <yakui.zhao@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

---


Index: linux-2.6.29-rc3/drivers/acpi/processor_idle.c
===================================================================
--- linux-2.6.29-rc3.orig/drivers/acpi/processor_idle.c
+++ linux-2.6.29-rc3/drivers/acpi/processor_idle.c
@@ -64,7 +64,8 @@
 #define _COMPONENT              ACPI_PROCESSOR_COMPONENT
 ACPI_MODULE_NAME("processor_idle");
 #define ACPI_PROCESSOR_FILE_POWER	"power"
-#define US_TO_PM_TIMER_TICKS(t)		((t * (PM_TIMER_FREQUENCY/1000)) / 1000)
+#define US_TO_PM_TIMER_TICKS(t)		div64_u64(\
+	(t * (PM_TIMER_FREQUENCY/1000)), 1000ULL)
 #define PM_TIMER_TICK_NS		(1000000000ULL/PM_TIMER_FREQUENCY)
 #ifndef CONFIG_CPU_IDLE
 #define C2_OVERHEAD			4	/* 1us (3.579 ticks per us) */
@@ -396,8 +397,9 @@ static void acpi_processor_idle(void)
 	struct acpi_processor *pr = NULL;
 	struct acpi_processor_cx *cx = NULL;
 	struct acpi_processor_cx *next_state = NULL;
-	int sleep_ticks = 0;
-	u32 t1, t2 = 0;
+	s64 sleep_ticks = 0;
+	ktime_t  kt1, kt2;
+	s64 idle_time;
 
 	/*
 	 * Interrupts must be disabled during bus mastering calculations and
@@ -544,14 +546,15 @@ static void acpi_processor_idle(void)
 
 	case ACPI_STATE_C2:
 		/* Get start time (ticks) */
-		t1 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+		kt1 = ktime_get_real();
 		/* Tell the scheduler that we are going deep-idle: */
 		sched_clock_idle_sleep_event();
 		/* Invoke C2 */
 		acpi_state_timer_broadcast(pr, cx, 1);
 		acpi_cstate_enter(cx);
 		/* Get end time (ticks) */
-		t2 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+		kt2 = ktime_get_real();
+		idle_time =  ktime_to_us(ktime_sub(kt2, kt1));
 
 #if defined (CONFIG_GENERIC_TIME) && defined (CONFIG_X86)
 		/* TSC halts in C2, so notify users */
@@ -559,7 +562,7 @@ static void acpi_processor_idle(void)
 			mark_tsc_unstable("possible TSC halt in C2");
 #endif
 		/* Compute time (ticks) that we were actually asleep */
-		sleep_ticks = ticks_elapsed(t1, t2);
+		sleep_ticks = US_TO_PM_TIMER_TICKS(idle_time);
 
 		/* Tell the scheduler how much we idled: */
 		sched_clock_idle_wakeup_event(sleep_ticks*PM_TIMER_TICK_NS);
@@ -605,13 +608,14 @@ static void acpi_processor_idle(void)
 		}
 
 		/* Get start time (ticks) */
-		t1 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+		kt1 = ktime_get_real();
 		/* Invoke C3 */
 		/* Tell the scheduler that we are going deep-idle: */
 		sched_clock_idle_sleep_event();
 		acpi_cstate_enter(cx);
 		/* Get end time (ticks) */
-		t2 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+		kt2 = ktime_get_real();
+		idle_time =  ktime_to_us(ktime_sub(kt2, kt1));
 		if (pr->flags.bm_check && pr->flags.bm_control) {
 			/* Enable bus master arbitration */
 			atomic_dec(&c3_cpu_count);
@@ -624,7 +628,7 @@ static void acpi_processor_idle(void)
 			mark_tsc_unstable("TSC halts in C3");
 #endif
 		/* Compute time (ticks) that we were actually asleep */
-		sleep_ticks = ticks_elapsed(t1, t2);
+		sleep_ticks = US_TO_PM_TIMER_TICKS(idle_time);
 		/* Tell the scheduler how much we idled: */
 		sched_clock_idle_wakeup_event(sleep_ticks*PM_TIMER_TICK_NS);
 
@@ -1455,7 +1459,8 @@ static inline void acpi_idle_do_entry(st
 static int acpi_idle_enter_c1(struct cpuidle_device *dev,
 			      struct cpuidle_state *state)
 {
-	u32 t1, t2;
+	ktime_t  kt1, kt2;
+	s64 idle_time;
 	struct acpi_processor *pr;
 	struct acpi_processor_cx *cx = cpuidle_get_statedata(state);
 
@@ -1476,14 +1481,15 @@ static int acpi_idle_enter_c1(struct cpu
 	if (pr->flags.bm_check)
 		acpi_idle_update_bm_rld(pr, cx);
 
-	t1 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+	kt1 = ktime_get_real();
 	acpi_idle_do_entry(cx);
-	t2 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+	kt2 = ktime_get_real();
+	idle_time =  ktime_to_us(ktime_sub(kt2, kt1));
 
 	local_irq_enable();
 	cx->usage++;
 
-	return ticks_elapsed_in_us(t1, t2);
+	return idle_time;
 }
 
 /**
@@ -1496,8 +1502,9 @@ static int acpi_idle_enter_simple(struct
 {
 	struct acpi_processor *pr;
 	struct acpi_processor_cx *cx = cpuidle_get_statedata(state);
-	u32 t1, t2;
-	int sleep_ticks = 0;
+	ktime_t  kt1, kt2;
+	s64 idle_time;
+	s64 sleep_ticks = 0;
 
 	pr = __get_cpu_var(processors);
 
@@ -1533,18 +1540,19 @@ static int acpi_idle_enter_simple(struct
 	if (cx->type == ACPI_STATE_C3)
 		ACPI_FLUSH_CPU_CACHE();
 
-	t1 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+	kt1 = ktime_get_real();
 	/* Tell the scheduler that we are going deep-idle: */
 	sched_clock_idle_sleep_event();
 	acpi_idle_do_entry(cx);
-	t2 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+	kt2 = ktime_get_real();
+	idle_time =  ktime_to_us(ktime_sub(kt2, kt1));
 
 #if defined (CONFIG_GENERIC_TIME) && defined (CONFIG_X86)
 	/* TSC could halt in idle, so notify users */
 	if (tsc_halts_in_c(cx->type))
 		mark_tsc_unstable("TSC halts in idle");;
 #endif
-	sleep_ticks = ticks_elapsed(t1, t2);
+	sleep_ticks = US_TO_PM_TIMER_TICKS(idle_time);
 
 	/* Tell the scheduler how much we idled: */
 	sched_clock_idle_wakeup_event(sleep_ticks*PM_TIMER_TICK_NS);
@@ -1556,7 +1564,7 @@ static int acpi_idle_enter_simple(struct
 
 	acpi_state_timer_broadcast(pr, cx, 0);
 	cx->time += sleep_ticks;
-	return ticks_elapsed_in_us(t1, t2);
+	return idle_time;
 }
 
 static int c3_cpu_count;
@@ -1574,8 +1582,9 @@ static int acpi_idle_enter_bm(struct cpu
 {
 	struct acpi_processor *pr;
 	struct acpi_processor_cx *cx = cpuidle_get_statedata(state);
-	u32 t1, t2;
-	int sleep_ticks = 0;
+	ktime_t  kt1, kt2;
+	s64 idle_time;
+	s64 sleep_ticks = 0;
 
 	pr = __get_cpu_var(processors);
 
@@ -1644,9 +1653,10 @@ static int acpi_idle_enter_bm(struct cpu
 		ACPI_FLUSH_CPU_CACHE();
 	}
 
-	t1 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+	kt1 = ktime_get_real();
 	acpi_idle_do_entry(cx);
-	t2 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+	kt2 = ktime_get_real();
+	idle_time =  ktime_to_us(ktime_sub(kt2, kt1));
 
 	/* Re-enable bus master arbitration */
 	if (pr->flags.bm_check && pr->flags.bm_control) {
@@ -1661,7 +1671,7 @@ static int acpi_idle_enter_bm(struct cpu
 	if (tsc_halts_in_c(ACPI_STATE_C3))
 		mark_tsc_unstable("TSC halts in idle");
 #endif
-	sleep_ticks = ticks_elapsed(t1, t2);
+	sleep_ticks = US_TO_PM_TIMER_TICKS(idle_time);
 	/* Tell the scheduler how much we idled: */
 	sched_clock_idle_wakeup_event(sleep_ticks*PM_TIMER_TICK_NS);
 
@@ -1672,7 +1682,7 @@ static int acpi_idle_enter_bm(struct cpu
 
 	acpi_state_timer_broadcast(pr, cx, 0);
 	cx->time += sleep_ticks;
-	return ticks_elapsed_in_us(t1, t2);
+	return idle_time;
 }
 
 struct cpuidle_driver acpi_idle_driver = {



^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [patch]: fixing of pmtimer overflow that make Cx states time incorrect
  2009-02-02  1:46     ` alex.shi
@ 2009-02-03  6:18       ` yakui_zhao
  2009-02-03  7:24       ` Andrew Morton
  2009-02-09 19:56       ` Andrew Morton
  2 siblings, 0 replies; 9+ messages in thread
From: yakui_zhao @ 2009-02-03  6:18 UTC (permalink / raw)
  To: Shi, Alex
  Cc: Andrew Morton, "; lenb"@kernel.org,
	linux-acpi@vger.kernel.org, ; linux-kernel@vger.kernel.org,
	"; venkatesh.pallipadi"@intel.com

On Mon, 2009-02-02 at 09:46 +0800, alex.shi wrote:
> Yagui want to give a clear explanation to be used for commitment. So I resend
> this again. 
> 
>     On most boxes the ACPI PM timer is 24-bit counter that runs on 3.579545MHz
> clock. In such case the max C-state sleep time should be less than 4687ms when
> it is used to record C2/C3 duration time. 
>     But on some boxes the max C-state sleep time is more than 4687ms. In such
> case the overflow happens and the C-state duration time can't be counted
> accurately.
Hi, All

   When the clocksource is used to get the C-state idle time, the unit
of idle_time will be microsecond. To be compatible with
the /proc/acpi/processor/*/power interface, it will be converted to ACPI
PM timer sleep ticks by using the macro definition of
US_TO_PM_TIMER_TICKS(this I/F will be used by the powertop tool of early
version).  When the macro definition is used, sometimes the overflow
will happen if the 32-bit data type is used.  
   So the function of div64_u64 is used in the macro definition of
US_TO_PM_TIMER_TICKS.
   On the x86_64 platform the function of div64_u64 is very simple. But
on the i386 platform it will be very complex. 
   
   In fact in the latest powertop the sys I/F will be preferred instead
of /proc I/F. The time unit of sys I/F is microsecond. In such case it
seems redundant that the C-state idle time is converted to PM timer
sleep ticks. 
   How about delete the conversion? If so, the time unit of /proc I/F
will also be microsecond. And the div operation will be omitted.
   
---
 drivers/acpi/processor_idle.c |   83
+++++++++++++++++++++---------------------
 1 file changed, 42 insertions(+), 41 deletions(-)

Index: linux-2.6/drivers/acpi/processor_idle.c
===================================================================
--- linux-2.6.orig/drivers/acpi/processor_idle.c	2009-02-03
13:41:07.000000000 +0800
+++ linux-2.6/drivers/acpi/processor_idle.c	2009-02-03
14:16:30.000000000 +0800
@@ -67,8 +67,8 @@
 #define US_TO_PM_TIMER_TICKS(t)		((t * (PM_TIMER_FREQUENCY/1000)) /
1000)
 #define PM_TIMER_TICK_NS		(1000000000ULL/PM_TIMER_FREQUENCY)
 #ifndef CONFIG_CPU_IDLE
-#define C2_OVERHEAD			4	/* 1us (3.579 ticks per us) */
-#define C3_OVERHEAD			4	/* 1us (3.579 ticks per us) */
+#define C2_OVERHEAD			1	/* 1us (3.579 ticks per us) */
+#define C3_OVERHEAD			1	/* 1us (3.579 ticks per us) */
 static void (*pm_idle_save) (void) __read_mostly;
 #else
 #define C2_OVERHEAD			1	/* 1us */
@@ -396,8 +396,9 @@
 	struct acpi_processor *pr = NULL;
 	struct acpi_processor_cx *cx = NULL;
 	struct acpi_processor_cx *next_state = NULL;
-	int sleep_ticks = 0;
-	u32 t1, t2 = 0;
+	s64 sleep_ticks = 0;
+	ktime_t  kt1, kt2;
+	s64 idle_time;
 
 	/*
 	 * Interrupts must be disabled during bus mastering calculations and
@@ -544,26 +545,26 @@
 
 	case ACPI_STATE_C2:
 		/* Get start time (ticks) */
-		t1 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+		kt1 = ktime_get_real();
 		/* Tell the scheduler that we are going deep-idle: */
 		sched_clock_idle_sleep_event();
 		/* Invoke C2 */
 		acpi_state_timer_broadcast(pr, cx, 1);
 		acpi_cstate_enter(cx);
 		/* Get end time (ticks) */
-		t2 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+		kt2 = ktime_get_real();
+		idle_time =  ktime_to_us(ktime_sub(kt2, kt1));
 
 #if defined (CONFIG_GENERIC_TIME) && defined (CONFIG_X86)
 		/* TSC halts in C2, so notify users */
 		if (tsc_halts_in_c(ACPI_STATE_C2))
 			mark_tsc_unstable("possible TSC halt in C2");
 #endif
-		/* Compute time (ticks) that we were actually asleep */
-		sleep_ticks = ticks_elapsed(t1, t2);
 
 		/* Tell the scheduler how much we idled: */
-		sched_clock_idle_wakeup_event(sleep_ticks*PM_TIMER_TICK_NS);
-
+		/* the unit of idle_time is us. We need convert it to NS */
+		sched_clock_idle_wakeup_event(idle_time * 1000);
+		sleep_ticks = idle_time;
 		/* Re-enable interrupts */
 		local_irq_enable();
 		/* Do not account our idle-switching overhead: */
@@ -605,13 +606,14 @@
 		}
 
 		/* Get start time (ticks) */
-		t1 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+		kt1 = ktime_get_real();
 		/* Invoke C3 */
 		/* Tell the scheduler that we are going deep-idle: */
 		sched_clock_idle_sleep_event();
 		acpi_cstate_enter(cx);
 		/* Get end time (ticks) */
-		t2 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+		kt2 = ktime_get_real();
+		idle_time =  ktime_to_us(ktime_sub(kt2, kt1));
 		if (pr->flags.bm_check && pr->flags.bm_control) {
 			/* Enable bus master arbitration */
 			atomic_dec(&c3_cpu_count);
@@ -624,9 +626,10 @@
 			mark_tsc_unstable("TSC halts in C3");
 #endif
 		/* Compute time (ticks) that we were actually asleep */
-		sleep_ticks = ticks_elapsed(t1, t2);
+		sleep_ticks = idle_time;
 		/* Tell the scheduler how much we idled: */
-		sched_clock_idle_wakeup_event(sleep_ticks*PM_TIMER_TICK_NS);
+		/* the unit of sleept_ticks is us. We need convert it to NS */
+		sched_clock_idle_wakeup_event(idle_time * 1000);
 
 		/* Re-enable interrupts */
 		local_irq_enable();
@@ -1046,12 +1049,8 @@
 	 * Normalize the C2 latency to expidite policy
 	 */
 	cx->valid = 1;
-
-#ifndef CONFIG_CPU_IDLE
-	cx->latency_ticks = US_TO_PM_TIMER_TICKS(cx->latency);
-#else
+	/* the unit of latency_ticks will always be US */
 	cx->latency_ticks = cx->latency;
-#endif
 
 	return;
 }
@@ -1132,11 +1131,7 @@
 	 */
 	cx->valid = 1;
 
-#ifndef CONFIG_CPU_IDLE
-	cx->latency_ticks = US_TO_PM_TIMER_TICKS(cx->latency);
-#else
 	cx->latency_ticks = cx->latency;
-#endif
 
 	return;
 }
@@ -1455,7 +1450,8 @@
 static int acpi_idle_enter_c1(struct cpuidle_device *dev,
 			      struct cpuidle_state *state)
 {
-	u32 t1, t2;
+	ktime_t  kt1, kt2;
+	s64 idle_time;
 	struct acpi_processor *pr;
 	struct acpi_processor_cx *cx = cpuidle_get_statedata(state);
 
@@ -1476,14 +1472,15 @@
 	if (pr->flags.bm_check)
 		acpi_idle_update_bm_rld(pr, cx);
 
-	t1 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+	kt1 = ktime_get_real();
 	acpi_idle_do_entry(cx);
-	t2 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+	kt2 = ktime_get_real();
+	idle_time =  ktime_to_us(ktime_sub(kt2, kt1));
 
 	local_irq_enable();
 	cx->usage++;
 
-	return ticks_elapsed_in_us(t1, t2);
+	return idle_time;
 }
 
 /**
@@ -1496,8 +1493,9 @@
 {
 	struct acpi_processor *pr;
 	struct acpi_processor_cx *cx = cpuidle_get_statedata(state);
-	u32 t1, t2;
-	int sleep_ticks = 0;
+	ktime_t  kt1, kt2;
+	s64 idle_time;
+	s64 sleep_ticks = 0;
 
 	pr = __get_cpu_var(processors);
 
@@ -1533,21 +1531,22 @@
 	if (cx->type == ACPI_STATE_C3)
 		ACPI_FLUSH_CPU_CACHE();
 
-	t1 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+	kt1 = ktime_get_real();
 	/* Tell the scheduler that we are going deep-idle: */
 	sched_clock_idle_sleep_event();
 	acpi_idle_do_entry(cx);
-	t2 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+	kt2 = ktime_get_real();
+	idle_time =  ktime_to_us(ktime_sub(kt2, kt1));
 
 #if defined (CONFIG_GENERIC_TIME) && defined (CONFIG_X86)
 	/* TSC could halt in idle, so notify users */
 	if (tsc_halts_in_c(cx->type))
 		mark_tsc_unstable("TSC halts in idle");;
 #endif
-	sleep_ticks = ticks_elapsed(t1, t2);
+	sleep_ticks = idle_time;
 
 	/* Tell the scheduler how much we idled: */
-	sched_clock_idle_wakeup_event(sleep_ticks*PM_TIMER_TICK_NS);
+	sched_clock_idle_wakeup_event(idle_time * 1000);
 
 	local_irq_enable();
 	current_thread_info()->status |= TS_POLLING;
@@ -1556,7 +1555,7 @@
 
 	acpi_state_timer_broadcast(pr, cx, 0);
 	cx->time += sleep_ticks;
-	return ticks_elapsed_in_us(t1, t2);
+	return idle_time;
 }
 
 static int c3_cpu_count;
@@ -1574,8 +1573,9 @@
 {
 	struct acpi_processor *pr;
 	struct acpi_processor_cx *cx = cpuidle_get_statedata(state);
-	u32 t1, t2;
-	int sleep_ticks = 0;
+	ktime_t  kt1, kt2;
+	s64 idle_time;
+	s64 sleep_ticks = 0;
 
 	pr = __get_cpu_var(processors);
 
@@ -1644,9 +1644,10 @@
 		ACPI_FLUSH_CPU_CACHE();
 	}
 
-	t1 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+	kt1 = ktime_get_real();
 	acpi_idle_do_entry(cx);
-	t2 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+	kt2 = ktime_get_real();
+	idle_time =  ktime_to_us(ktime_sub(kt2, kt1));
 
 	/* Re-enable bus master arbitration */
 	if (pr->flags.bm_check && pr->flags.bm_control) {
@@ -1661,9 +1662,9 @@
 	if (tsc_halts_in_c(ACPI_STATE_C3))
 		mark_tsc_unstable("TSC halts in idle");
 #endif
-	sleep_ticks = ticks_elapsed(t1, t2);
+	sleep_ticks = idle_time;
 	/* Tell the scheduler how much we idled: */
-	sched_clock_idle_wakeup_event(sleep_ticks*PM_TIMER_TICK_NS);
+	sched_clock_idle_wakeup_event(idle_time * 1000);
 
 	local_irq_enable();
 	current_thread_info()->status |= TS_POLLING;
@@ -1672,7 +1673,7 @@
 
 	acpi_state_timer_broadcast(pr, cx, 0);
 	cx->time += sleep_ticks;
-	return ticks_elapsed_in_us(t1, t2);
+	return idle_time;
 }
 
 struct cpuidle_driver acpi_idle_driver = {

   
> 
>     Use clocksource to get the C-state time instead of ACPI PM timer. and use
> div64_u64 to convert US_TO_PM_TIME_TICKS in i386 mode.
> 
> Thanks
> Alex 
> 
> Asked-by: venkatesh.pallipadi@intel.com
> Tested-by: Alex Shi <alex.shi@intel.com>
> Signed-off-by: Alex Shi <alex.shi@intel.com>
> Signed-off-by: Yakui.zhao <yakui.zhao@intel.com>
> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
> 
> ---
> 
> 
> Index: linux-2.6.29-rc3/drivers/acpi/processor_idle.c
> ===================================================================
> --- linux-2.6.29-rc3.orig/drivers/acpi/processor_idle.c
> +++ linux-2.6.29-rc3/drivers/acpi/processor_idle.c
> @@ -64,7 +64,8 @@
>  #define _COMPONENT              ACPI_PROCESSOR_COMPONENT
>  ACPI_MODULE_NAME("processor_idle");
>  #define ACPI_PROCESSOR_FILE_POWER	"power"
> -#define US_TO_PM_TIMER_TICKS(t)		((t * (PM_TIMER_FREQUENCY/1000)) / 1000)
> +#define US_TO_PM_TIMER_TICKS(t)		div64_u64(\
> +	(t * (PM_TIMER_FREQUENCY/1000)), 1000ULL)
>  #define PM_TIMER_TICK_NS		(1000000000ULL/PM_TIMER_FREQUENCY)
>  #ifndef CONFIG_CPU_IDLE
>  #define C2_OVERHEAD			4	/* 1us (3.579 ticks per us) */
> @@ -396,8 +397,9 @@ static void acpi_processor_idle(void)
>  	struct acpi_processor *pr = NULL;
>  	struct acpi_processor_cx *cx = NULL;
>  	struct acpi_processor_cx *next_state = NULL;
> -	int sleep_ticks = 0;
> -	u32 t1, t2 = 0;
> +	s64 sleep_ticks = 0;
> +	ktime_t  kt1, kt2;
> +	s64 idle_time;
>  
>  	/*
>  	 * Interrupts must be disabled during bus mastering calculations and
> @@ -544,14 +546,15 @@ static void acpi_processor_idle(void)
>  
>  	case ACPI_STATE_C2:
>  		/* Get start time (ticks) */
> -		t1 = inl(acpi_gbl_FADT.xpm_timer_block.address);
> +		kt1 = ktime_get_real();
>  		/* Tell the scheduler that we are going deep-idle: */
>  		sched_clock_idle_sleep_event();
>  		/* Invoke C2 */
>  		acpi_state_timer_broadcast(pr, cx, 1);
>  		acpi_cstate_enter(cx);
>  		/* Get end time (ticks) */
> -		t2 = inl(acpi_gbl_FADT.xpm_timer_block.address);
> +		kt2 = ktime_get_real();
> +		idle_time =  ktime_to_us(ktime_sub(kt2, kt1));
>  
>  #if defined (CONFIG_GENERIC_TIME) && defined (CONFIG_X86)
>  		/* TSC halts in C2, so notify users */
> @@ -559,7 +562,7 @@ static void acpi_processor_idle(void)
>  			mark_tsc_unstable("possible TSC halt in C2");
>  #endif
>  		/* Compute time (ticks) that we were actually asleep */
> -		sleep_ticks = ticks_elapsed(t1, t2);
> +		sleep_ticks = US_TO_PM_TIMER_TICKS(idle_time);
>  
>  		/* Tell the scheduler how much we idled: */
>  		sched_clock_idle_wakeup_event(sleep_ticks*PM_TIMER_TICK_NS);
> @@ -605,13 +608,14 @@ static void acpi_processor_idle(void)
>  		}
>  
>  		/* Get start time (ticks) */
> -		t1 = inl(acpi_gbl_FADT.xpm_timer_block.address);
> +		kt1 = ktime_get_real();
>  		/* Invoke C3 */
>  		/* Tell the scheduler that we are going deep-idle: */
>  		sched_clock_idle_sleep_event();
>  		acpi_cstate_enter(cx);
>  		/* Get end time (ticks) */
> -		t2 = inl(acpi_gbl_FADT.xpm_timer_block.address);
> +		kt2 = ktime_get_real();
> +		idle_time =  ktime_to_us(ktime_sub(kt2, kt1));
>  		if (pr->flags.bm_check && pr->flags.bm_control) {
>  			/* Enable bus master arbitration */
>  			atomic_dec(&c3_cpu_count);
> @@ -624,7 +628,7 @@ static void acpi_processor_idle(void)
>  			mark_tsc_unstable("TSC halts in C3");
>  #endif
>  		/* Compute time (ticks) that we were actually asleep */
> -		sleep_ticks = ticks_elapsed(t1, t2);
> +		sleep_ticks = US_TO_PM_TIMER_TICKS(idle_time);
>  		/* Tell the scheduler how much we idled: */
>  		sched_clock_idle_wakeup_event(sleep_ticks*PM_TIMER_TICK_NS);
>  
> @@ -1455,7 +1459,8 @@ static inline void acpi_idle_do_entry(st
>  static int acpi_idle_enter_c1(struct cpuidle_device *dev,
>  			      struct cpuidle_state *state)
>  {
> -	u32 t1, t2;
> +	ktime_t  kt1, kt2;
> +	s64 idle_time;
>  	struct acpi_processor *pr;
>  	struct acpi_processor_cx *cx = cpuidle_get_statedata(state);
>  
> @@ -1476,14 +1481,15 @@ static int acpi_idle_enter_c1(struct cpu
>  	if (pr->flags.bm_check)
>  		acpi_idle_update_bm_rld(pr, cx);
>  
> -	t1 = inl(acpi_gbl_FADT.xpm_timer_block.address);
> +	kt1 = ktime_get_real();
>  	acpi_idle_do_entry(cx);
> -	t2 = inl(acpi_gbl_FADT.xpm_timer_block.address);
> +	kt2 = ktime_get_real();
> +	idle_time =  ktime_to_us(ktime_sub(kt2, kt1));
>  
>  	local_irq_enable();
>  	cx->usage++;
>  
> -	return ticks_elapsed_in_us(t1, t2);
> +	return idle_time;
>  }
>  
>  /**
> @@ -1496,8 +1502,9 @@ static int acpi_idle_enter_simple(struct
>  {
>  	struct acpi_processor *pr;
>  	struct acpi_processor_cx *cx = cpuidle_get_statedata(state);
> -	u32 t1, t2;
> -	int sleep_ticks = 0;
> +	ktime_t  kt1, kt2;
> +	s64 idle_time;
> +	s64 sleep_ticks = 0;
>  
>  	pr = __get_cpu_var(processors);
>  
> @@ -1533,18 +1540,19 @@ static int acpi_idle_enter_simple(struct
>  	if (cx->type == ACPI_STATE_C3)
>  		ACPI_FLUSH_CPU_CACHE();
>  
> -	t1 = inl(acpi_gbl_FADT.xpm_timer_block.address);
> +	kt1 = ktime_get_real();
>  	/* Tell the scheduler that we are going deep-idle: */
>  	sched_clock_idle_sleep_event();
>  	acpi_idle_do_entry(cx);
> -	t2 = inl(acpi_gbl_FADT.xpm_timer_block.address);
> +	kt2 = ktime_get_real();
> +	idle_time =  ktime_to_us(ktime_sub(kt2, kt1));
>  
>  #if defined (CONFIG_GENERIC_TIME) && defined (CONFIG_X86)
>  	/* TSC could halt in idle, so notify users */
>  	if (tsc_halts_in_c(cx->type))
>  		mark_tsc_unstable("TSC halts in idle");;
>  #endif
> -	sleep_ticks = ticks_elapsed(t1, t2);
> +	sleep_ticks = US_TO_PM_TIMER_TICKS(idle_time);
>  
>  	/* Tell the scheduler how much we idled: */
>  	sched_clock_idle_wakeup_event(sleep_ticks*PM_TIMER_TICK_NS);
> @@ -1556,7 +1564,7 @@ static int acpi_idle_enter_simple(struct
>  
>  	acpi_state_timer_broadcast(pr, cx, 0);
>  	cx->time += sleep_ticks;
> -	return ticks_elapsed_in_us(t1, t2);
> +	return idle_time;
>  }
>  
>  static int c3_cpu_count;
> @@ -1574,8 +1582,9 @@ static int acpi_idle_enter_bm(struct cpu
>  {
>  	struct acpi_processor *pr;
>  	struct acpi_processor_cx *cx = cpuidle_get_statedata(state);
> -	u32 t1, t2;
> -	int sleep_ticks = 0;
> +	ktime_t  kt1, kt2;
> +	s64 idle_time;
> +	s64 sleep_ticks = 0;
>  
>  	pr = __get_cpu_var(processors);
>  
> @@ -1644,9 +1653,10 @@ static int acpi_idle_enter_bm(struct cpu
>  		ACPI_FLUSH_CPU_CACHE();
>  	}
>  
> -	t1 = inl(acpi_gbl_FADT.xpm_timer_block.address);
> +	kt1 = ktime_get_real();
>  	acpi_idle_do_entry(cx);
> -	t2 = inl(acpi_gbl_FADT.xpm_timer_block.address);
> +	kt2 = ktime_get_real();
> +	idle_time =  ktime_to_us(ktime_sub(kt2, kt1));
>  
>  	/* Re-enable bus master arbitration */
>  	if (pr->flags.bm_check && pr->flags.bm_control) {
> @@ -1661,7 +1671,7 @@ static int acpi_idle_enter_bm(struct cpu
>  	if (tsc_halts_in_c(ACPI_STATE_C3))
>  		mark_tsc_unstable("TSC halts in idle");
>  #endif
> -	sleep_ticks = ticks_elapsed(t1, t2);
> +	sleep_ticks = US_TO_PM_TIMER_TICKS(idle_time);
>  	/* Tell the scheduler how much we idled: */
>  	sched_clock_idle_wakeup_event(sleep_ticks*PM_TIMER_TICK_NS);
>  
> @@ -1672,7 +1682,7 @@ static int acpi_idle_enter_bm(struct cpu
>  
>  	acpi_state_timer_broadcast(pr, cx, 0);
>  	cx->time += sleep_ticks;
> -	return ticks_elapsed_in_us(t1, t2);
> +	return idle_time;
>  }
>  
>  struct cpuidle_driver acpi_idle_driver = {
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [patch]: fixing of pmtimer overflow that make Cx states time incorrect
  2009-02-02  1:46     ` alex.shi
  2009-02-03  6:18       ` yakui_zhao
@ 2009-02-03  7:24       ` Andrew Morton
  2009-02-09 19:56       ` Andrew Morton
  2 siblings, 0 replies; 9+ messages in thread
From: Andrew Morton @ 2009-02-03  7:24 UTC (permalink / raw)
  To: alex.shi; +Cc: Len Brown, linux-kernel, Venkatesh Pallipadi, linux-acpi


(Cc: line rewritten.  What on earth is your email client doing??)

On Mon, 02 Feb 2009 09:46:57 +0800 "alex.shi" <alex.shi@intel.com> wrote:

> Yagui want to give a clear explanation to be used for commitment. So I resend
> this again. 
> 
>     On most boxes the ACPI PM timer is 24-bit counter that runs on 3.579545MHz
> clock. In such case the max C-state sleep time should be less than 4687ms when
> it is used to record C2/C3 duration time. 
>     But on some boxes the max C-state sleep time is more than 4687ms. In such
> case the overflow happens and the C-state duration time can't be counted
> accurately.
> 
>     Use clocksource to get the C-state time instead of ACPI PM timer. and use
> div64_u64 to convert US_TO_PM_TIME_TICKS in i386 mode.
> 

A minor thing..

> --- linux-2.6.29-rc3.orig/drivers/acpi/processor_idle.c
> +++ linux-2.6.29-rc3/drivers/acpi/processor_idle.c
> @@ -64,7 +64,8 @@
>  #define _COMPONENT              ACPI_PROCESSOR_COMPONENT
>  ACPI_MODULE_NAME("processor_idle");
>  #define ACPI_PROCESSOR_FILE_POWER	"power"
> -#define US_TO_PM_TIMER_TICKS(t)		((t * (PM_TIMER_FREQUENCY/1000)) / 1000)
> +#define US_TO_PM_TIMER_TICKS(t)		div64_u64(\
> +	(t * (PM_TIMER_FREQUENCY/1000)), 1000ULL)

I suppose it doesn't matter much, but we could make this function more
accurate via

	(t * PM_TIMER_FREQUENCY) / 1000

Also, it would be nicer to implement this is a regular C function. 
There's no need at all for it to be an ugly all-caps macro.



^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [patch]: fixing of pmtimer overflow that make Cx states time incorrect
  2009-02-02  1:46     ` alex.shi
  2009-02-03  6:18       ` yakui_zhao
  2009-02-03  7:24       ` Andrew Morton
@ 2009-02-09 19:56       ` Andrew Morton
  2009-02-10  3:37         ` alex.shi
  2 siblings, 1 reply; 9+ messages in thread
From: Andrew Morton @ 2009-02-09 19:56 UTC (permalink / raw)
  To: alex.shi; +Cc: linux-acpi, linux-kernel, Venkatesh Pallipadi, Len Brown

On Mon, 02 Feb 2009 09:46:57 +0800
"alex.shi" <alex.shi@intel.com> wrote:

> Yagui want to give a clear explanation to be used for commitment. So I resend
> this again. 
> 
>     On most boxes the ACPI PM timer is 24-bit counter that runs on 3.579545MHz
> clock. In such case the max C-state sleep time should be less than 4687ms when
> it is used to record C2/C3 duration time. 
>     But on some boxes the max C-state sleep time is more than 4687ms. In such
> case the overflow happens and the C-state duration time can't be counted
> accurately.
> 
>     Use clocksource to get the C-state time instead of ACPI PM timer. and use
> div64_u64 to convert US_TO_PM_TIME_TICKS in i386 mode.

It seems that this bugfix was not applied, a cleanup patch was merged
instead and this patch is now wrecked.

I'll need to drop it, sorry.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [patch]: fixing of pmtimer overflow that make Cx states time incorrect
  2009-02-09 19:56       ` Andrew Morton
@ 2009-02-10  3:37         ` alex.shi
  0 siblings, 0 replies; 9+ messages in thread
From: alex.shi @ 2009-02-10  3:37 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-acpi@vger.kernel.org, linux-kernel@vger.kernel.org,
	Pallipadi, Venkatesh, Len Brown


> 
> It seems that this bugfix was not applied, a cleanup patch was merged
> instead and this patch is now wrecked.
> 
> I'll need to drop it, sorry.
I rewrite and retested it base on the latest Linus's git tree. Hope to
catch up it this time. :) 

Best regards
Alex 

We found  Cx states time abnormal in our some of machines which have
16 LCPUs, the C0 take too many time while system is really idle when
kernel enabled tickless and highres.  powertop output is below: 
     PowerTOP version 1.9       (C) 2007 Intel Corporation

Cn                Avg residency       P-states (frequencies)
C0 (cpu running)        (40.5%)         2.53 Ghz     0.0%
C1                0.0ms ( 0.0%)         2.53 Ghz     0.0%
C2              128.8ms (59.5%)         2.40 Ghz     0.0%
                                        1.60 Ghz   100.0%


Wakeups-from-idle per second :  4.7     interval: 20.0s
no ACPI power usage estimate available

Top causes for wakeups:
  41.4% ( 24.9)       <interrupt> : extra timer interrupt 
  20.2% ( 12.2)     <kernel core> : usb_hcd_poll_rh_status
(rh_timer_func)


After tacking detailed for this issue, Yakui and I find it is due to 24
bit PM timer overflows when some of cpu sleep more than 4 seconds. With
tickless kernel, the CPU want to sleep as much as possible when system
idle. But the Cx sleep time are recorded by pmtimer which length is
determined by BIOS. The current Cx time was gotten in the following
function from driver/acpi/processor_idle.c:

static inline u32 ticks_elapsed(u32 t1, u32 t2)
{
       if (t2 >= t1)
               return (t2 - t1);
       else if (!(acpi_gbl_FADT.flags & ACPI_FADT_32BIT_TIMER))
               return (((0x00FFFFFF - t1) + t2) & 0x00FFFFFF);
       else
               return ((0xFFFFFFFF - t1) + t2);
}

If pmtimer is 24 bits and it take 5 seconds from t1 to t2, in above
function, just about 1 seconds ticks was recorded. So the Cx time will
be reduced about 4 seconds. and this is why we see above powertop
output. 


To resolve this problem,  Yakui and I use ktime_get() to record the Cx
states time instead of PM timer as the following patch. the patch was
tested with i386/x86_64 modes on several platforms. 


Acked-by: venkatesh.pallipadi@intel.com
Tested-by: Alex Shi <alex.shi@intel.com>
Signed-off-by: Alex Shi <alex.shi@intel.com>
Signed-off-by: Yakui.zhao <yakui.zhao@intel.com>

--- 

Index: linux-2.6/drivers/acpi/processor_idle.c
===================================================================
--- linux-2.6.orig/drivers/acpi/processor_idle.c
+++ linux-2.6/drivers/acpi/processor_idle.c
@@ -64,7 +64,6 @@
 #define _COMPONENT              ACPI_PROCESSOR_COMPONENT
 ACPI_MODULE_NAME("processor_idle");
 #define ACPI_PROCESSOR_FILE_POWER	"power"
-#define US_TO_PM_TIMER_TICKS(t)		((t * (PM_TIMER_FREQUENCY/1000)) / 1000)
 #define PM_TIMER_TICK_NS		(1000000000ULL/PM_TIMER_FREQUENCY)
 #define C2_OVERHEAD			1	/* 1us */
 #define C3_OVERHEAD			1	/* 1us */
@@ -78,6 +77,10 @@ module_param(nocst, uint, 0000);
 static unsigned int latency_factor __read_mostly = 2;
 module_param(latency_factor, uint, 0644);
 
+static s64 us_to_pm_timer_ticks(s64 t)
+{
+	return div64_u64(t * PM_TIMER_FREQUENCY, 1000000);
+}
 /*
  * IBM ThinkPad R40e crashes mysteriously when going into C2 or C3.
  * For now disable this. Probably a bug somewhere else.
@@ -159,25 +162,6 @@ static struct dmi_system_id __cpuinitdat
 	{},
 };
 
-static inline u32 ticks_elapsed(u32 t1, u32 t2)
-{
-	if (t2 >= t1)
-		return (t2 - t1);
-	else if (!(acpi_gbl_FADT.flags & ACPI_FADT_32BIT_TIMER))
-		return (((0x00FFFFFF - t1) + t2) & 0x00FFFFFF);
-	else
-		return ((0xFFFFFFFF - t1) + t2);
-}
-
-static inline u32 ticks_elapsed_in_us(u32 t1, u32 t2)
-{
-	if (t2 >= t1)
-		return PM_TIMER_TICKS_TO_US(t2 - t1);
-	else if (!(acpi_gbl_FADT.flags & ACPI_FADT_32BIT_TIMER))
-		return PM_TIMER_TICKS_TO_US(((0x00FFFFFF - t1) + t2) & 0x00FFFFFF);
-	else
-		return PM_TIMER_TICKS_TO_US((0xFFFFFFFF - t1) + t2);
-}
 
 /*
  * Callers should disable interrupts before the call and enable
@@ -853,7 +837,8 @@ static inline void acpi_idle_do_entry(st
 static int acpi_idle_enter_c1(struct cpuidle_device *dev,
 			      struct cpuidle_state *state)
 {
-	u32 t1, t2;
+	ktime_t  kt1, kt2;
+	s64 idle_time;
 	struct acpi_processor *pr;
 	struct acpi_processor_cx *cx = cpuidle_get_statedata(state);
 
@@ -871,14 +856,15 @@ static int acpi_idle_enter_c1(struct cpu
 		return 0;
 	}
 
-	t1 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+	kt1 = ktime_get_real();
 	acpi_idle_do_entry(cx);
-	t2 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+	kt2 = ktime_get_real();
+	idle_time =  ktime_to_us(ktime_sub(kt2, kt1));
 
 	local_irq_enable();
 	cx->usage++;
 
-	return ticks_elapsed_in_us(t1, t2);
+	return idle_time;
 }
 
 /**
@@ -891,8 +877,9 @@ static int acpi_idle_enter_simple(struct
 {
 	struct acpi_processor *pr;
 	struct acpi_processor_cx *cx = cpuidle_get_statedata(state);
-	u32 t1, t2;
-	int sleep_ticks = 0;
+	ktime_t  kt1, kt2;
+	s64 idle_time;
+	s64 sleep_ticks = 0;
 
 	pr = __get_cpu_var(processors);
 
@@ -925,18 +912,19 @@ static int acpi_idle_enter_simple(struct
 	if (cx->type == ACPI_STATE_C3)
 		ACPI_FLUSH_CPU_CACHE();
 
-	t1 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+	kt1 = ktime_get_real();
 	/* Tell the scheduler that we are going deep-idle: */
 	sched_clock_idle_sleep_event();
 	acpi_idle_do_entry(cx);
-	t2 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+	kt2 = ktime_get_real();
+	idle_time =  ktime_to_us(ktime_sub(kt2, kt1));
 
 #if defined (CONFIG_GENERIC_TIME) && defined (CONFIG_X86)
 	/* TSC could halt in idle, so notify users */
 	if (tsc_halts_in_c(cx->type))
 		mark_tsc_unstable("TSC halts in idle");;
 #endif
-	sleep_ticks = ticks_elapsed(t1, t2);
+	sleep_ticks = us_to_pm_timer_ticks(idle_time);
 
 	/* Tell the scheduler how much we idled: */
 	sched_clock_idle_wakeup_event(sleep_ticks*PM_TIMER_TICK_NS);
@@ -948,7 +936,7 @@ static int acpi_idle_enter_simple(struct
 
 	acpi_state_timer_broadcast(pr, cx, 0);
 	cx->time += sleep_ticks;
-	return ticks_elapsed_in_us(t1, t2);
+	return idle_time;
 }
 
 static int c3_cpu_count;
@@ -966,8 +954,10 @@ static int acpi_idle_enter_bm(struct cpu
 {
 	struct acpi_processor *pr;
 	struct acpi_processor_cx *cx = cpuidle_get_statedata(state);
-	u32 t1, t2;
-	int sleep_ticks = 0;
+	ktime_t  kt1, kt2;
+	s64 idle_time;
+	s64 sleep_ticks = 0;
+
 
 	pr = __get_cpu_var(processors);
 
@@ -1034,9 +1024,10 @@ static int acpi_idle_enter_bm(struct cpu
 		ACPI_FLUSH_CPU_CACHE();
 	}
 
-	t1 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+	kt1 = ktime_get_real();
 	acpi_idle_do_entry(cx);
-	t2 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+	kt2 = ktime_get_real();
+	idle_time =  ktime_to_us(ktime_sub(kt2, kt1));
 
 	/* Re-enable bus master arbitration */
 	if (pr->flags.bm_check && pr->flags.bm_control) {
@@ -1051,7 +1042,7 @@ static int acpi_idle_enter_bm(struct cpu
 	if (tsc_halts_in_c(ACPI_STATE_C3))
 		mark_tsc_unstable("TSC halts in idle");
 #endif
-	sleep_ticks = ticks_elapsed(t1, t2);
+	sleep_ticks = us_to_pm_timer_ticks(idle_time);
 	/* Tell the scheduler how much we idled: */
 	sched_clock_idle_wakeup_event(sleep_ticks*PM_TIMER_TICK_NS);
 
@@ -1062,7 +1053,7 @@ static int acpi_idle_enter_bm(struct cpu
 
 	acpi_state_timer_broadcast(pr, cx, 0);
 	cx->time += sleep_ticks;
-	return ticks_elapsed_in_us(t1, t2);
+	return idle_time;
 }
 
 struct cpuidle_driver acpi_idle_driver = {



^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2009-02-10  3:36 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-01-19  5:13 [patch]: fixing of pmtimer overflow that make Cx states time incorrect alex.shi
2009-01-27  6:52 ` Andrew Morton
2009-02-01  2:04   ` yakui_zhao
2009-02-01 10:03     ` alex.shi
2009-02-02  1:46     ` alex.shi
2009-02-03  6:18       ` yakui_zhao
2009-02-03  7:24       ` Andrew Morton
2009-02-09 19:56       ` Andrew Morton
2009-02-10  3:37         ` alex.shi

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox