boot time, process start time, and NOW time

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* boot time, process start time, and NOW time
@ 2004-06-22 23:57 Albert Cahalan
  2004-06-28 17:56 ` OGAWA Hirofumi
  0 siblings, 1 reply; 57+ messages in thread
From: Albert Cahalan @ 2004-06-22 23:57 UTC (permalink / raw)
  To: linux-kernel mailing list
  Cc: voland, nicolas.george, kaukasoi, tim, george, johnstul,
	david+powerix, Andrew Morton OSDL

Even with the 2.6.7 kernel, I'm still getting reports of process
start times wandering. Here is an example:

   "About 12 hours since reboot to 2.6.7 there was already a
   difference of about 7 seconds between the real start time
   and the start time reported by ps. Now, 24 hours since reboot
   the difference is 10 seconds."

The calculation used is:

   now - uptime + time_from_boot_to_process_start

The code shown below works great on a 2.4.xx or earlier kernel.
It generally relys on USER_HZ, which is supposedly in our ABI.

I have a feeling we'll forever be chasing bugs related to not
using a PLL to drive the clock tick at exactly HZ ticks per second.
Perhaps the DragonflyBSD code could be stolen. Anyway, the code:

///////////////////////////////////////////////////////////////////////////
unsigned long seconds_since_1970 = time(NULL);
unsigned long seconds_since_boot = uptime(0,0);
unsigned long time_of_boot       = seconds_since_1970 - seconds_since_boot;

int pr_stime(char *restrict const outbuf, const proc_t *restrict const pp){
  struct tm *proc_time;
  struct tm *our_time;
  time_t t;
  const char *fmt;
  int tm_year;
  int tm_yday;
  our_time = localtime(&seconds_since_1970);   /* not reentrant */
  tm_year = our_time->tm_year;
  tm_yday = our_time->tm_yday;
  t = time_of_boot + pp->start_time / Hertz;
  proc_time = localtime(&t); /* not reentrant, this corrupts our_time */
  fmt = "%H:%M";                                   /* 03:02 23:59 */
  if(tm_yday != proc_time->tm_yday) fmt = "%b%d";  /* Jun06 Aug27 */
  if(tm_year != proc_time->tm_year) fmt = "%Y";    /* 1991 2001 */
  return strftime(outbuf, 42, fmt, proc_time);
}
///////////////////////////////////////////////////////////////////////////

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: boot time, process start time, and NOW time
  2004-06-22 23:57 boot time, process start time, and NOW time Albert Cahalan
@ 2004-06-28 17:56 ` OGAWA Hirofumi
  2004-08-16 19:41   ` Andrew Morton
  0 siblings, 1 reply; 57+ messages in thread
From: OGAWA Hirofumi @ 2004-06-28 17:56 UTC (permalink / raw)
  To: Albert Cahalan
  Cc: linux-kernel mailing list, voland, nicolas.george, kaukasoi, tim,
	george, johnstul, david+powerix, Andrew Morton OSDL

Albert Cahalan <albert@users.sf.net> writes:

> Even with the 2.6.7 kernel, I'm still getting reports of process
> start times wandering. Here is an example:
> 
>    "About 12 hours since reboot to 2.6.7 there was already a
>    difference of about 7 seconds between the real start time
>    and the start time reported by ps. Now, 24 hours since reboot
>    the difference is 10 seconds."
> 
> The calculation used is:
> 
>    now - uptime + time_from_boot_to_process_start

Start-time and uptime is using different source. Looks like the
jiffies was added bogus lost counts.

quick hack. Does this change the behavior?
-- 
OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>

---

 arch/i386/kernel/smpboot.c          |   16 +++++++++-------
 arch/i386/kernel/timers/timer_tsc.c |    8 ++++++--
 include/linux/time.h                |    2 +-
 3 files changed, 16 insertions(+), 10 deletions(-)

diff -puN arch/i386/kernel/timers/timer_tsc.c~uptime-fix arch/i386/kernel/timers/timer_tsc.c
--- linux-2.6.7/arch/i386/kernel/timers/timer_tsc.c~uptime-fix	2004-06-29 01:21:26.000000000 +0900
+++ linux-2.6.7-hirofumi/arch/i386/kernel/timers/timer_tsc.c	2004-06-29 01:21:26.000000000 +0900
@@ -467,8 +467,6 @@ static int __init init_tsc(char* overrid
  	 *	moaned if you have the only one in the world - you fix it!
  	 */
 
-	count2 = LATCH; /* initialize counter for mark_offset_tsc() */
-
 	if (cpu_has_tsc) {
 		unsigned long tsc_quotient;
 #ifdef CONFIG_HPET_TIMER
@@ -512,6 +510,12 @@ static int __init init_tsc(char* overrid
 				printk("Detected %lu.%03lu MHz processor.\n", cpu_khz / 1000, cpu_khz % 1000);
 			}
 			set_cyc2ns_scale(cpu_khz/1000);
+
+			/* initialize for mark_offset_tsc() */
+			count2 = LATCH;
+			rdtsc(last_tsc_low, last_tsc_high);
+			printk("initial tsc: %lu.%lu\n",
+				last_tsc_high, last_tsc_low);
 			return 0;
 		}
 	}
diff -puN include/linux/time.h~uptime-fix include/linux/time.h
--- linux-2.6.7/include/linux/time.h~uptime-fix	2004-06-29 01:21:26.000000000 +0900
+++ linux-2.6.7-hirofumi/include/linux/time.h	2004-06-29 01:21:26.000000000 +0900
@@ -41,7 +41,7 @@ struct timezone {
  * Have the 32 bit jiffies value wrap 5 minutes after boot
  * so jiffies wrap bugs show up earlier.
  */
-#define INITIAL_JIFFIES ((unsigned long)(unsigned int) (-300*HZ))
+#define INITIAL_JIFFIES		(-300L*HZ)
 
 /*
  * Change timeval to jiffies, trying to avoid the
diff -puN arch/i386/kernel/smpboot.c~uptime-fix arch/i386/kernel/smpboot.c
--- linux-2.6.7/arch/i386/kernel/smpboot.c~uptime-fix	2004-06-29 01:25:59.000000000 +0900
+++ linux-2.6.7-hirofumi/arch/i386/kernel/smpboot.c	2004-06-29 01:34:55.000000000 +0900
@@ -210,6 +210,8 @@ static unsigned long long __init div64 (
 	return res;
 }
 
+static unsigned long __initdata sync_tsc_high, sync_tsc_low;
+
 static void __init synchronize_tsc_bp (void)
 {
 	int i;
@@ -251,11 +253,6 @@ static void __init synchronize_tsc_bp (v
 		atomic_inc(&tsc_count_start);
 
 		rdtscll(tsc_values[smp_processor_id()]);
-		/*
-		 * We clear the TSC in the last loop:
-		 */
-		if (i == NR_LOOPS-1)
-			write_tsc(0, 0);
 
 		/*
 		 * Wait for all APs to leave the synchronization point:
@@ -264,8 +261,14 @@ static void __init synchronize_tsc_bp (v
 			mb();
 		atomic_set(&tsc_count_start, 0);
 		wmb();
+
+		/* We save the TSC in the last loop: */
+		if (i == NR_LOOPS-1)
+			rdtsc(sync_tsc_low, sync_tsc_high);
+
 		atomic_inc(&tsc_count_stop);
 	}
+	write_tsc(sync_tsc_low, sync_tsc_high);
 
 	sum = 0;
 	for (i = 0; i < NR_CPUS; i++) {
@@ -323,12 +326,11 @@ static void __init synchronize_tsc_ap (v
 			mb();
 
 		rdtscll(tsc_values[smp_processor_id()]);
-		if (i == NR_LOOPS-1)
-			write_tsc(0, 0);
 
 		atomic_inc(&tsc_count_stop);
 		while (atomic_read(&tsc_count_stop) != num_booting_cpus()) mb();
 	}
+	write_tsc(sync_tsc_low, sync_tsc_high);
 }
 #undef NR_LOOPS
 

_

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: boot time, process start time, and NOW time
  2004-06-28 17:56 ` OGAWA Hirofumi
@ 2004-08-16 19:41   ` Andrew Morton
  2004-08-16 21:49     ` john stultz
                       ` (3 more replies)
  0 siblings, 4 replies; 57+ messages in thread
From: Andrew Morton @ 2004-08-16 19:41 UTC (permalink / raw)
  To: OGAWA Hirofumi
  Cc: albert, linux-kernel, voland, nicolas.george, kaukasoi, tim,
	george, johnstul, david+powerix

OGAWA Hirofumi <hirofumi@mail.parknet.co.jp> wrote:
>
> Albert Cahalan <albert@users.sf.net> writes:
> 
> > Even with the 2.6.7 kernel, I'm still getting reports of process
> > start times wandering. Here is an example:
> > 
> >    "About 12 hours since reboot to 2.6.7 there was already a
> >    difference of about 7 seconds between the real start time
> >    and the start time reported by ps. Now, 24 hours since reboot
> >    the difference is 10 seconds."
> > 
> > The calculation used is:
> > 
> >    now - uptime + time_from_boot_to_process_start
> 
> Start-time and uptime is using different source. Looks like the
> jiffies was added bogus lost counts.
> 
> quick hack. Does this change the behavior?

Where did this all end up?  Complaints about wandering start times are
persistent, and it'd be nice to get some fix in place...

Thanks.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: boot time, process start time, and NOW time
  2004-08-16 19:41   ` Andrew Morton
@ 2004-08-16 21:49     ` john stultz
  2004-08-16 23:08     ` Tim Schmielau
                       ` (2 subsequent siblings)
  3 siblings, 0 replies; 57+ messages in thread
From: john stultz @ 2004-08-16 21:49 UTC (permalink / raw)
  To: Andrew Morton
  Cc: OGAWA Hirofumi, albert, lkml, voland, nicolas.george, kaukasoi,
	tim, george anzinger, david+powerix

On Mon, 2004-08-16 at 12:41, Andrew Morton wrote:
> OGAWA Hirofumi <hirofumi@mail.parknet.co.jp> wrote:
> >
> > Albert Cahalan <albert@users.sf.net> writes:
> > 
> > > Even with the 2.6.7 kernel, I'm still getting reports of process
> > > start times wandering. Here is an example:
> > > 
> > >    "About 12 hours since reboot to 2.6.7 there was already a
> > >    difference of about 7 seconds between the real start time
> > >    and the start time reported by ps. Now, 24 hours since reboot
> > >    the difference is 10 seconds."
> > > 
> > > The calculation used is:
> > > 
> > >    now - uptime + time_from_boot_to_process_start
> > 
> > Start-time and uptime is using different source. Looks like the
> > jiffies was added bogus lost counts.
> > 
> > quick hack. Does this change the behavior?
> 
> Where did this all end up?  Complaints about wandering start times are
> persistent, and it'd be nice to get some fix in place...

Yea, I think I dropped this. Not sure what the trouble is just yet. Let
me go digging. 

thanks
-john


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: boot time, process start time, and NOW time
  2004-08-16 19:41   ` Andrew Morton
  2004-08-16 21:49     ` john stultz
@ 2004-08-16 23:08     ` Tim Schmielau
  2004-08-16 23:56       ` Tim Schmielau
                         ` (2 more replies)
  2004-08-16 23:24     ` boot time, process start time, and NOW time Albert Cahalan
  2004-08-17 20:25     ` [PATCH] " Tim Schmielau
  3 siblings, 3 replies; 57+ messages in thread
From: Tim Schmielau @ 2004-08-16 23:08 UTC (permalink / raw)
  To: Andrew Morton
  Cc: OGAWA Hirofumi, albert, lkml, voland, nicolas.george, kaukasoi,
	george, johnstul, david+powerix

On Mon, 16 Aug 2004, Andrew Morton wrote:

> OGAWA Hirofumi <hirofumi@mail.parknet.co.jp> wrote:
> >
> > Albert Cahalan <albert@users.sf.net> writes:
> > 
> > > Even with the 2.6.7 kernel, I'm still getting reports of process
> > > start times wandering. Here is an example:
> > > 
> > >    "About 12 hours since reboot to 2.6.7 there was already a
> > >    difference of about 7 seconds between the real start time
> > >    and the start time reported by ps. Now, 24 hours since reboot
> > >    the difference is 10 seconds."
> > > 
> > > The calculation used is:
> > > 
> > >    now - uptime + time_from_boot_to_process_start
> > 
> > Start-time and uptime is using different source. Looks like the
> > jiffies was added bogus lost counts.
> > 
> > quick hack. Does this change the behavior?
> 
> Where did this all end up?  Complaints about wandering start times are
> persistent, and it'd be nice to get some fix in place...



The trouble seems to be due to the patch below, part of a larger cleanup
(http://linus.bkbits.net:8080/linux-2.5/cset%403ef4851dGg0fxX58R9Zv8SIq9fzNmQ?nav=index.html|src/.|src/fs|src/fs/proc|related/fs/proc/proc_misc.c)
by George.

Quoting from the changelog entry:

"Changes the uptime code to use the posix_clock_monotonic notion of 
uptime instead of the jiffies.  This time will track NTP changes and so should 
be better than your standard wristwatch (if your using ntp)."

George is absolutely right that it's more precise. However, it's also 
inconsistent with the process start times which use plain uncorrected 
jiffies. ps stumbles over this inconsistency.

Simple fix: revert the patch below.
Complicated fix: correct process start times in fork.c (no patch provided, 
too complicated for me to do).

George?




diff -Nru a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
--- a/fs/proc/proc_misc.c	2004-08-16 15:48:44 -07:00
+++ b/fs/proc/proc_misc.c	2004-08-16 15:48:44 -07:00
@@ -137,36 +137,19 @@
 static int uptime_read_proc(char *page, char **start, off_t off,
 				 int count, int *eof, void *data)
 {
-	u64 uptime;
-	unsigned long uptime_remainder;
+	struct timespec uptime;
+	struct timespec idle;
 	int len;
+	u64 idle_jiffies = init_task.utime + init_task.stime;
 
-	uptime = get_jiffies_64() - INITIAL_JIFFIES;
-	uptime_remainder = (unsigned long) do_div(uptime, HZ);
+	do_posix_clock_monotonic_gettime(&uptime);
+	jiffies_to_timespec(idle_jiffies, &idle);
+	len = sprintf(page,"%lu.%02lu %lu.%02lu\n",
+			(unsigned long) uptime.tv_sec,
+			(uptime.tv_nsec / (NSEC_PER_SEC / 100)),
+			(unsigned long) idle.tv_sec,
+			(idle.tv_nsec / (NSEC_PER_SEC / 100)));
 
-#if HZ!=100
-	{
-		u64 idle = init_task.utime + init_task.stime;
-		unsigned long idle_remainder;
-
-		idle_remainder = (unsigned long) do_div(idle, HZ);
-		len = sprintf(page,"%lu.%02lu %lu.%02lu\n",
-			(unsigned long) uptime,
-			(uptime_remainder * 100) / HZ,
-			(unsigned long) idle,
-			(idle_remainder * 100) / HZ);
-	}
-#else
-	{
-		unsigned long idle = init_task.utime + init_task.stime;
-
-		len = sprintf(page,"%lu.%02lu %lu.%02lu\n",
-			(unsigned long) uptime,
-			uptime_remainder,
-			idle / HZ,
-			idle % HZ);
-	}
-#endif
 	return proc_calc_metrics(page, start, off, count, eof, len);
 }


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: boot time, process start time, and NOW time
  2004-08-16 23:08     ` Tim Schmielau
@ 2004-08-16 23:56       ` Tim Schmielau
  2004-08-17  0:21       ` john stultz
  2004-08-17  0:31       ` George Anzinger
  2 siblings, 0 replies; 57+ messages in thread
From: Tim Schmielau @ 2004-08-16 23:56 UTC (permalink / raw)
  To: Andrew Morton
  Cc: OGAWA Hirofumi, albert, lkml, voland, nicolas.george, kaukasoi,
	george, johnstul, david+powerix

> > Where did this all end up?  Complaints about wandering start times are
> > persistent, and it'd be nice to get some fix in place...
[...]

> Simple fix: revert the patch below.
> Complicated fix: correct process start times in fork.c (no patch provided, 
> too complicated for me to do).

Well, if we actually revert the patch, we'd also need to revert the 
changes made to jiffies_to_clock_t() & Co. in response to the patch.
Otherwise we again end up inconsistent.

I could make a patch for that tomorrow, if there's interest. At least to 
show that my analysis actually is correct.

Alternatively we might formulate uptime in terms of jiffies_to_clock_t()). 


Tim



> diff -Nru a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
> --- a/fs/proc/proc_misc.c	2004-08-16 15:48:44 -07:00
> +++ b/fs/proc/proc_misc.c	2004-08-16 15:48:44 -07:00
> @@ -137,36 +137,19 @@
>  static int uptime_read_proc(char *page, char **start, off_t off,
>  				 int count, int *eof, void *data)
>  {
> -	u64 uptime;
> -	unsigned long uptime_remainder;
> +	struct timespec uptime;
> +	struct timespec idle;
>  	int len;
> +	u64 idle_jiffies = init_task.utime + init_task.stime;
>  
> -	uptime = get_jiffies_64() - INITIAL_JIFFIES;
> -	uptime_remainder = (unsigned long) do_div(uptime, HZ);
> +	do_posix_clock_monotonic_gettime(&uptime);
> +	jiffies_to_timespec(idle_jiffies, &idle);
> +	len = sprintf(page,"%lu.%02lu %lu.%02lu\n",
> +			(unsigned long) uptime.tv_sec,
> +			(uptime.tv_nsec / (NSEC_PER_SEC / 100)),
> +			(unsigned long) idle.tv_sec,
> +			(idle.tv_nsec / (NSEC_PER_SEC / 100)));
>  
> -#if HZ!=100
> -	{
> -		u64 idle = init_task.utime + init_task.stime;
> -		unsigned long idle_remainder;
> -
> -		idle_remainder = (unsigned long) do_div(idle, HZ);
> -		len = sprintf(page,"%lu.%02lu %lu.%02lu\n",
> -			(unsigned long) uptime,
> -			(uptime_remainder * 100) / HZ,
> -			(unsigned long) idle,
> -			(idle_remainder * 100) / HZ);
> -	}
> -#else
> -	{
> -		unsigned long idle = init_task.utime + init_task.stime;
> -
> -		len = sprintf(page,"%lu.%02lu %lu.%02lu\n",
> -			(unsigned long) uptime,
> -			uptime_remainder,
> -			idle / HZ,
> -			idle % HZ);
> -	}
> -#endif
>  	return proc_calc_metrics(page, start, off, count, eof, len);
>  }
> 
> 


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: boot time, process start time, and NOW time
  2004-08-16 23:08     ` Tim Schmielau
  2004-08-16 23:56       ` Tim Schmielau
@ 2004-08-17  0:21       ` john stultz
  2004-08-17  0:37         ` George Anzinger
  2004-08-17  0:31       ` George Anzinger
  2 siblings, 1 reply; 57+ messages in thread
From: john stultz @ 2004-08-17  0:21 UTC (permalink / raw)
  To: Tim Schmielau
  Cc: Andrew Morton, OGAWA Hirofumi, albert, lkml, voland,
	nicolas.george, kaukasoi, george anzinger, david+powerix

On Mon, 2004-08-16 at 16:08, Tim Schmielau wrote:
> On Mon, 16 Aug 2004, Andrew Morton wrote:
> 
> > OGAWA Hirofumi <hirofumi@mail.parknet.co.jp> wrote:
> > >
> > > Albert Cahalan <albert@users.sf.net> writes:
> > > 
> > > > Even with the 2.6.7 kernel, I'm still getting reports of process
> > > > start times wandering. Here is an example:
> > > > 
> > > >    "About 12 hours since reboot to 2.6.7 there was already a
> > > >    difference of about 7 seconds between the real start time
> > > >    and the start time reported by ps. Now, 24 hours since reboot
> > > >    the difference is 10 seconds."
> > > > 
> > > > The calculation used is:
> > > > 
> > > >    now - uptime + time_from_boot_to_process_start
> > > 
> > > Start-time and uptime is using different source. Looks like the
> > > jiffies was added bogus lost counts.
> > > 
> > > quick hack. Does this change the behavior?
> > 
> > Where did this all end up?  Complaints about wandering start times are
> > persistent, and it'd be nice to get some fix in place...
> 
> 
> 
> The trouble seems to be due to the patch below, part of a larger cleanup
> (http://linus.bkbits.net:8080/linux-2.5/cset%403ef4851dGg0fxX58R9Zv8SIq9fzNmQ?nav=index.html|src/.|src/fs|src/fs/proc|related/fs/proc/proc_misc.c)
> by George.
> 
> Quoting from the changelog entry:
> 
> "Changes the uptime code to use the posix_clock_monotonic notion of 
> uptime instead of the jiffies.  This time will track NTP changes and so should 
> be better than your standard wristwatch (if your using ntp)."
> 
> George is absolutely right that it's more precise. However, it's also 
> inconsistent with the process start times which use plain uncorrected 
> jiffies. ps stumbles over this inconsistency.
> 
> Simple fix: revert the patch below.
> Complicated fix: correct process start times in fork.c (no patch provided, 
> too complicated for me to do).

Hmm. While that patch fixed the uptime proc entry, I thought the issue
was with process start times. I'm looking at fixing the start_time
assignment in proc_pid_stat(). My suspicion is that we need to use ACTHZ
in jiffies64_to_clock_t().

Something like the patch below.

thanks
-john

===== include/linux/times.h 1.6 vs edited =====
--- 1.6/include/linux/times.h	2004-05-10 04:25:49 -07:00
+++ edited/include/linux/times.h	2004-08-16 16:22:13 -07:00
@@ -48,6 +48,7 @@
 	 * but even this doesn't overflow in hundreds of years
 	 * in 64 bits, so..
 	 */
+	x = (x * ACT_HZ)>>8;  /* compensate for ACT_HZ != HZ */
 	x *= TICK_NSEC;
 	do_div(x, (NSEC_PER_SEC / USER_HZ));
 #endif



^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: boot time, process start time, and NOW time
  2004-08-17  0:21       ` john stultz
@ 2004-08-17  0:37         ` George Anzinger
  2004-08-17  0:49           ` john stultz
  0 siblings, 1 reply; 57+ messages in thread
From: George Anzinger @ 2004-08-17  0:37 UTC (permalink / raw)
  To: john stultz
  Cc: Tim Schmielau, Andrew Morton, OGAWA Hirofumi, albert, lkml,
	voland, nicolas.george, kaukasoi, david+powerix

john stultz wrote:
> On Mon, 2004-08-16 at 16:08, Tim Schmielau wrote:
> 
>>On Mon, 16 Aug 2004, Andrew Morton wrote:
>>
>>
>>>OGAWA Hirofumi <hirofumi@mail.parknet.co.jp> wrote:
>>>
>>>>Albert Cahalan <albert@users.sf.net> writes:
>>>>
>>>>
>>>>>Even with the 2.6.7 kernel, I'm still getting reports of process
>>>>>start times wandering. Here is an example:
>>>>>
>>>>>   "About 12 hours since reboot to 2.6.7 there was already a
>>>>>   difference of about 7 seconds between the real start time
>>>>>   and the start time reported by ps. Now, 24 hours since reboot
>>>>>   the difference is 10 seconds."
>>>>>
>>>>>The calculation used is:
>>>>>
>>>>>   now - uptime + time_from_boot_to_process_start
>>>>
>>>>Start-time and uptime is using different source. Looks like the
>>>>jiffies was added bogus lost counts.
>>>>
>>>>quick hack. Does this change the behavior?
>>>
>>>Where did this all end up?  Complaints about wandering start times are
>>>persistent, and it'd be nice to get some fix in place...
>>
>>
>>
>>The trouble seems to be due to the patch below, part of a larger cleanup
>>(http://linus.bkbits.net:8080/linux-2.5/cset%403ef4851dGg0fxX58R9Zv8SIq9fzNmQ?nav=index.html|src/.|src/fs|src/fs/proc|related/fs/proc/proc_misc.c)
>>by George.
>>
>>Quoting from the changelog entry:
>>
>>"Changes the uptime code to use the posix_clock_monotonic notion of 
>>uptime instead of the jiffies.  This time will track NTP changes and so should 
>>be better than your standard wristwatch (if your using ntp)."
>>
>>George is absolutely right that it's more precise. However, it's also 
>>inconsistent with the process start times which use plain uncorrected 
>>jiffies. ps stumbles over this inconsistency.
>>
>>Simple fix: revert the patch below.
>>Complicated fix: correct process start times in fork.c (no patch provided, 
>>too complicated for me to do).
> 
> 
> Hmm. While that patch fixed the uptime proc entry, I thought the issue
> was with process start times. I'm looking at fixing the start_time
> assignment in proc_pid_stat(). My suspicion is that we need to use ACTHZ
> in jiffies64_to_clock_t().

I really don't see how the start_time that proc_pid_stat() is producing could be 
anything but a constant.  The complaint is that it moves, not that it is 
incorrect, right?
> 
> Something like the patch below.
> 
> thanks
> -john
> 
> ===== include/linux/times.h 1.6 vs edited =====
> --- 1.6/include/linux/times.h	2004-05-10 04:25:49 -07:00
> +++ edited/include/linux/times.h	2004-08-16 16:22:13 -07:00
> @@ -48,6 +48,7 @@
>  	 * but even this doesn't overflow in hundreds of years
>  	 * in 64 bits, so..
>  	 */
> +	x = (x * ACT_HZ)>>8;  /* compensate for ACT_HZ != HZ */
>  	x *= TICK_NSEC;
>  	do_div(x, (NSEC_PER_SEC / USER_HZ));
>  #endif
> 
> 

-- 
George Anzinger   george@mvista.com
High-res-timers:  http://sourceforge.net/projects/high-res-timers/
Preemption patch: http://www.kernel.org/pub/linux/kernel/people/rml


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: boot time, process start time, and NOW time
  2004-08-17  0:37         ` George Anzinger
@ 2004-08-17  0:49           ` john stultz
  0 siblings, 0 replies; 57+ messages in thread
From: john stultz @ 2004-08-17  0:49 UTC (permalink / raw)
  To: george anzinger
  Cc: Tim Schmielau, Andrew Morton, OGAWA Hirofumi, albert, lkml,
	voland, nicolas.george, kaukasoi, david+powerix

On Mon, 2004-08-16 at 17:37, George Anzinger wrote:
> john stultz wrote:
> > On Mon, 2004-08-16 at 16:08, Tim Schmielau wrote:
> >>Simple fix: revert the patch below.
> >>Complicated fix: correct process start times in fork.c (no patch provided, 
> >>too complicated for me to do).
> > 
> > Hmm. While that patch fixed the uptime proc entry, I thought the issue
> > was with process start times. I'm looking at fixing the start_time
> > assignment in proc_pid_stat(). My suspicion is that we need to use ACTHZ
> > in jiffies64_to_clock_t().
> 
> I really don't see how the start_time that proc_pid_stat() is producing could be 
> anything but a constant.  The complaint is that it moves, not that it is 
> incorrect, right?

My impression was that it was both. 

Regardless, your point stands, it would just be a constant. Good catch.
I'll have to think about this some more. 

Let me look at procps to see how exactly it comes up w/ STIME. 

thanks
-john



^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: boot time, process start time, and NOW time
  2004-08-16 23:08     ` Tim Schmielau
  2004-08-16 23:56       ` Tim Schmielau
  2004-08-17  0:21       ` john stultz
@ 2004-08-17  0:31       ` George Anzinger
  2004-08-16 22:32         ` Albert Cahalan
  2004-08-17  6:56         ` Tim Schmielau
  2 siblings, 2 replies; 57+ messages in thread
From: George Anzinger @ 2004-08-17  0:31 UTC (permalink / raw)
  To: Tim Schmielau
  Cc: Andrew Morton, OGAWA Hirofumi, albert, lkml, voland,
	nicolas.george, kaukasoi, johnstul, david+powerix

Tim Schmielau wrote:
> On Mon, 16 Aug 2004, Andrew Morton wrote:
> 
> 
>>OGAWA Hirofumi <hirofumi@mail.parknet.co.jp> wrote:
>>
>>>Albert Cahalan <albert@users.sf.net> writes:
>>>
>>>
>>>>Even with the 2.6.7 kernel, I'm still getting reports of process
>>>>start times wandering. Here is an example:
>>>>
>>>>   "About 12 hours since reboot to 2.6.7 there was already a
>>>>   difference of about 7 seconds between the real start time
>>>>   and the start time reported by ps. Now, 24 hours since reboot
>>>>   the difference is 10 seconds."
>>>>
>>>>The calculation used is:
>>>>
>>>>   now - uptime + time_from_boot_to_process_start
>>>
>>>Start-time and uptime is using different source. Looks like the
>>>jiffies was added bogus lost counts.
>>>
>>>quick hack. Does this change the behavior?
>>
>>Where did this all end up?  Complaints about wandering start times are
>>persistent, and it'd be nice to get some fix in place...
> 
> 
> 
> 
> The trouble seems to be due to the patch below, part of a larger cleanup
> (http://linus.bkbits.net:8080/linux-2.5/cset%403ef4851dGg0fxX58R9Zv8SIq9fzNmQ?nav=index.html|src/.|src/fs|src/fs/proc|related/fs/proc/proc_misc.c)
> by George.
> 
> Quoting from the changelog entry:
> 
> "Changes the uptime code to use the posix_clock_monotonic notion of 
> uptime instead of the jiffies.  This time will track NTP changes and so should 
> be better than your standard wristwatch (if your using ntp)."
> 
> George is absolutely right that it's more precise. However, it's also 
> inconsistent with the process start times which use plain uncorrected 
> jiffies. ps stumbles over this inconsistency.
> 
> Simple fix: revert the patch below.
> Complicated fix: correct process start times in fork.c (no patch provided, 
> too complicated for me to do).
> 
> George?

Hm...  That patch was for a reason...  It seems to me that doing anything short 
of putting "xtime" (or better, clock_gettime() :)) in at fork time is not going 
to fix anything.   As written the start_time in the task_struct is fixed.  If 
"now - uptime + time_from_boot_to_process_start" it is wandering, it must be the 
fault of "now - uptime".  Since this seems to be wandering, and we corrected 
uptime in the referenced patch, is it safe to assume that "now" is actually 
being computed from "jiffies" rather than a gettimeofday()?

Seems like that is where we should be changing things.


-- 
George Anzinger   george@mvista.com
High-res-timers:  http://sourceforge.net/projects/high-res-timers/
Preemption patch: http://www.kernel.org/pub/linux/kernel/people/rml


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: boot time, process start time, and NOW time
  2004-08-17  0:31       ` George Anzinger
@ 2004-08-16 22:32         ` Albert Cahalan
  2004-08-17  1:26           ` George Anzinger
  2004-08-17  6:56         ` Tim Schmielau
  1 sibling, 1 reply; 57+ messages in thread
From: Albert Cahalan @ 2004-08-16 22:32 UTC (permalink / raw)
  To: george
  Cc: Tim Schmielau, Andrew Morton OSDL, OGAWA Hirofumi, albert, lkml,
	voland, nicolas.george, kaukasoi, johnstul, david+powerix

On Mon, 2004-08-16 at 20:31, George Anzinger wrote:

> Hm...  That patch was for a reason...  It seems to me that doing anything short 
> of putting "xtime" (or better, clock_gettime() :)) in at fork time is not going 
> to fix anything.   As written the start_time in the task_struct is fixed.  If 
> "now - uptime + time_from_boot_to_process_start" it is wandering, it must be the 
> fault of "now - uptime".  Since this seems to be wandering, and we corrected 
> uptime in the referenced patch, is it safe to assume that "now" is actually 
> being computed from "jiffies" rather than a gettimeofday()?
> 
> Seems like that is where we should be changing things.

That's userspace, which works fine on a 2.4.xx kernel.
If userspace were to change, it wouldn't work OK for
a 2.4.xx kernel anymore. So consider that cast in stone.

"now" is the time() function. Using gettimeofday()
would only make sense if I decided to pay the cost
of asking for the time every time I look at a task.

Here is the "now - uptime + time_from_boot_to_process_start"
calculation, unsimplified, ripped from the procps code:

////////////////////////////////////////////////////////////////
unsigned long   seconds_since_boot = -1;
static unsigned long seconds_since_1970;
static unsigned long time_of_boot;

some_init_function(){
  seconds_since_boot = uptime(0,0);
  seconds_since_1970 = time(NULL);
  time_of_boot = seconds_since_1970 - seconds_since_boot;
}

static int pr_stime(char *restrict const outbuf, const proc_t *restrict const pp){
  struct tm *proc_time;
  struct tm *our_time;
  time_t t;
  const char *fmt;
  int tm_year;
  int tm_yday;
  our_time = localtime(&seconds_since_1970);   /* not reentrant */
  tm_year = our_time->tm_year;
  tm_yday = our_time->tm_yday;
  t = time_of_boot + pp->start_time / Hertz;
  proc_time = localtime(&t); /* not reentrant, this corrupts our_time */
  fmt = "%H:%M";                                   /* 03:02 23:59 */
  if(tm_yday != proc_time->tm_yday) fmt = "%b%d";  /* Jun06 Aug27 */
  if(tm_year != proc_time->tm_year) fmt = "%Y";    /* 1991 2001 */
  return strftime(outbuf, 42, fmt, proc_time);
}
////////////////////////////////////////////////////////////////



^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: boot time, process start time, and NOW time
  2004-08-16 22:32         ` Albert Cahalan
@ 2004-08-17  1:26           ` George Anzinger
  2004-08-16 23:08             ` Albert Cahalan
  0 siblings, 1 reply; 57+ messages in thread
From: George Anzinger @ 2004-08-17  1:26 UTC (permalink / raw)
  To: Albert Cahalan
  Cc: Tim Schmielau, Andrew Morton OSDL, OGAWA Hirofumi, lkml, voland,
	nicolas.george, kaukasoi, johnstul, david+powerix

Albert Cahalan wrote:
> On Mon, 2004-08-16 at 20:31, George Anzinger wrote:
> 
> 
>>Hm...  That patch was for a reason...  It seems to me that doing anything short 
>>of putting "xtime" (or better, clock_gettime() :)) in at fork time is not going 
>>to fix anything.   As written the start_time in the task_struct is fixed.  If 
>>"now - uptime + time_from_boot_to_process_start" it is wandering, it must be the 
>>fault of "now - uptime".  Since this seems to be wandering, and we corrected 
>>uptime in the referenced patch, is it safe to assume that "now" is actually 
>>being computed from "jiffies" rather than a gettimeofday()?
>>
>>Seems like that is where we should be changing things.
> 
> 
> That's userspace, which works fine on a 2.4.xx kernel.
> If userspace were to change, it wouldn't work OK for
> a 2.4.xx kernel anymore. So consider that cast in stone.
> 
> "now" is the time() function. Using gettimeofday()
> would only make sense if I decided to pay the cost
> of asking for the time every time I look at a task.
> 
> Here is the "now - uptime + time_from_boot_to_process_start"
> calculation, unsimplified, ripped from the procps code:
> 
> ////////////////////////////////////////////////////////////////
> unsigned long   seconds_since_boot = -1;
> static unsigned long seconds_since_1970;
> static unsigned long time_of_boot;
> 
> some_init_function(){
>   seconds_since_boot = uptime(0,0);
>   seconds_since_1970 = time(NULL);
>   time_of_boot = seconds_since_1970 - seconds_since_boot;
> }
> 
> static int pr_stime(char *restrict const outbuf, const proc_t *restrict const pp){
>   struct tm *proc_time;
>   struct tm *our_time;
>   time_t t;
>   const char *fmt;
>   int tm_year;
>   int tm_yday;
>   our_time = localtime(&seconds_since_1970);   /* not reentrant */
>   tm_year = our_time->tm_year;
>   tm_yday = our_time->tm_yday;
>   t = time_of_boot + pp->start_time / Hertz;
>   proc_time = localtime(&t); /* not reentrant, this corrupts our_time */
>   fmt = "%H:%M";                                   /* 03:02 23:59 */
>   if(tm_yday != proc_time->tm_yday) fmt = "%b%d";  /* Jun06 Aug27 */
>   if(tm_year != proc_time->tm_year) fmt = "%Y";    /* 1991 2001 */
>   return strftime(outbuf, 42, fmt, proc_time);
> }
> ////////////////////////////////////////////////////////////////
> 
> 
Hm, I assume time() just returns the seconds part of gettimeofday().  Is 
uptime() local to procps?  What does it do?  You implied it uses the kernel 
version of up time, right?  Given all this, I don't see how it can wander.

An interesting question: does it wander if ntp is not in the mix?

> 

-- 
George Anzinger   george@mvista.com
High-res-timers:  http://sourceforge.net/projects/high-res-timers/
Preemption patch: http://www.kernel.org/pub/linux/kernel/people/rml


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: boot time, process start time, and NOW time
  2004-08-17  1:26           ` George Anzinger
@ 2004-08-16 23:08             ` Albert Cahalan
  2004-08-17  1:54               ` James Courtier-Dutton
  0 siblings, 1 reply; 57+ messages in thread
From: Albert Cahalan @ 2004-08-16 23:08 UTC (permalink / raw)
  To: george
  Cc: Albert Cahalan, Tim Schmielau, Andrew Morton OSDL, OGAWA Hirofumi,
	lkml, voland, nicolas.george, kaukasoi, johnstul, david+powerix

On Mon, 2004-08-16 at 21:26, George Anzinger wrote:
> Albert Cahalan wrote:
> > On Mon, 2004-08-16 at 20:31, George Anzinger wrote:
> > 
> > 
> >>Hm...  That patch was for a reason...  It seems to me that doing anything short 
> >>of putting "xtime" (or better, clock_gettime() :)) in at fork time is not going 
> >>to fix anything.   As written the start_time in the task_struct is fixed.  If 
> >>"now - uptime + time_from_boot_to_process_start" it is wandering, it must be the 
> >>fault of "now - uptime".  Since this seems to be wandering, and we corrected 
> >>uptime in the referenced patch, is it safe to assume that "now" is actually 
> >>being computed from "jiffies" rather than a gettimeofday()?
> >>
> >>Seems like that is where we should be changing things.
> > 
> > 
> > That's userspace, which works fine on a 2.4.xx kernel.
> > If userspace were to change, it wouldn't work OK for
> > a 2.4.xx kernel anymore. So consider that cast in stone.
> > 
> > "now" is the time() function. Using gettimeofday()
> > would only make sense if I decided to pay the cost
> > of asking for the time every time I look at a task.
> > 
> > Here is the "now - uptime + time_from_boot_to_process_start"
> > calculation, unsimplified, ripped from the procps code:
> > 
> > ////////////////////////////////////////////////////////////////
> > unsigned long   seconds_since_boot = -1;
> > static unsigned long seconds_since_1970;
> > static unsigned long time_of_boot;
> > 
> > some_init_function(){
> >   seconds_since_boot = uptime(0,0);
> >   seconds_since_1970 = time(NULL);
> >   time_of_boot = seconds_since_1970 - seconds_since_boot;
> > }
> > 
> > static int pr_stime(char *restrict const outbuf, const proc_t *restrict const pp){
> >   struct tm *proc_time;
> >   struct tm *our_time;
> >   time_t t;
> >   const char *fmt;
> >   int tm_year;
> >   int tm_yday;
> >   our_time = localtime(&seconds_since_1970);   /* not reentrant */
> >   tm_year = our_time->tm_year;
> >   tm_yday = our_time->tm_yday;
> >   t = time_of_boot + pp->start_time / Hertz;
> >   proc_time = localtime(&t); /* not reentrant, this corrupts our_time */
> >   fmt = "%H:%M";                                   /* 03:02 23:59 */
> >   if(tm_yday != proc_time->tm_yday) fmt = "%b%d";  /* Jun06 Aug27 */
> >   if(tm_year != proc_time->tm_year) fmt = "%Y";    /* 1991 2001 */
> >   return strftime(outbuf, 42, fmt, proc_time);
> > }
> > ////////////////////////////////////////////////////////////////
> > 
> > 
> Hm, I assume time() just returns the seconds part of gettimeofday().  Is 
> uptime() local to procps?  What does it do?  You implied it uses the kernel 
> version of up time, right?  Given all this, I don't see how it can wander.

uptime() returns the first number from /proc/uptime as an int.
(currently it rounds down -- perhaps not the best)

> An interesting question: does it wander if ntp is not in the mix?

I think yes. I just get the bug reports. (well, 1/2 of them)
I'm guessing this is a PC problem; I have a Mac.

 


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: boot time, process start time, and NOW time
  2004-08-16 23:08             ` Albert Cahalan
@ 2004-08-17  1:54               ` James Courtier-Dutton
  2004-08-17  2:03                 ` Lee Revell
  2004-08-17 20:52                 ` George Anzinger
  0 siblings, 2 replies; 57+ messages in thread
From: James Courtier-Dutton @ 2004-08-17  1:54 UTC (permalink / raw)
  To: Albert Cahalan
  Cc: george, Albert Cahalan, Tim Schmielau, Andrew Morton OSDL,
	OGAWA Hirofumi, lkml, voland, nicolas.george, kaukasoi, johnstul,
	david+powerix

Albert Cahalan wrote:
>>>
>>>
>>>That's userspace, which works fine on a 2.4.xx kernel.
>>>If userspace were to change, it wouldn't work OK for
>>>a 2.4.xx kernel anymore. So consider that cast in stone.
>>>
>>>"now" is the time() function. Using gettimeofday()
>>>would only make sense if I decided to pay the cost
>>>of asking for the time every time I look at a task.
>>>

While on the subject of time, is it possible to get a monotonic timer 
with 1ms or better resolution?
We need this for linux multimedia applications, and it is used to sync 
audio and video. Currently we use gettimeofday(). If a movie is playing, 
and the user goes and changes the time, or changes the timezone, we do 
not want that to effect the movie playing. I have not been able to find 
a monotonic 1ms accurate timer in the linux kernel, that is available to 
applications, and has little overhead. Some efficient ioctl or function 
call for uptime to 1ms accuracy would do perfectly.

James

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: boot time, process start time, and NOW time
  2004-08-17  1:54               ` James Courtier-Dutton
@ 2004-08-17  2:03                 ` Lee Revell
  2004-08-17 20:52                 ` George Anzinger
  1 sibling, 0 replies; 57+ messages in thread
From: Lee Revell @ 2004-08-17  2:03 UTC (permalink / raw)
  To: James Courtier-Dutton
  Cc: Albert Cahalan, george, Albert Cahalan, Tim Schmielau,
	Andrew Morton OSDL, OGAWA Hirofumi, lkml, voland, nicolas.george,
	kaukasoi, johnstul, david+powerix

On Mon, 2004-08-16 at 21:54, James Courtier-Dutton wrote:
> Albert Cahalan wrote:

> While on the subject of time, is it possible to get a monotonic timer 
> with 1ms or better resolution?

mplayer uses /dev/rtc for this.  Any reason why it won't work for you?

Lee


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: boot time, process start time, and NOW time
  2004-08-17  1:54               ` James Courtier-Dutton
  2004-08-17  2:03                 ` Lee Revell
@ 2004-08-17 20:52                 ` George Anzinger
  1 sibling, 0 replies; 57+ messages in thread
From: George Anzinger @ 2004-08-17 20:52 UTC (permalink / raw)
  To: James Courtier-Dutton
  Cc: Albert Cahalan, Tim Schmielau, Andrew Morton OSDL, OGAWA Hirofumi,
	lkml, voland, nicolas.george, kaukasoi, johnstul, david+powerix

James Courtier-Dutton wrote:
> Albert Cahalan wrote:
> 
>>>>
>>>>
>>>> That's userspace, which works fine on a 2.4.xx kernel.
>>>> If userspace were to change, it wouldn't work OK for
>>>> a 2.4.xx kernel anymore. So consider that cast in stone.
>>>>
>>>> "now" is the time() function. Using gettimeofday()
>>>> would only make sense if I decided to pay the cost
>>>> of asking for the time every time I look at a task.
>>>>
> 
> While on the subject of time, is it possible to get a monotonic timer 
> with 1ms or better resolution?
> We need this for linux multimedia applications, and it is used to sync 
> audio and video. Currently we use gettimeofday(). If a movie is playing, 
> and the user goes and changes the time, or changes the timezone, we do 
> not want that to effect the movie playing. I have not been able to find 
> a monotonic 1ms accurate timer in the linux kernel, that is available to 
> applications, and has little overhead. Some efficient ioctl or function 
> call for uptime to 1ms accuracy would do perfectly.

If all you want is the time try
clock_gettime(CLOCK_MONOTONIC, struct time_spec *tv)

Should work fine on 2.6.x kernels.  This is good to what ever the fine structure 
is on the box, e.g. TCP cycles on most x86 or pm_timer cycles on some, in any 
case it is good to better than a micro second.
> 
   If you want a timer, look into the posix clocks & timers which were added at 
2.6.
-- 
George Anzinger   george@mvista.com
High-res-timers:  http://sourceforge.net/projects/high-res-timers/
Preemption patch: http://www.kernel.org/pub/linux/kernel/people/rml


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: boot time, process start time, and NOW time
  2004-08-17  0:31       ` George Anzinger
  2004-08-16 22:32         ` Albert Cahalan
@ 2004-08-17  6:56         ` Tim Schmielau
  2004-08-17 20:07           ` john stultz
  1 sibling, 1 reply; 57+ messages in thread
From: Tim Schmielau @ 2004-08-17  6:56 UTC (permalink / raw)
  To: George Anzinger
  Cc: Andrew Morton, OGAWA Hirofumi, albert, lkml, voland,
	nicolas.george, kaukasoi, johnstul, david+powerix

(Whoops, this generated quite some traffic while I was asleep.
I'll just comment on some of the posts in a single mail.)

On Mon, 16 Aug 2004, George Anzinger wrote:

> > George is absolutely right that it's more precise. However, it's also
> > inconsistent with the process start times which use plain uncorrected
> > jiffies. ps stumbles over this inconsistency.
> >
> > Simple fix: revert the patch below.
> > Complicated fix: correct process start times in fork.c (no patch 
provided,
> > too complicated for me to do).
> >
> > George?
>
> Hm...  That patch was for a reason...  It seems to me that doing 
anything shor
t
> of putting "xtime" (or better, clock_gettime() :)) in at fork time is 
not goin
g
> to fix anything.

Yep. I think that's the way to go.

>                   As written the start_time in the task_struct is fixed.  
If
> "now - uptime + time_from_boot_to_process_start" it is wandering, it 
must be t
he
> fault of "now - uptime".  Since this seems to be wandering, and we 
corrected
> uptime in the referenced patch, is it safe to assume that "now" is 
actually
> being computed from "jiffies" rather than a gettimeofday()?

No, it's not "now" which is wandering, but the difference between "uptime"
and "time_from_boot_to_process_start". The former gets corrected by ntp,
while the latter is computed from "jiffies" and thus uncorrected.

On Mon, 16 Aug 2004, john stultz wrote:

> Hmm. While that patch fixed the uptime proc entry, I thought the issue
> was with process start times. I'm looking at fixing the start_time
> assignment in proc_pid_stat(). My suspicion is that we need to use ACTHZ
> in jiffies64_to_clock_t().

No, we already fixed jiffies64_to_clock_t() by using TICK_NSEC instead of
HZ.

On Mon, 16 Aug 2004, George Anzinger wrote:

> I really don't see how the start_time that proc_pid_stat() is producing 
could be
> anything but a constant.  The complaint is that it moves, not that it is
> incorrect, right?

No, proc_pid_stat() indeed gives a constant. But userspace somehow has to
figure out what a value in "jiffies" means. Since "jiffies" started from 
zero
at boot time, "uptime" is needed for that. However, we "fixed" uptime to
get corrected by ntp, so that userspace now has a drifting notion of 
"jiffies".

On Tue, 16 Aug 2004, Albert Cahalan wrote:

> If you're interested in reducing (not solving)
> the problem for the 2.6.x series, you might change
> HZ to something that works better with the PIT.

No, that's not needed anymore. We've already started to account for the
difference, e.g. by using TICK_NSEC in jiffies64_to_clock_t().

Problem is, we are only halfway through the attempt to remove the use
of "jiffies" as a clock, so currently to incompatible time sources get 
mixed
up.

The other problem seems to be that this move away from "jiffies" seems to
happen on an ad-hoc basis whenever we encounter a problem, rather than
with a big picture in mind.
John Stultz once laid out a concept for a (coordinated) rewrite in 2.7,
and I think this still is a good idea.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: boot time, process start time, and NOW time
  2004-08-17  6:56         ` Tim Schmielau
@ 2004-08-17 20:07           ` john stultz
  2004-08-17 20:13             ` [RFC] New timeofday implementation proposal john stultz
  0 siblings, 1 reply; 57+ messages in thread
From: john stultz @ 2004-08-17 20:07 UTC (permalink / raw)
  To: Tim Schmielau
  Cc: george anzinger, Andrew Morton, OGAWA Hirofumi, albert, lkml,
	voland, nicolas.george, kaukasoi, david+powerix

On Mon, 2004-08-16 at 23:56, Tim Schmielau wrote:
> On Tue, 16 Aug 2004, Albert Cahalan wrote:
> 
> > If you're interested in reducing (not solving)
> > the problem for the 2.6.x series, you might change
> > HZ to something that works better with the PIT.
> 
> No, that's not needed anymore. We've already started to account for the
> difference, e.g. by using TICK_NSEC in jiffies64_to_clock_t().

Well, unfortunately TICK_NSEC just gives the *current* tick length as
requested by ntpd. So it won't work over an long interval of jiffies
where TICK_NSEC might have changed.

> Problem is, we are only halfway through the attempt to remove the use
> of "jiffies" as a clock, so currently to incompatible time sources get 
> mixed
> up.
> 
> The other problem seems to be that this move away from "jiffies" seems to
> happen on an ad-hoc basis whenever we encounter a problem, rather than
> with a big picture in mind.

Indeed you are correct. Since timer interrupts are not precisely or
accurately delivered, a timer interrupt counter (jiffies), cannot be
used as a reliable time source (except where there is not other time
source). The problem is that it is difficult to discern where jiffies is
just being used as a timer subsystem counter, or where its being used as
a time of day time source.

Even worse, this is a userspace visible usage of jiffies as a time
stamp, so in this case we have to preserve the interface and find a way
to emulate it. So proc_pid_stats() may need to do the reverse of what
procps is doing. 

> John Stultz once laid out a concept for a (coordinated) rewrite in 2.7,
> and I think this still is a good idea.

Yep, I've been working like crazy on just this (well, when my work isn't
swamping me). Unfortunately it is a major overhaul and causes cascading
changes (removal of xtime), so its not going as quickly as I'd like.
However I feel the design is quite good and I will attach a copy of it
and the first pass of the code in a reply to this email.

thanks
-john

^ permalink raw reply	[flat|nested] 57+ messages in thread

* [RFC] New timeofday implementation proposal
  2004-08-17 20:07           ` john stultz
@ 2004-08-17 20:13             ` john stultz
  2004-08-17 20:58               ` [RFC] New timeofday code john stultz
  2004-09-01 23:16               ` [RFC] New timeofday implementation proposal Christoph Lameter
  0 siblings, 2 replies; 57+ messages in thread
From: john stultz @ 2004-08-17 20:13 UTC (permalink / raw)
  To: Tim Schmielau
  Cc: george anzinger, Andrew Morton, OGAWA Hirofumi, albert, lkml,
	voland, nicolas.george, kaukasoi, david+powerix

As promised, here is my proposal for overhauling the time of day
subsystem. This would likely be 2.7 material.

Any comments and suggestions would be appreciated.
thanks
-john


Proposal for an architecture independent time of day implementation.
-------------------------------------------------------------------
John Stultz (johnstul@us.ibm.com)
DRAFT
Tue Aug 17 12:50:34 PDT 2004

Credits:
	Keith Mannthey:	Aided initial design.
			Aided greatly to implementation details.
	George Anzinger: Initial review and corrections.
	Ulrich Windl: Review and suggestions for clarity.

	Many of the time of day related issues that cropped up in 2.5
development occurred where a fix or change was made to a number of
architectures, but missed a few others. Currently every architecture has
its own set of timekeeping functions that basically do the same thing,
only using different (or frequently, not so different) types of
hardware. As hardware has changed, many architectures have had to
re-engineer their time system to handle multiple time and interrupt
sources. With little common infrastructure, either each separate
implementation has its own quirks and bugs, or we end up with a
reasonable quantity of duplicated code. Additionally the lack of a clear
time of day interface has led developers to use jiffies, HZ, and the raw
xtime values to calculate the time of day themselves. This has lead to a
number of troublesome bugs.

	With the goal to simplify, streamline and consolidate the time-of-day
infrastructure, I propose the following common implementation across all
arches. This will allow generic bugs to be fixed once, reduce code
duplication, and with many architectures sharing the same time source,
this allows drivers to be written once for multiple architectures.
Additionally it will better delineate the lines between the timer
subsystem and the time-of-day subsystem, opening the door for more
flexible and better timekeeping.

Features of this design:
========================

o Splits time of day management from timer interrupts:
	This is necessary for virtualization & tickless systems. It allows us
to no longer care how often clock_interrupt() is called. Missing, early
or lost interrupts do not affect time keeping (within bounds - ie: the
time source cannot overflow). This isolates HZ and jiffies to the timer
subsystem (mostly), as they are frequently and incorrectly used to
calculate time.
	Additionally, it allows for dynamic tick interrupts / high-res ticks.
Avoid the need to interpolate between multiple shoddy time sources, and
lets us be agnostic to where the periodic interrupts come from (cleans
up i386 HPET interrupt code).

o Consolidates a large amount of code:
	Allows for shared times source implementations, such as: i386, x86-64
and ia64 all use HPET, i386 and x86-64 both have ACPI PM timers, and
i386 and ia64 both have cyclone counters. Time sources are just drivers!
Also work for user space gettimeofday implementations will be able to be
shared across all arches (assuming the hardware time source can be
safely accessed from user space).

o Generic algorithms which use time-source drivers chosen at runtime:
	Drivers are just simple hw accessors functions with no internal state
needed. They can be loaded and changed while the system is running, like
normal modules.

o More consistent and readable code:
	Drop wall_to_monotonic & xtime in favor of a more simple system_time
and wall_time_offset variables. Where system_time is the monotonically
increasing nanoseconds since boot time and wall_time_offset is the
offset added to system_time to calculate time of day.

o Uses nanoseconds as the kernel's base time unit.
	Rather then doing ugly manipulations to timevals or timespecs, this
simplifies math, and gives us plenty of room to grow (64bits of
nanoseconds ~= 584 years).

o Clearly separates the NTP code from the time code:
	Creates a clean and clear interface, keeping all the NTP related code
in a single place. Save brains, normal people shouldn't have to think
about the in kernel ntp machinery.


Brief Psudo-code to illustrate the design:
==========================================

Globals:
--------
offset_base: timesource cycle value at last call to timeofday_hook()
system_time: time in ns calculated at last call to timeofday_hook()
wall_offset: offset to monotonic_clock() to get current time of day

Functions:
----------
timeofday_hook()
	now = read();			/* read the timesource */
	ns = cyc2ns(now - offset_base); /* calc nsecs since last call */
	ntp_ns = ntp_scale(ns);		/* apply ntp scaling */
	system_time += ntp_ns;		/* add scaled value to system_time */
	ntp_advance(ns);		/* advance ntp state machine by ns */
	offset_base = now;		/* set new offset_base */

monotonic_clock()
	now = read();			/* read the timesource */
	ns = cyc2ns(now - offset_base);	/* calculate nsecs since last hook */
	ntp_ns = ntp_scale(ns);		/* apply ntp scaling */
	return system_time + ntp_ns; 	/* return system_time and scaled value
					 */

settimeofday(desired)
	wall_offset = desired - monotonic_clock(); /* set wall offset */

gettimeofday()
	return wall_offset + monotonic_clock();	/* return current timeofday */


Points I'm glossing over for now:
====================================================

o Have to convert back to time_val for syscall interface

o ntp_scale(ns):  scales ns by NTP scaling factor
	- costly, but correct.

o ntp_advance(ns): advances NTP state machine by ns
	- we have to do the whole NTP state machine

o What is the cost of throwing around 64bit values for everything?
	- Do we need an arch specific time structure that varies size
accordingly?

o Some arches (arm, for example) do not have high res  timing hardware
	- In this case we can have a "jiffies" timesource
		- cyc2ns(x) =  x*(NSEC_PER_SEC/HZ)
		- doesn't work for tickless systems

o vsyscalls/userspace gettimeofday()
	- Mark functions and data w/  __vsyscall attribute
	- Use linker to put all __vsyscall data in the same set of pages
	- Mark those pages user-executable
	- Should work for all arches

o suspend/resume
	- need to pause and restart the timesource reads
	- we don't want a gigantic or negative offset!

Anything else? What am I missing or just being ignorant of?




^ permalink raw reply	[flat|nested] 57+ messages in thread

* [RFC] New timeofday code
  2004-08-17 20:13             ` [RFC] New timeofday implementation proposal john stultz
@ 2004-08-17 20:58               ` john stultz
  2004-09-01 23:16               ` [RFC] New timeofday implementation proposal Christoph Lameter
  1 sibling, 0 replies; 57+ messages in thread
From: john stultz @ 2004-08-17 20:58 UTC (permalink / raw)
  To: Tim Schmielau
  Cc: george anzinger, Andrew Morton, OGAWA Hirofumi, albert, lkml,
	voland, nicolas.george, kaukasoi, david+powerix

Here's the first pass of the core code for the time of day overhaul. 
Since the changes affect so much of the kernel, I'm just sending out the
important files. Those files being:

kernel/timeofday.c: core time of day implementation and interfaces
include/linux/timeofday.h: interface definition and helper functions
include/linux/timesource.h: timesource interface definition
drivers/timesource/cyclone.c: example timesource

I'm still heavily working on this, so any comments or suggestions would
be greatly appreciated. 

thanks
-john



=[linux/kernel/timeofday.c]======================================

/*	linux/kernel/timeofday.c
 *
 *	Copyright (C) 2003 IBM
 *
 *  This file contains the functions which access and manage
 *	the system's time of day functionality.
 */


/* TODO:
 *		o NTP functions & testing
 */

#include <linux/timeofday.h>
#include <linux/timesource.h>
#include <linux/ntp.h>

/*XXX - remove later */
#define TIME_DBG 1
#define TIME_DBG_FREQ 120000

/*[Nanosecond based variables]----------------
 * system_time:
 *		Monotonically increasing counter of the number of nanoseconds
 *		since boot.
 * wall_time_offset:
 *		Offset added to system_time to provide accurate time-of-day
 */
static nsec_t system_time;
static nsec_t wall_time_offset;


/*[Cycle based variables]----------------
 * offset_base:
 *		Value of the timesource at the last clock_interrupt_hook()
 */
static cycle_t offset_base;

/*[Time source data]-------------------
 * timesource
 *		current timesource pointer (initialized to timesource_jiffies)
 * next_timesource:
 *		pointer to the timesource that will be installed at the next hook
 */
extern struct timesource_t timesource_jiffies;
static struct timesource_t *timesource = &timesource_jiffies;
static struct timesource_t *next_timesource;

/*[Locks]----------------------------
 * system_time_lock:
 *		generic lock for all locally scoped time values
 */
static seqlock_t system_time_lock = SEQLOCK_UNLOCKED;


/* [XXX - Hacks]--------------------
 *			Makes stuff compile
 */
extern unsigned long get_cmos_time(void);


/* get_lowres_timestamp():
 *		Returns a low res timestamp.
 *		(ie: the value of system_time as  calculated at
 *			the last invocation of clock_interrupt_hook() )
 */
nsec_t get_lowres_timestamp(void)
{
	nsec_t ret;
	unsigned long seq;
	do {
		seq = read_seqbegin(&system_time_lock);

		/* quickly grab system_time*/
		ret = system_time;

	} while (read_seqretry(&system_time_lock, seq));

	return ret;
}

/* get_lowres_timeofday():
 *		Returns a low res time of day, as calculated at the
 *		last invocation of clock_interrupt_hook()
 */
nsec_t get_lowres_timeofday(void)
{
	nsec_t ret;
	unsigned long seq;
	do {
		seq = read_seqbegin(&system_time_lock);

		/* quickly calculate low-res time of day */
		ret = system_time + wall_time_offset;

	} while (read_seqretry(&system_time_lock, seq));

	return ret;
}


/* __monotonic_clock():
 *		private function, must hold system_time_lock lock when being
 *		called. Returns the monotonically increasing number of
 *		nanoseconds	since the system booted (adjusted by NTP scaling)
 */
static nsec_t __monotonic_clock(void)
{
	nsec_t ret, ns_offset;
	cycle_t now, delta;

	/* read timesource */
	now = timesource->read();

	/* calculate the delta since the last clock_interrupt */
	delta = timesource->delta(now, offset_base);

	/* convert to nanoseconds */
	ns_offset = timesource->cyc2ns(delta, 0);

	/* apply the NTP scaling */
	ns_offset = ntp_scale(ns_offset);

	/* add result to system time */
	ret = system_time + ns_offset;

	return ret;
}


/* do_monotonic_clock():
 *		Returns the monotonically increasing number of nanoseconds
 *		since the system booted via __monotonic_clock()
 */
nsec_t do_monotonic_clock(void)
{
	nsec_t ret;
	unsigned long seq;

	/* atomically read __monotonic_clock() */
	do {
		seq = read_seqbegin(&system_time_lock);

		ret = __monotonic_clock();

	} while (read_seqretry(&system_time_lock, seq));

	return ret;
}


/* do_gettimeofday():
 *		Returns the time of day
 */
void do_gettimeofday(struct timeval *tv)
{
	nsec_t wall, sys;
	unsigned long seq;

	/* atomically read wall and sys time */
	do {
		seq = read_seqbegin(&system_time_lock);

		wall = wall_time_offset;
		sys = __monotonic_clock();

	} while (read_seqretry(&system_time_lock, seq));

	/* add them and convert to timeval */
	*tv = ns2timeval(wall+sys);
}


/* do_settimeofday():
 *		Sets the time of day
 */
int do_settimeofday(struct timespec *tv)
{
	/* convert timespec to ns */
	nsec_t newtime = timespec2ns(tv);

	/* atomically adjust wall_time_offset to the desired value */
	write_seqlock_irq(&system_time_lock);

	wall_time_offset = newtime - __monotonic_clock();

	/* clear NTP settings */
	ntp_clear();

	write_sequnlock_irq(&system_time_lock);

	return 0;
}

/* do_adjtimex:
 *		Userspace NTP daemon's interface to the kernel NTP variables
 */
int do_adjtimex(struct timex *tx)
{
	do_gettimeofday(&tx->time); /* set timex->time*/
								/* Note: We set tx->time first, */
								/* because ntp_adjtimex uses it */

	return ntp_adjtimex(tx);			/* call out to NTP code */
}


/* timeofday_interrupt_hook:
 *		calculates the delta since the last interrupt,
 *		updates system time and clears the offset.
 *		likely called by timer_interrupt()
 */
void timeofday_interrupt_hook(void)
{
	cycle_t now, delta, remainder;
	nsec_t ns, ntp_ns;

	write_seqlock(&system_time_lock);

	/* read time source */
	now = timesource->read();

	/* calculate cycle delta */
	delta = timesource->delta(now, offset_base);

	/* convert cycles to ns  and save remainder */
	ns = timesource->cyc2ns(delta, &remainder);

	/* apply NTP scaling factor for this tick */
	ntp_ns = ntp_scale(ns);

#if TIME_DBG /* XXX - remove later*/
{
	static int dbg=0;
	if(!(dbg++%TIME_DBG_FREQ)){
		printk("now: %lluc - then: %lluc = delta: %lluc -> %llu ns + %llu cyc (ntp: %lluc)\n",
			now, offset_base, delta, ns, remainder, ntp_ns);
	}
}
#endif
	/* update system_time */
	system_time += ntp_ns;

	/* reset the offset_base */
	offset_base = now;

	/* subtract remainder to account for rounded off cycles */
	offset_base = timesource->delta(offset_base,remainder);

	/* advance the ntp state machine by ns*/
	ntp_advance(ns);

	/* if necessary, switch timesources */
	if (next_timesource) {
		/* immediately set new offset_base */
		offset_base = next_timesource->read();
		/* swap timesources */
		timesource = next_timesource;
		next_timesource = 0;

		printk(KERN_INFO "Time: %s timesource has been installed\n",
					timesource->name);
	}

	write_sequnlock(&system_time_lock);
}

/* register_timesource():
 *		Used to install a new timesource
 */
void register_timesource(struct timesource_t* t)
{
	write_seqlock(&system_time_lock);

	/* XXX - check override */

	/* if next_timesource has been set, make sure we beat that one too */
	if (next_timesource && (t->priority > next_timesource->priority))
		next_timesource = t;
	else if(t->priority > timesource->priority)
		next_timesource = t;

	write_sequnlock(&system_time_lock);
}


/* timeofday_init():
 *		Initializes time variables
 */
void timeofday_init(void)
{
	write_seqlock(&system_time_lock);

	/* clear and initialize offsets*/
	offset_base = timesource->read();
	wall_time_offset = ((u64)get_cmos_time()) * NSEC_PER_SEC;

	/* clear NTP scaling factor*/
	ntp_clear();

	write_sequnlock(&system_time_lock);

	return;
}


=[include/linux/timeofday.h]======================================
/*	linux/include/linux/timeofday.h
 *
 *	Copyright (C) 2003 IBM
 *
 *	This file contains the interface to the time of day subsystem
 */
#ifndef _LINUX_TIMEOFDAY_H
#define _LINUX_TIMEOFDAY_H
#include <linux/types.h>
#include <linux/time.h>

nsec_t get_lowres_timestamp(void);
nsec_t get_lowres_timeofday(void);
nsec_t do_monotonic_clock(void);


void do_gettimeofday(struct timeval *tv);
int do_settimeofday(struct timespec *tv);
int do_adjtimex(struct timex *tx);

void timeofday_interrupt_hook(void);
void timeofday_init(void);


/* Helper functions */
#define USEC_PER_NSEC 1000;

static inline struct timeval ns2timeval(nsec_t ns)
{
	struct timeval tv;
	tv.tv_sec = div_long_long_rem(ns, NSEC_PER_SEC, &tv.tv_usec);
	tv.tv_usec /= USEC_PER_NSEC;
	return tv;
}

static inline struct timespec ns2timespec(nsec_t ns)
{
	struct timespec ts;
	ts.tv_sec = div_long_long_rem(ns, NSEC_PER_SEC, &ts.tv_nsec);
	return ts;
}

static inline u64 timespec2ns(struct timespec* ts)
{
	nsec_t ret;
	ret = ((nsec_t)ts->tv_sec) * NSEC_PER_SEC;
	ret += ts->tv_nsec;
	return ret;
}

static inline nsec_t timeval2ns(struct timeval* tv)
{
	nsec_t ret;
	ret = ((nsec_t)tv->tv_sec) * NSEC_PER_SEC;
	ret += tv->tv_usec*USEC_PER_NSEC;
	return ret;
}

#endif

=[include/linux/timesource.h]======================================
/*	linux/include/linux/timesource.h
 *
 *	Copyright (C) 2003 IBM
 *
 *	This file contains the structure definitions for timesources. 
 *
 *	If you are not a timesource, or the time of day code, you should
 *	not be including this file.
 */
#ifndef _LINUX_TIMESORUCE_H
#define _LINUX_TIMESORUCE_H

#include <linux/types.h>
#include <linux/time.h>

/* struct timesource_t:
 *		Provides mostly state-free accessors to the underlying
 *		hardware.
 * name:	ptr to timesource name
 * priority:priority value (higher is better)
 * @read:	returns a cycle value
 * @delta:	calculates the difference between two cycle values
 * @cyc2ns:	converts a cycle value to ns (expected to be expensive)
 */
struct timesource_t {
	char* name;
	int priority;
	cycle_t (*read)(void);
	cycle_t (*delta)(cycle_t now, cycle_t then);
	nsec_t (*cyc2ns)(cycle_t cycles, cycle_t* remainder);
};
void register_timesource(struct timesource_t*);

#endif

=[drivers/timesource/cyclone.c]==================================
#include <linux/timesource.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/timex.h>
#include <linux/init.h>

#include <asm/io.h>
#include <asm/pgtable.h>
#include <asm/fixmap.h>
#include "mach_timer.h"

#define CYCLONE_CBAR_ADDR 0xFEB00CD0
#define CYCLONE_PMCC_OFFSET 0x51A0
#define CYCLONE_MPMC_OFFSET 0x51D0
#define CYCLONE_MPCS_OFFSET 0x51A8
#define CYCLONE_TIMER_FREQ 100000000
#define CYCLONE_TIMER_MASK (((u64)1<<40)-1) /* 40 bit mask */

unsigned long cyclone_freq_khz;

int use_cyclone = 0;
static u32* volatile cyclone_timer;	/* Cyclone MPMC0 register */

/* helper macro to atomically read both cyclone counter registers */
#define read_cyclone_counter(low,high) \
	do{ \
		high = cyclone_timer[1]; low = cyclone_timer[0]; \
	} while (high != cyclone_timer[1]);


static cycle_t cyclone_read(void)
{
	u32 low, high;
	u64 ret;

	read_cyclone_counter(low,high);
	ret = ((u64)high << 32)|low;

	return (cycle_t)ret;
}

static cycle_t cyclone_delta(cycle_t now, cycle_t then)
{
	return (now - then)&CYCLONE_TIMER_MASK;
}

static nsec_t cyclone_cyc2ns(cycle_t cyc, cycle_t* remainder)
{
	u64 rem;
	cyc *= 1000000;
	rem = do_div(cyc, cyclone_freq_khz);
	if (remainder)
		*remainder = rem;
	return (nsec_t)cyc;
}

struct timesource_t timesource_cyclone = {
	.name = "cyclone",
	.priority = 100,
	.read = cyclone_read,
	.delta = cyclone_delta,
	.cyc2ns = cyclone_cyc2ns,
};


static void calibrate_cyclone(void)
{
	u32 startlow, starthigh, endlow, endhigh, delta32;
	u64 start, end, delta64;
	unsigned long i, count;
	/* repeat 3 times to make sure the cache is warm */
	for(i=0; i < 3; i++) {
		mach_prepare_counter();
		read_cyclone_counter(startlow,starthigh);
		mach_countup(&count);
		read_cyclone_counter(endlow,endhigh);
	}
	start = (u64)starthigh<<32|startlow;
	end = (u64)endhigh<<32|endlow;

	delta64 = end - start;
	printk("cyclone delta: %llu\n", delta64);
	delta64 *= (ACTHZ/1000)>>8;
	printk("delta*hz = %llu\n", delta64);
	delta32 = (u32)delta64;
	cyclone_freq_khz = delta32/CALIBRATE_ITERATION;
	printk("calculated cyclone_freq: %lu khz\n", cyclone_freq_khz);
}

static int init_cyclone_timesource(void)
{
	u32* reg;
	u32 base;		/* saved cyclone base address */
	u32 pageaddr;	/* page that contains cyclone_timer register */
	u32 offset;		/* offset from pageaddr to cyclone_timer register */
	int i;

	/*make sure we're on a summit box*/
	if(!use_cyclone) return -ENODEV;

	printk(KERN_INFO "Summit chipset: Starting Cyclone Counter.\n");

	/* find base address */
	pageaddr = (CYCLONE_CBAR_ADDR)&PAGE_MASK;
	offset = (CYCLONE_CBAR_ADDR)&(~PAGE_MASK);
	set_fixmap_nocache(FIX_CYCLONE_TIMER, pageaddr);
	reg = (u32*)(fix_to_virt(FIX_CYCLONE_TIMER) + offset);
	if(!reg){
		printk(KERN_ERR "Summit chipset: Could not find valid CBAR register.\n");
		return -ENODEV;
	}
	base = *reg;
	if(!base){
		printk(KERN_ERR "Summit chipset: Could not find valid CBAR value.\n");
		return -ENODEV;
	}

	/* setup PMCC */
	pageaddr = (base + CYCLONE_PMCC_OFFSET)&PAGE_MASK;
	offset = (base + CYCLONE_PMCC_OFFSET)&(~PAGE_MASK);
	set_fixmap_nocache(FIX_CYCLONE_TIMER, pageaddr);
	reg = (u32*)(fix_to_virt(FIX_CYCLONE_TIMER) + offset);
	if(!reg){
		printk(KERN_ERR "Summit chipset: Could not find valid PMCC register.\n");
		return -ENODEV;
	}
	reg[0] = 0x00000001;

	/* setup MPCS */
	pageaddr = (base + CYCLONE_MPCS_OFFSET)&PAGE_MASK;
	offset = (base + CYCLONE_MPCS_OFFSET)&(~PAGE_MASK);
	set_fixmap_nocache(FIX_CYCLONE_TIMER, pageaddr);
	reg = (u32*)(fix_to_virt(FIX_CYCLONE_TIMER) + offset);
	if(!reg){
		printk(KERN_ERR "Summit chipset: Could not find valid MPCS register.\n");
		return -ENODEV;
	}
	reg[0] = 0x00000001;

	/* map in cyclone_timer */
	pageaddr = (base + CYCLONE_MPMC_OFFSET)&PAGE_MASK;
	offset = (base + CYCLONE_MPMC_OFFSET)&(~PAGE_MASK);
	set_fixmap_nocache(FIX_CYCLONE_TIMER, pageaddr);
	cyclone_timer = (u32*)(fix_to_virt(FIX_CYCLONE_TIMER) + offset);
	if(!cyclone_timer){
		printk(KERN_ERR "Summit chipset: Could not find valid MPMC register.\n");
		return -ENODEV;
	}

	/*quick test to make sure its ticking*/
	for(i=0; i<3; i++){
		u32 old = cyclone_timer[0];
		int stall = 100;
		while(stall--) barrier();
		if(cyclone_timer[0] == old){
			printk(KERN_ERR "Summit chipset: Counter not counting! DISABLED\n");
			cyclone_timer = 0;
			return -ENODEV;
		}
	}
	calibrate_cyclone();
	register_timesource(&timesource_cyclone);

	return 0;
}

module_init(init_cyclone_timesource);




^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [RFC] New timeofday implementation proposal
  2004-08-17 20:13             ` [RFC] New timeofday implementation proposal john stultz
  2004-08-17 20:58               ` [RFC] New timeofday code john stultz
@ 2004-09-01 23:16               ` Christoph Lameter
  1 sibling, 0 replies; 57+ messages in thread
From: Christoph Lameter @ 2004-09-01 23:16 UTC (permalink / raw)
  To: john stultz
  Cc: Tim Schmielau, george anzinger, Andrew Morton, OGAWA Hirofumi,
	albert, lkml, voland, nicolas.george, kaukasoi, david+powerix

On Tue, 17 Aug 2004, john stultz wrote:

> o Consolidates a large amount of code:
> 	Allows for shared times source implementations, such as: i386, x86-64
> and ia64 all use HPET, i386 and x86-64 both have ACPI PM timers, and
> i386 and ia64 both have cyclone counters. Time sources are just drivers!
> Also work for user space gettimeofday implementations will be able to be
> shared across all arches (assuming the hardware time source can be
> safely accessed from user space).

What about a hardware time source that can be safely accessed with a fast
system call (f.e. via epc on IA64)? My tests indicate that such an
implementation is comparable to a user space memory mapped solution. The
user space memory mapping might generate complexities. Especially since
the page mapped for a memory mapped timer may allow access to hardware
information that should not be exposed to user space.

The time interpolator patches that I posted a while back provide a
generic interface to timer registers / values which may be useful to what
you are trying to accomplish. The C code for that patch is platform
independent but there is also an asm fast path that is specific to IA64.
Other arches could develop similar fastpaths.

> o Uses nanoseconds as the kernel's base time unit.
> 	Rather then doing ugly manipulations to timevals or timespecs, this
> simplifies math, and gives us plenty of room to grow (64bits of
> nanoseconds ~= 584 years).

The nanoseconds patch that was accepted into 2.6.9-rc1 does do that
partially by providing a getnstimeofday and centralizing the instances
where microseconds are multiplied by 1000 get to nanoseconds.

> o Have to convert back to time_val for syscall interface

This is mostly covered by gettimeofday()

> o ntp_scale(ns):  scales ns by NTP scaling factor
> 	- costly, but correct.

May we would need 128bit arithmetic to increase the accurary of the
scaling?

> o Some arches (arm, for example) do not have high res  timing hardware
> 	- In this case we can have a "jiffies" timesource
> 		- cyc2ns(x) =  x*(NSEC_PER_SEC/HZ)
> 		- doesn't work for tickless systems

Most arches have already high res time sources. I think we just need to
make proper use of them.

> o suspend/resume
> 	- need to pause and restart the timesource reads
> 	- we don't want a gigantic or negative offset!

Some intelligent timer needs to survive the suspend. Timers may need an
attribute to show if they continue counting through suspense/resume etc
(various power conditions etc....)

Hope this helps ....

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: boot time, process start time, and NOW time
  2004-08-16 19:41   ` Andrew Morton
  2004-08-16 21:49     ` john stultz
  2004-08-16 23:08     ` Tim Schmielau
@ 2004-08-16 23:24     ` Albert Cahalan
  2004-08-17 19:00       ` john stultz
  2004-08-17 20:25     ` [PATCH] " Tim Schmielau
  3 siblings, 1 reply; 57+ messages in thread
From: Albert Cahalan @ 2004-08-16 23:24 UTC (permalink / raw)
  To: Andrew Morton OSDL
  Cc: OGAWA Hirofumi, albert, linux-kernel mailing list, voland,
	nicolas.george, kaukasoi, tim, george, johnstul, david+powerix

On Mon, 2004-08-16 at 15:41, Andrew Morton wrote:

> Where did this all end up?  Complaints about
> wandering start times are persistent, and it'd
> be nice to get some fix in place...

If you're interested in reducing (not solving)
the problem for the 2.6.x series, you might change
HZ to something that works better with the PIT.

Here is a table showing % error for various HZ choices:

wrongness_%   HZ_diff   PIT_#   HZ     actual_HZ   
-0.00150855  -0.001509  11932   100    99.998491  
-0.00150855  -0.009474   1900   628   627.990526  
-0.00083809  -0.003051   3278   364   363.996949  
-0.00083809  -0.008389   1192  1001  1000.991611  
+0.00000000  +0.000000  14551    82    82.000000  
+0.00008381  +0.000304   3287   363   363.000304  
+0.00008381  +0.000435   2299   519   519.000435  
+0.00008381  +0.000525   1903   627   627.000525  
+0.01525566  +0.152557   1193  1000  1000.152557  
+0.01860917  +0.190558   1165  1024  1024.190558

As you can see, 1000 HZ and 1024 HZ are really bad.
They're worse than typical quartz crystal variation.

The old 100 HZ tick was just barely tolerable.
While 82 is perfect, it's a bit low. :-(

Some of the other choices are nice. How about 363,
519, or 627?

For the AMD Elan: 300, 400, 600, 991, 1200
(the AMD Elan PIT runs at 1189200 instead of 1193182)



^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: boot time, process start time, and NOW time
  2004-08-16 23:24     ` boot time, process start time, and NOW time Albert Cahalan
@ 2004-08-17 19:00       ` john stultz
  2004-08-17 17:41         ` Albert Cahalan
  0 siblings, 1 reply; 57+ messages in thread
From: john stultz @ 2004-08-17 19:00 UTC (permalink / raw)
  To: Albert Cahalan
  Cc: Andrew Morton OSDL, OGAWA Hirofumi, linux-kernel mailing list,
	voland, nicolas.george, kaukasoi, tim, george anzinger,
	david+powerix

On Mon, 2004-08-16 at 16:24, Albert Cahalan wrote:
> On Mon, 2004-08-16 at 15:41, Andrew Morton wrote:
> 
> > Where did this all end up?  Complaints about
> > wandering start times are persistent, and it'd
> > be nice to get some fix in place...
> 
> If you're interested in reducing (not solving)
> the problem for the 2.6.x series, you might change
> HZ to something that works better with the PIT.
> 
> Here is a table showing % error for various HZ choices:
> 
> wrongness_%   HZ_diff   PIT_#   HZ     actual_HZ   
> -0.00150855  -0.001509  11932   100    99.998491  
> -0.00150855  -0.009474   1900   628   627.990526  
> -0.00083809  -0.003051   3278   364   363.996949  
> -0.00083809  -0.008389   1192  1001  1000.991611  
> +0.00000000  +0.000000  14551    82    82.000000  
> +0.00008381  +0.000304   3287   363   363.000304  
> +0.00008381  +0.000435   2299   519   519.000435  
> +0.00008381  +0.000525   1903   627   627.000525  
> +0.01525566  +0.152557   1193  1000  1000.152557  
> +0.01860917  +0.190558   1165  1024  1024.190558
> 
> As you can see, 1000 HZ and 1024 HZ are really bad.
> They're worse than typical quartz crystal variation.
> 
> The old 100 HZ tick was just barely tolerable.
> While 82 is perfect, it's a bit low. :-(
> 
> Some of the other choices are nice. How about 363,
> 519, or 627?

What about 1001? That looks reasonably accurate.

thanks
-john



^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: boot time, process start time, and NOW time
  2004-08-17 19:00       ` john stultz
@ 2004-08-17 17:41         ` Albert Cahalan
  2004-08-17 20:58           ` john stultz
  0 siblings, 1 reply; 57+ messages in thread
From: Albert Cahalan @ 2004-08-17 17:41 UTC (permalink / raw)
  To: john stultz
  Cc: Albert Cahalan, Andrew Morton OSDL, OGAWA Hirofumi,
	linux-kernel mailing list, voland, nicolas.george, kaukasoi, tim,
	george anzinger, david+powerix

On Tue, 2004-08-17 at 15:00, john stultz wrote:
> On Mon, 2004-08-16 at 16:24, Albert Cahalan wrote:
> > On Mon, 2004-08-16 at 15:41, Andrew Morton wrote:
> > 
> > > Where did this all end up?  Complaints about
> > > wandering start times are persistent, and it'd
> > > be nice to get some fix in place...
> > 
> > If you're interested in reducing (not solving)
> > the problem for the 2.6.x series, you might change
> > HZ to something that works better with the PIT.
> > 
> > Here is a table showing % error for various HZ choices:
> > 
> > wrongness_%   HZ_diff   PIT_#   HZ     actual_HZ   
> > -0.00150855  -0.001509  11932   100    99.998491  
> > -0.00150855  -0.009474   1900   628   627.990526  
> > -0.00083809  -0.003051   3278   364   363.996949  
> > -0.00083809  -0.008389   1192  1001  1000.991611  
> > +0.00000000  +0.000000  14551    82    82.000000  
> > +0.00008381  +0.000304   3287   363   363.000304  
> > +0.00008381  +0.000435   2299   519   519.000435  
> > +0.00008381  +0.000525   1903   627   627.000525  
> > +0.01525566  +0.152557   1193  1000  1000.152557  
> > +0.01860917  +0.190558   1165  1024  1024.190558
> > 
> > As you can see, 1000 HZ and 1024 HZ are really bad.
> > They're worse than typical quartz crystal variation.
> > 
> > The old 100 HZ tick was just barely tolerable.
> > While 82 is perfect, it's a bit low. :-(
> > 
> > Some of the other choices are nice. How about 363,
> > 519, or 627?
> 
> What about 1001? That looks reasonably accurate.

Sure. (it's 10x worse, but the crystals aren't good
enough to tell the difference) Supposing that a
choice near 1000 HZ is good, here are some more:

wrongness_%   HZ_diff   PIT_#   HZ     actual_HZ   
-0.00217900  -0.021703   1198   996   995.978297
-0.00083809  -0.008389   1192  1001  1000.991611
-0.00050285  -0.006376    941  1268  1267.993624
+0.00050286  +0.005396   1112  1073  1073.005396
+0.00150859  +0.014950   1204   991   991.014950

I think it's better to drop down a bit, because people
have also been suffering problems with lost ticks.
The BIOS can grab the CPU for too long.

We need to deal well with a few different frequencies:

100    the old clock tick
59.94  NTSC field rate
50     PAL field rate

The theory is that you need a frequency of just over 2x
the one you'd like, but in practice you need about 4x.
So that's why I suggested 363, 519, and 627.

I'd really rather just run everything off the RTC or HPET,
with an arbitrary rate interrupt source, and just call into
the regular jiffies handling code as needed to catch up.
This would allow steering the jiffies tick to an exact
integer HZ. High-precision timers could be fired off of
the RTC or HPET interrupt if that is running faster.



^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: boot time, process start time, and NOW time
  2004-08-17 17:41         ` Albert Cahalan
@ 2004-08-17 20:58           ` john stultz
  0 siblings, 0 replies; 57+ messages in thread
From: john stultz @ 2004-08-17 20:58 UTC (permalink / raw)
  To: Albert Cahalan
  Cc: Andrew Morton OSDL, OGAWA Hirofumi, linux-kernel mailing list,
	voland, nicolas.george, kaukasoi, tim, george anzinger,
	david+powerix

On Tue, 2004-08-17 at 10:41, Albert Cahalan wrote:
> On Tue, 2004-08-17 at 15:00, john stultz wrote:
> > What about 1001? That looks reasonably accurate.
> 
> Sure. (it's 10x worse, but the crystals aren't good
> enough to tell the difference) Supposing that a
> choice near 1000 HZ is good, here are some more:
> 
> wrongness_%   HZ_diff   PIT_#   HZ     actual_HZ   
> -0.00217900  -0.021703   1198   996   995.978297
> -0.00083809  -0.008389   1192  1001  1000.991611
> -0.00050285  -0.006376    941  1268  1267.993624
> +0.00050286  +0.005396   1112  1073  1073.005396
> +0.00150859  +0.014950   1204   991   991.014950
> 
> I think it's better to drop down a bit, because people
> have also been suffering problems with lost ticks.
> The BIOS can grab the CPU for too long.

Well, the move to HZ=1000 from HZ=100 was wanted to improve latency
requirements, so I don't know if folks would go for something like
HZ=519. The lost tick issue is a problem, but the real solution there is
to move the time subsystem away from depending on timer interrupts to
keep accurate time. See the proposal I just sent out for more details.
That way timer interrupts just become scheduler preemption points and
lost ticks become just an issue for folks with latency requirements. 

> I'd really rather just run everything off the RTC or HPET,
> with an arbitrary rate interrupt source, and just call into
> the regular jiffies handling code as needed to catch up.
> This would allow steering the jiffies tick to an exact
> integer HZ. High-precision timers could be fired off of
> the RTC or HPET interrupt if that is running faster.

Indeed, having alternate or multiple timer interrupt sources would be
nice. Hopefully once the time of day subsystem is untangled from the
timer subsystem, using alternate interrupt sources will be much easier
(and cleaner!). 

thanks
-john

^ permalink raw reply	[flat|nested] 57+ messages in thread

* [PATCH] Re: boot time, process start time, and NOW time
  2004-08-16 19:41   ` Andrew Morton
                       ` (2 preceding siblings ...)
  2004-08-16 23:24     ` boot time, process start time, and NOW time Albert Cahalan
@ 2004-08-17 20:25     ` Tim Schmielau
  2004-08-17 22:24       ` George Anzinger
  3 siblings, 1 reply; 57+ messages in thread
From: Tim Schmielau @ 2004-08-17 20:25 UTC (permalink / raw)
  To: Andrew Morton
  Cc: OGAWA Hirofumi, albert, linux-kernel, voland, nicolas.george,
	kaukasoi, george, johnstul, david+powerix

On Mon, 16 Aug 2004, Andrew Morton wrote:

> OGAWA Hirofumi <hirofumi@mail.parknet.co.jp> wrote:
> >
> > Albert Cahalan <albert@users.sf.net> writes:
> > 
> > > Even with the 2.6.7 kernel, I'm still getting reports of process
> > > start times wandering. Here is an example:
> > > 
> > >    "About 12 hours since reboot to 2.6.7 there was already a
> > >    difference of about 7 seconds between the real start time
> > >    and the start time reported by ps. Now, 24 hours since reboot
> > >    the difference is 10 seconds."
> > > 
> > > The calculation used is:
> > > 
> > >    now - uptime + time_from_boot_to_process_start
> > 
> > Start-time and uptime is using different source. Looks like the
> > jiffies was added bogus lost counts.
> > 
> > quick hack. Does this change the behavior?
> 
> Where did this all end up?  Complaints about wandering start times are
> persistent, and it'd be nice to get some fix in place...
> 
> Thanks.
> 

Seems my analysis of the problem wasn't perceived as such.

The problem is that in the above calculation 

  now - uptime + time_from_boot_to_process_start

"uptime" currently is an ntp-corrected precise time, while 
"time_from_boot_to_process_start" just is the free-running "jiffies"
value.

The problem is easily reproducible for me. It goes away if the change
that rebased /proc/uptime on posix monotonic time and my followup patch to 
fix the resulting rounding issues in jiffies64_to_clock_t() are backed out 
with the following patch.

Tim



--- linux-2.6.8.1/fs/proc/proc_misc.c	2004-08-17 21:38:54.000000000 +0200
+++ linux-2.6.8.1-uf/fs/proc/proc_misc.c	2004-08-17 21:41:53.000000000 +0200
@@ -133,19 +133,36 @@ static struct vmalloc_info get_vmalloc_i
 static int uptime_read_proc(char *page, char **start, off_t off,
 				 int count, int *eof, void *data)
 {
-	struct timespec uptime;
-	struct timespec idle;
+	u64 uptime;
+	unsigned long uptime_remainder;
 	int len;
-	u64 idle_jiffies = init_task.utime + init_task.stime;
 
-	do_posix_clock_monotonic_gettime(&uptime);
-	jiffies_to_timespec(idle_jiffies, &idle);
-	len = sprintf(page,"%lu.%02lu %lu.%02lu\n",
-			(unsigned long) uptime.tv_sec,
-			(uptime.tv_nsec / (NSEC_PER_SEC / 100)),
-			(unsigned long) idle.tv_sec,
-			(idle.tv_nsec / (NSEC_PER_SEC / 100)));
+	uptime = get_jiffies_64() - INITIAL_JIFFIES;
+	uptime_remainder = (unsigned long) do_div(uptime, HZ);
 
+#if HZ!=100
+	{
+		u64 idle = init_task.utime + init_task.stime;
+		unsigned long idle_remainder;
+
+		idle_remainder = (unsigned long) do_div(idle, HZ);
+		len = sprintf(page,"%lu.%02lu %lu.%02lu\n",
+			(unsigned long) uptime,
+			(uptime_remainder * 100) / HZ,
+			(unsigned long) idle,
+			(idle_remainder * 100) / HZ);
+	}
+#else
+	{
+		unsigned long idle = init_task.utime + init_task.stime;
+
+		len = sprintf(page,"%lu.%02lu %lu.%02lu\n",
+			(unsigned long) uptime,
+			uptime_remainder,
+			idle / HZ,
+			idle % HZ);
+	}
+#endif
 	return proc_calc_metrics(page, start, off, count, eof, len);
 }
 

--- linux-2.6.8.1/include/linux/times.h	2004-08-17 00:13:35.000000000 +0200
+++ linux-2.6.8.1-uf/include/linux/times.h	2004-08-17 21:44:26.000000000 +0200
@@ -7,16 +7,11 @@
 #include <asm/types.h>
 #include <asm/param.h>
 
-static inline clock_t jiffies_to_clock_t(long x)
-{
-#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0
-	return x / (HZ / USER_HZ);
+#if (HZ % USER_HZ)==0
+# define jiffies_to_clock_t(x) ((x) / (HZ / USER_HZ))
 #else
-	u64 tmp = (u64)x * TICK_NSEC;
-	do_div(tmp, (NSEC_PER_SEC / USER_HZ));
-	return (long)tmp;
+# define jiffies_to_clock_t(x) ((clock_t) jiffies_64_to_clock_t((u64) x))
 #endif
-}
 
 static inline unsigned long clock_t_to_jiffies(unsigned long x)
 {
@@ -40,7 +35,7 @@ static inline unsigned long clock_t_to_j
 
 static inline u64 jiffies_64_to_clock_t(u64 x)
 {
-#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0
+#if (HZ % USER_HZ)==0
 	do_div(x, HZ / USER_HZ);
 #else
 	/*
@@ -48,8 +43,8 @@ static inline u64 jiffies_64_to_clock_t(
 	 * but even this doesn't overflow in hundreds of years
 	 * in 64 bits, so..
 	 */
-	x *= TICK_NSEC;
-	do_div(x, (NSEC_PER_SEC / USER_HZ));
+	x *= USER_HZ;
+	do_div(x, HZ);
 #endif
 	return x;
 }

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] Re: boot time, process start time, and NOW time
  2004-08-17 20:25     ` [PATCH] " Tim Schmielau
@ 2004-08-17 22:24       ` George Anzinger
  2004-08-17 22:37         ` john stultz
  0 siblings, 1 reply; 57+ messages in thread
From: George Anzinger @ 2004-08-17 22:24 UTC (permalink / raw)
  To: Tim Schmielau
  Cc: Andrew Morton, OGAWA Hirofumi, albert, linux-kernel, voland,
	nicolas.george, kaukasoi, johnstul, david+powerix

Tim Schmielau wrote:
> On Mon, 16 Aug 2004, Andrew Morton wrote:
> 
> 
>>OGAWA Hirofumi <hirofumi@mail.parknet.co.jp> wrote:
>>
>>>Albert Cahalan <albert@users.sf.net> writes:
>>>
>>>
>>>>Even with the 2.6.7 kernel, I'm still getting reports of process
>>>>start times wandering. Here is an example:
>>>>
>>>>   "About 12 hours since reboot to 2.6.7 there was already a
>>>>   difference of about 7 seconds between the real start time
>>>>   and the start time reported by ps. Now, 24 hours since reboot
>>>>   the difference is 10 seconds."
>>>>
>>>>The calculation used is:
>>>>
>>>>   now - uptime + time_from_boot_to_process_start
>>>
>>>Start-time and uptime is using different source. Looks like the
>>>jiffies was added bogus lost counts.
>>>
>>>quick hack. Does this change the behavior?
>>
>>Where did this all end up?  Complaints about wandering start times are
>>persistent, and it'd be nice to get some fix in place...
>>
>>Thanks.
>>
> 
> 
> Seems my analysis of the problem wasn't perceived as such.
> 
> The problem is that in the above calculation 
> 
>   now - uptime + time_from_boot_to_process_start
> 
> "uptime" currently is an ntp-corrected precise time, while 
> "time_from_boot_to_process_start" just is the free-running "jiffies"
> value.

I see you think you have the solution, but I guess I am just dense here.  May be 
you could help me to see the error of my ways.  Here is my thinking:

"now" is from gettimeofday() and as such is ntp corrected.
"uptime" is also corrected.  In fact it is "now" + "wall_to_monotonic".  And 
"wall_to_monotonic" is _only_ changed by do_settime() when the clock is set.
"time_from_boot_to_process_start" is the same as "start_time" restated in 
seconds, i.e. it is a constant.  So, either one or more of the above assumtions 
is wrong, or  somebody is twiddling the clock.  Otherwise I don't see how the 
start time can move at all.
> 
> The problem is easily reproducible for me. It goes away if the change
> that rebased /proc/uptime on posix monotonic time and my followup patch to 
> fix the resulting rounding issues in jiffies64_to_clock_t() are backed out 
> with the following patch.
> 

-- 
George Anzinger   george@mvista.com
High-res-timers:  http://sourceforge.net/projects/high-res-timers/
Preemption patch: http://www.kernel.org/pub/linux/kernel/people/rml


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] Re: boot time, process start time, and NOW time
  2004-08-17 22:24       ` George Anzinger
@ 2004-08-17 22:37         ` john stultz
  2004-08-17 23:07           ` Tim Schmielau
  0 siblings, 1 reply; 57+ messages in thread
From: john stultz @ 2004-08-17 22:37 UTC (permalink / raw)
  To: george anzinger
  Cc: Tim Schmielau, Andrew Morton, OGAWA Hirofumi, albert, lkml,
	voland, nicolas.george, kaukasoi, david+powerix

On Tue, 2004-08-17 at 15:24, George Anzinger wrote:
> Tim Schmielau wrote:
> > On Mon, 16 Aug 2004, Andrew Morton wrote:
> > 
> > 
> >>OGAWA Hirofumi <hirofumi@mail.parknet.co.jp> wrote:
> >>
> >>>Albert Cahalan <albert@users.sf.net> writes:
> >>>
> >>>
> >>>>Even with the 2.6.7 kernel, I'm still getting reports of process
> >>>>start times wandering. Here is an example:
> >>>>
> >>>>   "About 12 hours since reboot to 2.6.7 there was already a
> >>>>   difference of about 7 seconds between the real start time
> >>>>   and the start time reported by ps. Now, 24 hours since reboot
> >>>>   the difference is 10 seconds."
> >>>>
> >>>>The calculation used is:
> >>>>
> >>>>   now - uptime + time_from_boot_to_process_start
> >>>
> >>>Start-time and uptime is using different source. Looks like the
> >>>jiffies was added bogus lost counts.
> >>>
> >>>quick hack. Does this change the behavior?
> >>
> >>Where did this all end up?  Complaints about wandering start times are
> >>persistent, and it'd be nice to get some fix in place...
> >>
> >>Thanks.
> >>
> > 
> > 
> > Seems my analysis of the problem wasn't perceived as such.
> > 
> > The problem is that in the above calculation 
> > 
> >   now - uptime + time_from_boot_to_process_start
> > 
> > "uptime" currently is an ntp-corrected precise time, while 
> > "time_from_boot_to_process_start" just is the free-running "jiffies"
> > value.
> 
> I see you think you have the solution, but I guess I am just dense here.  May be 
> you could help me to see the error of my ways.  Here is my thinking:
> 
> "now" is from gettimeofday() and as such is ntp corrected.
> "uptime" is also corrected.  In fact it is "now" + "wall_to_monotonic".  And 
> "wall_to_monotonic" is _only_ changed by do_settime() when the clock is set.
> "time_from_boot_to_process_start" is the same as "start_time" restated in 
> seconds, i.e. it is a constant.  So, either one or more of the above assumtions 
> is wrong, or  somebody is twiddling the clock.  Otherwise I don't see how the 
> start time can move at all.

The problem is start time is derived from task->start_time which is the
jiffies value at the time the process started. Thus interval calculated
by: (start_time = p->start_time - INITIAL_JIFFIES) or (run_time =
get_jiffies_64() - p->start_time) is not NTP adjusted. 

So both (uptime - run_time) or (boot_time + start_time) will have
problems. 

What needs to happen is task->start_time is changed to a timespec which
is set at fork time to be do_posix_clock_monotonic_gettime(). Then in
proc_pid_stat() we can calculate the appropriate user-jiffies value.


task->start_time is used at the following lines:

include/linux/sched.h: 460
kernel/fork.c: 964
fs/proc/array.h: 359
kernel/acct.c: 404
mm/oom_kill.c: 64

I'm stuck trying to fix the last two files at the moment. Please let me
know if you see any other uses.

thanks
-john



^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] Re: boot time, process start time, and NOW time
  2004-08-17 22:37         ` john stultz
@ 2004-08-17 23:07           ` Tim Schmielau
  2004-08-18  0:11             ` john stultz
  0 siblings, 1 reply; 57+ messages in thread
From: Tim Schmielau @ 2004-08-17 23:07 UTC (permalink / raw)
  To: john stultz
  Cc: george anzinger, Andrew Morton, OGAWA Hirofumi, albert, lkml,
	voland, nicolas.george, kaukasoi, david+powerix

On Tue, 17 Aug 2004, john stultz wrote:

> On Tue, 2004-08-17 at 15:24, George Anzinger wrote:
> > I see you think you have the solution, but I guess I am just dense here.  May be 
> > you could help me to see the error of my ways.  Here is my thinking:
> > 
> > "now" is from gettimeofday() and as such is ntp corrected.
> > "uptime" is also corrected.  In fact it is "now" + "wall_to_monotonic".  And 
> > "wall_to_monotonic" is _only_ changed by do_settime() when the clock is set.
> > "time_from_boot_to_process_start" is the same as "start_time" restated in 
> > seconds, i.e. it is a constant.  So, either one or more of the above assumtions 
> > is wrong, or  somebody is twiddling the clock.  Otherwise I don't see how the 
> > start time can move at all.

Start time indeed is a constant for each process, and doesn't drift. 
The problem trather is that a (slightly) wrong start time is assigned
to newly created processes.

> The problem is start time is derived from task->start_time which is the
> jiffies value at the time the process started. Thus interval calculated
> by: (start_time = p->start_time - INITIAL_JIFFIES) or (run_time =
> get_jiffies_64() - p->start_time) is not NTP adjusted. 
> 
> So both (uptime - run_time) or (boot_time + start_time) will have
> problems. 
> 
> What needs to happen is task->start_time is changed to a timespec which
> is set at fork time to be do_posix_clock_monotonic_gettime(). Then in
> proc_pid_stat() we can calculate the appropriate user-jiffies value.

Yep.

> task->start_time is used at the following lines:
> 
> include/linux/sched.h: 460
> kernel/fork.c: 964
> fs/proc/array.h: 359
> kernel/acct.c: 404
> mm/oom_kill.c: 64
> 
> I'm stuck trying to fix the last two files at the moment. Please let me
> know if you see any other uses.

Where's the problem with the last two of them?
I think I can do them if you fix the first three, so that I can see which
time source is going to replace jiffies here.


Tim

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] Re: boot time, process start time, and NOW time
  2004-08-17 23:07           ` Tim Schmielau
@ 2004-08-18  0:11             ` john stultz
  2004-08-17 22:19               ` Albert Cahalan
  0 siblings, 1 reply; 57+ messages in thread
From: john stultz @ 2004-08-18  0:11 UTC (permalink / raw)
  To: Tim Schmielau
  Cc: george anzinger, Andrew Morton, OGAWA Hirofumi, albert, lkml,
	voland, nicolas.george, kaukasoi, david+powerix

On Tue, 2004-08-17 at 16:07, Tim Schmielau wrote:
> On Tue, 17 Aug 2004, john stultz wrote:
> > On Tue, 2004-08-17 at 15:24, George Anzinger wrote:
> > > I see you think you have the solution, but I guess I am just dense here.  May be 
> > > you could help me to see the error of my ways.  Here is my thinking:
> > > 
> > > "now" is from gettimeofday() and as such is ntp corrected.
> > > "uptime" is also corrected.  In fact it is "now" + "wall_to_monotonic".  And 
> > > "wall_to_monotonic" is _only_ changed by do_settime() when the clock is set.
> > > "time_from_boot_to_process_start" is the same as "start_time" restated in 
> > > seconds, i.e. it is a constant.  So, either one or more of the above assumtions 
> > > is wrong, or  somebody is twiddling the clock.  Otherwise I don't see how the 
> > > start time can move at all.
> 
> Start time indeed is a constant for each process, and doesn't drift. 
> The problem trather is that a (slightly) wrong start time is assigned
> to newly created processes.
> 
> > The problem is start time is derived from task->start_time which is the
> > jiffies value at the time the process started. Thus interval calculated
> > by: (start_time = p->start_time - INITIAL_JIFFIES) or (run_time =
> > get_jiffies_64() - p->start_time) is not NTP adjusted. 
> > 
> > So both (uptime - run_time) or (boot_time + start_time) will have
> > problems. 
> > 
> > What needs to happen is task->start_time is changed to a timespec which
> > is set at fork time to be do_posix_clock_monotonic_gettime(). Then in
> > proc_pid_stat() we can calculate the appropriate user-jiffies value.
> 
> Yep.

Ok, I think I've got something to start working with. It compiles, but I
don't have a free machine to test on, so Tim, maybe you could give this
a run? 

thanks
-john

===== fs/proc/array.c 1.62 vs edited =====
--- 1.62/fs/proc/array.c	2004-08-05 13:36:53 -07:00
+++ edited/fs/proc/array.c	2004-08-17 17:08:07 -07:00
@@ -356,7 +356,14 @@
 	read_unlock(&tasklist_lock);
 
 	/* Temporary variable needed for gcc-2.96 */
-	start_time = jiffies_64_to_clock_t(task->start_time - INITIAL_JIFFIES);
+	/* convert timespec -> nsec*/
+	start_time = (unsigned long long)task->start_time.tv_sec * NSEC_PER_SEC 
+				+ task->start_time.tv_nsec;
+	/* convert nsec -> ticks */
+	start_time *= HZ;
+	do_div(start_time, NSEC_PER_SEC);
+	/* convert ticks -> USER_HZ ticks */
+	start_time = jiffies_64_to_clock_t(start_time);
 
 	res = sprintf(buffer,"%d (%s) %c %d %d %d %d %d %lu %lu \
 %lu %lu %lu %lu %lu %ld %ld %ld %ld %d %ld %llu %lu %ld %lu %lu %lu %lu %lu \
===== include/linux/sched.h 1.228 vs edited =====
--- 1.228/include/linux/sched.h	2004-07-28 21:58:54 -07:00
+++ edited/include/linux/sched.h	2004-08-17 15:49:38 -07:00
@@ -457,7 +457,7 @@
 	struct timer_list real_timer;
 	unsigned long utime, stime, cutime, cstime;
 	unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; /* context switch counts */
-	u64 start_time;
+	struct timespec start_time;
 /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
 	unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
 /* process credentials */
===== kernel/acct.c 1.34 vs edited =====
--- 1.34/kernel/acct.c	2004-08-02 01:00:40 -07:00
+++ edited/kernel/acct.c	2004-08-17 17:09:03 -07:00
@@ -384,6 +384,8 @@
 	unsigned long vsize;
 	unsigned long flim;
 	u64 elapsed;
+	u64 run_time;
+	struct timespec uptime;
 
 	/*
 	 * First check to see if there is enough free_space to continue
@@ -401,7 +403,16 @@
 	ac.ac_version = ACCT_VERSION | ACCT_BYTEORDER;
 	strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm));
 
-	elapsed = jiffies_64_to_AHZ(get_jiffies_64() - current->start_time);
+	/* calculate run_time in nsec*/
+	do_posix_clock_monotonic_gettime(&uptime);
+	run_time = (u64)uptime.tv_sec*NSEC_PER_SEC + uptime.tv_nsec;	
+	run_time -= (u64)current->start_time.tv_sec*NSEC_PER_SEC 
+					+ current->start_Time.tv_nsec;
+	/* convert nsec -> ticks */
+	run_time *= HZ;
+	do_div(run_time, NSEC_PER_SEC);
+	
+	elapsed = jiffies_64_to_AHZ(run_time);
 #if ACCT_VERSION==3
 	ac.ac_etime = encode_float(elapsed);
 #else
===== kernel/fork.c 1.186 vs edited =====
--- 1.186/kernel/fork.c	2004-07-28 21:58:55 -07:00
+++ edited/kernel/fork.c	2004-08-17 15:51:30 -07:00
@@ -961,7 +961,7 @@
 	p->utime = p->stime = 0;
 	p->cutime = p->cstime = 0;
 	p->lock_depth = -1;		/* -1 = no lock */
-	p->start_time = get_jiffies_64();
+	do_posix_clock_monotonic_gettime(&p->start_time);
 	p->security = NULL;
 	p->io_context = NULL;
 	p->audit_context = NULL;
===== mm/oom_kill.c 1.28 vs edited =====
--- 1.28/mm/oom_kill.c	2004-08-02 01:00:42 -07:00
+++ edited/mm/oom_kill.c	2004-08-17 17:09:32 -07:00
@@ -44,6 +44,7 @@
 static int badness(struct task_struct *p)
 {
 	int points, cpu_time, run_time, s;
+	struct timespec uptime;
 
 	if (!p->mm)
 		return 0;
@@ -61,7 +62,9 @@
 	 * very well in practice.
 	 */
 	cpu_time = (p->utime + p->stime) >> (SHIFT_HZ + 3);
-	run_time = (get_jiffies_64() - p->start_time) >> (SHIFT_HZ + 10);
+
+	do_posix_clock_monotonic_gettime(&uptime);
+	run_time = (uptime.tv_sec - p->start_time.tv_sec)/60;
 
 	s = int_sqrt(cpu_time);
 	if (s)



^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] Re: boot time, process start time, and NOW time
  2004-08-18  0:11             ` john stultz
@ 2004-08-17 22:19               ` Albert Cahalan
  2004-08-18  1:09                 ` john stultz
  0 siblings, 1 reply; 57+ messages in thread
From: Albert Cahalan @ 2004-08-17 22:19 UTC (permalink / raw)
  To: john stultz
  Cc: Tim Schmielau, george anzinger, Andrew Morton OSDL,
	OGAWA Hirofumi, albert, lkml, voland, nicolas.george, kaukasoi,
	david+powerix

On Tue, 2004-08-17 at 20:11, john stultz wrote:

> --- 1.62/fs/proc/array.c	2004-08-05 13:36:53 -07:00
> +++ edited/fs/proc/array.c	2004-08-17 17:08:07 -07:00
> @@ -356,7 +356,14 @@
>  	read_unlock(&tasklist_lock);
>  
>  	/* Temporary variable needed for gcc-2.96 */
> -	start_time = jiffies_64_to_clock_t(task->start_time - INITIAL_JIFFIES);
> +	/* convert timespec -> nsec*/
> +	start_time = (unsigned long long)task->start_time.tv_sec * NSEC_PER_SEC 
> +				+ task->start_time.tv_nsec;
> +	/* convert nsec -> ticks */
> +	start_time *= HZ;
> +	do_div(start_time, NSEC_PER_SEC);
> +	/* convert ticks -> USER_HZ ticks */
> +	start_time = jiffies_64_to_clock_t(start_time);

This would overflow in about 6 months at 1024 USER_HZ.
Various possible alternatives:

// 6 months to overflow at 1024 USER_HZ
value = ns64 * USER_HZ / BILLION;

// 2 years to overflow at 1024 USER_HZ
// (assuming USER_HZ is always divisible by 4)
value = ns64 * (USER_HZ/4) / (BILLION/4);

// faster, and never overflows (for 100, 128, 1000)
#if ! (BILLION % USER_HZ)
value = ns64 / (BILLION/USER_HZ);
#endif

// 256 years to overflow (for 1024)
#if ! (USER_HZ % 512)
value = ns64 * (USER_HZ/512) / (BILLION/512);
#endif



^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] Re: boot time, process start time, and NOW time
  2004-08-17 22:19               ` Albert Cahalan
@ 2004-08-18  1:09                 ` john stultz
  2004-08-17 22:45                   ` Albert Cahalan
  2004-08-18  7:42                   ` Tim Schmielau
  0 siblings, 2 replies; 57+ messages in thread
From: john stultz @ 2004-08-18  1:09 UTC (permalink / raw)
  To: Albert Cahalan
  Cc: Tim Schmielau, george anzinger, Andrew Morton OSDL,
	OGAWA Hirofumi, lkml, voland, nicolas.george, kaukasoi,
	david+powerix

On Tue, 2004-08-17 at 15:19, Albert Cahalan wrote:
> On Tue, 2004-08-17 at 20:11, john stultz wrote:
> 
> > --- 1.62/fs/proc/array.c	2004-08-05 13:36:53 -07:00
> > +++ edited/fs/proc/array.c	2004-08-17 17:08:07 -07:00
> > @@ -356,7 +356,14 @@
> >  	read_unlock(&tasklist_lock);
> >  
> >  	/* Temporary variable needed for gcc-2.96 */
> > -	start_time = jiffies_64_to_clock_t(task->start_time - INITIAL_JIFFIES);
> > +	/* convert timespec -> nsec*/
> > +	start_time = (unsigned long long)task->start_time.tv_sec * NSEC_PER_SEC 
> > +				+ task->start_time.tv_nsec;
> > +	/* convert nsec -> ticks */
> > +	start_time *= HZ;
> > +	do_div(start_time, NSEC_PER_SEC);
> > +	/* convert ticks -> USER_HZ ticks */
> > +	start_time = jiffies_64_to_clock_t(start_time);
> 
> This would overflow in about 6 months at 1024 USER_HZ.
> Various possible alternatives:

Everybody sing: Thanks, nice catch/Here's an updated patch!

-john

===== fs/proc/array.c 1.62 vs edited =====
--- 1.62/fs/proc/array.c	2004-08-05 13:36:53 -07:00
+++ edited/fs/proc/array.c	2004-08-17 18:03:55 -07:00
@@ -356,7 +356,13 @@
 	read_unlock(&tasklist_lock);
 
 	/* Temporary variable needed for gcc-2.96 */
-	start_time = jiffies_64_to_clock_t(task->start_time - INITIAL_JIFFIES);
+	/* convert timespec -> nsec*/
+	start_time = (unsigned long long)task->start_time.tv_sec * NSEC_PER_SEC 
+				+ task->start_time.tv_nsec;
+	/* convert nsec -> ticks */
+	do_div(start_time, NSEC_PER_SEC/HZ);
+	/* convert ticks -> USER_HZ ticks */
+	start_time = jiffies_64_to_clock_t(start_time);
 
 	res = sprintf(buffer,"%d (%s) %c %d %d %d %d %d %lu %lu \
 %lu %lu %lu %lu %lu %ld %ld %ld %ld %d %ld %llu %lu %ld %lu %lu %lu %lu %lu \
===== include/linux/sched.h 1.228 vs edited =====
--- 1.228/include/linux/sched.h	2004-07-28 21:58:54 -07:00
+++ edited/include/linux/sched.h	2004-08-17 15:49:38 -07:00
@@ -457,7 +457,7 @@
 	struct timer_list real_timer;
 	unsigned long utime, stime, cutime, cstime;
 	unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; /* context switch counts */
-	u64 start_time;
+	struct timespec start_time;
 /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
 	unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
 /* process credentials */
===== kernel/acct.c 1.34 vs edited =====
--- 1.34/kernel/acct.c	2004-08-02 01:00:40 -07:00
+++ edited/kernel/acct.c	2004-08-17 18:04:27 -07:00
@@ -384,6 +384,8 @@
 	unsigned long vsize;
 	unsigned long flim;
 	u64 elapsed;
+	u64 run_time;
+	struct timespec uptime;
 
 	/*
 	 * First check to see if there is enough free_space to continue
@@ -401,7 +403,15 @@
 	ac.ac_version = ACCT_VERSION | ACCT_BYTEORDER;
 	strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm));
 
-	elapsed = jiffies_64_to_AHZ(get_jiffies_64() - current->start_time);
+	/* calculate run_time in nsec*/
+	do_posix_clock_monotonic_gettime(&uptime);
+	run_time = (u64)uptime.tv_sec*NSEC_PER_SEC + uptime.tv_nsec;	
+	run_time -= (u64)current->start_time.tv_sec*NSEC_PER_SEC 
+					+ current->start_Time.tv_nsec;
+	/* convert nsec -> ticks */
+	do_div(run_time, NSEC_PER_SEC/HZ);
+	
+	elapsed = jiffies_64_to_AHZ(run_time);
 #if ACCT_VERSION==3
 	ac.ac_etime = encode_float(elapsed);
 #else
===== kernel/fork.c 1.186 vs edited =====
--- 1.186/kernel/fork.c	2004-07-28 21:58:55 -07:00
+++ edited/kernel/fork.c	2004-08-17 15:51:30 -07:00
@@ -961,7 +961,7 @@
 	p->utime = p->stime = 0;
 	p->cutime = p->cstime = 0;
 	p->lock_depth = -1;		/* -1 = no lock */
-	p->start_time = get_jiffies_64();
+	do_posix_clock_monotonic_gettime(&p->start_time);
 	p->security = NULL;
 	p->io_context = NULL;
 	p->audit_context = NULL;
===== mm/oom_kill.c 1.28 vs edited =====
--- 1.28/mm/oom_kill.c	2004-08-02 01:00:42 -07:00
+++ edited/mm/oom_kill.c	2004-08-17 17:09:32 -07:00
@@ -44,6 +44,7 @@
 static int badness(struct task_struct *p)
 {
 	int points, cpu_time, run_time, s;
+	struct timespec uptime;
 
 	if (!p->mm)
 		return 0;
@@ -61,7 +62,9 @@
 	 * very well in practice.
 	 */
 	cpu_time = (p->utime + p->stime) >> (SHIFT_HZ + 3);
-	run_time = (get_jiffies_64() - p->start_time) >> (SHIFT_HZ + 10);
+
+	do_posix_clock_monotonic_gettime(&uptime);
+	run_time = (uptime.tv_sec - p->start_time.tv_sec)/60;
 
 	s = int_sqrt(cpu_time);
 	if (s)



^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] Re: boot time, process start time, and NOW time
  2004-08-18  1:09                 ` john stultz
@ 2004-08-17 22:45                   ` Albert Cahalan
  2004-08-18  7:42                   ` Tim Schmielau
  1 sibling, 0 replies; 57+ messages in thread
From: Albert Cahalan @ 2004-08-17 22:45 UTC (permalink / raw)
  To: john stultz
  Cc: Albert Cahalan, Tim Schmielau, george anzinger,
	Andrew Morton OSDL, OGAWA Hirofumi, lkml, voland, nicolas.george,
	kaukasoi, david+powerix

On Tue, 2004-08-17 at 21:09, john stultz wrote:
> On Tue, 2004-08-17 at 15:19, Albert Cahalan wrote:
> > On Tue, 2004-08-17 at 20:11, john stultz wrote:
> > 
> > > --- 1.62/fs/proc/array.c	2004-08-05 13:36:53 -07:00
> > > +++ edited/fs/proc/array.c	2004-08-17 17:08:07 -07:00
> > > @@ -356,7 +356,14 @@
> > >  	read_unlock(&tasklist_lock);
> > >  
> > >  	/* Temporary variable needed for gcc-2.96 */
> > > -	start_time = jiffies_64_to_clock_t(task->start_time - INITIAL_JIFFIES);
> > > +	/* convert timespec -> nsec*/
> > > +	start_time = (unsigned long long)task->start_time.tv_sec * NSEC_PER_SEC 
> > > +				+ task->start_time.tv_nsec;
> > > +	/* convert nsec -> ticks */
> > > +	start_time *= HZ;
> > > +	do_div(start_time, NSEC_PER_SEC);
> > > +	/* convert ticks -> USER_HZ ticks */
> > > +	start_time = jiffies_64_to_clock_t(start_time);
> > 
> > This would overflow in about 6 months at 1024 USER_HZ.
> > Various possible alternatives:
> 
> Everybody sing: Thanks, nice catch/Here's an updated patch!
> 
> -john
> 
> ===== fs/proc/array.c 1.62 vs edited =====
> --- 1.62/fs/proc/array.c	2004-08-05 13:36:53 -07:00
> +++ edited/fs/proc/array.c	2004-08-17 18:03:55 -07:00
> @@ -356,7 +356,13 @@
>  	read_unlock(&tasklist_lock);
>  
>  	/* Temporary variable needed for gcc-2.96 */
> -	start_time = jiffies_64_to_clock_t(task->start_time - INITIAL_JIFFIES);
> +	/* convert timespec -> nsec*/
> +	start_time = (unsigned long long)task->start_time.tv_sec * NSEC_PER_SEC 
> +				+ task->start_time.tv_nsec;
> +	/* convert nsec -> ticks */
> +	do_div(start_time, NSEC_PER_SEC/HZ);
> +	/* convert ticks -> USER_HZ ticks */
> +	start_time = jiffies_64_to_clock_t(start_time);

NSEC_PER_SEC/HZ isn't an integer when HZ is 1024.
Also, you're doing two conversions. You can go directly
from nanoseconds to USER_HZ, without using HZ at all.

I think you really need the #if for this.
It could go in a header if you like, creating
a timespec_to_clock_t macro for use in proc.



^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] Re: boot time, process start time, and NOW time
  2004-08-18  1:09                 ` john stultz
  2004-08-17 22:45                   ` Albert Cahalan
@ 2004-08-18  7:42                   ` Tim Schmielau
  2004-08-19 19:15                     ` Petri Kaukasoina
  1 sibling, 1 reply; 57+ messages in thread
From: Tim Schmielau @ 2004-08-18  7:42 UTC (permalink / raw)
  To: john stultz
  Cc: Albert Cahalan, george anzinger, Andrew Morton OSDL,
	OGAWA Hirofumi, lkml, voland, nicolas.george, kaukasoi,
	david+powerix

On Tue, 17 Aug 2004, john stultz wrote:

> Everybody sing: Thanks, nice catch/Here's an updated patch!

> ===== fs/proc/array.c 1.62 vs edited =====
> --- 1.62/fs/proc/array.c	2004-08-05 13:36:53 -07:00
> +++ edited/fs/proc/array.c	2004-08-17 18:03:55 -07:00
> @@ -356,7 +356,13 @@
>  	read_unlock(&tasklist_lock);
>  
>  	/* Temporary variable needed for gcc-2.96 */
> -	start_time = jiffies_64_to_clock_t(task->start_time - INITIAL_JIFFIES);
> +	/* convert timespec -> nsec*/
> +	start_time = (unsigned long long)task->start_time.tv_sec * NSEC_PER_SEC 
> +				+ task->start_time.tv_nsec;
> +	/* convert nsec -> ticks */
> +	do_div(start_time, NSEC_PER_SEC/HZ);
> +	/* convert ticks -> USER_HZ ticks */
> +	start_time = jiffies_64_to_clock_t(start_time);

As Albert already noted, we can collapse the two conversions into one.

> ===== kernel/acct.c 1.34 vs edited =====
[...]
> +	/* convert nsec -> ticks */
> +	do_div(run_time, NSEC_PER_SEC/HZ);
> +	
> +	elapsed = jiffies_64_to_AHZ(run_time);

ditto

> ===== mm/oom_kill.c 1.28 vs edited =====

> @@ -61,7 +62,9 @@
>  	 * very well in practice.
>  	 */
>  	cpu_time = (p->utime + p->stime) >> (SHIFT_HZ + 3);
> -	run_time = (get_jiffies_64() - p->start_time) >> (SHIFT_HZ + 10);
> +
> +	do_posix_clock_monotonic_gettime(&uptime);
> +	run_time = (uptime.tv_sec - p->start_time.tv_sec)/60;

Doh, now I understand what you scratched your head over...
Since these shifts are magical constants found by empirical tuning and 
anecdotal evidence, I'd rather keep the behavior of the code and fix the 
comment accordingly.

Also, we might optimize this to call do_posix_clock_monotonic_gettime()
only once per oom killer invocation, rather than once per process.
Haven't yet looked into how expensive do_posix_clock_monotonic_gettime()
is, though.


Updated patch below. It's not very well tested, but it compiles, boots, 
and fixes the problem on i386 with the default HZ=1000 and USER_HZ=100.

Time to do some work for my day-job now.

Thanks,
Tim


--- linux-2.6.8.1/fs/proc/array.c	2004-08-17 21:38:54.000000000 +0200
+++ linux-2.6.8.1-uf/fs/proc/array.c	2004-08-18 08:35:23.000000000 +0200
@@ -356,7 +356,11 @@ int proc_pid_stat(struct task_struct *ta
 	read_unlock(&tasklist_lock);
 
 	/* Temporary variable needed for gcc-2.96 */
-	start_time = jiffies_64_to_clock_t(task->start_time - INITIAL_JIFFIES);
+	/* convert timespec -> nsec*/
+	start_time = (unsigned long long)task->start_time.tv_sec * NSEC_PER_SEC 
+				+ task->start_time.tv_nsec;
+	/* convert nsec -> ticks */
+	start_time = nsec_to_clock_t(start_time);
 
 	res = sprintf(buffer,"%d (%s) %c %d %d %d %d %d %lu %lu \
 %lu %lu %lu %lu %lu %ld %ld %ld %ld %d %ld %llu %lu %ld %lu %lu %lu %lu %lu \

--- linux-2.6.8.1/include/linux/acct.h	2004-08-17 21:38:55.000000000 +0200
+++ linux-2.6.8.1-uf/include/linux/acct.h	2004-08-18 08:41:38.000000000 +0200
@@ -172,17 +172,23 @@ static inline u32 jiffies_to_AHZ(unsigne
 #endif
 }
 
-static inline u64 jiffies_64_to_AHZ(u64 x)
+static inline u64 nsec_to_AHZ(u64 x)
 {
-#if (TICK_NSEC % (NSEC_PER_SEC / AHZ)) == 0
-#if HZ != AHZ
-	do_div(x, HZ / AHZ);
-#endif
-#else
-	x *= TICK_NSEC;
+#if (NSEC_PER_SEC % AHZ) == 0
 	do_div(x, (NSEC_PER_SEC / AHZ));
+#elif (AHZ % 512) == 0
+	x *= AHZ/512;
+	do_div(x, (NSEC_PER_SEC / 512));
+#else
+	/* 
+         * max relative error 1.28e-7 for AHZ <= 1024,
+         * overflow after 146 years.
+         * Note that 4*NSEC_PER_SEC just fits into an unsigned long.
+         */
+	x *= 4;
+	do_div(x, (4ul * NSEC_PER_SEC + (AHZ/2)) / AHZ);
 #endif
-       return x;
+	return x;
 }
 
 #endif  /* __KERNEL */

--- linux-2.6.8.1/include/linux/sched.h	2004-08-17 21:38:55.000000000 +0200
+++ linux-2.6.8.1-uf/include/linux/sched.h	2004-08-18 07:59:52.000000000 +0200
@@ -457,7 +457,7 @@ struct task_struct {
 	struct timer_list real_timer;
 	unsigned long utime, stime, cutime, cstime;
 	unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; /* context switch counts */
-	u64 start_time;
+	struct timespec start_time;
 /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
 	unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
 /* process credentials */

--- linux-2.6.8.1/include/linux/times.h	2004-08-17 00:13:35.000000000 +0200
+++ linux-2.6.8.1-uf/include/linux/times.h	2004-08-18 08:38:45.000000000 +0200
@@ -55,6 +55,25 @@ static inline u64 jiffies_64_to_clock_t(
 }
 #endif
 
+static inline u64 nsec_to_clock_t(u64 x)
+{
+#if (NSEC_PER_SEC % USER_HZ) == 0
+	do_div(x, (NSEC_PER_SEC / USER_HZ));
+#elif (USER_HZ % 512) == 0
+	x *= USER_HZ/512;
+	do_div(x, (NSEC_PER_SEC / 512));
+#else
+	/* 
+         * max relative error 1.28e-7 for USER_HZ <= 1024,
+         * overflow after 146 years.
+         * Note that 4*NSEC_PER_SEC just fits into an unsigned long.
+         */
+	x *= 4;
+	do_div(x, (4ul * NSEC_PER_SEC + (USER_HZ/2)) / USER_HZ);
+#endif
+	return x;
+}
+
 struct tms {
 	clock_t tms_utime;
 	clock_t tms_stime;

--- linux-2.6.8.1/kernel/acct.c	2004-08-17 21:38:55.000000000 +0200
+++ linux-2.6.8.1-uf/kernel/acct.c	2004-08-18 08:41:44.000000000 +0200
@@ -384,6 +384,8 @@ static void do_acct_process(long exitcod
 	unsigned long vsize;
 	unsigned long flim;
 	u64 elapsed;
+	u64 run_time;
+	struct timespec uptime;
 
 	/*
 	 * First check to see if there is enough free_space to continue
@@ -401,7 +403,13 @@ static void do_acct_process(long exitcod
 	ac.ac_version = ACCT_VERSION | ACCT_BYTEORDER;
 	strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm));
 
-	elapsed = jiffies_64_to_AHZ(get_jiffies_64() - current->start_time);
+	/* calculate run_time in nsec*/
+	do_posix_clock_monotonic_gettime(&uptime);
+	run_time = (u64)uptime.tv_sec*NSEC_PER_SEC + uptime.tv_nsec;	
+	run_time -= (u64)current->start_time.tv_sec*NSEC_PER_SEC 
+					+ current->start_Time.tv_nsec;
+	/* convert nsec -> AHZ */
+	elapsed = nsec_to_AHZ(run_time);
 #if ACCT_VERSION==3
 	ac.ac_etime = encode_float(elapsed);
 #else

--- linux-2.6.8.1/kernel/fork.c	2004-08-17 21:38:55.000000000 +0200
+++ linux-2.6.8.1-uf/kernel/fork.c	2004-08-18 07:59:52.000000000 +0200
@@ -961,7 +961,7 @@ struct task_struct *copy_process(unsigne
 	p->utime = p->stime = 0;
 	p->cutime = p->cstime = 0;
 	p->lock_depth = -1;		/* -1 = no lock */
-	p->start_time = get_jiffies_64();
+	do_posix_clock_monotonic_gettime(&p->start_time);
 	p->security = NULL;
 	p->io_context = NULL;
 	p->audit_context = NULL;

--- linux-2.6.8.1/mm/oom_kill.c	2004-08-17 21:38:55.000000000 +0200
+++ linux-2.6.8.1-uf/mm/oom_kill.c	2004-08-18 08:49:51.000000000 +0200
@@ -44,6 +44,7 @@
 static int badness(struct task_struct *p)
 {
 	int points, cpu_time, run_time, s;
+	struct timespec uptime;
 
 	if (!p->mm)
 		return 0;
@@ -56,12 +57,14 @@ static int badness(struct task_struct *p
 	points = p->mm->total_vm;
 
 	/*
-	 * CPU time is in seconds and run time is in minutes. There is no
-	 * particular reason for this other than that it turned out to work
-	 * very well in practice.
+	 * CPU time is in tens of seconds and run time is in thousands
+	 * of seconds. There is no particular reason for this other than
+	 * that it turned out to work very well in practice.
 	 */
 	cpu_time = (p->utime + p->stime) >> (SHIFT_HZ + 3);
-	run_time = (get_jiffies_64() - p->start_time) >> (SHIFT_HZ + 10);
+
+	do_posix_clock_monotonic_gettime(&uptime);
+	run_time = (uptime.tv_sec - p->start_time.tv_sec) >> 10;
 
 	s = int_sqrt(cpu_time);
 	if (s)

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] Re: boot time, process start time, and NOW time
  2004-08-18  7:42                   ` Tim Schmielau
@ 2004-08-19 19:15                     ` Petri Kaukasoina
  2004-08-26 11:04                       ` Andrew Morton
  0 siblings, 1 reply; 57+ messages in thread
From: Petri Kaukasoina @ 2004-08-19 19:15 UTC (permalink / raw)
  To: Tim Schmielau
  Cc: john stultz, Albert Cahalan, george anzinger, Andrew Morton OSDL,
	OGAWA Hirofumi, lkml, voland, nicolas.george, david+powerix

On Wed, Aug 18, 2004 at 09:42:17AM +0200, Tim Schmielau wrote:
> Updated patch below. It's not very well tested, but it compiles, boots, 
> and fixes the problem on i386 with the default HZ=1000 and USER_HZ=100.

Yes, it works nicely now.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] Re: boot time, process start time, and NOW time
  2004-08-19 19:15                     ` Petri Kaukasoina
@ 2004-08-26 11:04                       ` Andrew Morton
  2004-08-26 12:07                         ` Tim Schmielau
  0 siblings, 1 reply; 57+ messages in thread
From: Andrew Morton @ 2004-08-26 11:04 UTC (permalink / raw)
  To: Petri Kaukasoina
  Cc: tim, johnstul, albert, george, hirofumi, linux-kernel, voland,
	nicolas.george, david+powerix

Petri Kaukasoina <kaukasoi@elektroni.ee.tut.fi> wrote:
>
> On Wed, Aug 18, 2004 at 09:42:17AM +0200, Tim Schmielau wrote:
> > Updated patch below. It's not very well tested, but it compiles, boots, 
> > and fixes the problem on i386 with the default HZ=1000 and USER_HZ=100.
> 
> Yes, it works nicely now.

So...  is this settled now?

If so, could you (Tim) please send out a fresh, changelogged version of the
patch for review?

Thanks.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] Re: boot time, process start time, and NOW time
  2004-08-26 11:04                       ` Andrew Morton
@ 2004-08-26 12:07                         ` Tim Schmielau
  2004-08-30 23:00                           ` Tim Schmielau
  0 siblings, 1 reply; 57+ messages in thread
From: Tim Schmielau @ 2004-08-26 12:07 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Petri Kaukasoina, John Stultz, albert, george, hirofumi, lkml,
	voland, nicolas.george, david+powerix

On Thu, 26 Aug 2004, Andrew Morton wrote:

> Petri Kaukasoina <kaukasoi@elektroni.ee.tut.fi> wrote:
> >
> > On Wed, Aug 18, 2004 at 09:42:17AM +0200, Tim Schmielau wrote:
> > > Updated patch below. It's not very well tested, but it compiles, boots, 
> > > and fixes the problem on i386 with the default HZ=1000 and USER_HZ=100.
> > 
> > Yes, it works nicely now.
> 
> So...  is this settled now?
> 
> If so, could you (Tim) please send out a fresh, changelogged version of the
> patch for review?
> 
> Thanks.

I still want to do some basic testing, (boot with different values of HZ /
USER_HZ, check that I didn't spoil the OOM killer), and we might have some
arguments about the helper functions, but it might already go into -mm as
it is:


Derive process start times from the posix_clock_monotonic notion of 
uptime instead of "jiffies", consistent with the earlier change to 
/proc/uptime itself.
(http://linus.bkbits.net:8080/linux-2.5/cset@3ef4851dGg0fxX58R9Zv8SIq9fzNmQ?na%0Av=index.html|src/.|src/fs|src/fs/proc|related/fs/proc/proc_misc.c)

Process start times are reported to userspace in units of 1/USER_HZ since
boot, thus applications as procps need the value of "uptime" to convert
them into absolute time.

Currently "uptime" is derived from an ntp-corrected time base, but process
start time is derived from the free-running "jiffies" counter.
This results in inaccurate, drifting process start times as seen by the
user, even if the exported number stays constant, because the users notion
of "jiffies" changes in time.

It's John Stultz's patch anyways, which I only messed up a bit, but since
people started trading signed-off lines on lkml:

Signed-off-by: Tim Schmielau <tim@physik3.uni-rostock.de>


--- linux-2.6.8.1-oom/fs/proc/array.c	2004-08-17 21:38:54.000000000 +0200
+++ linux-2.6.8.1-oom-uf/fs/proc/array.c	2004-08-25 23:24:20.000000000 +0200
@@ -356,7 +356,11 @@ int proc_pid_stat(struct task_struct *ta
 	read_unlock(&tasklist_lock);
 
 	/* Temporary variable needed for gcc-2.96 */
-	start_time = jiffies_64_to_clock_t(task->start_time - INITIAL_JIFFIES);
+	/* convert timespec -> nsec*/
+	start_time = (unsigned long long)task->start_time.tv_sec * NSEC_PER_SEC 
+				+ task->start_time.tv_nsec;
+	/* convert nsec -> ticks */
+	start_time = nsec_to_clock_t(start_time);
 
 	res = sprintf(buffer,"%d (%s) %c %d %d %d %d %d %lu %lu \
 %lu %lu %lu %lu %lu %ld %ld %ld %ld %d %ld %llu %lu %ld %lu %lu %lu %lu %lu \

--- linux-2.6.8.1-oom/include/linux/acct.h	2004-08-17 21:38:55.000000000 +0200
+++ linux-2.6.8.1-oom-uf/include/linux/acct.h	2004-08-25 23:50:15.000000000 +0200
@@ -172,17 +172,24 @@ static inline u32 jiffies_to_AHZ(unsigne
 #endif
 }
 
-static inline u64 jiffies_64_to_AHZ(u64 x)
+static inline u64 nsec_to_AHZ(u64 x)
 {
-#if (TICK_NSEC % (NSEC_PER_SEC / AHZ)) == 0
-#if HZ != AHZ
-	do_div(x, HZ / AHZ);
-#endif
-#else
-	x *= TICK_NSEC;
+#if (NSEC_PER_SEC % AHZ) == 0
 	do_div(x, (NSEC_PER_SEC / AHZ));
+#elif (AHZ % 512) == 0
+	x *= AHZ/512;
+	do_div(x, (NSEC_PER_SEC / 512));
+#else
+	/* 
+         * max relative error 5.7e-8 (1.8s per year) for AHZ <= 1024,
+         * overflow after 64.99 years.
+         * exact for AHZ=60, 72, 90, 120, 144, 180, 300, 600, 900, ...
+         */
+	x *= 9;
+	do_div(x, (unsigned long)((9ull * NSEC_PER_SEC + (AHZ/2))
+	                          / AHZ));
 #endif
-       return x;
+	return x;
 }
 
 #endif  /* __KERNEL */

--- linux-2.6.8.1-oom/include/linux/sched.h	2004-08-17 21:38:55.000000000 +0200
+++ linux-2.6.8.1-oom-uf/include/linux/sched.h	2004-08-25 23:24:20.000000000 +0200
@@ -457,7 +457,7 @@ struct task_struct {
 	struct timer_list real_timer;
 	unsigned long utime, stime, cutime, cstime;
 	unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; /* context switch counts */
-	u64 start_time;
+	struct timespec start_time;
 /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
 	unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
 /* process credentials */

--- linux-2.6.8.1-oom/include/linux/times.h	2004-08-17 00:13:35.000000000 +0200
+++ linux-2.6.8.1-oom-uf/include/linux/times.h	2004-08-25 23:24:20.000000000 +0200
@@ -55,6 +55,26 @@ static inline u64 jiffies_64_to_clock_t(
 }
 #endif
 
+static inline u64 nsec_to_clock_t(u64 x)
+{
+#if (NSEC_PER_SEC % USER_HZ) == 0
+	do_div(x, (NSEC_PER_SEC / USER_HZ));
+#elif (USER_HZ % 512) == 0
+	x *= USER_HZ/512;
+	do_div(x, (NSEC_PER_SEC / 512));
+#else
+	/* 
+         * max relative error 5.7e-8 (1.8s per year) for USER_HZ <= 1024,
+         * overflow after 64.99 years.
+         * exact for HZ=60, 72, 90, 120, 144, 180, 300, 600, 900, ...
+         */
+	x *= 9;
+	do_div(x, (unsigned long)((9ull * NSEC_PER_SEC + (USER_HZ/2))
+	                          / USER_HZ));
+#endif
+	return x;
+}
+
 struct tms {
 	clock_t tms_utime;
 	clock_t tms_stime;

--- linux-2.6.8.1-oom/kernel/acct.c	2004-08-17 21:38:55.000000000 +0200
+++ linux-2.6.8.1-oom-uf/kernel/acct.c	2004-08-25 23:24:20.000000000 +0200
@@ -384,6 +384,8 @@ static void do_acct_process(long exitcod
 	unsigned long vsize;
 	unsigned long flim;
 	u64 elapsed;
+	u64 run_time;
+	struct timespec uptime;
 
 	/*
 	 * First check to see if there is enough free_space to continue
@@ -401,7 +403,13 @@ static void do_acct_process(long exitcod
 	ac.ac_version = ACCT_VERSION | ACCT_BYTEORDER;
 	strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm));
 
-	elapsed = jiffies_64_to_AHZ(get_jiffies_64() - current->start_time);
+	/* calculate run_time in nsec*/
+	do_posix_clock_monotonic_gettime(&uptime);
+	run_time = (u64)uptime.tv_sec*NSEC_PER_SEC + uptime.tv_nsec;	
+	run_time -= (u64)current->start_time.tv_sec*NSEC_PER_SEC 
+					+ current->start_time.tv_nsec;
+	/* convert nsec -> AHZ */
+	elapsed = nsec_to_AHZ(run_time);
 #if ACCT_VERSION==3
 	ac.ac_etime = encode_float(elapsed);
 #else

--- linux-2.6.8.1-oom/kernel/fork.c	2004-08-17 21:38:55.000000000 +0200
+++ linux-2.6.8.1-oom-uf/kernel/fork.c	2004-08-25 23:24:20.000000000 +0200
@@ -961,7 +961,7 @@ struct task_struct *copy_process(unsigne
 	p->utime = p->stime = 0;
 	p->cutime = p->cstime = 0;
 	p->lock_depth = -1;		/* -1 = no lock */
-	p->start_time = get_jiffies_64();
+	do_posix_clock_monotonic_gettime(&p->start_time);
 	p->security = NULL;
 	p->io_context = NULL;
 	p->audit_context = NULL;

--- linux-2.6.8.1-oom/mm/oom_kill.c	2004-08-24 17:40:58.000000000 +0200
+++ linux-2.6.8.1-oom-uf/mm/oom_kill.c	2004-08-25 23:32:03.000000000 +0200
@@ -26,6 +26,7 @@
 /**
  * oom_badness - calculate a numeric value for how bad this task has been
  * @p: task struct of which task we should calculate
+ * @p: current uptime in seconds
  *
  * The formula used is relatively simple and documented inline in the
  * function. The main rationale is that we want to select a good task
@@ -41,7 +42,7 @@
  *    of least surprise ... (be careful when you change it)
  */
 
-static unsigned long badness(struct task_struct *p)
+static unsigned long badness(struct task_struct *p, unsigned long uptime)
 {
 	unsigned long points, cpu_time, run_time, s;
 
@@ -56,12 +57,16 @@ static unsigned long badness(struct task
 	points = p->mm->total_vm;
 
 	/*
-	 * CPU time is in seconds and run time is in minutes. There is no
-	 * particular reason for this other than that it turned out to work
-	 * very well in practice.
+	 * CPU time is in tens of seconds and run time is in thousands
+         * of seconds. There is no particular reason for this other than
+         * that it turned out to work very well in practice.
 	 */
 	cpu_time = (p->utime + p->stime) >> (SHIFT_HZ + 3);
-	run_time = (get_jiffies_64() - p->start_time) >> (SHIFT_HZ + 10);
+
+	if (uptime >= p->start_time.tv_sec)
+		run_time = (uptime - p->start_time.tv_sec) >> 10;
+	else
+		run_time = 0;
 
 	s = int_sqrt(cpu_time);
 	if (s)
@@ -111,10 +116,12 @@ static struct task_struct * select_bad_p
 	unsigned long maxpoints = 0;
 	struct task_struct *g, *p;
 	struct task_struct *chosen = NULL;
+	struct timespec uptime;
 
+	do_posix_clock_monotonic_gettime(&uptime);
 	do_each_thread(g, p)
 		if (p->pid) {
-			unsigned long points = badness(p);
+			unsigned long points = badness(p, uptime.tv_sec);
 			if (points > maxpoints) {
 				chosen = p;
 				maxpoints = points;

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] Re: boot time, process start time, and NOW time
  2004-08-26 12:07                         ` Tim Schmielau
@ 2004-08-30 23:00                           ` Tim Schmielau
  2004-08-30 23:38                             ` john stultz
  2004-08-31  1:34                             ` john stultz
  0 siblings, 2 replies; 57+ messages in thread
From: Tim Schmielau @ 2004-08-30 23:00 UTC (permalink / raw)
  To: george, Andrew Morton
  Cc: Petri Kaukasoina, John Stultz, albert, hirofumi, lkml, voland,
	nicolas.george, david+powerix

> > So...  is this settled now?
> > 
> > If so, could you (Tim) please send out a fresh, changelogged version of the
> > patch for review?
> > 
> > Thanks.
> 
> I still want to do some basic testing, (boot with different values of HZ /
> USER_HZ, check that I didn't spoil the OOM killer), and we might have some
> arguments about the helper functions, but it might already go into -mm as
> it is:

I rediffed it for 2.6.9-rc1-mm1 and did some more testing
(HZ/USER_HZ = 1000/100, 1024/1024, 60/60, verified the oom killer still 
selects the right task).

Crosschecking an unpatched kernel I once got a whooping 15 minutes error 
in process start times, again emphasizing that something needs to be done 
about this. Of course no such error occured with the patch.

All tests went ok, but wondering about the difference between /proc/stat 
and /proc/uptimes it strikes me that with this patch userland now might 
detect wrong HZ values. Sigh.

So I think we should not apply the patch, but rather back out the patch 
that rebased uptime on a ntp-corrected timesource.
There are too many statistics that are still based on jiffies or clock 
ticks, and we cannot immediately change that without a large rework
(although this might eventually happen according to John's proposal).
And mixing two different timesources just won't work, regardles where we 
draw the borderline between them.

George, please excuse my lack of understanding. What again where the
precise reasons to have an ntp-corrected uptime?

Tim



Just for reference, the current patch against 2.6.9-rc1-mm1:


Derive process start times from the posix_clock_monotonic notion of 
uptime instead of "jiffies", consistent with the earlier change to 
/proc/uptime itself.
(http://linus.bkbits.net:8080/linux-2.5/cset@3ef4851dGg0fxX58R9Zv8SIq9fzNmQ?na%0Av=index.html|src/.|src/fs|src/fs/proc|related/fs/proc/proc_misc.c)

Process start times are reported to userspace in units of 1/USER_HZ since
boot, thus applications as procps need the value of "uptime" to convert
them into absolute time.

Currently "uptime" is derived from an ntp-corrected time base, but process
start time is derived from the free-running "jiffies" counter.
This results in inaccurate, drifting process start times as seen by the
user, even if the exported number stays constant, because the users notion
of "jiffies" changes in time.

It's John Stultz's patch anyways, which I only messed up a bit, but since
people started trading signed-off lines on lkml:

Signed-off-by: Tim Schmielau <tim@physik3.uni-rostock.de>


--- linux-2.6.9-rc1-mm1/fs/proc/array.c	2004-08-30 21:51:05.000000000 +0200
+++ linux-2.6.9-rc1-mm1-uf/fs/proc/array.c	2004-08-30 21:51:36.000000000 +0200
@@ -359,7 +359,11 @@ int proc_pid_stat(struct task_struct *ta
 	read_unlock(&tasklist_lock);
 
 	/* Temporary variable needed for gcc-2.96 */
-	start_time = jiffies_64_to_clock_t(task->start_time - INITIAL_JIFFIES);
+	/* convert timespec -> nsec*/
+	start_time = (unsigned long long)task->start_time.tv_sec * NSEC_PER_SEC 
+				+ task->start_time.tv_nsec;
+	/* convert nsec -> ticks */
+	start_time = nsec_to_clock_t(start_time);
 
 	res = sprintf(buffer,"%d (%s) %c %d %d %d %d %d %lu %lu \
 %lu %lu %lu %lu %lu %ld %ld %ld %ld %d %ld %llu %lu %ld %lu %lu %lu %lu %lu \

--- linux-2.6.9-rc1-mm1/include/linux/acct.h	2004-08-17 21:38:55.000000000 +0200
+++ linux-2.6.9-rc1-mm1-uf/include/linux/acct.h	2004-08-30 21:51:36.000000000 +0200
@@ -172,17 +172,24 @@ static inline u32 jiffies_to_AHZ(unsigne
 #endif
 }
 
-static inline u64 jiffies_64_to_AHZ(u64 x)
+static inline u64 nsec_to_AHZ(u64 x)
 {
-#if (TICK_NSEC % (NSEC_PER_SEC / AHZ)) == 0
-#if HZ != AHZ
-	do_div(x, HZ / AHZ);
-#endif
-#else
-	x *= TICK_NSEC;
+#if (NSEC_PER_SEC % AHZ) == 0
 	do_div(x, (NSEC_PER_SEC / AHZ));
+#elif (AHZ % 512) == 0
+	x *= AHZ/512;
+	do_div(x, (NSEC_PER_SEC / 512));
+#else
+	/* 
+         * max relative error 5.7e-8 (1.8s per year) for AHZ <= 1024,
+         * overflow after 64.99 years.
+         * exact for AHZ=60, 72, 90, 120, 144, 180, 300, 600, 900, ...
+         */
+	x *= 9;
+	do_div(x, (unsigned long)((9ull * NSEC_PER_SEC + (AHZ/2))
+	                          / AHZ));
 #endif
-       return x;
+	return x;
 }
 
 #endif  /* __KERNEL */

--- linux-2.6.9-rc1-mm1/include/linux/sched.h	2004-08-30 21:51:06.000000000 +0200
+++ linux-2.6.9-rc1-mm1-uf/include/linux/sched.h	2004-08-30 21:52:34.000000000 +0200
@@ -515,7 +515,7 @@ struct task_struct {
 	struct timer_list real_timer;
 	unsigned long utime, stime;
 	unsigned long nvcsw, nivcsw; /* context switch counts */
-	u64 start_time;
+	struct timespec start_time;
 /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
 	unsigned long min_flt, maj_flt;
 /* process credentials */

--- linux-2.6.9-rc1-mm1/include/linux/times.h	2004-08-17 00:13:35.000000000 +0200
+++ linux-2.6.9-rc1-mm1-uf/include/linux/times.h	2004-08-30 21:51:36.000000000 +0200
@@ -55,6 +55,26 @@ static inline u64 jiffies_64_to_clock_t(
 }
 #endif
 
+static inline u64 nsec_to_clock_t(u64 x)
+{
+#if (NSEC_PER_SEC % USER_HZ) == 0
+	do_div(x, (NSEC_PER_SEC / USER_HZ));
+#elif (USER_HZ % 512) == 0
+	x *= USER_HZ/512;
+	do_div(x, (NSEC_PER_SEC / 512));
+#else
+	/* 
+         * max relative error 5.7e-8 (1.8s per year) for USER_HZ <= 1024,
+         * overflow after 64.99 years.
+         * exact for HZ=60, 72, 90, 120, 144, 180, 300, 600, 900, ...
+         */
+	x *= 9;
+	do_div(x, (unsigned long)((9ull * NSEC_PER_SEC + (USER_HZ/2))
+	                          / USER_HZ));
+#endif
+	return x;
+}
+
 struct tms {
 	clock_t tms_utime;
 	clock_t tms_stime;

--- linux-2.6.9-rc1-mm1/kernel/acct.c	2004-08-17 21:38:55.000000000 +0200
+++ linux-2.6.9-rc1-mm1-uf/kernel/acct.c	2004-08-30 21:51:36.000000000 +0200
@@ -384,6 +384,8 @@ static void do_acct_process(long exitcod
 	unsigned long vsize;
 	unsigned long flim;
 	u64 elapsed;
+	u64 run_time;
+	struct timespec uptime;
 
 	/*
 	 * First check to see if there is enough free_space to continue
@@ -401,7 +403,13 @@ static void do_acct_process(long exitcod
 	ac.ac_version = ACCT_VERSION | ACCT_BYTEORDER;
 	strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm));
 
-	elapsed = jiffies_64_to_AHZ(get_jiffies_64() - current->start_time);
+	/* calculate run_time in nsec*/
+	do_posix_clock_monotonic_gettime(&uptime);
+	run_time = (u64)uptime.tv_sec*NSEC_PER_SEC + uptime.tv_nsec;	
+	run_time -= (u64)current->start_time.tv_sec*NSEC_PER_SEC 
+					+ current->start_time.tv_nsec;
+	/* convert nsec -> AHZ */
+	elapsed = nsec_to_AHZ(run_time);
 #if ACCT_VERSION==3
 	ac.ac_etime = encode_float(elapsed);
 #else

--- linux-2.6.9-rc1-mm1/kernel/fork.c	2004-08-30 21:51:07.000000000 +0200
+++ linux-2.6.9-rc1-mm1-uf/kernel/fork.c	2004-08-30 21:51:36.000000000 +0200
@@ -996,7 +996,7 @@ static task_t *copy_process(unsigned lon
 
 	p->utime = p->stime = 0;
 	p->lock_depth = -1;		/* -1 = no lock */
-	p->start_time = get_jiffies_64();
+	do_posix_clock_monotonic_gettime(&p->start_time);
 	p->security = NULL;
 	p->io_context = NULL;
 	p->io_wait = NULL;

--- linux-2.6.9-rc1-mm1/mm/oom_kill.c	2004-08-30 21:51:07.000000000 +0200
+++ linux-2.6.9-rc1-mm1-uf/mm/oom_kill.c	2004-08-30 21:51:36.000000000 +0200
@@ -26,6 +26,7 @@
 /**
  * oom_badness - calculate a numeric value for how bad this task has been
  * @p: task struct of which task we should calculate
+ * @p: current uptime in seconds
  *
  * The formula used is relatively simple and documented inline in the
  * function. The main rationale is that we want to select a good task
@@ -41,7 +42,7 @@
  *    of least surprise ... (be careful when you change it)
  */
 
-static unsigned long badness(struct task_struct *p)
+static unsigned long badness(struct task_struct *p, unsigned long uptime)
 {
 	unsigned long points, cpu_time, run_time, s;
 
@@ -56,12 +57,16 @@ static unsigned long badness(struct task
 	points = p->mm->total_vm;
 
 	/*
-	 * CPU time is in seconds and run time is in minutes. There is no
-	 * particular reason for this other than that it turned out to work
-	 * very well in practice.
+	 * CPU time is in tens of seconds and run time is in thousands
+         * of seconds. There is no particular reason for this other than
+         * that it turned out to work very well in practice.
 	 */
 	cpu_time = (p->utime + p->stime) >> (SHIFT_HZ + 3);
-	run_time = (get_jiffies_64() - p->start_time) >> (SHIFT_HZ + 10);
+
+	if (uptime >= p->start_time.tv_sec)
+		run_time = (uptime - p->start_time.tv_sec) >> 10;
+	else
+		run_time = 0;
 
 	s = int_sqrt(cpu_time);
 	if (s)
@@ -111,10 +116,12 @@ static struct task_struct * select_bad_p
 	unsigned long maxpoints = 0;
 	struct task_struct *g, *p;
 	struct task_struct *chosen = NULL;
+	struct timespec uptime;
 
+	do_posix_clock_monotonic_gettime(&uptime);
 	do_each_thread(g, p)
 		if (p->pid) {
-			unsigned long points = badness(p);
+			unsigned long points = badness(p, uptime.tv_sec);
 			if (points > maxpoints) {
 				chosen = p;
 				maxpoints = points;

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] Re: boot time, process start time, and NOW time
  2004-08-30 23:00                           ` Tim Schmielau
@ 2004-08-30 23:38                             ` john stultz
  2004-08-31  0:37                               ` Albert Cahalan
  2004-08-31  0:45                               ` Tim Schmielau
  2004-08-31  1:34                             ` john stultz
  1 sibling, 2 replies; 57+ messages in thread
From: john stultz @ 2004-08-30 23:38 UTC (permalink / raw)
  To: Tim Schmielau
  Cc: george anzinger, Andrew Morton, Petri Kaukasoina, albert,
	hirofumi, lkml, voland, nicolas.george, david+powerix

On Mon, 2004-08-30 at 16:00, Tim Schmielau wrote:
> So I think we should not apply the patch, but rather back out the patch 
> that rebased uptime on a ntp-corrected timesource.
> There are too many statistics that are still based on jiffies or clock 
> ticks, and we cannot immediately change that without a large rework
> (although this might eventually happen according to John's proposal).
> And mixing two different timesources just won't work, regardles where we 
> draw the borderline between them.
> 
> George, please excuse my lack of understanding. What again where the
> precise reasons to have an ntp-corrected uptime?

If I remember correctly, folks were complaining that boot time was
drifting due to the same issue. 

So yes, a full rework of the time subsystem is needed, but it alone
won't fix all of these problems, its just the first step. Once we have a
sane time base that isn't dependent on regular timer ticks, we then need
to make the timer subsystem and every other subsystem to use that time
base instead of Jiffies/HZ. 

This isn't going to happen instantly by any means. I'm trying to get the
time of day rework finished as soon as I can, but I've got the day job
to do as well. In the mean time, we can staple gun any user visible
exported HZ/jiffies values so they are accurate (using ACTHZ or
gettimeofday), and also look into changing HZ to a less error-ful
value.  HZ=1001 has been suggested and looks quite promising (although
/net/schec/estimator.c wants a power of 4).

thanks
-john

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] Re: boot time, process start time, and NOW time
  2004-08-30 23:38                             ` john stultz
@ 2004-08-31  0:37                               ` Albert Cahalan
  2004-08-31  0:49                                 ` Tim Schmielau
  2004-08-31  0:45                               ` Tim Schmielau
  1 sibling, 1 reply; 57+ messages in thread
From: Albert Cahalan @ 2004-08-31  0:37 UTC (permalink / raw)
  To: john stultz
  Cc: Tim Schmielau, george anzinger, Andrew Morton OSDL,
	Petri Kaukasoina, albert, hirofumi, lkml, voland, nicolas.george,
	david+powerix

On Mon, 2004-08-30 at 19:38, john stultz wrote:

> This isn't going to happen instantly by any means. I'm trying to
> get the time of day rework finished as soon as I can, but I've
> got the day job to do as well. In the mean time, we can staple gun
> any user visible exported HZ/jiffies values so they are accurate
> (using ACTHZ or gettimeofday), and also look into changing HZ to a
> less error-ful value.  HZ=1001 has been suggested and looks quite  
> promising (although /net/schec/estimator.c wants a power of 4).

Well, pick something else. Here's a list of choices with
error under 0.0025%, in the 240..1300 range, that can be
evenly divided by four.

Dropping back a bit would be good, to better tolerate        
systems with firmware that steals enough time to cause
lost clock ticks.

I like 400, 488, and 556. 864 and 1112 are decent too.
It might be useful having 400 as a multiple of 100.

I don't think /net/sched/estimator.c needs such
great accuracy. If that were sacrificed, one could
do much better with HZ at 363, 519, or 627.

%error       HZerror     PIT#    HZ   actual
-0.00217900  -0.007234   3594   332   331.992766
-0.00217900  -0.014469   1797   664   663.985531
-0.00217900  -0.021703   1198   996   995.978297
-0.00184378  -0.014676   1499   796   795.985324
-0.00150855  -0.004586   3925   304   303.995414
-0.00150855  -0.005732   3140   380   379.994268
-0.00150855  -0.006034   2983   400   399.993966
-0.00150855  -0.009474   1900   628   627.990526
-0.00150855  -0.011465   1570   760   759.988535
-0.00150855  -0.018947    950  1256  1255.981053
-0.00083809  -0.002581   3874   308   307.997419
-0.00083809  -0.003051   3278   364   363.996949
-0.00083809  -0.004794   2086   572   571.995206
-0.00083809  -0.004995   2002   596   595.995005
-0.00083809  -0.005163   1937   616   615.994837
-0.00083809  -0.006101   1639   728   727.993899
-0.00083809  -0.009588   1043  1144  1143.990412
-0.00083809  -0.009990   1001  1192  1191.990010
-0.00050285  -0.006376    941  1268  1267.993624
-0.00016762  -0.000483   4143   288   287.999517
-0.00016762  -0.000724   2762   432   431.999276
-0.00016762  -0.001448   1381   864   863.998552
+0.00050286  +0.001488   4031   296   296.001488
+0.00050286  +0.002796   2146   556   556.002796
+0.00050286  +0.005592   1073  1112  1112.005592
+0.00150859  +0.018163    991  1204  1204.018163
+0.00184384  +0.004499   4890   244   244.004499
+0.00184384  +0.008998   2445   488   488.008998
+0.00184384  +0.012022   1830   652   652.012022
+0.00184384  +0.013497   1630   732   732.013497
+0.00184384  +0.022495    978  1220  1220.022495
+0.00217909  +0.027282    953  1252  1252.027282



^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] Re: boot time, process start time, and NOW time
  2004-08-31  0:37                               ` Albert Cahalan
@ 2004-08-31  0:49                                 ` Tim Schmielau
  0 siblings, 0 replies; 57+ messages in thread
From: Tim Schmielau @ 2004-08-31  0:49 UTC (permalink / raw)
  To: Albert Cahalan
  Cc: john stultz, george anzinger, Andrew Morton OSDL,
	Petri Kaukasoina, hirofumi, lkml, voland, nicolas.george,
	david+powerix

On Tue, 30 Aug 2004, Albert Cahalan wrote:

> On Mon, 2004-08-30 at 19:38, john stultz wrote:
> 
> > This isn't going to happen instantly by any means. I'm trying to
> > get the time of day rework finished as soon as I can, but I've
> > got the day job to do as well. In the mean time, we can staple gun
> > any user visible exported HZ/jiffies values so they are accurate
> > (using ACTHZ or gettimeofday), and also look into changing HZ to a
> > less error-ful value.  HZ=1001 has been suggested and looks quite  
> > promising (although /net/schec/estimator.c wants a power of 4).
> 
> Well, pick something else. Here's a list of choices with
> error under 0.0025%, in the 240..1300 range, that can be
> evenly divided by four.

I don't think this is the main issue. We are not talking about slight 
rounding errors that might (and in many places are) be accounted for,
but with real inconsistencies that arise whenever the system clock is
adjusted. This e.g. was the source of the 15 minutes error that I saw in 
my recent testing.

Tim

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] Re: boot time, process start time, and NOW time
  2004-08-30 23:38                             ` john stultz
  2004-08-31  0:37                               ` Albert Cahalan
@ 2004-08-31  0:45                               ` Tim Schmielau
  2004-08-31  1:23                                 ` john stultz
  1 sibling, 1 reply; 57+ messages in thread
From: Tim Schmielau @ 2004-08-31  0:45 UTC (permalink / raw)
  To: john stultz
  Cc: george anzinger, Andrew Morton, Petri Kaukasoina, albert,
	hirofumi, lkml, voland, nicolas.george, david+powerix

On Mon, 30 Aug 2004, john stultz wrote:

> On Mon, 2004-08-30 at 16:00, Tim Schmielau wrote:
> > George, please excuse my lack of understanding. What again where the
> > precise reasons to have an ntp-corrected uptime?
> 
> If I remember correctly, folks were complaining that boot time was
> drifting due to the same issue. 

Yes, I remember this was discussed at the same time. However, I don't see
that boot time display actually is connected to uptime. Boot time is
available in /proc/stat as seconds sice the beginning of the epoch. It's
now derived from wall_to_monotonic, thus should be reasonable constant
(and I'm not aware of recent reports that say otherwise).

If some program thinks it has to calculate boot time from 
gettimeofday() - uptime, and it drifts, so what?
It's got a better way to do that. But there are no better ways to recover 
from inconsistent statistics in /proc.

Tim

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] Re: boot time, process start time, and NOW time
  2004-08-31  0:45                               ` Tim Schmielau
@ 2004-08-31  1:23                                 ` john stultz
  0 siblings, 0 replies; 57+ messages in thread
From: john stultz @ 2004-08-31  1:23 UTC (permalink / raw)
  To: Tim Schmielau
  Cc: george anzinger, Andrew Morton, Petri Kaukasoina, albert,
	hirofumi, lkml, voland, nicolas.george, david+powerix

On Mon, 2004-08-30 at 17:45, Tim Schmielau wrote:
> On Mon, 30 Aug 2004, john stultz wrote:
> 
> > On Mon, 2004-08-30 at 16:00, Tim Schmielau wrote:
> > > George, please excuse my lack of understanding. What again where the
> > > precise reasons to have an ntp-corrected uptime?
> > 
> > If I remember correctly, folks were complaining that boot time was
> > drifting due to the same issue. 
> 
> Yes, I remember this was discussed at the same time. However, I don't see
> that boot time display actually is connected to uptime. Boot time is
> available in /proc/stat as seconds sice the beginning of the epoch. It's
> now derived from wall_to_monotonic, thus should be reasonable constant
> (and I'm not aware of recent reports that say otherwise).
> 
> If some program thinks it has to calculate boot time from 
> gettimeofday() - uptime, and it drifts, so what?
> It's got a better way to do that. 

Hmm. Maybe I'm confusing problems. I need to re-read my mail archive. In
the meantime, maybe George could remind us why exactly the patch was
needed.

thanks
-john


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] Re: boot time, process start time, and NOW time
  2004-08-30 23:00                           ` Tim Schmielau
  2004-08-30 23:38                             ` john stultz
@ 2004-08-31  1:34                             ` john stultz
  2004-08-31  6:07                               ` Tim Schmielau
  1 sibling, 1 reply; 57+ messages in thread
From: john stultz @ 2004-08-31  1:34 UTC (permalink / raw)
  To: Tim Schmielau
  Cc: george anzinger, Andrew Morton, Petri Kaukasoina, albert,
	hirofumi, lkml, voland, nicolas.george, david+powerix

On Mon, 2004-08-30 at 16:00, Tim Schmielau wrote:
> George, please excuse my lack of understanding. What again where the
> precise reasons to have an ntp-corrected uptime?

Ah, here's the thread with the first mention of it that I could find.

http://www.uwsg.iu.edu/hypermail/linux/kernel/0306.1/1471.html

thanks
-john


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] Re: boot time, process start time, and NOW time
  2004-08-31  1:34                             ` john stultz
@ 2004-08-31  6:07                               ` Tim Schmielau
  2004-08-31 19:27                                 ` George Anzinger
  2004-09-01 19:14                                 ` OGAWA Hirofumi
  0 siblings, 2 replies; 57+ messages in thread
From: Tim Schmielau @ 2004-08-31  6:07 UTC (permalink / raw)
  To: john stultz
  Cc: george anzinger, Andrew Morton, Petri Kaukasoina, albert,
	hirofumi, lkml, voland, nicolas.george, david+powerix

On Mon, 30 Aug 2004, john stultz wrote:

> On Mon, 2004-08-30 at 16:00, Tim Schmielau wrote:
> > George, please excuse my lack of understanding. What again where the
> > precise reasons to have an ntp-corrected uptime?
> 
> Ah, here's the thread with the first mention of it that I could find.
> 
> http://www.uwsg.iu.edu/hypermail/linux/kernel/0306.1/1471.html

Ah, it seems George indeed did the patch because of these problems:

  http://www.uwsg.iu.edu/hypermail/linux/kernel/0306.1/1641.html

However, the actual reason were just missing wall_to_monotonic 
initializations:

  http://www.uwsg.iu.edu/hypermail/linux/kernel/0306.2/1330.html

This was fixed in mainline:

  http://linus.bkbits.net:8080/linux-2.5/cset%403f0e60dcpIosK3b5_uJ-aD9Mare17w

Tim

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] Re: boot time, process start time, and NOW time
  2004-08-31  6:07                               ` Tim Schmielau
@ 2004-08-31 19:27                                 ` George Anzinger
  2004-08-31 20:56                                   ` john stultz
  2004-09-01 19:14                                 ` OGAWA Hirofumi
  1 sibling, 1 reply; 57+ messages in thread
From: George Anzinger @ 2004-08-31 19:27 UTC (permalink / raw)
  To: Tim Schmielau
  Cc: john stultz, Andrew Morton, Petri Kaukasoina, albert, hirofumi,
	lkml, voland, nicolas.george, david+powerix

Tim Schmielau wrote:
> On Mon, 30 Aug 2004, john stultz wrote:
> 
> 
>>On Mon, 2004-08-30 at 16:00, Tim Schmielau wrote:
>>
>>>George, please excuse my lack of understanding. What again where the
>>>precise reasons to have an ntp-corrected uptime?
>>
>>Ah, here's the thread with the first mention of it that I could find.
>>
>>http://www.uwsg.iu.edu/hypermail/linux/kernel/0306.1/1471.html

As I recall the problem was that jiffies since boot was being converted to get 
uptime base on 1/HZ = 1 jiffie.  Since it is really not quite that, there was an 
error.  Using clock_monotonic seemed like the right answer as it eliminated the 
error AND made the result consistant with get_clock(CLOCK_MONOTONIC,..).

The alternate answer is, of course, to directly convert the elapsed jiffies. 
The main problem with this is that this can be a BIG number and, therefor, the 
math needs to be carefully.  And, of course, it is inconsistant with 
get_clock(), but that is a new interface...

George
> 
> 
> Ah, it seems George indeed did the patch because of these problems:
> 
>   http://www.uwsg.iu.edu/hypermail/linux/kernel/0306.1/1641.html
> 
> However, the actual reason were just missing wall_to_monotonic 
> initializations:
> 
>   http://www.uwsg.iu.edu/hypermail/linux/kernel/0306.2/1330.html
> 
> This was fixed in mainline:
> 
>   http://linus.bkbits.net:8080/linux-2.5/cset%403f0e60dcpIosK3b5_uJ-aD9Mare17w
> 
> Tim
> 

-- 
George Anzinger   george@mvista.com
High-res-timers:  http://sourceforge.net/projects/high-res-timers/
Preemption patch: http://www.kernel.org/pub/linux/kernel/people/rml


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] Re: boot time, process start time, and NOW time
  2004-08-31 19:27                                 ` George Anzinger
@ 2004-08-31 20:56                                   ` john stultz
  2004-08-31 21:10                                     ` David Ford
  2004-09-02 20:39                                     ` George Anzinger
  0 siblings, 2 replies; 57+ messages in thread
From: john stultz @ 2004-08-31 20:56 UTC (permalink / raw)
  To: george anzinger
  Cc: Tim Schmielau, Andrew Morton, Petri Kaukasoina, albert, hirofumi,
	lkml, voland, nicolas.george, david+powerix

On Tue, 2004-08-31 at 12:27, George Anzinger wrote:
> Tim Schmielau wrote:
> > On Mon, 30 Aug 2004, john stultz wrote:
> >>On Mon, 2004-08-30 at 16:00, Tim Schmielau wrote:
> >>
> >>>George, please excuse my lack of understanding. What again where the
> >>>precise reasons to have an ntp-corrected uptime?
> >>
> >>Ah, here's the thread with the first mention of it that I could find.
> >>
> >>http://www.uwsg.iu.edu/hypermail/linux/kernel/0306.1/1471.html
> 
> As I recall the problem was that jiffies since boot was being converted to get 
> uptime base on 1/HZ = 1 jiffie.  Since it is really not quite that, there was an 
> error.  Using clock_monotonic seemed like the right answer as it eliminated the 
> error AND made the result consistant with get_clock(CLOCK_MONOTONIC,..).
> 
> The alternate answer is, of course, to directly convert the elapsed jiffies. 
> The main problem with this is that this can be a BIG number and, therefor, the 
> math needs to be carefully.  And, of course, it is inconsistant with 
> get_clock(), but that is a new interface...

Hmmm. Well, I may be starting to lean in Tim's direction of pulling the
clock_monotonic based uptime and going back to the jiffies based uptime.
Atleast until we can make all the /proc/ output consistent. 

I just worry that it actually fixed a problem for someone, and backing
it out would just reopen that.

Thoughts?

-john



^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] Re: boot time, process start time, and NOW time
  2004-08-31 20:56                                   ` john stultz
@ 2004-08-31 21:10                                     ` David Ford
  2004-09-02 20:39                                     ` George Anzinger
  1 sibling, 0 replies; 57+ messages in thread
From: David Ford @ 2004-08-31 21:10 UTC (permalink / raw)
  To: john stultz
  Cc: george anzinger, Tim Schmielau, Andrew Morton, Petri Kaukasoina,
	albert, hirofumi, lkml, voland, nicolas.george

[-- Attachment #1: Type: text/plain, Size: 700 bytes --]

>
>
>Hmmm. Well, I may be starting to lean in Tim's direction of pulling the
>clock_monotonic based uptime and going back to the jiffies based uptime.
>Atleast until we can make all the /proc/ output consistent. 
>
>I just worry that it actually fixed a problem for someone, and backing
>it out would just reopen that.
>
>Thoughts?
>
>-john
>

I would rather deal with some aesthetic breakage and get it fixed.  
Right now, having been mildly affected by differing times, and having 
seen the lengthy discussion about it, it feels like a huge octopus of 
timelines in the kernel.  Each one of them different, some just a 
little, some significantly - especially after suspend/resume events.

-david


[-- Attachment #2: david+challenge-response.vcf --]
[-- Type: text/x-vcard, Size: 183 bytes --]

begin:vcard
fn:David Ford
n:Ford;David
email;internet:david@blue-labs.org
title:Industrial Geek
tel;home:Ask please
tel;cell:(203) 650-3611
x-mozilla-html:TRUE
version:2.1
end:vcard


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] Re: boot time, process start time, and NOW time
  2004-08-31 20:56                                   ` john stultz
  2004-08-31 21:10                                     ` David Ford
@ 2004-09-02 20:39                                     ` George Anzinger
  1 sibling, 0 replies; 57+ messages in thread
From: George Anzinger @ 2004-09-02 20:39 UTC (permalink / raw)
  To: john stultz
  Cc: Tim Schmielau, Andrew Morton, Petri Kaukasoina, albert, hirofumi,
	lkml, voland, nicolas.george, david+powerix

john stultz wrote:
> On Tue, 2004-08-31 at 12:27, George Anzinger wrote:
> 
>>Tim Schmielau wrote:
>>
>>>On Mon, 30 Aug 2004, john stultz wrote:
>>>
>>>>On Mon, 2004-08-30 at 16:00, Tim Schmielau wrote:
>>>>
>>>>
>>>>>George, please excuse my lack of understanding. What again where the
>>>>>precise reasons to have an ntp-corrected uptime?
>>>>
>>>>Ah, here's the thread with the first mention of it that I could find.
>>>>
>>>>http://www.uwsg.iu.edu/hypermail/linux/kernel/0306.1/1471.html
>>
>>As I recall the problem was that jiffies since boot was being converted to get 
>>uptime base on 1/HZ = 1 jiffie.  Since it is really not quite that, there was an 
>>error.  Using clock_monotonic seemed like the right answer as it eliminated the 
>>error AND made the result consistant with get_clock(CLOCK_MONOTONIC,..).
>>
>>The alternate answer is, of course, to directly convert the elapsed jiffies. 
>>The main problem with this is that this can be a BIG number and, therefor, the 
>>math needs to be carefully.  And, of course, it is inconsistant with 
>>get_clock(), but that is a new interface...
> 
> 
> Hmmm. Well, I may be starting to lean in Tim's direction of pulling the
> clock_monotonic based uptime and going back to the jiffies based uptime.
> Atleast until we can make all the /proc/ output consistent. 
> 
> I just worry that it actually fixed a problem for someone, and backing
> it out would just reopen that.
> 
> Thoughts?

Well, it was done in reaction to some complaint.  I don't, at this point, recall 
who or why.  If it is done with the correct values (i.e. NOT 1/HZ, but what the 
wall clock uses) I think it will not reopen that complaint.

-- 
George Anzinger   george@mvista.com
High-res-timers:  http://sourceforge.net/projects/high-res-timers/
Preemption patch: http://www.kernel.org/pub/linux/kernel/people/rml


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] Re: boot time, process start time, and NOW time
  2004-08-31  6:07                               ` Tim Schmielau
  2004-08-31 19:27                                 ` George Anzinger
@ 2004-09-01 19:14                                 ` OGAWA Hirofumi
  2004-09-02 20:58                                   ` George Anzinger
  1 sibling, 1 reply; 57+ messages in thread
From: OGAWA Hirofumi @ 2004-09-01 19:14 UTC (permalink / raw)
  To: Tim Schmielau
  Cc: john stultz, george anzinger, Andrew Morton, Petri Kaukasoina,
	albert, lkml, voland, nicolas.george, david+powerix

Tim Schmielau <tim@physik3.uni-rostock.de> writes:

> However, the actual reason were just missing wall_to_monotonic 
> initializations:
> 
>   http://www.uwsg.iu.edu/hypermail/linux/kernel/0306.2/1330.html

Sorry for may not be related question in this thread.


         xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);

#include <stdio.h>

#define HZ 1000
#define NSEC_PER_SEC (1000000000L)
#define INITIAL_JIFFIES ((unsigned long)(unsigned int) (-300*HZ))

int main()
{
	printf("%ld\n", (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ));
	return 0;
}

hirofumi@devron (a)[1006]$ ./c
296000000

xtime.tv_nsec was not 0. Is this bug?
-- 
OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] Re: boot time, process start time, and NOW time
  2004-09-01 19:14                                 ` OGAWA Hirofumi
@ 2004-09-02 20:58                                   ` George Anzinger
  2004-09-02 21:38                                     ` OGAWA Hirofumi
  0 siblings, 1 reply; 57+ messages in thread
From: George Anzinger @ 2004-09-02 20:58 UTC (permalink / raw)
  To: OGAWA Hirofumi
  Cc: Tim Schmielau, john stultz, Andrew Morton, Petri Kaukasoina,
	albert, lkml, voland, nicolas.george, david+powerix

OGAWA Hirofumi wrote:
> Tim Schmielau <tim@physik3.uni-rostock.de> writes:
> 
> 
>>However, the actual reason were just missing wall_to_monotonic 
>>initializations:
>>
>>  http://www.uwsg.iu.edu/hypermail/linux/kernel/0306.2/1330.html
> 
> 
> Sorry for may not be related question in this thread.
> 
> 
>          xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
> 
> #include <stdio.h>
> 
> #define HZ 1000
> #define NSEC_PER_SEC (1000000000L)
> #define INITIAL_JIFFIES ((unsigned long)(unsigned int) (-300*HZ))
> 
> int main()
> {
> 	printf("%ld\n", (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ));
> 	return 0;
> }
> 
> hirofumi@devron (a)[1006]$ ./c
> 296000000
> 
> xtime.tv_nsec was not 0. Is this bug?

Well, my machine says the result should be 996000000, so something is wrong with 
your or my math.  As to if the initial jiffie value should be a multiple of HZ, 
I don't see why.  I think it is several counts off of this value when the system 
wall clock is set in any case.

-- 
George Anzinger   george@mvista.com
High-res-timers:  http://sourceforge.net/projects/high-res-timers/
Preemption patch: http://www.kernel.org/pub/linux/kernel/people/rml


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] Re: boot time, process start time, and NOW time
  2004-09-02 20:58                                   ` George Anzinger
@ 2004-09-02 21:38                                     ` OGAWA Hirofumi
  2004-09-03  0:59                                       ` George Anzinger
  2004-09-03  7:15                                       ` Tim Schmielau
  0 siblings, 2 replies; 57+ messages in thread
From: OGAWA Hirofumi @ 2004-09-02 21:38 UTC (permalink / raw)
  To: george
  Cc: Tim Schmielau, john stultz, Andrew Morton, Petri Kaukasoina,
	albert, lkml, voland, nicolas.george, david+powerix

George Anzinger <george@mvista.com> writes:

> OGAWA Hirofumi wrote:
> Well, my machine says the result should be 996000000, so something is
> wrong with your or my math.

Hmm.. I don't know why. I'm using x86 cpu machine.

> As to if the initial jiffie value should
> be a multiple of HZ, I don't see why.  I think it is several counts
> off of this value when the system wall clock is set in any case.

Ah, sorry for quite insufficiency explanation.

Since INITIAL_JIFFIES is -5 minutes, so I though tv.tv_nsec should be 0.
The cause of this is

     INITIAL_JIFFIES % HZ (4294667296 % 1000)

because INITIAL_JIFFIES is unsigned long.

So, I guessed this is not intention.
Looks like this should be (-300*1000) % 1000.

What do you think of this?
-- 
OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] Re: boot time, process start time, and NOW time
  2004-09-02 21:38                                     ` OGAWA Hirofumi
@ 2004-09-03  0:59                                       ` George Anzinger
  2004-09-03  3:35                                         ` OGAWA Hirofumi
  2004-09-03  7:15                                       ` Tim Schmielau
  1 sibling, 1 reply; 57+ messages in thread
From: George Anzinger @ 2004-09-03  0:59 UTC (permalink / raw)
  To: OGAWA Hirofumi
  Cc: Tim Schmielau, john stultz, Andrew Morton, Petri Kaukasoina,
	albert, lkml, voland, nicolas.george, david+powerix

OGAWA Hirofumi wrote:
> George Anzinger <george@mvista.com> writes:
> 
> 
>>OGAWA Hirofumi wrote:
>>Well, my machine says the result should be 996000000, so something is
>>wrong with your or my math.
> 
> 
> Hmm.. I don't know why. I'm using x86 cpu machine.
> 
> 
>>As to if the initial jiffie value should
>>be a multiple of HZ, I don't see why.  I think it is several counts
>>off of this value when the system wall clock is set in any case.
> 
> 
> Ah, sorry for quite insufficiency explanation.
> 
> Since INITIAL_JIFFIES is -5 minutes, so I though tv.tv_nsec should be 0.
> The cause of this is
> 
>      INITIAL_JIFFIES % HZ (4294667296 % 1000)
> 
> because INITIAL_JIFFIES is unsigned long.
> 
> So, I guessed this is not intention.
> Looks like this should be (-300*1000) % 1000.

What "should be"?  Are you refering to some real code or some thoughts you had? 
   I am not aware of the kernel converting INITIAL_JIFFIES to time ....
> 
> What do you think of this?

The actual initial value of jiffies is not important.  The reason this value was 
chosen was to catch problems that occur when the unsigned value rolls over to 
zero (and several were found and fixed).

-- 
George Anzinger   george@mvista.com
High-res-timers:  http://sourceforge.net/projects/high-res-timers/
Preemption patch: http://www.kernel.org/pub/linux/kernel/people/rml


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] Re: boot time, process start time, and NOW time
  2004-09-03  0:59                                       ` George Anzinger
@ 2004-09-03  3:35                                         ` OGAWA Hirofumi
  2004-09-03  7:31                                           ` George Anzinger
  0 siblings, 1 reply; 57+ messages in thread
From: OGAWA Hirofumi @ 2004-09-03  3:35 UTC (permalink / raw)
  To: george
  Cc: Tim Schmielau, john stultz, Andrew Morton, Petri Kaukasoina,
	albert, lkml, voland, nicolas.george, david+powerix

George Anzinger <george@mvista.com> writes:

> > The cause of this is
> >      INITIAL_JIFFIES % HZ (4294667296 % 1000)
> > because INITIAL_JIFFIES is unsigned long.
> > So, I guessed this is not intention.
> > Looks like this should be (-300*1000) % 1000.
> 
> What "should be"?

in time_init(), and hpet_time_init(),
        xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
should be
        xtime.tv_nsec = ((long)INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);

because
	(INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ)		== 296000000
and
	((long)INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ)	== 0
-- 
OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] Re: boot time, process start time, and NOW time
  2004-09-03  3:35                                         ` OGAWA Hirofumi
@ 2004-09-03  7:31                                           ` George Anzinger
  2004-09-03  7:51                                             ` Tim Schmielau
  0 siblings, 1 reply; 57+ messages in thread
From: George Anzinger @ 2004-09-03  7:31 UTC (permalink / raw)
  To: OGAWA Hirofumi
  Cc: Tim Schmielau, john stultz, Andrew Morton, Petri Kaukasoina,
	albert, lkml, voland, nicolas.george, david+powerix

OGAWA Hirofumi wrote:
> George Anzinger <george@mvista.com> writes:
> 
> 
>>>The cause of this is
>>>     INITIAL_JIFFIES % HZ (4294667296 % 1000)
>>>because INITIAL_JIFFIES is unsigned long.
>>>So, I guessed this is not intention.
>>>Looks like this should be (-300*1000) % 1000.
>>
>>What "should be"?
> 
> 
> in time_init(), and hpet_time_init(),
>         xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
> should be
>         xtime.tv_nsec = ((long)INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
> 
> because
> 	(INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ)		== 296000000
> and
> 	((long)INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ)	== 0

It is possible that I am missing something here, but I just don't see that it 
matters.  If the wall clock is set jiffies is not changed so there is no implied 
or actual alignment between these two.

Is there a calculation in the system that would differ if this were changed?

-- 
George Anzinger   george@mvista.com
High-res-timers:  http://sourceforge.net/projects/high-res-timers/
Preemption patch: http://www.kernel.org/pub/linux/kernel/people/rml


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] Re: boot time, process start time, and NOW time
  2004-09-03  7:31                                           ` George Anzinger
@ 2004-09-03  7:51                                             ` Tim Schmielau
  0 siblings, 0 replies; 57+ messages in thread
From: Tim Schmielau @ 2004-09-03  7:51 UTC (permalink / raw)
  To: George Anzinger
  Cc: OGAWA Hirofumi, john stultz, Andrew Morton, Petri Kaukasoina,
	albert, lkml, voland, nicolas.george, david+powerix

On Fri, 3 Sep 2004, George Anzinger wrote:

> OGAWA Hirofumi wrote:
> > in time_init(), and hpet_time_init(),
> >         xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
> > should be
> >         xtime.tv_nsec = ((long)INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
> > 
> > because
> > 	(INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ)		== 296000000
> > and
> > 	((long)INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ)	== 0
> 
> It is possible that I am missing something here, but I just don't see that it 
> matters.  If the wall clock is set jiffies is not changed so there is no implied 
> or actual alignment between these two.
> 
> Is there a calculation in the system that would differ if this were changed?

Yep, I also think it _should_ not matter at all. That's why I suggested 
setting it to zero, but maybe we just shouldn't touch it..

Tim

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] Re: boot time, process start time, and NOW time
  2004-09-02 21:38                                     ` OGAWA Hirofumi
  2004-09-03  0:59                                       ` George Anzinger
@ 2004-09-03  7:15                                       ` Tim Schmielau
  1 sibling, 0 replies; 57+ messages in thread
From: Tim Schmielau @ 2004-09-03  7:15 UTC (permalink / raw)
  To: OGAWA Hirofumi
  Cc: george, john stultz, Andrew Morton, Petri Kaukasoina, albert,
	lkml, voland, nicolas.george, david+powerix

On Fri, 3 Sep 2004, OGAWA Hirofumi wrote:

> Since INITIAL_JIFFIES is -5 minutes, so I though tv.tv_nsec should be 0.
> The cause of this is
> 
>      INITIAL_JIFFIES % HZ (4294667296 % 1000)
> 
> because INITIAL_JIFFIES is unsigned long.
> 
> So, I guessed this is not intention.
> Looks like this should be (-300*1000) % 1000.

I think actually the whole xtime initialisation

        xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);

is bogus. INITIAL_JIFFIES should not be connected to any actual time, so 
this should really just be

        xtime.tv_nsec = 0;

I'll try to do a patch later on, and see what happens. Have to do some  
work-related things now.

Tim

^ permalink raw reply	[flat|nested] 57+ messages in thread

end of thread, other threads:[~2004-09-03  7:55 UTC | newest]

Thread overview: 57+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2004-06-22 23:57 boot time, process start time, and NOW time Albert Cahalan
2004-06-28 17:56 ` OGAWA Hirofumi
2004-08-16 19:41   ` Andrew Morton
2004-08-16 21:49     ` john stultz
2004-08-16 23:08     ` Tim Schmielau
2004-08-16 23:56       ` Tim Schmielau
2004-08-17  0:21       ` john stultz
2004-08-17  0:37         ` George Anzinger
2004-08-17  0:49           ` john stultz
2004-08-17  0:31       ` George Anzinger
2004-08-16 22:32         ` Albert Cahalan
2004-08-17  1:26           ` George Anzinger
2004-08-16 23:08             ` Albert Cahalan
2004-08-17  1:54               ` James Courtier-Dutton
2004-08-17  2:03                 ` Lee Revell
2004-08-17 20:52                 ` George Anzinger
2004-08-17  6:56         ` Tim Schmielau
2004-08-17 20:07           ` john stultz
2004-08-17 20:13             ` [RFC] New timeofday implementation proposal john stultz
2004-08-17 20:58               ` [RFC] New timeofday code john stultz
2004-09-01 23:16               ` [RFC] New timeofday implementation proposal Christoph Lameter
2004-08-16 23:24     ` boot time, process start time, and NOW time Albert Cahalan
2004-08-17 19:00       ` john stultz
2004-08-17 17:41         ` Albert Cahalan
2004-08-17 20:58           ` john stultz
2004-08-17 20:25     ` [PATCH] " Tim Schmielau
2004-08-17 22:24       ` George Anzinger
2004-08-17 22:37         ` john stultz
2004-08-17 23:07           ` Tim Schmielau
2004-08-18  0:11             ` john stultz
2004-08-17 22:19               ` Albert Cahalan
2004-08-18  1:09                 ` john stultz
2004-08-17 22:45                   ` Albert Cahalan
2004-08-18  7:42                   ` Tim Schmielau
2004-08-19 19:15                     ` Petri Kaukasoina
2004-08-26 11:04                       ` Andrew Morton
2004-08-26 12:07                         ` Tim Schmielau
2004-08-30 23:00                           ` Tim Schmielau
2004-08-30 23:38                             ` john stultz
2004-08-31  0:37                               ` Albert Cahalan
2004-08-31  0:49                                 ` Tim Schmielau
2004-08-31  0:45                               ` Tim Schmielau
2004-08-31  1:23                                 ` john stultz
2004-08-31  1:34                             ` john stultz
2004-08-31  6:07                               ` Tim Schmielau
2004-08-31 19:27                                 ` George Anzinger
2004-08-31 20:56                                   ` john stultz
2004-08-31 21:10                                     ` David Ford
2004-09-02 20:39                                     ` George Anzinger
2004-09-01 19:14                                 ` OGAWA Hirofumi
2004-09-02 20:58                                   ` George Anzinger
2004-09-02 21:38                                     ` OGAWA Hirofumi
2004-09-03  0:59                                       ` George Anzinger
2004-09-03  3:35                                         ` OGAWA Hirofumi
2004-09-03  7:31                                           ` George Anzinger
2004-09-03  7:51                                             ` Tim Schmielau
2004-09-03  7:15                                       ` Tim Schmielau

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox