From: Tim Schmielau Derive process start times from the posix_clock_monotonic notion of uptime instead of "jiffies", consistent with the earlier change to /proc/uptime itself. (http://linus.bkbits.net:8080/linux-2.5/cset@3ef4851dGg0fxX58R9Zv8SIq9fzNmQ?na%0Av=index.html|src/.|src/fs|src/fs/proc|related/fs/proc/proc_misc.c) Process start times are reported to userspace in units of 1/USER_HZ since boot, thus applications as procps need the value of "uptime" to convert them into absolute time. Currently "uptime" is derived from an ntp-corrected time base, but process start time is derived from the free-running "jiffies" counter. This results in inaccurate, drifting process start times as seen by the user, even if the exported number stays constant, because the users notion of "jiffies" changes in time. It's John Stultz's patch anyways, which I only messed up a bit, but since people started trading signed-off lines on lkml: Signed-off-by: Tim Schmielau Signed-off-by: Andrew Morton --- 25-akpm/fs/proc/array.c | 6 +++++- 25-akpm/include/linux/acct.h | 23 +++++++++++++++-------- 25-akpm/include/linux/sched.h | 2 +- 25-akpm/include/linux/times.h | 20 ++++++++++++++++++++ 25-akpm/kernel/acct.c | 10 +++++++++- 25-akpm/kernel/fork.c | 2 +- 25-akpm/mm/oom_kill.c | 19 +++++++++++++------ 7 files changed, 64 insertions(+), 18 deletions(-) diff -puN fs/proc/array.c~fix-process-start-times fs/proc/array.c --- 25/fs/proc/array.c~fix-process-start-times 2004-10-05 01:47:53.828317688 -0700 +++ 25-akpm/fs/proc/array.c 2004-10-05 01:47:53.840315864 -0700 @@ -390,7 +390,11 @@ static int do_task_stat(struct task_stru nice = task_nice(task); /* Temporary variable needed for gcc-2.96 */ - start_time = jiffies_64_to_clock_t(task->start_time - INITIAL_JIFFIES); + /* convert timespec -> nsec*/ + start_time = (unsigned long long)task->start_time.tv_sec * NSEC_PER_SEC + + task->start_time.tv_nsec; + /* convert nsec -> ticks */ + start_time = nsec_to_clock_t(start_time); res = sprintf(buffer,"%d (%s) %c %d %d %d %d %d %lu %lu \ %lu %lu %lu %lu %lu %ld %ld %ld %ld %d %ld %llu %lu %ld %lu %lu %lu %lu %lu \ diff -puN include/linux/acct.h~fix-process-start-times include/linux/acct.h --- 25/include/linux/acct.h~fix-process-start-times 2004-10-05 01:47:53.830317384 -0700 +++ 25-akpm/include/linux/acct.h 2004-10-05 01:47:53.841315712 -0700 @@ -172,17 +172,24 @@ static inline u32 jiffies_to_AHZ(unsigne #endif } -static inline u64 jiffies_64_to_AHZ(u64 x) +static inline u64 nsec_to_AHZ(u64 x) { -#if (TICK_NSEC % (NSEC_PER_SEC / AHZ)) == 0 -#if HZ != AHZ - do_div(x, HZ / AHZ); -#endif -#else - x *= TICK_NSEC; +#if (NSEC_PER_SEC % AHZ) == 0 do_div(x, (NSEC_PER_SEC / AHZ)); +#elif (AHZ % 512) == 0 + x *= AHZ/512; + do_div(x, (NSEC_PER_SEC / 512)); +#else + /* + * max relative error 5.7e-8 (1.8s per year) for AHZ <= 1024, + * overflow after 64.99 years. + * exact for AHZ=60, 72, 90, 120, 144, 180, 300, 600, 900, ... + */ + x *= 9; + do_div(x, (unsigned long)((9ull * NSEC_PER_SEC + (AHZ/2)) + / AHZ)); #endif - return x; + return x; } #endif /* __KERNEL */ diff -puN include/linux/sched.h~fix-process-start-times include/linux/sched.h --- 25/include/linux/sched.h~fix-process-start-times 2004-10-05 01:47:53.831317232 -0700 +++ 25-akpm/include/linux/sched.h 2004-10-05 01:47:53.842315560 -0700 @@ -607,7 +607,7 @@ struct task_struct { struct timer_list real_timer; unsigned long utime, stime; unsigned long nvcsw, nivcsw; /* context switch counts */ - u64 start_time; + struct timespec start_time; /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */ unsigned long min_flt, maj_flt; /* process credentials */ diff -puN include/linux/times.h~fix-process-start-times include/linux/times.h --- 25/include/linux/times.h~fix-process-start-times 2004-10-05 01:47:53.833316928 -0700 +++ 25-akpm/include/linux/times.h 2004-10-05 01:47:53.843315408 -0700 @@ -55,6 +55,26 @@ static inline u64 jiffies_64_to_clock_t( } #endif +static inline u64 nsec_to_clock_t(u64 x) +{ +#if (NSEC_PER_SEC % USER_HZ) == 0 + do_div(x, (NSEC_PER_SEC / USER_HZ)); +#elif (USER_HZ % 512) == 0 + x *= USER_HZ/512; + do_div(x, (NSEC_PER_SEC / 512)); +#else + /* + * max relative error 5.7e-8 (1.8s per year) for USER_HZ <= 1024, + * overflow after 64.99 years. + * exact for HZ=60, 72, 90, 120, 144, 180, 300, 600, 900, ... + */ + x *= 9; + do_div(x, (unsigned long)((9ull * NSEC_PER_SEC + (USER_HZ/2)) + / USER_HZ)); +#endif + return x; +} + struct tms { clock_t tms_utime; clock_t tms_stime; diff -puN kernel/acct.c~fix-process-start-times kernel/acct.c --- 25/kernel/acct.c~fix-process-start-times 2004-10-05 01:47:53.834316776 -0700 +++ 25-akpm/kernel/acct.c 2004-10-05 01:47:53.843315408 -0700 @@ -385,6 +385,8 @@ static void do_acct_process(long exitcod unsigned long vsize; unsigned long flim; u64 elapsed; + u64 run_time; + struct timespec uptime; /* * First check to see if there is enough free_space to continue @@ -402,7 +404,13 @@ static void do_acct_process(long exitcod ac.ac_version = ACCT_VERSION | ACCT_BYTEORDER; strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm)); - elapsed = jiffies_64_to_AHZ(get_jiffies_64() - current->start_time); + /* calculate run_time in nsec*/ + do_posix_clock_monotonic_gettime(&uptime); + run_time = (u64)uptime.tv_sec*NSEC_PER_SEC + uptime.tv_nsec; + run_time -= (u64)current->start_time.tv_sec*NSEC_PER_SEC + + current->start_time.tv_nsec; + /* convert nsec -> AHZ */ + elapsed = nsec_to_AHZ(run_time); #if ACCT_VERSION==3 ac.ac_etime = encode_float(elapsed); #else diff -puN kernel/fork.c~fix-process-start-times kernel/fork.c --- 25/kernel/fork.c~fix-process-start-times 2004-10-05 01:47:53.836316472 -0700 +++ 25-akpm/kernel/fork.c 2004-10-05 01:47:53.844315256 -0700 @@ -1003,7 +1003,7 @@ static task_t *copy_process(unsigned lon p->utime = p->stime = 0; p->lock_depth = -1; /* -1 = no lock */ - p->start_time = get_jiffies_64(); + do_posix_clock_monotonic_gettime(&p->start_time); p->security = NULL; p->io_context = NULL; p->io_wait = NULL; diff -puN mm/oom_kill.c~fix-process-start-times mm/oom_kill.c --- 25/mm/oom_kill.c~fix-process-start-times 2004-10-05 01:47:53.837316320 -0700 +++ 25-akpm/mm/oom_kill.c 2004-10-05 01:47:53.845315104 -0700 @@ -26,6 +26,7 @@ /** * oom_badness - calculate a numeric value for how bad this task has been * @p: task struct of which task we should calculate + * @p: current uptime in seconds * * The formula used is relatively simple and documented inline in the * function. The main rationale is that we want to select a good task @@ -41,7 +42,7 @@ * of least surprise ... (be careful when you change it) */ -static unsigned long badness(struct task_struct *p) +static unsigned long badness(struct task_struct *p, unsigned long uptime) { unsigned long points, cpu_time, run_time, s; @@ -56,12 +57,16 @@ static unsigned long badness(struct task points = p->mm->total_vm; /* - * CPU time is in seconds and run time is in minutes. There is no - * particular reason for this other than that it turned out to work - * very well in practice. + * CPU time is in tens of seconds and run time is in thousands + * of seconds. There is no particular reason for this other than + * that it turned out to work very well in practice. */ cpu_time = (p->utime + p->stime) >> (SHIFT_HZ + 3); - run_time = (get_jiffies_64() - p->start_time) >> (SHIFT_HZ + 10); + + if (uptime >= p->start_time.tv_sec) + run_time = (uptime - p->start_time.tv_sec) >> 10; + else + run_time = 0; s = int_sqrt(cpu_time); if (s) @@ -111,10 +116,12 @@ static struct task_struct * select_bad_p unsigned long maxpoints = 0; struct task_struct *g, *p; struct task_struct *chosen = NULL; + struct timespec uptime; + do_posix_clock_monotonic_gettime(&uptime); do_each_thread(g, p) if (p->pid) { - unsigned long points = badness(p); + unsigned long points = badness(p, uptime.tv_sec); if (points > maxpoints) { chosen = p; maxpoints = points; _