All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] sched/cputime: fix clock_nanosleep/clock_gettime inconsistency
@ 2014-11-12 10:29 Stanislaw Gruszka
  2014-11-12 11:15 ` Peter Zijlstra
  0 siblings, 1 reply; 17+ messages in thread
From: Stanislaw Gruszka @ 2014-11-12 10:29 UTC (permalink / raw)
  To: linux-kernel
  Cc: Rik van Riel, Frederic Weisbecker, KOSAKI Motohiro, Oleg Nesterov,
	Thomas Gleixner, Peter Zijlstra, Ingo Molnar

Commit d670ec13178d "posix-cpu-timers: Cure SMP wobbles" fixes one glibc
test case in cost of breaking another one. After that commit, calling
clock_nanosleep(TIMER_ABSTIME, X) and then clock_gettime(&Y) can result
of Y time being smaller than X time.

Below is full reproducer (tst-cpuclock2.c) :

#define _GNU_SOURCE
#include <unistd.h>
#include <sys/syscall.h>
#include <stdio.h>
#include <time.h>
#include <pthread.h>
#include <stdint.h>
#include <inttypes.h>

/* Parameters for the Linux kernel ABI for CPU clocks.  */
#define CPUCLOCK_SCHED          2
#define MAKE_PROCESS_CPUCLOCK(pid, clock) \
        ((~(clockid_t) (pid) << 3) | (clockid_t) (clock))

static pthread_barrier_t barrier;

/* Help advance the clock.  */
static void *chew_cpu(void *arg)
{
	pthread_barrier_wait(&barrier);
	while (1) ;

	return NULL;
}

/* Don't use the glibc wrapper.  */
static int do_nanosleep(int flags, const struct timespec *req)
{
	clockid_t clock_id = MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED);

	return syscall(SYS_clock_nanosleep, clock_id, flags, req, NULL);
}

static int64_t tsdiff(const struct timespec *before, const struct timespec *after)
{
	int64_t before_i = before->tv_sec * 1000000000ULL + before->tv_nsec;
	int64_t after_i = after->tv_sec * 1000000000ULL + after->tv_nsec;

	return after_i - before_i;
}

int main(void)
{
	int result = 0;
	pthread_t th;

	pthread_barrier_init(&barrier, NULL, 2);

	if (pthread_create(&th, NULL, chew_cpu, NULL) != 0) {
		perror("pthread_create");
		return 1;
	}

	pthread_barrier_wait(&barrier);

	/* The test.  */
	struct timespec before, after, sleeptimeabs;
	int64_t sleepdiff, diffabs;
	const struct timespec sleeptime = {.tv_sec = 0,.tv_nsec = 100000000 };

	/* The relative nanosleep.  Not sure why this is needed, but its presence
	   seems to make it easier to reproduce the problem.  */
	if (do_nanosleep(0, &sleeptime) != 0) {
		perror("clock_nanosleep");
		return 1;
	}

	/* Get the current time.  */
	if (clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &before) < 0) {
		perror("clock_gettime[2]");
		return 1;
	}

	/* Compute the absolute sleep time based on the current time.  */
	uint64_t nsec = before.tv_nsec + sleeptime.tv_nsec;
	sleeptimeabs.tv_sec = before.tv_sec + nsec / 1000000000;
	sleeptimeabs.tv_nsec = nsec % 1000000000;

	/* Sleep for the computed time.  */
	if (do_nanosleep(TIMER_ABSTIME, &sleeptimeabs) != 0) {
		perror("absolute clock_nanosleep");
		return 1;
	}

	/* Get the time after the sleep.  */
	if (clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &after) < 0) {
		perror("clock_gettime[3]");
		return 1;
	}

	/* The time after sleep should always be equal to or after the absolute sleep
	   time passed to clock_nanosleep.  */
	sleepdiff = tsdiff(&sleeptimeabs, &after);
	if (sleepdiff < 0) {
		printf("absolute clock_nanosleep woke too early: %" PRId64 "\n", sleepdiff);
		result = 1;

		printf("Before %llu.%09llu\n", before.tv_sec, before.tv_nsec);
		printf("After  %llu.%09llu\n", after.tv_sec, after.tv_nsec);
		printf("Sleep  %llu.%09llu\n", sleeptimeabs.tv_sec, sleeptimeabs.tv_nsec);
	}

	/* The difference between the timestamps taken before and after the
	   clock_nanosleep call should be equal to or more than the duration of the
	   sleep.  */
	diffabs = tsdiff(&before, &after);
	if (diffabs < sleeptime.tv_nsec) {
		printf("clock_gettime difference too small: %" PRId64 "\n", diffabs);
		result = 1;
	}

	pthread_cancel(th);

	return result;
}

It can be compiled and ran by:

gcc -o tst-cpuclock2 tst-cpuclock2.c -pthread
while ./tst-cpuclock2 ; do : ; done

Issue happens because on start in thread_group_cputimer() we initialize
sum_exec_runtime of cputimer with threads runtime not yet accounted and
then add the threads runtime again on scheduler tick. When cputimer
finish, it's sum_exec_runtime value is bigger than current sum counted
by iterating over the threads in thread_group_cputime().

KOSAKI Motohiro posted fix for this problem, but that patch was never
applied: https://lkml.org/lkml/2013/5/26/191.

This patch takes different approach. It revert change from d670ec13178d,
but to assure process times are in sync with thread times, before
accounting the times it calls update_curr() to update current thread
sum_exec_runtime. This fixes the test case from commit d670ec13178d and
also make things like they were before i.e. process cpu times can be
(nr_threads - 1) * TICK_NSEC behind the clock (this is a drawback
unfortunately). Good news is that patch improve performance of
thread_group_cputime(), i.e. we do not need optimizations from commit
911b289 "sched: Optimize task_sched_runtime()"

Note I'm not familiar with scheduler subsystem, hence I'm not sure if
calling update_curr() will not affect scheduler functionality somehow.

Cc: Rik van Riel <riel@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Stanislaw Gruszka <sgruszka@redhat.com>
---
 include/linux/kernel_stat.h    |  5 +--
 kernel/sched/core.c            | 69 +++++-------------------------------------
 kernel/sched/cputime.c         |  2 +-
 kernel/sched/deadline.c        |  2 ++
 kernel/sched/fair.c            |  7 +++++
 kernel/sched/rt.c              |  2 ++
 kernel/sched/sched.h           |  2 ++
 kernel/time/posix-cpu-timers.c |  7 +++--
 8 files changed, 27 insertions(+), 69 deletions(-)

diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 8422b4e..d5bf373 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -77,10 +77,7 @@ static inline unsigned int kstat_cpu_irqs_sum(unsigned int cpu)
 	return kstat_cpu(cpu).irqs_sum;
 }
 
-/*
- * Lock/unlock the current runqueue - to extract task statistics:
- */
-extern unsigned long long task_delta_exec(struct task_struct *);
+extern void scheduler_update_curr(struct task_struct *);
 
 extern void account_user_time(struct task_struct *, cputime_t, cputime_t);
 extern void account_system_time(struct task_struct *, int, cputime_t, cputime_t);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 240157c..117c852 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2475,75 +2475,20 @@ EXPORT_PER_CPU_SYMBOL(kstat);
 EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
 
 /*
- * Return any ns on the sched_clock that have not yet been accounted in
- * @p in case that task is currently running.
- *
- * Called with task_rq_lock() held on @rq.
+ * If the task is currently running, update the statistics. Main purpose
+ * of this function is assure that the accounted runtime is updated.
  */
-static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
-{
-	u64 ns = 0;
-
-	/*
-	 * Must be ->curr _and_ ->on_rq.  If dequeued, we would
-	 * project cycles that may never be accounted to this
-	 * thread, breaking clock_gettime().
-	 */
-	if (task_current(rq, p) && task_on_rq_queued(p)) {
-		update_rq_clock(rq);
-		ns = rq_clock_task(rq) - p->se.exec_start;
-		if ((s64)ns < 0)
-			ns = 0;
-	}
-
-	return ns;
-}
-
-unsigned long long task_delta_exec(struct task_struct *p)
+void scheduler_update_curr(struct task_struct *p)
 {
 	unsigned long flags;
 	struct rq *rq;
-	u64 ns = 0;
 
 	rq = task_rq_lock(p, &flags);
-	ns = do_task_delta_exec(p, rq);
-	task_rq_unlock(rq, p, &flags);
-
-	return ns;
-}
-
-/*
- * Return accounted runtime for the task.
- * In case the task is currently running, return the runtime plus current's
- * pending runtime that have not been accounted yet.
- */
-unsigned long long task_sched_runtime(struct task_struct *p)
-{
-	unsigned long flags;
-	struct rq *rq;
-	u64 ns = 0;
-
-#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
-	/*
-	 * 64-bit doesn't need locks to atomically read a 64bit value.
-	 * So we have a optimization chance when the task's delta_exec is 0.
-	 * Reading ->on_cpu is racy, but this is ok.
-	 *
-	 * If we race with it leaving cpu, we'll take a lock. So we're correct.
-	 * If we race with it entering cpu, unaccounted time is 0. This is
-	 * indistinguishable from the read occurring a few cycles earlier.
-	 * If we see ->on_cpu without ->on_rq, the task is leaving, and has
-	 * been accounted, so we're correct here as well.
-	 */
-	if (!p->on_cpu || !task_on_rq_queued(p))
-		return p->se.sum_exec_runtime;
-#endif
-
-	rq = task_rq_lock(p, &flags);
-	ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
+	if (task_current(rq, p) && task_on_rq_queued(p)) {
+		update_rq_clock(rq);
+		p->sched_class->update_curr(rq);
+	}
 	task_rq_unlock(rq, p, &flags);
-
-	return ns;
 }
 
 /*
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 8394b1e..afcf114 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -305,7 +305,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
 			task_cputime(t, &utime, &stime);
 			times->utime += utime;
 			times->stime += stime;
-			times->sum_exec_runtime += task_sched_runtime(t);
+			times->sum_exec_runtime += t->se.sum_exec_runtime;
 		}
 		/* If lockless access failed, take the lock. */
 		nextseq = 1;
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 5285332..28fa9d9 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1701,4 +1701,6 @@ const struct sched_class dl_sched_class = {
 	.prio_changed           = prio_changed_dl,
 	.switched_from		= switched_from_dl,
 	.switched_to		= switched_to_dl,
+
+	.update_curr		= update_curr_dl,
 };
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 34baa60..99995e0 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -726,6 +726,11 @@ static void update_curr(struct cfs_rq *cfs_rq)
 	account_cfs_rq_runtime(cfs_rq, delta_exec);
 }
 
+static void update_curr_rq(struct rq *rq)
+{
+	update_curr(&rq->cfs);
+}
+
 static inline void
 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
@@ -7949,6 +7954,8 @@ const struct sched_class fair_sched_class = {
 
 	.get_rr_interval	= get_rr_interval_fair,
 
+	.update_curr		= update_curr_rq,
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	.task_move_group	= task_move_group_fair,
 #endif
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index d024e6c..20bca39 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2128,6 +2128,8 @@ const struct sched_class rt_sched_class = {
 
 	.prio_changed		= prio_changed_rt,
 	.switched_to		= switched_to_rt,
+
+	.update_curr		= update_curr_rt,
 };
 
 #ifdef CONFIG_SCHED_DEBUG
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 24156c84..2df8ef0 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1135,6 +1135,8 @@ struct sched_class {
 	unsigned int (*get_rr_interval) (struct rq *rq,
 					 struct task_struct *task);
 
+	void (*update_curr) (struct rq *rq);
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	void (*task_move_group) (struct task_struct *p, int on_rq);
 #endif
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 492b986..c2a5401 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -190,7 +190,8 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
 		*sample = virt_ticks(p);
 		break;
 	case CPUCLOCK_SCHED:
-		*sample = task_sched_runtime(p);
+		scheduler_update_curr(p);
+		*sample = p->se.sum_exec_runtime;
 		break;
 	}
 	return 0;
@@ -221,6 +222,7 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
 		 * to synchronize the timer to the clock every time we start
 		 * it.
 		 */
+		scheduler_update_curr(tsk);
 		thread_group_cputime(tsk, &sum);
 		raw_spin_lock_irqsave(&cputimer->lock, flags);
 		cputimer->running = 1;
@@ -254,6 +256,7 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
 		*sample = cputime_to_expires(cputime.utime);
 		break;
 	case CPUCLOCK_SCHED:
+		scheduler_update_curr(p);
 		thread_group_cputime(p, &cputime);
 		*sample = cputime.sum_exec_runtime;
 		break;
@@ -553,7 +556,7 @@ static int cpu_timer_sample_group(const clockid_t which_clock,
 		*sample = cputime_to_expires(cputime.utime);
 		break;
 	case CPUCLOCK_SCHED:
-		*sample = cputime.sum_exec_runtime + task_delta_exec(p);
+		*sample = cputime.sum_exec_runtime;
 		break;
 	}
 	return 0;
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2014-11-16  9:51 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2014-11-12 10:29 [PATCH] sched/cputime: fix clock_nanosleep/clock_gettime inconsistency Stanislaw Gruszka
2014-11-12 11:15 ` Peter Zijlstra
2014-11-12 11:37   ` Peter Zijlstra
2014-11-12 11:45     ` Peter Zijlstra
2014-11-12 12:27       ` Stanislaw Gruszka
2014-11-12 12:52         ` Peter Zijlstra
2014-11-16  9:50     ` [tip:sched/urgent] sched/cputime: Fix cpu_timer_sample_group() double accounting tip-bot for Peter Zijlstra
2014-11-12 12:21   ` [PATCH] sched/cputime: fix clock_nanosleep/clock_gettime inconsistency Stanislaw Gruszka
2014-11-12 12:51     ` Peter Zijlstra
2014-11-12 15:58       ` [PATCH v2] " Stanislaw Gruszka
2014-11-12 16:06         ` Peter Zijlstra
2014-11-12 16:17           ` Stanislaw Gruszka
2014-11-12 17:12           ` Peter Zijlstra
2014-11-12 16:45         ` Peter Zijlstra
2014-11-12 16:53         ` Stanislaw Gruszka
2014-11-12 16:56         ` Peter Zijlstra
2014-11-16  9:50         ` [tip:sched/urgent] sched/cputime: Fix clock_nanosleep()/ clock_gettime() inconsistency tip-bot for Stanislaw Gruszka

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.